{ "cells": [ { "cell_type": "markdown", "id": "86d2536c", "metadata": {}, "source": [ "# Combine information across tables: joins and anti-joins" ] }, { "cell_type": "code", "execution_count": 1, "id": "b6f949f7", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "markdown", "id": "1d2a4eab", "metadata": {}, "source": [ "# \"Load\" some experimental data" ] }, { "cell_type": "code", "execution_count": 2, "id": "a9450803", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idcondition_idresponse_timeresponse
0312A10.12LEFT
1312A20.37LEFT
2312C20.68LEFT
3711A14.01RIGHT
4711A20.44LEFT
5313A10.07RIGHT
6313B10.08RIGHT
7712A23.29LEFT
8314A20.29LEFT
9714B23.32RIGHT
10314B10.14RIGHT
11314C20.73RIGHT
12713B15.74LEFT
\n", "
" ], "text/plain": [ " subject_id condition_id response_time response\n", "0 312 A1 0.12 LEFT\n", "1 312 A2 0.37 LEFT\n", "2 312 C2 0.68 LEFT\n", "3 711 A1 4.01 RIGHT\n", "4 711 A2 0.44 LEFT\n", "5 313 A1 0.07 RIGHT\n", "6 313 B1 0.08 RIGHT\n", "7 712 A2 3.29 LEFT\n", "8 314 A2 0.29 LEFT\n", "9 714 B2 3.32 RIGHT\n", "10 314 B1 0.14 RIGHT\n", "11 314 C2 0.73 RIGHT\n", "12 713 B1 5.74 LEFT" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.DataFrame(\n", " data=[\n", " ['312', 'A1', 0.12, 'LEFT'],\n", " ['312', 'A2', 0.37, 'LEFT'],\n", " ['312', 'C2', 0.68, 'LEFT'],\n", " ['711', 'A1', 4.01, 'RIGHT'],\n", " ['711', 'A2', 0.44, 'LEFT'],\n", " ['313', 'A1', 0.07, 'RIGHT'],\n", " ['313', 'B1', 0.08, 'RIGHT'],\n", " ['712', 'A2', 3.29, 'LEFT'],\n", " ['314', 'A2', 0.29, 'LEFT'],\n", " ['714', 'B2', 3.32, 'RIGHT'],\n", " ['314', 'B1', 0.14, 'RIGHT'],\n", " ['314', 'C2', 0.73, 'RIGHT'],\n", " ['713', 'B1', 5.74, 'LEFT'],\n", " ],\n", " columns=['subject_id', 'condition_id', 'response_time', 'response'],\n", ")\n", "data" ] }, { "cell_type": "markdown", "id": "a7e8b09b", "metadata": {}, "source": [ "Each experiment belongs to one experimental condition, but the parameters of each condition are not in the table" ] }, { "cell_type": "code", "execution_count": 3, "id": "455471d7", "metadata": {}, "outputs": [], "source": [ "condition_to_orientation = {\n", " 'A1': 0,\n", " 'A2': 0,\n", " 'B1': 45,\n", " 'B2': 45,\n", " 'C1': 90,\n", "}\n", "\n", "condition_to_duration = {\n", " 'A1': 0.1,\n", " 'A2': 0.01,\n", " 'B1': 0.1,\n", " 'B2': 0.01,\n", " 'C1': 0.2,\n", "}\n", "\n", "condition_to_surround = {\n", " 'A1': 'FULL',\n", " 'A2': 'NONE',\n", " 'B1': 'NONE',\n", " 'B2': 'FULL',\n", " 'C1': 'FULL',\n", "}\n", "\n", "\n", "condition_to_stimulus_type = {\n", " 'A1': 'LINES',\n", " 'A2': 'DOTS',\n", " 'B1': 'PLAID',\n", " 'B2': 'PLAID',\n", " 'C1': 'WIGGLES',\n", "}\n" ] }, { "cell_type": "markdown", "id": "5ccfd7e7", "metadata": {}, "source": [ "# Manually adding the condition parameters to the table" ] }, { "cell_type": "code", "execution_count": 4, "id": "cc32110c", "metadata": {}, "outputs": [], "source": [ "data_with_properties = data.copy()" ] }, { "cell_type": "code", "execution_count": 5, "id": "c322a9af", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 A1\n", "1 A2\n", "2 C2\n", "3 A1\n", "4 A2\n", "5 A1\n", "6 B1\n", "7 A2\n", "8 A2\n", "9 B2\n", "10 B1\n", "11 C2\n", "12 B1\n", "Name: condition_id, dtype: object" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_with_properties['condition_id']" ] }, { "cell_type": "code", "execution_count": 6, "id": "0dbee78b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 0.0\n", "1 0.0\n", "2 NaN\n", "3 0.0\n", "4 0.0\n", "5 0.0\n", "6 45.0\n", "7 0.0\n", "8 0.0\n", "9 45.0\n", "10 45.0\n", "11 NaN\n", "12 45.0\n", "Name: condition_id, dtype: float64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_with_properties['condition_id'].map(condition_to_orientation)" ] }, { "cell_type": "code", "execution_count": 7, "id": "3fb3e3af", "metadata": {}, "outputs": [], "source": [ "data_with_properties['orientation'] = data_with_properties['condition_id'].map(condition_to_orientation)\n", "data_with_properties['duration'] = data_with_properties['condition_id'].map(condition_to_duration)\n", "data_with_properties['surround'] = data_with_properties['condition_id'].map(condition_to_surround)\n", "data_with_properties['stimulus_type'] = data_with_properties['condition_id'].map(condition_to_stimulus_type)" ] }, { "cell_type": "code", "execution_count": 8, "id": "995eff91", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idcondition_idresponse_timeresponseorientationdurationsurroundstimulus_type
0312A10.12LEFT0.00.10FULLLINES
1312A20.37LEFT0.00.01NONEDOTS
2312C20.68LEFTNaNNaNNaNNaN
3711A14.01RIGHT0.00.10FULLLINES
4711A20.44LEFT0.00.01NONEDOTS
5313A10.07RIGHT0.00.10FULLLINES
6313B10.08RIGHT45.00.10NONEPLAID
7712A23.29LEFT0.00.01NONEDOTS
8314A20.29LEFT0.00.01NONEDOTS
9714B23.32RIGHT45.00.01FULLPLAID
10314B10.14RIGHT45.00.10NONEPLAID
11314C20.73RIGHTNaNNaNNaNNaN
12713B15.74LEFT45.00.10NONEPLAID
\n", "
" ], "text/plain": [ " subject_id condition_id response_time response orientation duration \\\n", "0 312 A1 0.12 LEFT 0.0 0.10 \n", "1 312 A2 0.37 LEFT 0.0 0.01 \n", "2 312 C2 0.68 LEFT NaN NaN \n", "3 711 A1 4.01 RIGHT 0.0 0.10 \n", "4 711 A2 0.44 LEFT 0.0 0.01 \n", "5 313 A1 0.07 RIGHT 0.0 0.10 \n", "6 313 B1 0.08 RIGHT 45.0 0.10 \n", "7 712 A2 3.29 LEFT 0.0 0.01 \n", "8 314 A2 0.29 LEFT 0.0 0.01 \n", "9 714 B2 3.32 RIGHT 45.0 0.01 \n", "10 314 B1 0.14 RIGHT 45.0 0.10 \n", "11 314 C2 0.73 RIGHT NaN NaN \n", "12 713 B1 5.74 LEFT 45.0 0.10 \n", "\n", " surround stimulus_type \n", "0 FULL LINES \n", "1 NONE DOTS \n", "2 NaN NaN \n", "3 FULL LINES \n", "4 NONE DOTS \n", "5 FULL LINES \n", "6 NONE PLAID \n", "7 NONE DOTS \n", "8 NONE DOTS \n", "9 FULL PLAID \n", "10 NONE PLAID \n", "11 NaN NaN \n", "12 NONE PLAID " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_with_properties" ] }, { "cell_type": "markdown", "id": "d6e71b13", "metadata": {}, "source": [ "# Using a join operation" ] }, { "cell_type": "code", "execution_count": 9, "id": "d9835d7c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orientationdurationsurroundstimulus_type
A100.1FULLLINES
A200.01NONEDOTS
B1450.1NONEPLAID
B2450.01FULLPLAID
C1900.2FULLWIGGLES
\n", "
" ], "text/plain": [ " orientation duration surround stimulus_type\n", "A1 0 0.1 FULL LINES\n", "A2 0 0.01 NONE DOTS\n", "B1 45 0.1 NONE PLAID\n", "B2 45 0.01 FULL PLAID\n", "C1 90 0.2 FULL WIGGLES" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Often, this is done using a spreadsheet\n", "condition_properties = pd.DataFrame(\n", " [condition_to_orientation, condition_to_duration, condition_to_surround, condition_to_stimulus_type],\n", " index=['orientation', 'duration', 'surround', 'stimulus_type'],\n", ").T\n", "condition_properties" ] }, { "cell_type": "code", "execution_count": 10, "id": "a9087876", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idcondition_idresponse_timeresponseorientationdurationsurroundstimulus_type
0312A10.12LEFT00.1FULLLINES
3711A14.01RIGHT00.1FULLLINES
5313A10.07RIGHT00.1FULLLINES
1312A20.37LEFT00.01NONEDOTS
4711A20.44LEFT00.01NONEDOTS
7712A23.29LEFT00.01NONEDOTS
8314A20.29LEFT00.01NONEDOTS
6313B10.08RIGHT450.1NONEPLAID
10314B10.14RIGHT450.1NONEPLAID
12713B15.74LEFT450.1NONEPLAID
9714B23.32RIGHT450.01FULLPLAID
\n", "
" ], "text/plain": [ " subject_id condition_id response_time response orientation duration \\\n", "0 312 A1 0.12 LEFT 0 0.1 \n", "3 711 A1 4.01 RIGHT 0 0.1 \n", "5 313 A1 0.07 RIGHT 0 0.1 \n", "1 312 A2 0.37 LEFT 0 0.01 \n", "4 711 A2 0.44 LEFT 0 0.01 \n", "7 712 A2 3.29 LEFT 0 0.01 \n", "8 314 A2 0.29 LEFT 0 0.01 \n", "6 313 B1 0.08 RIGHT 45 0.1 \n", "10 314 B1 0.14 RIGHT 45 0.1 \n", "12 713 B1 5.74 LEFT 45 0.1 \n", "9 714 B2 3.32 RIGHT 45 0.01 \n", "\n", " surround stimulus_type \n", "0 FULL LINES \n", "3 FULL LINES \n", "5 FULL LINES \n", "1 NONE DOTS \n", "4 NONE DOTS \n", "7 NONE DOTS \n", "8 NONE DOTS \n", "6 NONE PLAID \n", "10 NONE PLAID \n", "12 NONE PLAID \n", "9 FULL PLAID " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.merge(condition_properties, left_on='condition_id', right_index=True)" ] }, { "cell_type": "code", "execution_count": 11, "id": "61cb65be", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idcondition_idresponse_timeresponseorientationdurationsurroundstimulus_type
0312A10.12LEFT00.1FULLLINES
1312A20.37LEFT00.01NONEDOTS
2312C20.68LEFTNaNNaNNaNNaN
3711A14.01RIGHT00.1FULLLINES
4711A20.44LEFT00.01NONEDOTS
5313A10.07RIGHT00.1FULLLINES
6313B10.08RIGHT450.1NONEPLAID
7712A23.29LEFT00.01NONEDOTS
8314A20.29LEFT00.01NONEDOTS
9714B23.32RIGHT450.01FULLPLAID
10314B10.14RIGHT450.1NONEPLAID
11314C20.73RIGHTNaNNaNNaNNaN
12713B15.74LEFT450.1NONEPLAID
\n", "
" ], "text/plain": [ " subject_id condition_id response_time response orientation duration \\\n", "0 312 A1 0.12 LEFT 0 0.1 \n", "1 312 A2 0.37 LEFT 0 0.01 \n", "2 312 C2 0.68 LEFT NaN NaN \n", "3 711 A1 4.01 RIGHT 0 0.1 \n", "4 711 A2 0.44 LEFT 0 0.01 \n", "5 313 A1 0.07 RIGHT 0 0.1 \n", "6 313 B1 0.08 RIGHT 45 0.1 \n", "7 712 A2 3.29 LEFT 0 0.01 \n", "8 314 A2 0.29 LEFT 0 0.01 \n", "9 714 B2 3.32 RIGHT 45 0.01 \n", "10 314 B1 0.14 RIGHT 45 0.1 \n", "11 314 C2 0.73 RIGHT NaN NaN \n", "12 713 B1 5.74 LEFT 45 0.1 \n", "\n", " surround stimulus_type \n", "0 FULL LINES \n", "1 NONE DOTS \n", "2 NaN NaN \n", "3 FULL LINES \n", "4 NONE DOTS \n", "5 FULL LINES \n", "6 NONE PLAID \n", "7 NONE DOTS \n", "8 NONE DOTS \n", "9 FULL PLAID \n", "10 NONE PLAID \n", "11 NaN NaN \n", "12 NONE PLAID " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.merge(condition_properties, left_on='condition_id', right_index=True, how='left')" ] }, { "cell_type": "code", "execution_count": 12, "id": "7b4d23df", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idcondition_idresponse_timeresponseorientationdurationsurroundstimulus_type
0.0312A10.12LEFT00.1FULLLINES
3.0711A14.01RIGHT00.1FULLLINES
5.0313A10.07RIGHT00.1FULLLINES
1.0312A20.37LEFT00.01NONEDOTS
4.0711A20.44LEFT00.01NONEDOTS
7.0712A23.29LEFT00.01NONEDOTS
8.0314A20.29LEFT00.01NONEDOTS
2.0312C20.68LEFTNaNNaNNaNNaN
11.0314C20.73RIGHTNaNNaNNaNNaN
6.0313B10.08RIGHT450.1NONEPLAID
10.0314B10.14RIGHT450.1NONEPLAID
12.0713B15.74LEFT450.1NONEPLAID
9.0714B23.32RIGHT450.01FULLPLAID
NaNNaNC1NaNNaN900.2FULLWIGGLES
\n", "
" ], "text/plain": [ " subject_id condition_id response_time response orientation duration \\\n", "0.0 312 A1 0.12 LEFT 0 0.1 \n", "3.0 711 A1 4.01 RIGHT 0 0.1 \n", "5.0 313 A1 0.07 RIGHT 0 0.1 \n", "1.0 312 A2 0.37 LEFT 0 0.01 \n", "4.0 711 A2 0.44 LEFT 0 0.01 \n", "7.0 712 A2 3.29 LEFT 0 0.01 \n", "8.0 314 A2 0.29 LEFT 0 0.01 \n", "2.0 312 C2 0.68 LEFT NaN NaN \n", "11.0 314 C2 0.73 RIGHT NaN NaN \n", "6.0 313 B1 0.08 RIGHT 45 0.1 \n", "10.0 314 B1 0.14 RIGHT 45 0.1 \n", "12.0 713 B1 5.74 LEFT 45 0.1 \n", "9.0 714 B2 3.32 RIGHT 45 0.01 \n", "NaN NaN C1 NaN NaN 90 0.2 \n", "\n", " surround stimulus_type \n", "0.0 FULL LINES \n", "3.0 FULL LINES \n", "5.0 FULL LINES \n", "1.0 NONE DOTS \n", "4.0 NONE DOTS \n", "7.0 NONE DOTS \n", "8.0 NONE DOTS \n", "2.0 NaN NaN \n", "11.0 NaN NaN \n", "6.0 NONE PLAID \n", "10.0 NONE PLAID \n", "12.0 NONE PLAID \n", "9.0 FULL PLAID \n", "NaN FULL WIGGLES " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.merge(condition_properties, left_on='condition_id', right_index=True, how='outer')" ] }, { "cell_type": "markdown", "id": "cba9534f", "metadata": {}, "source": [ "# Anti-join: filter out unwanted data" ] }, { "cell_type": "code", "execution_count": 13, "id": "1cb2bbdb", "metadata": {}, "outputs": [], "source": [ "# We are given a list of subjects that are outliers and should be disregarded in the analysis\n", "outliers = pd.DataFrame([['711'], ['712'], ['713'], ['714'], ['888']], columns=['subject_id'])" ] }, { "cell_type": "code", "execution_count": 14, "id": "e2e627d5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idcondition_idresponse_timeresponse
0711A14.01RIGHT
1711A20.44LEFT
2712A23.29LEFT
3714B23.32RIGHT
4713B15.74LEFT
\n", "
" ], "text/plain": [ " subject_id condition_id response_time response\n", "0 711 A1 4.01 RIGHT\n", "1 711 A2 0.44 LEFT\n", "2 712 A2 3.29 LEFT\n", "3 714 B2 3.32 RIGHT\n", "4 713 B1 5.74 LEFT" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.merge(outliers, on='subject_id')" ] }, { "cell_type": "code", "execution_count": 15, "id": "eb809fe0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idcondition_idresponse_timeresponse_merge
0312A10.12LEFTleft_only
1312A20.37LEFTleft_only
2312C20.68LEFTleft_only
3711A14.01RIGHTboth
4711A20.44LEFTboth
5313A10.07RIGHTleft_only
6313B10.08RIGHTleft_only
7712A23.29LEFTboth
8314A20.29LEFTleft_only
9314B10.14RIGHTleft_only
10314C20.73RIGHTleft_only
11714B23.32RIGHTboth
12713B15.74LEFTboth
13888NaNNaNNaNright_only
\n", "
" ], "text/plain": [ " subject_id condition_id response_time response _merge\n", "0 312 A1 0.12 LEFT left_only\n", "1 312 A2 0.37 LEFT left_only\n", "2 312 C2 0.68 LEFT left_only\n", "3 711 A1 4.01 RIGHT both\n", "4 711 A2 0.44 LEFT both\n", "5 313 A1 0.07 RIGHT left_only\n", "6 313 B1 0.08 RIGHT left_only\n", "7 712 A2 3.29 LEFT both\n", "8 314 A2 0.29 LEFT left_only\n", "9 314 B1 0.14 RIGHT left_only\n", "10 314 C2 0.73 RIGHT left_only\n", "11 714 B2 3.32 RIGHT both\n", "12 713 B1 5.74 LEFT both\n", "13 888 NaN NaN NaN right_only" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.merge(outliers, on='subject_id', how='outer', indicator=True)" ] }, { "cell_type": "code", "execution_count": 16, "id": "6fdb696e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idcondition_idresponse_timeresponse_merge
0312A10.12LEFTleft_only
1312A20.37LEFTleft_only
2312C20.68LEFTleft_only
5313A10.07RIGHTleft_only
6313B10.08RIGHTleft_only
8314A20.29LEFTleft_only
9314B10.14RIGHTleft_only
10314C20.73RIGHTleft_only
\n", "
" ], "text/plain": [ " subject_id condition_id response_time response _merge\n", "0 312 A1 0.12 LEFT left_only\n", "1 312 A2 0.37 LEFT left_only\n", "2 312 C2 0.68 LEFT left_only\n", "5 313 A1 0.07 RIGHT left_only\n", "6 313 B1 0.08 RIGHT left_only\n", "8 314 A2 0.29 LEFT left_only\n", "9 314 B1 0.14 RIGHT left_only\n", "10 314 C2 0.73 RIGHT left_only" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "temp = data.merge(outliers, on='subject_id', how='outer', indicator=True)\n", "data_without_outliers = temp[temp['_merge'] == 'left_only']\n", "data_without_outliers" ] }, { "cell_type": "code", "execution_count": null, "id": "6c3e6baa", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" } }, "nbformat": 4, "nbformat_minor": 5 }