{
"cells": [
{
"cell_type": "markdown",
"id": "86d2536c",
"metadata": {},
"source": [
"# Combine information across tables: joins and anti-joins"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "b6f949f7",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"id": "1d2a4eab",
"metadata": {},
"source": [
"# \"Load\" some experimental data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a9450803",
"metadata": {},
"outputs": [],
"source": [
"data = pd.DataFrame(\n",
" data=[\n",
" ['312', 'A1', 0.12, 'LEFT'],\n",
" ['312', 'A2', 0.37, 'LEFT'],\n",
" ['312', 'C2', 0.68, 'LEFT'],\n",
" ['711', 'A1', 4.01, 'RIGHT'],\n",
" ['711', 'A2', 0.44, 'LEFT'],\n",
" ['313', 'A1', 0.07, 'RIGHT'],\n",
" ['313', 'B1', 0.08, 'RIGHT'],\n",
" ['712', 'A2', 3.29, 'LEFT'],\n",
" ['314', 'A2', 0.29, 'LEFT'],\n",
" ['714', 'B2', 3.32, 'RIGHT'],\n",
" ['314', 'B1', 0.14, 'RIGHT'],\n",
" ['314', 'C2', 0.73, 'RIGHT'],\n",
" ['713', 'B1', 5.74, 'LEFT'],\n",
" ],\n",
" columns=['subject_id', 'condition_id', 'response_time', 'response'],\n",
")\n",
"data"
]
},
{
"cell_type": "markdown",
"id": "a7e8b09b",
"metadata": {},
"source": [
"Each experiment belongs to one experimental condition, but the parameters of each condition are not in the table"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "455471d7",
"metadata": {},
"outputs": [],
"source": [
"condition_to_orientation = {\n",
" 'A1': 0,\n",
" 'A2': 0,\n",
" 'B1': 45,\n",
" 'B2': 45,\n",
" 'C1': 90,\n",
"}\n",
"\n",
"condition_to_duration = {\n",
" 'A1': 0.1,\n",
" 'A2': 0.01,\n",
" 'B1': 0.1,\n",
" 'B2': 0.01,\n",
" 'C1': 0.2,\n",
"}\n",
"\n",
"condition_to_surround = {\n",
" 'A1': 'FULL',\n",
" 'A2': 'NONE',\n",
" 'B1': 'NONE',\n",
" 'B2': 'FULL',\n",
" 'C1': 'FULL',\n",
"}\n",
"\n",
"\n",
"condition_to_stimulus_type = {\n",
" 'A1': 'LINES',\n",
" 'A2': 'DOTS',\n",
" 'B1': 'PLAID',\n",
" 'B2': 'PLAID',\n",
" 'C1': 'WIGGLES',\n",
"}\n"
]
},
{
"cell_type": "markdown",
"id": "5ccfd7e7",
"metadata": {},
"source": [
"# Manually adding the condition parameters to the table"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "cc32110c",
"metadata": {},
"outputs": [],
"source": [
"data_with_properties = data.copy()"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "c322a9af",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 A1\n",
"1 A2\n",
"2 C2\n",
"3 A1\n",
"4 A2\n",
"5 A1\n",
"6 B1\n",
"7 A2\n",
"8 A2\n",
"9 B2\n",
"10 B1\n",
"11 C2\n",
"12 B1\n",
"Name: condition_id, dtype: object"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_with_properties['condition_id']"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "0dbee78b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 0.0\n",
"1 0.0\n",
"2 NaN\n",
"3 0.0\n",
"4 0.0\n",
"5 0.0\n",
"6 45.0\n",
"7 0.0\n",
"8 0.0\n",
"9 45.0\n",
"10 45.0\n",
"11 NaN\n",
"12 45.0\n",
"Name: condition_id, dtype: float64"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_with_properties['condition_id'].map(condition_to_orientation)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "3fb3e3af",
"metadata": {},
"outputs": [],
"source": [
"data_with_properties['orientation'] = data_with_properties['condition_id'].map(condition_to_orientation)\n",
"data_with_properties['duration'] = data_with_properties['condition_id'].map(condition_to_duration)\n",
"data_with_properties['surround'] = data_with_properties['condition_id'].map(condition_to_surround)\n",
"data_with_properties['stimulus_type'] = data_with_properties['condition_id'].map(condition_to_stimulus_type)"
]
},
{
"cell_type": "code",
"execution_count": 77,
"id": "995eff91",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" subject_id | \n",
" condition_id | \n",
" response_time | \n",
" response | \n",
" orientation | \n",
" duration | \n",
" surround | \n",
" stimulus_type | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 312 | \n",
" A1 | \n",
" 0.12 | \n",
" LEFT | \n",
" 0.0 | \n",
" 0.10 | \n",
" FULL | \n",
" LINES | \n",
"
\n",
" \n",
" 1 | \n",
" 312 | \n",
" A2 | \n",
" 0.37 | \n",
" LEFT | \n",
" 0.0 | \n",
" 0.01 | \n",
" NONE | \n",
" DOTS | \n",
"
\n",
" \n",
" 2 | \n",
" 312 | \n",
" C2 | \n",
" 0.68 | \n",
" LEFT | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" 711 | \n",
" A1 | \n",
" 4.01 | \n",
" RIGHT | \n",
" 0.0 | \n",
" 0.10 | \n",
" FULL | \n",
" LINES | \n",
"
\n",
" \n",
" 4 | \n",
" 711 | \n",
" A2 | \n",
" 0.44 | \n",
" LEFT | \n",
" 0.0 | \n",
" 0.01 | \n",
" NONE | \n",
" DOTS | \n",
"
\n",
" \n",
" 5 | \n",
" 313 | \n",
" A1 | \n",
" 0.07 | \n",
" RIGHT | \n",
" 0.0 | \n",
" 0.10 | \n",
" FULL | \n",
" LINES | \n",
"
\n",
" \n",
" 6 | \n",
" 313 | \n",
" B1 | \n",
" 0.08 | \n",
" RIGHT | \n",
" 45.0 | \n",
" 0.10 | \n",
" NONE | \n",
" PLAID | \n",
"
\n",
" \n",
" 7 | \n",
" 712 | \n",
" A2 | \n",
" 3.29 | \n",
" LEFT | \n",
" 0.0 | \n",
" 0.01 | \n",
" NONE | \n",
" DOTS | \n",
"
\n",
" \n",
" 8 | \n",
" 314 | \n",
" A2 | \n",
" 0.29 | \n",
" LEFT | \n",
" 0.0 | \n",
" 0.01 | \n",
" NONE | \n",
" DOTS | \n",
"
\n",
" \n",
" 9 | \n",
" 714 | \n",
" B2 | \n",
" 3.32 | \n",
" RIGHT | \n",
" 45.0 | \n",
" 0.01 | \n",
" FULL | \n",
" PLAID | \n",
"
\n",
" \n",
" 10 | \n",
" 314 | \n",
" B1 | \n",
" 0.14 | \n",
" RIGHT | \n",
" 45.0 | \n",
" 0.10 | \n",
" NONE | \n",
" PLAID | \n",
"
\n",
" \n",
" 11 | \n",
" 314 | \n",
" C2 | \n",
" 0.73 | \n",
" RIGHT | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 12 | \n",
" 713 | \n",
" B1 | \n",
" 5.74 | \n",
" LEFT | \n",
" 45.0 | \n",
" 0.10 | \n",
" NONE | \n",
" PLAID | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" subject_id condition_id response_time response orientation duration \\\n",
"0 312 A1 0.12 LEFT 0.0 0.10 \n",
"1 312 A2 0.37 LEFT 0.0 0.01 \n",
"2 312 C2 0.68 LEFT NaN NaN \n",
"3 711 A1 4.01 RIGHT 0.0 0.10 \n",
"4 711 A2 0.44 LEFT 0.0 0.01 \n",
"5 313 A1 0.07 RIGHT 0.0 0.10 \n",
"6 313 B1 0.08 RIGHT 45.0 0.10 \n",
"7 712 A2 3.29 LEFT 0.0 0.01 \n",
"8 314 A2 0.29 LEFT 0.0 0.01 \n",
"9 714 B2 3.32 RIGHT 45.0 0.01 \n",
"10 314 B1 0.14 RIGHT 45.0 0.10 \n",
"11 314 C2 0.73 RIGHT NaN NaN \n",
"12 713 B1 5.74 LEFT 45.0 0.10 \n",
"\n",
" surround stimulus_type \n",
"0 FULL LINES \n",
"1 NONE DOTS \n",
"2 NaN NaN \n",
"3 FULL LINES \n",
"4 NONE DOTS \n",
"5 FULL LINES \n",
"6 NONE PLAID \n",
"7 NONE DOTS \n",
"8 NONE DOTS \n",
"9 FULL PLAID \n",
"10 NONE PLAID \n",
"11 NaN NaN \n",
"12 NONE PLAID "
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_with_properties"
]
},
{
"cell_type": "markdown",
"id": "d6e71b13",
"metadata": {},
"source": [
"# Using a join operation"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "d9835d7c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orientation | \n",
" duration | \n",
" surround | \n",
" stimulus_type | \n",
"
\n",
" \n",
" \n",
" \n",
" A1 | \n",
" 0 | \n",
" 0.1 | \n",
" FULL | \n",
" LINES | \n",
"
\n",
" \n",
" A2 | \n",
" 0 | \n",
" 0.01 | \n",
" NONE | \n",
" DOTS | \n",
"
\n",
" \n",
" B1 | \n",
" 45 | \n",
" 0.1 | \n",
" NONE | \n",
" PLAID | \n",
"
\n",
" \n",
" B2 | \n",
" 45 | \n",
" 0.01 | \n",
" FULL | \n",
" PLAID | \n",
"
\n",
" \n",
" C1 | \n",
" 90 | \n",
" 0.2 | \n",
" FULL | \n",
" WIGGLES | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" orientation duration surround stimulus_type\n",
"A1 0 0.1 FULL LINES\n",
"A2 0 0.01 NONE DOTS\n",
"B1 45 0.1 NONE PLAID\n",
"B2 45 0.01 FULL PLAID\n",
"C1 90 0.2 FULL WIGGLES"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Often, this is done using a spreadsheet\n",
"condition_properties = pd.DataFrame(\n",
" [condition_to_orientation, condition_to_duration, condition_to_surround, condition_to_stimulus_type],\n",
" index=['orientation', 'duration', 'surround', 'stimulus_type'],\n",
").T\n",
"condition_properties"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a9087876",
"metadata": {},
"outputs": [],
"source": [
"data.merge(condition_properties, left_on='condition_id', right_index=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "61cb65be",
"metadata": {},
"outputs": [],
"source": [
"data.merge(condition_properties, left_on='condition_id', right_index=True, how='left')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7b4d23df",
"metadata": {},
"outputs": [],
"source": [
"data.merge(condition_properties, left_on='condition_id', right_index=True, how='outer')"
]
},
{
"cell_type": "markdown",
"id": "cba9534f",
"metadata": {},
"source": [
"# Anti-join: filter out unwanted data"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "1cb2bbdb",
"metadata": {},
"outputs": [],
"source": [
"# We are given a list of subjects that are outliers and should be disregarded in the analysis\n",
"outliers = pd.DataFrame([['711'], ['712'], ['713'], ['714'], ['888']], columns=['subject_id'])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e2e627d5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" subject_id | \n",
" condition_id | \n",
" response_time | \n",
" response | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 711 | \n",
" A1 | \n",
" 4.01 | \n",
" RIGHT | \n",
"
\n",
" \n",
" 1 | \n",
" 711 | \n",
" A2 | \n",
" 0.44 | \n",
" LEFT | \n",
"
\n",
" \n",
" 2 | \n",
" 712 | \n",
" A2 | \n",
" 3.29 | \n",
" LEFT | \n",
"
\n",
" \n",
" 3 | \n",
" 714 | \n",
" B2 | \n",
" 3.32 | \n",
" RIGHT | \n",
"
\n",
" \n",
" 4 | \n",
" 713 | \n",
" B1 | \n",
" 5.74 | \n",
" LEFT | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" subject_id condition_id response_time response\n",
"0 711 A1 4.01 RIGHT\n",
"1 711 A2 0.44 LEFT\n",
"2 712 A2 3.29 LEFT\n",
"3 714 B2 3.32 RIGHT\n",
"4 713 B1 5.74 LEFT"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.merge(outliers, on='subject_id')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "eb809fe0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" subject_id | \n",
" condition_id | \n",
" response_time | \n",
" response | \n",
" _merge | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 312 | \n",
" A1 | \n",
" 0.12 | \n",
" LEFT | \n",
" left_only | \n",
"
\n",
" \n",
" 1 | \n",
" 312 | \n",
" A2 | \n",
" 0.37 | \n",
" LEFT | \n",
" left_only | \n",
"
\n",
" \n",
" 2 | \n",
" 312 | \n",
" C2 | \n",
" 0.68 | \n",
" LEFT | \n",
" left_only | \n",
"
\n",
" \n",
" 3 | \n",
" 711 | \n",
" A1 | \n",
" 4.01 | \n",
" RIGHT | \n",
" both | \n",
"
\n",
" \n",
" 4 | \n",
" 711 | \n",
" A2 | \n",
" 0.44 | \n",
" LEFT | \n",
" both | \n",
"
\n",
" \n",
" 5 | \n",
" 313 | \n",
" A1 | \n",
" 0.07 | \n",
" RIGHT | \n",
" left_only | \n",
"
\n",
" \n",
" 6 | \n",
" 313 | \n",
" B1 | \n",
" 0.08 | \n",
" RIGHT | \n",
" left_only | \n",
"
\n",
" \n",
" 7 | \n",
" 712 | \n",
" A2 | \n",
" 3.29 | \n",
" LEFT | \n",
" both | \n",
"
\n",
" \n",
" 8 | \n",
" 314 | \n",
" A2 | \n",
" 0.29 | \n",
" LEFT | \n",
" left_only | \n",
"
\n",
" \n",
" 9 | \n",
" 314 | \n",
" B1 | \n",
" 0.14 | \n",
" RIGHT | \n",
" left_only | \n",
"
\n",
" \n",
" 10 | \n",
" 314 | \n",
" C2 | \n",
" 0.73 | \n",
" RIGHT | \n",
" left_only | \n",
"
\n",
" \n",
" 11 | \n",
" 714 | \n",
" B2 | \n",
" 3.32 | \n",
" RIGHT | \n",
" both | \n",
"
\n",
" \n",
" 12 | \n",
" 713 | \n",
" B1 | \n",
" 5.74 | \n",
" LEFT | \n",
" both | \n",
"
\n",
" \n",
" 13 | \n",
" 888 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" right_only | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" subject_id condition_id response_time response _merge\n",
"0 312 A1 0.12 LEFT left_only\n",
"1 312 A2 0.37 LEFT left_only\n",
"2 312 C2 0.68 LEFT left_only\n",
"3 711 A1 4.01 RIGHT both\n",
"4 711 A2 0.44 LEFT both\n",
"5 313 A1 0.07 RIGHT left_only\n",
"6 313 B1 0.08 RIGHT left_only\n",
"7 712 A2 3.29 LEFT both\n",
"8 314 A2 0.29 LEFT left_only\n",
"9 314 B1 0.14 RIGHT left_only\n",
"10 314 C2 0.73 RIGHT left_only\n",
"11 714 B2 3.32 RIGHT both\n",
"12 713 B1 5.74 LEFT both\n",
"13 888 NaN NaN NaN right_only"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.merge(outliers, on='subject_id', how='outer', indicator=True)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "6fdb696e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" subject_id | \n",
" condition_id | \n",
" response_time | \n",
" response | \n",
" _merge | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 312 | \n",
" A1 | \n",
" 0.12 | \n",
" LEFT | \n",
" left_only | \n",
"
\n",
" \n",
" 1 | \n",
" 312 | \n",
" A2 | \n",
" 0.37 | \n",
" LEFT | \n",
" left_only | \n",
"
\n",
" \n",
" 2 | \n",
" 312 | \n",
" C2 | \n",
" 0.68 | \n",
" LEFT | \n",
" left_only | \n",
"
\n",
" \n",
" 5 | \n",
" 313 | \n",
" A1 | \n",
" 0.07 | \n",
" RIGHT | \n",
" left_only | \n",
"
\n",
" \n",
" 6 | \n",
" 313 | \n",
" B1 | \n",
" 0.08 | \n",
" RIGHT | \n",
" left_only | \n",
"
\n",
" \n",
" 8 | \n",
" 314 | \n",
" A2 | \n",
" 0.29 | \n",
" LEFT | \n",
" left_only | \n",
"
\n",
" \n",
" 9 | \n",
" 314 | \n",
" B1 | \n",
" 0.14 | \n",
" RIGHT | \n",
" left_only | \n",
"
\n",
" \n",
" 10 | \n",
" 314 | \n",
" C2 | \n",
" 0.73 | \n",
" RIGHT | \n",
" left_only | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" subject_id condition_id response_time response _merge\n",
"0 312 A1 0.12 LEFT left_only\n",
"1 312 A2 0.37 LEFT left_only\n",
"2 312 C2 0.68 LEFT left_only\n",
"5 313 A1 0.07 RIGHT left_only\n",
"6 313 B1 0.08 RIGHT left_only\n",
"8 314 A2 0.29 LEFT left_only\n",
"9 314 B1 0.14 RIGHT left_only\n",
"10 314 C2 0.73 RIGHT left_only"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"temp = data.merge(outliers, on='subject_id', how='outer', indicator=True)\n",
"data_without_outliers = temp[temp['_merge'] == 'left_only']\n",
"data_without_outliers"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6c3e6baa",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}