{ "cells": [ { "cell_type": "markdown", "id": "247bbf84", "metadata": {}, "source": [ "# Split-apply-combine operations for tabular data" ] }, { "cell_type": "code", "execution_count": 1, "id": "44584190", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "id": "ba193f3f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idcondition_idresponse_timeresponse
0312A10.12LEFT
1312A20.37LEFT
2312C20.68LEFT
3313A10.07RIGHT
4313B10.08RIGHT
5314A20.29LEFT
6314B10.14RIGHT
7314C20.73RIGHT
8711A14.01RIGHT
9712A23.29LEFT
10713B15.74LEFT
11714B23.32RIGHT
\n", "
" ], "text/plain": [ " subject_id condition_id response_time response\n", "0 312 A1 0.12 LEFT\n", "1 312 A2 0.37 LEFT\n", "2 312 C2 0.68 LEFT\n", "3 313 A1 0.07 RIGHT\n", "4 313 B1 0.08 RIGHT\n", "5 314 A2 0.29 LEFT\n", "6 314 B1 0.14 RIGHT\n", "7 314 C2 0.73 RIGHT\n", "8 711 A1 4.01 RIGHT\n", "9 712 A2 3.29 LEFT\n", "10 713 B1 5.74 LEFT\n", "11 714 B2 3.32 RIGHT" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.DataFrame(\n", " data=[\n", " ['312', 'A1', 0.12, 'LEFT'],\n", " ['312', 'A2', 0.37, 'LEFT'],\n", " ['312', 'C2', 0.68, 'LEFT'],\n", " ['313', 'A1', 0.07, 'RIGHT'],\n", " ['313', 'B1', 0.08, 'RIGHT'],\n", " ['314', 'A2', 0.29, 'LEFT'],\n", " ['314', 'B1', 0.14, 'RIGHT'],\n", " ['314', 'C2', 0.73, 'RIGHT'],\n", " ['711', 'A1', 4.01, 'RIGHT'],\n", " ['712', 'A2', 3.29, 'LEFT'],\n", " ['713', 'B1', 5.74, 'LEFT'],\n", " ['714', 'B2', 3.32, 'RIGHT'],\n", " ],\n", " columns=['subject_id', 'condition_id', 'response_time', 'response'],\n", ")\n", "data" ] }, { "cell_type": "markdown", "id": "8a239e0c", "metadata": {}, "source": [ "# Group-by" ] }, { "cell_type": "markdown", "id": "31eba91e", "metadata": {}, "source": [ "We want to compute the mean response time by condition.\n", "\n", "Let's start by doing it by hand, using for loops!" ] }, { "cell_type": "code", "execution_count": 14, "id": "e8331039", "metadata": {}, "outputs": [], "source": [ "conditions = data['condition_id'].unique()\n", "results_dict = {}\n", "for condition in conditions:\n", " group = data[data['condition_id'] == condition]\n", " results_dict[condition] = group['response_time'].mean()\n", "\n", "results = pd.DataFrame([results_dict], index=['response_time']).T" ] }, { "cell_type": "code", "execution_count": 15, "id": "09cb04c4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
response_time
A11.400000
A21.316667
C20.705000
B11.986667
B23.320000
\n", "
" ], "text/plain": [ " response_time\n", "A1 1.400000\n", "A2 1.316667\n", "C2 0.705000\n", "B1 1.986667\n", "B2 3.320000" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results" ] }, { "cell_type": "markdown", "id": "2bc09c66", "metadata": {}, "source": [ "This is a basic operation, and we would need to repeat his pattern a million times!\n", "\n", "Pandas and all other tools for tabular data provide a command for performing operations on groups." ] }, { "cell_type": "code", "execution_count": 29, "id": "0500cd4a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# df.groupby(column_name) groups a DataFrame by the values in the column\n", "data.groupby('condition_id')" ] }, { "cell_type": "code", "execution_count": 3, "id": "c5857c4e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "condition_id\n", "A1 3\n", "A2 3\n", "B1 3\n", "B2 1\n", "C2 2\n", "dtype: int64" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# The group-by object can by used as a DataFrame. \n", "# Operations are executed on each group individually, then aggregated\n", "data.groupby('condition_id').size()" ] }, { "cell_type": "code", "execution_count": 33, "id": "5c865cc1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "condition_id\n", "A1 1.400000\n", "A2 1.316667\n", "B1 1.986667\n", "B2 3.320000\n", "C2 0.705000\n", "Name: response_time, dtype: float64" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.groupby('condition_id')['response_time'].mean()" ] }, { "cell_type": "code", "execution_count": 36, "id": "615a4515", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "condition_id\n", "A1 4.01\n", "A2 3.29\n", "B1 5.74\n", "B2 3.32\n", "C2 0.73\n", "Name: response_time, dtype: float64" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.groupby('condition_id')['response_time'].max()" ] }, { "cell_type": "markdown", "id": "b0441458", "metadata": {}, "source": [ "# Pivot tables" ] }, { "cell_type": "markdown", "id": "3feec98d", "metadata": {}, "source": [ "We want to look at response time biases when the subjects respond LEFT vs RIGHT. In principle, we expect them to have the same response time in both cases.\n", "\n", "We compute a summary table with 1) condition_id on the rows; 2) response on the columns; 3) the average response time for all experiments with a that condition and response\n", "\n", "We can do it with `groupby`, with some table manipulation commands." ] }, { "cell_type": "code", "execution_count": 44, "id": "4a8a7d0d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "condition_id response\n", "A1 LEFT 0.120000\n", " RIGHT 2.040000\n", "A2 LEFT 1.316667\n", "B1 LEFT 5.740000\n", " RIGHT 0.110000\n", "B2 RIGHT 3.320000\n", "C2 LEFT 0.680000\n", " RIGHT 0.730000\n", "Name: response_time, dtype: float64" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summary = data.groupby(['condition_id', 'response'])['response_time'].mean()\n", "summary" ] }, { "cell_type": "code", "execution_count": 45, "id": "e5a645e0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
responseLEFTRIGHT
condition_id
A10.1200002.04
A21.316667NaN
B15.7400000.11
B2NaN3.32
C20.6800000.73
\n", "
" ], "text/plain": [ "response LEFT RIGHT\n", "condition_id \n", "A1 0.120000 2.04\n", "A2 1.316667 NaN\n", "B1 5.740000 0.11\n", "B2 NaN 3.32\n", "C2 0.680000 0.73" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summary.unstack(level=1)" ] }, { "cell_type": "markdown", "id": "3307fcc6", "metadata": {}, "source": [ "Pandas has a command called `pivot_table` that can be used to perform this kind of operation straightforwardly." ] }, { "cell_type": "code", "execution_count": 47, "id": "8941edfe", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
responseLEFTRIGHT
condition_id
A10.1200002.04
A21.316667NaN
B15.7400000.11
B2NaN3.32
C20.6800000.73
\n", "
" ], "text/plain": [ "response LEFT RIGHT\n", "condition_id \n", "A1 0.120000 2.04\n", "A2 1.316667 NaN\n", "B1 5.740000 0.11\n", "B2 NaN 3.32\n", "C2 0.680000 0.73" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.pivot_table(index='condition_id', columns='response', values='response_time', aggfunc='mean')" ] }, { "cell_type": "code", "execution_count": 59, "id": "a7d1d998", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
meanstdcount
responseLEFTRIGHTLEFTRIGHTLEFTRIGHT
condition_id
A10.1200002.04NaN2.7860011.02.0
A21.316667NaN1.709425NaN3.0NaN
B15.7400000.11NaN0.0424261.02.0
B2NaN3.32NaNNaNNaN1.0
C20.6800000.73NaNNaN1.01.0
\n", "
" ], "text/plain": [ " mean std count \n", "response LEFT RIGHT LEFT RIGHT LEFT RIGHT\n", "condition_id \n", "A1 0.120000 2.04 NaN 2.786001 1.0 2.0\n", "A2 1.316667 NaN 1.709425 NaN 3.0 NaN\n", "B1 5.740000 0.11 NaN 0.042426 1.0 2.0\n", "B2 NaN 3.32 NaN NaN NaN 1.0\n", "C2 0.680000 0.73 NaN NaN 1.0 1.0" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(\n", " data\n", " .pivot_table(\n", " index='condition_id', \n", " columns='response', \n", " values='response_time', \n", " aggfunc=['mean', 'std', 'count'],\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "a770b812", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "0234ccf2", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "0c77c2dc", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" } }, "nbformat": 4, "nbformat_minor": 5 }