From 4930de7f0cdac30cf647ec2502222c69be1bcc76 Mon Sep 17 00:00:00 2001 From: Guillermo Aguilar Date: Tue, 23 Sep 2025 13:59:55 +0200 Subject: [PATCH] adds notebooks with live coding --- .../tabular_data/020_join_operations.ipynb | 1789 +++++++++++++++++ .../030_split-apply-combine.ipynb | 814 ++++++++ .../tabular_data/040_window_functions.ipynb | 1429 +++++++++++++ 3 files changed, 4032 insertions(+) create mode 100644 notebooks/tabular_data/020_join_operations.ipynb create mode 100644 notebooks/tabular_data/030_split-apply-combine.ipynb create mode 100644 notebooks/tabular_data/040_window_functions.ipynb diff --git a/notebooks/tabular_data/020_join_operations.ipynb b/notebooks/tabular_data/020_join_operations.ipynb new file mode 100644 index 0000000..542859b --- /dev/null +++ b/notebooks/tabular_data/020_join_operations.ipynb @@ -0,0 +1,1789 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "86d2536c", + "metadata": {}, + "source": [ + "# Combine information across tables: joins and anti-joins" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b6f949f7", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "1d2a4eab", + "metadata": {}, + "source": [ + "# \"Load\" some experimental data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a9450803", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_idcondition_idresponse_timeresponse
0312A10.12LEFT
1312A20.37LEFT
2312C20.68LEFT
3711A14.01RIGHT
4711A20.44LEFT
5313A10.07RIGHT
6313B10.08RIGHT
7712A23.29LEFT
8314A20.29LEFT
9714B23.32RIGHT
10314B10.14RIGHT
11314C20.73RIGHT
12713B15.74LEFT
\n", + "
" + ], + "text/plain": [ + " subject_id condition_id response_time response\n", + "0 312 A1 0.12 LEFT\n", + "1 312 A2 0.37 LEFT\n", + "2 312 C2 0.68 LEFT\n", + "3 711 A1 4.01 RIGHT\n", + "4 711 A2 0.44 LEFT\n", + "5 313 A1 0.07 RIGHT\n", + "6 313 B1 0.08 RIGHT\n", + "7 712 A2 3.29 LEFT\n", + "8 314 A2 0.29 LEFT\n", + "9 714 B2 3.32 RIGHT\n", + "10 314 B1 0.14 RIGHT\n", + "11 314 C2 0.73 RIGHT\n", + "12 713 B1 5.74 LEFT" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = pd.DataFrame(\n", + " data=[\n", + " ['312', 'A1', 0.12, 'LEFT'],\n", + " ['312', 'A2', 0.37, 'LEFT'],\n", + " ['312', 'C2', 0.68, 'LEFT'],\n", + " ['711', 'A1', 4.01, 'RIGHT'],\n", + " ['711', 'A2', 0.44, 'LEFT'],\n", + " ['313', 'A1', 0.07, 'RIGHT'],\n", + " ['313', 'B1', 0.08, 'RIGHT'],\n", + " ['712', 'A2', 3.29, 'LEFT'],\n", + " ['314', 'A2', 0.29, 'LEFT'],\n", + " ['714', 'B2', 3.32, 'RIGHT'],\n", + " ['314', 'B1', 0.14, 'RIGHT'],\n", + " ['314', 'C2', 0.73, 'RIGHT'],\n", + " ['713', 'B1', 5.74, 'LEFT'],\n", + " ],\n", + " columns=['subject_id', 'condition_id', 'response_time', 'response'],\n", + ")\n", + "data" + ] + }, + { + "cell_type": "markdown", + "id": "a7e8b09b", + "metadata": {}, + "source": [ + "Each experiment belongs to one experimental condition, but the parameters of each condition are not in the table" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "455471d7", + "metadata": {}, + "outputs": [], + "source": [ + "condition_to_orientation = {\n", + " 'A1': 0,\n", + " 'A2': 0,\n", + " 'B1': 45,\n", + " 'B2': 45,\n", + " 'C1': 90,\n", + "}\n", + "\n", + "condition_to_duration = {\n", + " 'A1': 0.1,\n", + " 'A2': 0.01,\n", + " 'B1': 0.1,\n", + " 'B2': 0.01,\n", + " 'C1': 0.2,\n", + "}\n", + "\n", + "condition_to_surround = {\n", + " 'A1': 'FULL',\n", + " 'A2': 'NONE',\n", + " 'B1': 'NONE',\n", + " 'B2': 'FULL',\n", + " 'C1': 'FULL',\n", + "}\n", + "\n", + "\n", + "condition_to_stimulus_type = {\n", + " 'A1': 'LINES',\n", + " 'A2': 'DOTS',\n", + " 'B1': 'PLAID',\n", + " 'B2': 'PLAID',\n", + " 'C1': 'WIGGLES',\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "5ccfd7e7", + "metadata": {}, + "source": [ + "# Manually adding the condition parameters to the table" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "cc32110c", + "metadata": {}, + "outputs": [], + "source": [ + "data_with_properties = data.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c322a9af", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 A1\n", + "1 A2\n", + "2 C2\n", + "3 A1\n", + "4 A2\n", + "5 A1\n", + "6 B1\n", + "7 A2\n", + "8 A2\n", + "9 B2\n", + "10 B1\n", + "11 C2\n", + "12 B1\n", + "Name: condition_id, dtype: object" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_with_properties['condition_id']" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0dbee78b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 0.0\n", + "1 0.0\n", + "2 NaN\n", + "3 0.0\n", + "4 0.0\n", + "5 0.0\n", + "6 45.0\n", + "7 0.0\n", + "8 0.0\n", + "9 45.0\n", + "10 45.0\n", + "11 NaN\n", + "12 45.0\n", + "Name: condition_id, dtype: float64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_with_properties['condition_id'].map(condition_to_orientation)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3fb3e3af", + "metadata": {}, + "outputs": [], + "source": [ + "data_with_properties['orientation'] = data_with_properties['condition_id'].map(condition_to_orientation)\n", + "data_with_properties['duration'] = data_with_properties['condition_id'].map(condition_to_duration)\n", + "data_with_properties['surround'] = data_with_properties['condition_id'].map(condition_to_surround)\n", + "data_with_properties['stimulus_type'] = data_with_properties['condition_id'].map(condition_to_stimulus_type)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "995eff91", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_idcondition_idresponse_timeresponseorientationdurationsurroundstimulus_type
0312A10.12LEFT0.00.10FULLLINES
1312A20.37LEFT0.00.01NONEDOTS
2312C20.68LEFTNaNNaNNaNNaN
3711A14.01RIGHT0.00.10FULLLINES
4711A20.44LEFT0.00.01NONEDOTS
5313A10.07RIGHT0.00.10FULLLINES
6313B10.08RIGHT45.00.10NONEPLAID
7712A23.29LEFT0.00.01NONEDOTS
8314A20.29LEFT0.00.01NONEDOTS
9714B23.32RIGHT45.00.01FULLPLAID
10314B10.14RIGHT45.00.10NONEPLAID
11314C20.73RIGHTNaNNaNNaNNaN
12713B15.74LEFT45.00.10NONEPLAID
\n", + "
" + ], + "text/plain": [ + " subject_id condition_id response_time response orientation duration \\\n", + "0 312 A1 0.12 LEFT 0.0 0.10 \n", + "1 312 A2 0.37 LEFT 0.0 0.01 \n", + "2 312 C2 0.68 LEFT NaN NaN \n", + "3 711 A1 4.01 RIGHT 0.0 0.10 \n", + "4 711 A2 0.44 LEFT 0.0 0.01 \n", + "5 313 A1 0.07 RIGHT 0.0 0.10 \n", + "6 313 B1 0.08 RIGHT 45.0 0.10 \n", + "7 712 A2 3.29 LEFT 0.0 0.01 \n", + "8 314 A2 0.29 LEFT 0.0 0.01 \n", + "9 714 B2 3.32 RIGHT 45.0 0.01 \n", + "10 314 B1 0.14 RIGHT 45.0 0.10 \n", + "11 314 C2 0.73 RIGHT NaN NaN \n", + "12 713 B1 5.74 LEFT 45.0 0.10 \n", + "\n", + " surround stimulus_type \n", + "0 FULL LINES \n", + "1 NONE DOTS \n", + "2 NaN NaN \n", + "3 FULL LINES \n", + "4 NONE DOTS \n", + "5 FULL LINES \n", + "6 NONE PLAID \n", + "7 NONE DOTS \n", + "8 NONE DOTS \n", + "9 FULL PLAID \n", + "10 NONE PLAID \n", + "11 NaN NaN \n", + "12 NONE PLAID " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_with_properties" + ] + }, + { + "cell_type": "markdown", + "id": "d6e71b13", + "metadata": {}, + "source": [ + "# Using a join operation" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d9835d7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orientationdurationsurroundstimulus_type
A100.1FULLLINES
A200.01NONEDOTS
B1450.1NONEPLAID
B2450.01FULLPLAID
C1900.2FULLWIGGLES
\n", + "
" + ], + "text/plain": [ + " orientation duration surround stimulus_type\n", + "A1 0 0.1 FULL LINES\n", + "A2 0 0.01 NONE DOTS\n", + "B1 45 0.1 NONE PLAID\n", + "B2 45 0.01 FULL PLAID\n", + "C1 90 0.2 FULL WIGGLES" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Often, this is done using a spreadsheet\n", + "condition_properties = pd.DataFrame(\n", + " [condition_to_orientation, condition_to_duration, condition_to_surround, condition_to_stimulus_type],\n", + " index=['orientation', 'duration', 'surround', 'stimulus_type'],\n", + ").T\n", + "condition_properties" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a9087876", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_idcondition_idresponse_timeresponseorientationdurationsurroundstimulus_type
0312A10.12LEFT00.1FULLLINES
3711A14.01RIGHT00.1FULLLINES
5313A10.07RIGHT00.1FULLLINES
1312A20.37LEFT00.01NONEDOTS
4711A20.44LEFT00.01NONEDOTS
7712A23.29LEFT00.01NONEDOTS
8314A20.29LEFT00.01NONEDOTS
6313B10.08RIGHT450.1NONEPLAID
10314B10.14RIGHT450.1NONEPLAID
12713B15.74LEFT450.1NONEPLAID
9714B23.32RIGHT450.01FULLPLAID
\n", + "
" + ], + "text/plain": [ + " subject_id condition_id response_time response orientation duration \\\n", + "0 312 A1 0.12 LEFT 0 0.1 \n", + "3 711 A1 4.01 RIGHT 0 0.1 \n", + "5 313 A1 0.07 RIGHT 0 0.1 \n", + "1 312 A2 0.37 LEFT 0 0.01 \n", + "4 711 A2 0.44 LEFT 0 0.01 \n", + "7 712 A2 3.29 LEFT 0 0.01 \n", + "8 314 A2 0.29 LEFT 0 0.01 \n", + "6 313 B1 0.08 RIGHT 45 0.1 \n", + "10 314 B1 0.14 RIGHT 45 0.1 \n", + "12 713 B1 5.74 LEFT 45 0.1 \n", + "9 714 B2 3.32 RIGHT 45 0.01 \n", + "\n", + " surround stimulus_type \n", + "0 FULL LINES \n", + "3 FULL LINES \n", + "5 FULL LINES \n", + "1 NONE DOTS \n", + "4 NONE DOTS \n", + "7 NONE DOTS \n", + "8 NONE DOTS \n", + "6 NONE PLAID \n", + "10 NONE PLAID \n", + "12 NONE PLAID \n", + "9 FULL PLAID " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.merge(condition_properties, left_on='condition_id', right_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "61cb65be", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_idcondition_idresponse_timeresponseorientationdurationsurroundstimulus_type
0312A10.12LEFT00.1FULLLINES
1312A20.37LEFT00.01NONEDOTS
2312C20.68LEFTNaNNaNNaNNaN
3711A14.01RIGHT00.1FULLLINES
4711A20.44LEFT00.01NONEDOTS
5313A10.07RIGHT00.1FULLLINES
6313B10.08RIGHT450.1NONEPLAID
7712A23.29LEFT00.01NONEDOTS
8314A20.29LEFT00.01NONEDOTS
9714B23.32RIGHT450.01FULLPLAID
10314B10.14RIGHT450.1NONEPLAID
11314C20.73RIGHTNaNNaNNaNNaN
12713B15.74LEFT450.1NONEPLAID
\n", + "
" + ], + "text/plain": [ + " subject_id condition_id response_time response orientation duration \\\n", + "0 312 A1 0.12 LEFT 0 0.1 \n", + "1 312 A2 0.37 LEFT 0 0.01 \n", + "2 312 C2 0.68 LEFT NaN NaN \n", + "3 711 A1 4.01 RIGHT 0 0.1 \n", + "4 711 A2 0.44 LEFT 0 0.01 \n", + "5 313 A1 0.07 RIGHT 0 0.1 \n", + "6 313 B1 0.08 RIGHT 45 0.1 \n", + "7 712 A2 3.29 LEFT 0 0.01 \n", + "8 314 A2 0.29 LEFT 0 0.01 \n", + "9 714 B2 3.32 RIGHT 45 0.01 \n", + "10 314 B1 0.14 RIGHT 45 0.1 \n", + "11 314 C2 0.73 RIGHT NaN NaN \n", + "12 713 B1 5.74 LEFT 45 0.1 \n", + "\n", + " surround stimulus_type \n", + "0 FULL LINES \n", + "1 NONE DOTS \n", + "2 NaN NaN \n", + "3 FULL LINES \n", + "4 NONE DOTS \n", + "5 FULL LINES \n", + "6 NONE PLAID \n", + "7 NONE DOTS \n", + "8 NONE DOTS \n", + "9 FULL PLAID \n", + "10 NONE PLAID \n", + "11 NaN NaN \n", + "12 NONE PLAID " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.merge(condition_properties, left_on='condition_id', right_index=True, how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7b4d23df", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_idcondition_idresponse_timeresponseorientationdurationsurroundstimulus_type
0.0312A10.12LEFT00.1FULLLINES
3.0711A14.01RIGHT00.1FULLLINES
5.0313A10.07RIGHT00.1FULLLINES
1.0312A20.37LEFT00.01NONEDOTS
4.0711A20.44LEFT00.01NONEDOTS
7.0712A23.29LEFT00.01NONEDOTS
8.0314A20.29LEFT00.01NONEDOTS
2.0312C20.68LEFTNaNNaNNaNNaN
11.0314C20.73RIGHTNaNNaNNaNNaN
6.0313B10.08RIGHT450.1NONEPLAID
10.0314B10.14RIGHT450.1NONEPLAID
12.0713B15.74LEFT450.1NONEPLAID
9.0714B23.32RIGHT450.01FULLPLAID
NaNNaNC1NaNNaN900.2FULLWIGGLES
\n", + "
" + ], + "text/plain": [ + " subject_id condition_id response_time response orientation duration \\\n", + "0.0 312 A1 0.12 LEFT 0 0.1 \n", + "3.0 711 A1 4.01 RIGHT 0 0.1 \n", + "5.0 313 A1 0.07 RIGHT 0 0.1 \n", + "1.0 312 A2 0.37 LEFT 0 0.01 \n", + "4.0 711 A2 0.44 LEFT 0 0.01 \n", + "7.0 712 A2 3.29 LEFT 0 0.01 \n", + "8.0 314 A2 0.29 LEFT 0 0.01 \n", + "2.0 312 C2 0.68 LEFT NaN NaN \n", + "11.0 314 C2 0.73 RIGHT NaN NaN \n", + "6.0 313 B1 0.08 RIGHT 45 0.1 \n", + "10.0 314 B1 0.14 RIGHT 45 0.1 \n", + "12.0 713 B1 5.74 LEFT 45 0.1 \n", + "9.0 714 B2 3.32 RIGHT 45 0.01 \n", + "NaN NaN C1 NaN NaN 90 0.2 \n", + "\n", + " surround stimulus_type \n", + "0.0 FULL LINES \n", + "3.0 FULL LINES \n", + "5.0 FULL LINES \n", + "1.0 NONE DOTS \n", + "4.0 NONE DOTS \n", + "7.0 NONE DOTS \n", + "8.0 NONE DOTS \n", + "2.0 NaN NaN \n", + "11.0 NaN NaN \n", + "6.0 NONE PLAID \n", + "10.0 NONE PLAID \n", + "12.0 NONE PLAID \n", + "9.0 FULL PLAID \n", + "NaN FULL WIGGLES " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.merge(condition_properties, left_on='condition_id', right_index=True, how='outer')" + ] + }, + { + "cell_type": "markdown", + "id": "cba9534f", + "metadata": {}, + "source": [ + "# Anti-join: filter out unwanted data" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "1cb2bbdb", + "metadata": {}, + "outputs": [], + "source": [ + "# We are given a list of subjects that are outliers and should be disregarded in the analysis\n", + "outliers = pd.DataFrame([['711'], ['712'], ['713'], ['714'], ['888']], columns=['subject_id'])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e2e627d5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_idcondition_idresponse_timeresponse
0711A14.01RIGHT
1711A20.44LEFT
2712A23.29LEFT
3714B23.32RIGHT
4713B15.74LEFT
\n", + "
" + ], + "text/plain": [ + " subject_id condition_id response_time response\n", + "0 711 A1 4.01 RIGHT\n", + "1 711 A2 0.44 LEFT\n", + "2 712 A2 3.29 LEFT\n", + "3 714 B2 3.32 RIGHT\n", + "4 713 B1 5.74 LEFT" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.merge(outliers, on='subject_id')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "eb809fe0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_idcondition_idresponse_timeresponse_merge
0312A10.12LEFTleft_only
1312A20.37LEFTleft_only
2312C20.68LEFTleft_only
3711A14.01RIGHTboth
4711A20.44LEFTboth
5313A10.07RIGHTleft_only
6313B10.08RIGHTleft_only
7712A23.29LEFTboth
8314A20.29LEFTleft_only
9314B10.14RIGHTleft_only
10314C20.73RIGHTleft_only
11714B23.32RIGHTboth
12713B15.74LEFTboth
13888NaNNaNNaNright_only
\n", + "
" + ], + "text/plain": [ + " subject_id condition_id response_time response _merge\n", + "0 312 A1 0.12 LEFT left_only\n", + "1 312 A2 0.37 LEFT left_only\n", + "2 312 C2 0.68 LEFT left_only\n", + "3 711 A1 4.01 RIGHT both\n", + "4 711 A2 0.44 LEFT both\n", + "5 313 A1 0.07 RIGHT left_only\n", + "6 313 B1 0.08 RIGHT left_only\n", + "7 712 A2 3.29 LEFT both\n", + "8 314 A2 0.29 LEFT left_only\n", + "9 314 B1 0.14 RIGHT left_only\n", + "10 314 C2 0.73 RIGHT left_only\n", + "11 714 B2 3.32 RIGHT both\n", + "12 713 B1 5.74 LEFT both\n", + "13 888 NaN NaN NaN right_only" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.merge(outliers, on='subject_id', how='outer', indicator=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "6fdb696e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_idcondition_idresponse_timeresponse_merge
0312A10.12LEFTleft_only
1312A20.37LEFTleft_only
2312C20.68LEFTleft_only
5313A10.07RIGHTleft_only
6313B10.08RIGHTleft_only
8314A20.29LEFTleft_only
9314B10.14RIGHTleft_only
10314C20.73RIGHTleft_only
\n", + "
" + ], + "text/plain": [ + " subject_id condition_id response_time response _merge\n", + "0 312 A1 0.12 LEFT left_only\n", + "1 312 A2 0.37 LEFT left_only\n", + "2 312 C2 0.68 LEFT left_only\n", + "5 313 A1 0.07 RIGHT left_only\n", + "6 313 B1 0.08 RIGHT left_only\n", + "8 314 A2 0.29 LEFT left_only\n", + "9 314 B1 0.14 RIGHT left_only\n", + "10 314 C2 0.73 RIGHT left_only" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "temp = data.merge(outliers, on='subject_id', how='outer', indicator=True)\n", + "data_without_outliers = temp[temp['_merge'] == 'left_only']\n", + "data_without_outliers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c3e6baa", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/tabular_data/030_split-apply-combine.ipynb b/notebooks/tabular_data/030_split-apply-combine.ipynb new file mode 100644 index 0000000..5c199e3 --- /dev/null +++ b/notebooks/tabular_data/030_split-apply-combine.ipynb @@ -0,0 +1,814 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "247bbf84", + "metadata": {}, + "source": [ + "# Split-apply-combine operations for tabular data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "44584190", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ba193f3f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_idcondition_idresponse_timeresponse
0312A10.12LEFT
1312A20.37LEFT
2312C20.68LEFT
3313A10.07RIGHT
4313B10.08RIGHT
5314A20.29LEFT
6314B10.14RIGHT
7314C20.73RIGHT
8711A14.01RIGHT
9712A23.29LEFT
10713B15.74LEFT
11714B23.32RIGHT
\n", + "
" + ], + "text/plain": [ + " subject_id condition_id response_time response\n", + "0 312 A1 0.12 LEFT\n", + "1 312 A2 0.37 LEFT\n", + "2 312 C2 0.68 LEFT\n", + "3 313 A1 0.07 RIGHT\n", + "4 313 B1 0.08 RIGHT\n", + "5 314 A2 0.29 LEFT\n", + "6 314 B1 0.14 RIGHT\n", + "7 314 C2 0.73 RIGHT\n", + "8 711 A1 4.01 RIGHT\n", + "9 712 A2 3.29 LEFT\n", + "10 713 B1 5.74 LEFT\n", + "11 714 B2 3.32 RIGHT" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = pd.DataFrame(\n", + " data=[\n", + " ['312', 'A1', 0.12, 'LEFT'],\n", + " ['312', 'A2', 0.37, 'LEFT'],\n", + " ['312', 'C2', 0.68, 'LEFT'],\n", + " ['313', 'A1', 0.07, 'RIGHT'],\n", + " ['313', 'B1', 0.08, 'RIGHT'],\n", + " ['314', 'A2', 0.29, 'LEFT'],\n", + " ['314', 'B1', 0.14, 'RIGHT'],\n", + " ['314', 'C2', 0.73, 'RIGHT'],\n", + " ['711', 'A1', 4.01, 'RIGHT'],\n", + " ['712', 'A2', 3.29, 'LEFT'],\n", + " ['713', 'B1', 5.74, 'LEFT'],\n", + " ['714', 'B2', 3.32, 'RIGHT'],\n", + " ],\n", + " columns=['subject_id', 'condition_id', 'response_time', 'response'],\n", + ")\n", + "data" + ] + }, + { + "cell_type": "markdown", + "id": "8a239e0c", + "metadata": {}, + "source": [ + "# Group-by" + ] + }, + { + "cell_type": "markdown", + "id": "31eba91e", + "metadata": {}, + "source": [ + "We want to compute the mean response time by condition.\n", + "\n", + "Let's start by doing it by hand, using for loops!" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e8331039", + "metadata": {}, + "outputs": [], + "source": [ + "conditions = data['condition_id'].unique()\n", + "results_dict = {}\n", + "for condition in conditions:\n", + " group = data[data['condition_id'] == condition]\n", + " results_dict[condition] = group['response_time'].mean()\n", + "\n", + "results = pd.DataFrame([results_dict], index=['response_time']).T" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "09cb04c4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
response_time
A11.400000
A21.316667
C20.705000
B11.986667
B23.320000
\n", + "
" + ], + "text/plain": [ + " response_time\n", + "A1 1.400000\n", + "A2 1.316667\n", + "C2 0.705000\n", + "B1 1.986667\n", + "B2 3.320000" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results" + ] + }, + { + "cell_type": "markdown", + "id": "2bc09c66", + "metadata": {}, + "source": [ + "This is a basic operation, and we would need to repeat his pattern a million times!\n", + "\n", + "Pandas and all other tools for tabular data provide a command for performing operations on groups." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "0500cd4a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# df.groupby(column_name) groups a DataFrame by the values in the column\n", + "data.groupby('condition_id')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c5857c4e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "condition_id\n", + "A1 3\n", + "A2 3\n", + "B1 3\n", + "B2 1\n", + "C2 2\n", + "dtype: int64" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The group-by object can by used as a DataFrame. \n", + "# Operations are executed on each group individually, then aggregated\n", + "data.groupby('condition_id').size()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "5c865cc1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "condition_id\n", + "A1 1.400000\n", + "A2 1.316667\n", + "B1 1.986667\n", + "B2 3.320000\n", + "C2 0.705000\n", + "Name: response_time, dtype: float64" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.groupby('condition_id')['response_time'].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "615a4515", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "condition_id\n", + "A1 4.01\n", + "A2 3.29\n", + "B1 5.74\n", + "B2 3.32\n", + "C2 0.73\n", + "Name: response_time, dtype: float64" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.groupby('condition_id')['response_time'].max()" + ] + }, + { + "cell_type": "markdown", + "id": "b0441458", + "metadata": {}, + "source": [ + "# Pivot tables" + ] + }, + { + "cell_type": "markdown", + "id": "3feec98d", + "metadata": {}, + "source": [ + "We want to look at response time biases when the subjects respond LEFT vs RIGHT. In principle, we expect them to have the same response time in both cases.\n", + "\n", + "We compute a summary table with 1) condition_id on the rows; 2) response on the columns; 3) the average response time for all experiments with a that condition and response\n", + "\n", + "We can do it with `groupby`, with some table manipulation commands." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "4a8a7d0d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "condition_id response\n", + "A1 LEFT 0.120000\n", + " RIGHT 2.040000\n", + "A2 LEFT 1.316667\n", + "B1 LEFT 5.740000\n", + " RIGHT 0.110000\n", + "B2 RIGHT 3.320000\n", + "C2 LEFT 0.680000\n", + " RIGHT 0.730000\n", + "Name: response_time, dtype: float64" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "summary = data.groupby(['condition_id', 'response'])['response_time'].mean()\n", + "summary" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "e5a645e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
responseLEFTRIGHT
condition_id
A10.1200002.04
A21.316667NaN
B15.7400000.11
B2NaN3.32
C20.6800000.73
\n", + "
" + ], + "text/plain": [ + "response LEFT RIGHT\n", + "condition_id \n", + "A1 0.120000 2.04\n", + "A2 1.316667 NaN\n", + "B1 5.740000 0.11\n", + "B2 NaN 3.32\n", + "C2 0.680000 0.73" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "summary.unstack(level=1)" + ] + }, + { + "cell_type": "markdown", + "id": "3307fcc6", + "metadata": {}, + "source": [ + "Pandas has a command called `pivot_table` that can be used to perform this kind of operation straightforwardly." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "8941edfe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
responseLEFTRIGHT
condition_id
A10.1200002.04
A21.316667NaN
B15.7400000.11
B2NaN3.32
C20.6800000.73
\n", + "
" + ], + "text/plain": [ + "response LEFT RIGHT\n", + "condition_id \n", + "A1 0.120000 2.04\n", + "A2 1.316667 NaN\n", + "B1 5.740000 0.11\n", + "B2 NaN 3.32\n", + "C2 0.680000 0.73" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.pivot_table(index='condition_id', columns='response', values='response_time', aggfunc='mean')" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "a7d1d998", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
meanstdcount
responseLEFTRIGHTLEFTRIGHTLEFTRIGHT
condition_id
A10.1200002.04NaN2.7860011.02.0
A21.316667NaN1.709425NaN3.0NaN
B15.7400000.11NaN0.0424261.02.0
B2NaN3.32NaNNaNNaN1.0
C20.6800000.73NaNNaN1.01.0
\n", + "
" + ], + "text/plain": [ + " mean std count \n", + "response LEFT RIGHT LEFT RIGHT LEFT RIGHT\n", + "condition_id \n", + "A1 0.120000 2.04 NaN 2.786001 1.0 2.0\n", + "A2 1.316667 NaN 1.709425 NaN 3.0 NaN\n", + "B1 5.740000 0.11 NaN 0.042426 1.0 2.0\n", + "B2 NaN 3.32 NaN NaN NaN 1.0\n", + "C2 0.680000 0.73 NaN NaN 1.0 1.0" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(\n", + " data\n", + " .pivot_table(\n", + " index='condition_id', \n", + " columns='response', \n", + " values='response_time', \n", + " aggfunc=['mean', 'std', 'count'],\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a770b812", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0234ccf2", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c77c2dc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/tabular_data/040_window_functions.ipynb b/notebooks/tabular_data/040_window_functions.ipynb new file mode 100644 index 0000000..b6b118f --- /dev/null +++ b/notebooks/tabular_data/040_window_functions.ipynb @@ -0,0 +1,1429 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "247bbf84", + "metadata": {}, + "source": [ + "# Window functions for tabular data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "44584190", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "8c3508da", + "metadata": {}, + "source": [ + "# Load experimental data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8e22f6d4", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('timed_responses.csv', index_col=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c4504d72", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_idtime (ms)responseaccuracy
5743540RIGHT0.04
11902552LEFT0.43
189521036LEFT0.36
533257RIGHT0.11
1582743RIGHT0.32
5513619LEFT0.25
1602143RIGHT0.65
4131471LEFT0.80
7851121LEFT0.10
13932903RIGHT0.33
6292353LEFT0.17
18293768RIGHT0.26
90211093LEFT0.34
148623RIGHT0.29
\n", + "
" + ], + "text/plain": [ + " subject_id time (ms) response accuracy\n", + "574 3 540 RIGHT 0.04\n", + "1190 2 552 LEFT 0.43\n", + "1895 2 1036 LEFT 0.36\n", + "53 3 257 RIGHT 0.11\n", + "158 2 743 RIGHT 0.32\n", + "551 3 619 LEFT 0.25\n", + "1602 1 43 RIGHT 0.65\n", + "413 1 471 LEFT 0.80\n", + "785 1 121 LEFT 0.10\n", + "1393 2 903 RIGHT 0.33\n", + "629 2 353 LEFT 0.17\n", + "1829 3 768 RIGHT 0.26\n", + "902 1 1093 LEFT 0.34\n", + "1486 2 3 RIGHT 0.29" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "id": "a72f45c6", + "metadata": {}, + "source": [ + "# Split-apply-combine operations return one aggregated value per group" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0234ccf2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "subject_id\n", + "1 0.80\n", + "2 0.43\n", + "3 0.26\n", + "Name: accuracy, dtype: float64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby('subject_id')['accuracy'].max()" + ] + }, + { + "cell_type": "markdown", + "id": "b8926b52", + "metadata": {}, + "source": [ + "# However, for some calculations we need to have a value per row\n", + "\n", + "For example: for each subject, rank the responses by decreasing accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0c77c2dc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "574 1.0\n", + "1190 6.0\n", + "1895 5.0\n", + "53 2.0\n", + "158 3.0\n", + "551 3.0\n", + "1602 3.0\n", + "413 4.0\n", + "785 1.0\n", + "1393 4.0\n", + "629 1.0\n", + "1829 4.0\n", + "902 2.0\n", + "1486 2.0\n", + "Name: accuracy, dtype: float64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby('subject_id')['accuracy'].rank()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6803bea3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_idtime (ms)responseaccuracyaccuracy_rank
5743540RIGHT0.044.0
11902552LEFT0.431.0
189521036LEFT0.362.0
533257RIGHT0.113.0
1582743RIGHT0.324.0
5513619LEFT0.252.0
1602143RIGHT0.652.0
4131471LEFT0.801.0
7851121LEFT0.104.0
13932903RIGHT0.333.0
6292353LEFT0.176.0
18293768RIGHT0.261.0
90211093LEFT0.343.0
148623RIGHT0.295.0
\n", + "
" + ], + "text/plain": [ + " subject_id time (ms) response accuracy accuracy_rank\n", + "574 3 540 RIGHT 0.04 4.0\n", + "1190 2 552 LEFT 0.43 1.0\n", + "1895 2 1036 LEFT 0.36 2.0\n", + "53 3 257 RIGHT 0.11 3.0\n", + "158 2 743 RIGHT 0.32 4.0\n", + "551 3 619 LEFT 0.25 2.0\n", + "1602 1 43 RIGHT 0.65 2.0\n", + "413 1 471 LEFT 0.80 1.0\n", + "785 1 121 LEFT 0.10 4.0\n", + "1393 2 903 RIGHT 0.33 3.0\n", + "629 2 353 LEFT 0.17 6.0\n", + "1829 3 768 RIGHT 0.26 1.0\n", + "902 1 1093 LEFT 0.34 3.0\n", + "1486 2 3 RIGHT 0.29 5.0" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['accuracy_rank'] = df.groupby('subject_id')['accuracy'].rank(ascending=False)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "5690feee", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_idtime (ms)responseaccuracyaccuracy_rank
4131471LEFT0.801.0
1602143RIGHT0.652.0
90211093LEFT0.343.0
7851121LEFT0.104.0
11902552LEFT0.431.0
189521036LEFT0.362.0
13932903RIGHT0.333.0
1582743RIGHT0.324.0
148623RIGHT0.295.0
6292353LEFT0.176.0
18293768RIGHT0.261.0
5513619LEFT0.252.0
533257RIGHT0.113.0
5743540RIGHT0.044.0
\n", + "
" + ], + "text/plain": [ + " subject_id time (ms) response accuracy accuracy_rank\n", + "413 1 471 LEFT 0.80 1.0\n", + "1602 1 43 RIGHT 0.65 2.0\n", + "902 1 1093 LEFT 0.34 3.0\n", + "785 1 121 LEFT 0.10 4.0\n", + "1190 2 552 LEFT 0.43 1.0\n", + "1895 2 1036 LEFT 0.36 2.0\n", + "1393 2 903 RIGHT 0.33 3.0\n", + "158 2 743 RIGHT 0.32 4.0\n", + "1486 2 3 RIGHT 0.29 5.0\n", + "629 2 353 LEFT 0.17 6.0\n", + "1829 3 768 RIGHT 0.26 1.0\n", + "551 3 619 LEFT 0.25 2.0\n", + "53 3 257 RIGHT 0.11 3.0\n", + "574 3 540 RIGHT 0.04 4.0" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.sort_values(['subject_id', 'accuracy_rank'])" + ] + }, + { + "cell_type": "markdown", + "id": "f57d47c8", + "metadata": {}, + "source": [ + "# In many cases, a window functions is combined with a sorting operation\n", + "\n", + "For example: for each subject, count the number of \"LEFT\" responses up until any moment in the experiment" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f032f5db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_idtime (ms)responseaccuracyaccuracy_rankis_left
5743540RIGHT0.044.0False
11902552LEFT0.431.0True
189521036LEFT0.362.0True
533257RIGHT0.113.0False
1582743RIGHT0.324.0False
5513619LEFT0.252.0True
1602143RIGHT0.652.0False
4131471LEFT0.801.0True
7851121LEFT0.104.0True
13932903RIGHT0.333.0False
6292353LEFT0.176.0True
18293768RIGHT0.261.0False
90211093LEFT0.343.0True
148623RIGHT0.295.0False
\n", + "
" + ], + "text/plain": [ + " subject_id time (ms) response accuracy accuracy_rank is_left\n", + "574 3 540 RIGHT 0.04 4.0 False\n", + "1190 2 552 LEFT 0.43 1.0 True\n", + "1895 2 1036 LEFT 0.36 2.0 True\n", + "53 3 257 RIGHT 0.11 3.0 False\n", + "158 2 743 RIGHT 0.32 4.0 False\n", + "551 3 619 LEFT 0.25 2.0 True\n", + "1602 1 43 RIGHT 0.65 2.0 False\n", + "413 1 471 LEFT 0.80 1.0 True\n", + "785 1 121 LEFT 0.10 4.0 True\n", + "1393 2 903 RIGHT 0.33 3.0 False\n", + "629 2 353 LEFT 0.17 6.0 True\n", + "1829 3 768 RIGHT 0.26 1.0 False\n", + "902 1 1093 LEFT 0.34 3.0 True\n", + "1486 2 3 RIGHT 0.29 5.0 False" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Add a flag column \"is_left\", so that we can count the number of LEFT reponses using a cumulative sum\n", + "df['is_left'] = df['response'] == 'LEFT'\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f9420c3d", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_idtime (ms)responseaccuracyaccuracy_rankis_leftnr_lefts
1602143RIGHT0.652.0False0
4131471LEFT0.801.0True1
7851121LEFT0.104.0True2
90211093LEFT0.343.0True3
11902552LEFT0.431.0True1
189521036LEFT0.362.0True2
1582743RIGHT0.324.0False2
13932903RIGHT0.333.0False2
6292353LEFT0.176.0True3
148623RIGHT0.295.0False3
5743540RIGHT0.044.0False0
533257RIGHT0.113.0False0
5513619LEFT0.252.0True1
18293768RIGHT0.261.0False1
\n", + "
" + ], + "text/plain": [ + " subject_id time (ms) response accuracy accuracy_rank is_left \\\n", + "1602 1 43 RIGHT 0.65 2.0 False \n", + "413 1 471 LEFT 0.80 1.0 True \n", + "785 1 121 LEFT 0.10 4.0 True \n", + "902 1 1093 LEFT 0.34 3.0 True \n", + "1190 2 552 LEFT 0.43 1.0 True \n", + "1895 2 1036 LEFT 0.36 2.0 True \n", + "158 2 743 RIGHT 0.32 4.0 False \n", + "1393 2 903 RIGHT 0.33 3.0 False \n", + "629 2 353 LEFT 0.17 6.0 True \n", + "1486 2 3 RIGHT 0.29 5.0 False \n", + "574 3 540 RIGHT 0.04 4.0 False \n", + "53 3 257 RIGHT 0.11 3.0 False \n", + "551 3 619 LEFT 0.25 2.0 True \n", + "1829 3 768 RIGHT 0.26 1.0 False \n", + "\n", + " nr_lefts \n", + "1602 0 \n", + "413 1 \n", + "785 2 \n", + "902 3 \n", + "1190 1 \n", + "1895 2 \n", + "158 2 \n", + "1393 2 \n", + "629 3 \n", + "1486 3 \n", + "574 0 \n", + "53 0 \n", + "551 1 \n", + "1829 1 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Without sorting, we get the number of LEFT responses... in no particular order\n", + "df['nr_lefts'] = df.groupby('subject_id')['is_left'].cumsum()\n", + "df.sort_values(['subject_id'])" + ] + }, + { + "cell_type": "markdown", + "id": "ca1e8032", + "metadata": {}, + "source": [ + "# Window functions are also useful to compute changes in the data for each group\n", + "\n", + "In this case, the window function often uses the `shift(n)` method that lags the data by `n` rows" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "18f440d3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_idtime (ms)shifted time
1602143NaN
785112143.0
4131471121.0
90211093471.0
148623NaN
62923533.0
11902552353.0
1582743552.0
13932903743.0
189521036903.0
533257NaN
5743540257.0
5513619540.0
18293768619.0
\n", + "
" + ], + "text/plain": [ + " subject_id time (ms) shifted time\n", + "1602 1 43 NaN\n", + "785 1 121 43.0\n", + "413 1 471 121.0\n", + "902 1 1093 471.0\n", + "1486 2 3 NaN\n", + "629 2 353 3.0\n", + "1190 2 552 353.0\n", + "158 2 743 552.0\n", + "1393 2 903 743.0\n", + "1895 2 1036 903.0\n", + "53 3 257 NaN\n", + "574 3 540 257.0\n", + "551 3 619 540.0\n", + "1829 3 768 619.0" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['shifted time'] = (\n", + " df\n", + " .sort_values('time (ms)')\n", + " .groupby('subject_id')['time (ms)']\n", + " .shift(1)\n", + ")\n", + "df.sort_values(['subject_id', 'time (ms)'])[['subject_id', 'time (ms)', 'shifted time']]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4f1cb393", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_idtime (ms)time from prev
1602143NaN
785112178.0
4131471350.0
90211093622.0
148623NaN
6292353350.0
11902552199.0
1582743191.0
13932903160.0
189521036133.0
533257NaN
5743540283.0
551361979.0
18293768149.0
\n", + "
" + ], + "text/plain": [ + " subject_id time (ms) time from prev\n", + "1602 1 43 NaN\n", + "785 1 121 78.0\n", + "413 1 471 350.0\n", + "902 1 1093 622.0\n", + "1486 2 3 NaN\n", + "629 2 353 350.0\n", + "1190 2 552 199.0\n", + "158 2 743 191.0\n", + "1393 2 903 160.0\n", + "1895 2 1036 133.0\n", + "53 3 257 NaN\n", + "574 3 540 283.0\n", + "551 3 619 79.0\n", + "1829 3 768 149.0" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['time from prev'] = df['time (ms)'] - df['shifted time']\n", + "df.sort_values(['subject_id', 'time (ms)'])[['subject_id', 'time (ms)', 'time from prev']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d06a890", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87a52b7c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61badaf1", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}