{ "cells": [ { "cell_type": "markdown", "id": "247bbf84", "metadata": {}, "source": [ "# Window functions for tabular data" ] }, { "cell_type": "code", "execution_count": 1, "id": "44584190", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "markdown", "id": "8c3508da", "metadata": {}, "source": [ "# Load experimental data" ] }, { "cell_type": "code", "execution_count": 2, "id": "8e22f6d4", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('timed_responses.csv', index_col=0)" ] }, { "cell_type": "code", "execution_count": 3, "id": "c4504d72", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idtime (ms)responseaccuracy
5743540RIGHT0.04
11902552LEFT0.43
189521036LEFT0.36
533257RIGHT0.11
1582743RIGHT0.32
5513619LEFT0.25
1602143RIGHT0.65
4131471LEFT0.80
7851121LEFT0.10
13932903RIGHT0.33
6292353LEFT0.17
18293768RIGHT0.26
90211093LEFT0.34
148623RIGHT0.29
\n", "
" ], "text/plain": [ " subject_id time (ms) response accuracy\n", "574 3 540 RIGHT 0.04\n", "1190 2 552 LEFT 0.43\n", "1895 2 1036 LEFT 0.36\n", "53 3 257 RIGHT 0.11\n", "158 2 743 RIGHT 0.32\n", "551 3 619 LEFT 0.25\n", "1602 1 43 RIGHT 0.65\n", "413 1 471 LEFT 0.80\n", "785 1 121 LEFT 0.10\n", "1393 2 903 RIGHT 0.33\n", "629 2 353 LEFT 0.17\n", "1829 3 768 RIGHT 0.26\n", "902 1 1093 LEFT 0.34\n", "1486 2 3 RIGHT 0.29" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "id": "a72f45c6", "metadata": {}, "source": [ "# Split-apply-combine operations return one aggregated value per group" ] }, { "cell_type": "code", "execution_count": 4, "id": "0234ccf2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "subject_id\n", "1 0.80\n", "2 0.43\n", "3 0.26\n", "Name: accuracy, dtype: float64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby('subject_id')['accuracy'].max()" ] }, { "cell_type": "markdown", "id": "b8926b52", "metadata": {}, "source": [ "# However, for some calculations we need to have a value per row\n", "\n", "For example: for each subject, rank the responses by decreasing accuracy" ] }, { "cell_type": "code", "execution_count": 5, "id": "0c77c2dc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "574 1.0\n", "1190 6.0\n", "1895 5.0\n", "53 2.0\n", "158 3.0\n", "551 3.0\n", "1602 3.0\n", "413 4.0\n", "785 1.0\n", "1393 4.0\n", "629 1.0\n", "1829 4.0\n", "902 2.0\n", "1486 2.0\n", "Name: accuracy, dtype: float64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby('subject_id')['accuracy'].rank()" ] }, { "cell_type": "code", "execution_count": 6, "id": "6803bea3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idtime (ms)responseaccuracyaccuracy_rank
5743540RIGHT0.044.0
11902552LEFT0.431.0
189521036LEFT0.362.0
533257RIGHT0.113.0
1582743RIGHT0.324.0
5513619LEFT0.252.0
1602143RIGHT0.652.0
4131471LEFT0.801.0
7851121LEFT0.104.0
13932903RIGHT0.333.0
6292353LEFT0.176.0
18293768RIGHT0.261.0
90211093LEFT0.343.0
148623RIGHT0.295.0
\n", "
" ], "text/plain": [ " subject_id time (ms) response accuracy accuracy_rank\n", "574 3 540 RIGHT 0.04 4.0\n", "1190 2 552 LEFT 0.43 1.0\n", "1895 2 1036 LEFT 0.36 2.0\n", "53 3 257 RIGHT 0.11 3.0\n", "158 2 743 RIGHT 0.32 4.0\n", "551 3 619 LEFT 0.25 2.0\n", "1602 1 43 RIGHT 0.65 2.0\n", "413 1 471 LEFT 0.80 1.0\n", "785 1 121 LEFT 0.10 4.0\n", "1393 2 903 RIGHT 0.33 3.0\n", "629 2 353 LEFT 0.17 6.0\n", "1829 3 768 RIGHT 0.26 1.0\n", "902 1 1093 LEFT 0.34 3.0\n", "1486 2 3 RIGHT 0.29 5.0" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['accuracy_rank'] = df.groupby('subject_id')['accuracy'].rank(ascending=False)\n", "df" ] }, { "cell_type": "code", "execution_count": 7, "id": "5690feee", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idtime (ms)responseaccuracyaccuracy_rank
4131471LEFT0.801.0
1602143RIGHT0.652.0
90211093LEFT0.343.0
7851121LEFT0.104.0
11902552LEFT0.431.0
189521036LEFT0.362.0
13932903RIGHT0.333.0
1582743RIGHT0.324.0
148623RIGHT0.295.0
6292353LEFT0.176.0
18293768RIGHT0.261.0
5513619LEFT0.252.0
533257RIGHT0.113.0
5743540RIGHT0.044.0
\n", "
" ], "text/plain": [ " subject_id time (ms) response accuracy accuracy_rank\n", "413 1 471 LEFT 0.80 1.0\n", "1602 1 43 RIGHT 0.65 2.0\n", "902 1 1093 LEFT 0.34 3.0\n", "785 1 121 LEFT 0.10 4.0\n", "1190 2 552 LEFT 0.43 1.0\n", "1895 2 1036 LEFT 0.36 2.0\n", "1393 2 903 RIGHT 0.33 3.0\n", "158 2 743 RIGHT 0.32 4.0\n", "1486 2 3 RIGHT 0.29 5.0\n", "629 2 353 LEFT 0.17 6.0\n", "1829 3 768 RIGHT 0.26 1.0\n", "551 3 619 LEFT 0.25 2.0\n", "53 3 257 RIGHT 0.11 3.0\n", "574 3 540 RIGHT 0.04 4.0" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sort_values(['subject_id', 'accuracy_rank'])" ] }, { "cell_type": "markdown", "id": "f57d47c8", "metadata": {}, "source": [ "# In many cases, a window functions is combined with a sorting operation\n", "\n", "For example: for each subject, count the number of \"LEFT\" responses up until any moment in the experiment" ] }, { "cell_type": "code", "execution_count": 8, "id": "f032f5db", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idtime (ms)responseaccuracyaccuracy_rankis_left
5743540RIGHT0.044.0False
11902552LEFT0.431.0True
189521036LEFT0.362.0True
533257RIGHT0.113.0False
1582743RIGHT0.324.0False
5513619LEFT0.252.0True
1602143RIGHT0.652.0False
4131471LEFT0.801.0True
7851121LEFT0.104.0True
13932903RIGHT0.333.0False
6292353LEFT0.176.0True
18293768RIGHT0.261.0False
90211093LEFT0.343.0True
148623RIGHT0.295.0False
\n", "
" ], "text/plain": [ " subject_id time (ms) response accuracy accuracy_rank is_left\n", "574 3 540 RIGHT 0.04 4.0 False\n", "1190 2 552 LEFT 0.43 1.0 True\n", "1895 2 1036 LEFT 0.36 2.0 True\n", "53 3 257 RIGHT 0.11 3.0 False\n", "158 2 743 RIGHT 0.32 4.0 False\n", "551 3 619 LEFT 0.25 2.0 True\n", "1602 1 43 RIGHT 0.65 2.0 False\n", "413 1 471 LEFT 0.80 1.0 True\n", "785 1 121 LEFT 0.10 4.0 True\n", "1393 2 903 RIGHT 0.33 3.0 False\n", "629 2 353 LEFT 0.17 6.0 True\n", "1829 3 768 RIGHT 0.26 1.0 False\n", "902 1 1093 LEFT 0.34 3.0 True\n", "1486 2 3 RIGHT 0.29 5.0 False" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Add a flag column \"is_left\", so that we can count the number of LEFT reponses using a cumulative sum\n", "df['is_left'] = df['response'] == 'LEFT'\n", "df" ] }, { "cell_type": "code", "execution_count": 9, "id": "f9420c3d", "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idtime (ms)responseaccuracyaccuracy_rankis_leftnr_lefts
1602143RIGHT0.652.0False0
4131471LEFT0.801.0True1
7851121LEFT0.104.0True2
90211093LEFT0.343.0True3
11902552LEFT0.431.0True1
189521036LEFT0.362.0True2
1582743RIGHT0.324.0False2
13932903RIGHT0.333.0False2
6292353LEFT0.176.0True3
148623RIGHT0.295.0False3
5743540RIGHT0.044.0False0
533257RIGHT0.113.0False0
5513619LEFT0.252.0True1
18293768RIGHT0.261.0False1
\n", "
" ], "text/plain": [ " subject_id time (ms) response accuracy accuracy_rank is_left \\\n", "1602 1 43 RIGHT 0.65 2.0 False \n", "413 1 471 LEFT 0.80 1.0 True \n", "785 1 121 LEFT 0.10 4.0 True \n", "902 1 1093 LEFT 0.34 3.0 True \n", "1190 2 552 LEFT 0.43 1.0 True \n", "1895 2 1036 LEFT 0.36 2.0 True \n", "158 2 743 RIGHT 0.32 4.0 False \n", "1393 2 903 RIGHT 0.33 3.0 False \n", "629 2 353 LEFT 0.17 6.0 True \n", "1486 2 3 RIGHT 0.29 5.0 False \n", "574 3 540 RIGHT 0.04 4.0 False \n", "53 3 257 RIGHT 0.11 3.0 False \n", "551 3 619 LEFT 0.25 2.0 True \n", "1829 3 768 RIGHT 0.26 1.0 False \n", "\n", " nr_lefts \n", "1602 0 \n", "413 1 \n", "785 2 \n", "902 3 \n", "1190 1 \n", "1895 2 \n", "158 2 \n", "1393 2 \n", "629 3 \n", "1486 3 \n", "574 0 \n", "53 0 \n", "551 1 \n", "1829 1 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Without sorting, we get the number of LEFT responses... in no particular order\n", "df['nr_lefts'] = df.groupby('subject_id')['is_left'].cumsum()\n", "df.sort_values(['subject_id'])" ] }, { "cell_type": "markdown", "id": "ca1e8032", "metadata": {}, "source": [ "# Window functions are also useful to compute changes in the data for each group\n", "\n", "In this case, the window function often uses the `shift(n)` method that lags the data by `n` rows" ] }, { "cell_type": "code", "execution_count": 10, "id": "18f440d3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idtime (ms)shifted time
1602143NaN
785112143.0
4131471121.0
90211093471.0
148623NaN
62923533.0
11902552353.0
1582743552.0
13932903743.0
189521036903.0
533257NaN
5743540257.0
5513619540.0
18293768619.0
\n", "
" ], "text/plain": [ " subject_id time (ms) shifted time\n", "1602 1 43 NaN\n", "785 1 121 43.0\n", "413 1 471 121.0\n", "902 1 1093 471.0\n", "1486 2 3 NaN\n", "629 2 353 3.0\n", "1190 2 552 353.0\n", "158 2 743 552.0\n", "1393 2 903 743.0\n", "1895 2 1036 903.0\n", "53 3 257 NaN\n", "574 3 540 257.0\n", "551 3 619 540.0\n", "1829 3 768 619.0" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['shifted time'] = (\n", " df\n", " .sort_values('time (ms)')\n", " .groupby('subject_id')['time (ms)']\n", " .shift(1)\n", ")\n", "df.sort_values(['subject_id', 'time (ms)'])[['subject_id', 'time (ms)', 'shifted time']]" ] }, { "cell_type": "code", "execution_count": 11, "id": "4f1cb393", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subject_idtime (ms)time from prev
1602143NaN
785112178.0
4131471350.0
90211093622.0
148623NaN
6292353350.0
11902552199.0
1582743191.0
13932903160.0
189521036133.0
533257NaN
5743540283.0
551361979.0
18293768149.0
\n", "
" ], "text/plain": [ " subject_id time (ms) time from prev\n", "1602 1 43 NaN\n", "785 1 121 78.0\n", "413 1 471 350.0\n", "902 1 1093 622.0\n", "1486 2 3 NaN\n", "629 2 353 350.0\n", "1190 2 552 199.0\n", "158 2 743 191.0\n", "1393 2 903 160.0\n", "1895 2 1036 133.0\n", "53 3 257 NaN\n", "574 3 540 283.0\n", "551 3 619 79.0\n", "1829 3 768 149.0" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['time from prev'] = df['time (ms)'] - df['shifted time']\n", "df.sort_values(['subject_id', 'time (ms)'])[['subject_id', 'time (ms)', 'time from prev']]" ] }, { "cell_type": "code", "execution_count": null, "id": "3d06a890", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "87a52b7c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "61badaf1", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" } }, "nbformat": 4, "nbformat_minor": 5 }