2024-heraklion-data/notebooks/030_tabular_data/020_join_operations.ipynb
2024-08-27 15:27:53 +03:00

53 KiB

Combine information across tables: joins and anti-joins

In [1]:
import pandas as pd

"Load" some experimental data

In [2]:
data = pd.DataFrame(
    data=[
        ['312', 'A1', 0.12, 'LEFT'],
        ['312', 'A2', 0.37, 'LEFT'],
        ['312', 'C2', 0.68, 'LEFT'],
        ['711', 'A1', 4.01, 'RIGHT'],
        ['711', 'A2', 0.44, 'LEFT'],
        ['313', 'A1', 0.07, 'RIGHT'],
        ['313', 'B1', 0.08, 'RIGHT'],
        ['712', 'A2', 3.29, 'LEFT'],
        ['314', 'A2', 0.29, 'LEFT'],
        ['714', 'B2', 3.32, 'RIGHT'],
        ['314', 'B1', 0.14, 'RIGHT'],
        ['314', 'C2', 0.73, 'RIGHT'],
        ['713', 'B1', 5.74, 'LEFT'],
    ],
    columns=['subject_id', 'condition_id', 'response_time', 'response'],
)
data
Out[2]:
subject_id condition_id response_time response
0 312 A1 0.12 LEFT
1 312 A2 0.37 LEFT
2 312 C2 0.68 LEFT
3 711 A1 4.01 RIGHT
4 711 A2 0.44 LEFT
5 313 A1 0.07 RIGHT
6 313 B1 0.08 RIGHT
7 712 A2 3.29 LEFT
8 314 A2 0.29 LEFT
9 714 B2 3.32 RIGHT
10 314 B1 0.14 RIGHT
11 314 C2 0.73 RIGHT
12 713 B1 5.74 LEFT

Each experiment belongs to one experimental condition, but the parameters of each condition are not in the table

In [3]:
condition_to_orientation = {
    'A1': 0,
    'A2': 0,
    'B1': 45,
    'B2': 45,
    'C1': 90,
}

condition_to_duration = {
    'A1': 0.1,
    'A2': 0.01,
    'B1': 0.1,
    'B2': 0.01,
    'C1': 0.2,
}

condition_to_surround = {
    'A1': 'FULL',
    'A2': 'NONE',
    'B1': 'NONE',
    'B2': 'FULL',
    'C1': 'FULL',
}


condition_to_stimulus_type = {
    'A1': 'LINES',
    'A2': 'DOTS',
    'B1': 'PLAID',
    'B2': 'PLAID',
    'C1': 'WIGGLES',
}

Manually adding the condition parameters to the table

In [4]:
data_with_properties = data.copy()
In [5]:
data_with_properties['condition_id']
Out[5]:
0     A1
1     A2
2     C2
3     A1
4     A2
5     A1
6     B1
7     A2
8     A2
9     B2
10    B1
11    C2
12    B1
Name: condition_id, dtype: object
In [6]:
data_with_properties['condition_id'].map(condition_to_orientation)
Out[6]:
0      0.0
1      0.0
2      NaN
3      0.0
4      0.0
5      0.0
6     45.0
7      0.0
8      0.0
9     45.0
10    45.0
11     NaN
12    45.0
Name: condition_id, dtype: float64
In [7]:
data_with_properties['orientation'] = data_with_properties['condition_id'].map(condition_to_orientation)
data_with_properties['duration'] = data_with_properties['condition_id'].map(condition_to_duration)
data_with_properties['surround'] = data_with_properties['condition_id'].map(condition_to_surround)
data_with_properties['stimulus_type'] = data_with_properties['condition_id'].map(condition_to_stimulus_type)
In [8]:
data_with_properties
Out[8]:
subject_id condition_id response_time response orientation duration surround stimulus_type
0 312 A1 0.12 LEFT 0.0 0.10 FULL LINES
1 312 A2 0.37 LEFT 0.0 0.01 NONE DOTS
2 312 C2 0.68 LEFT NaN NaN NaN NaN
3 711 A1 4.01 RIGHT 0.0 0.10 FULL LINES
4 711 A2 0.44 LEFT 0.0 0.01 NONE DOTS
5 313 A1 0.07 RIGHT 0.0 0.10 FULL LINES
6 313 B1 0.08 RIGHT 45.0 0.10 NONE PLAID
7 712 A2 3.29 LEFT 0.0 0.01 NONE DOTS
8 314 A2 0.29 LEFT 0.0 0.01 NONE DOTS
9 714 B2 3.32 RIGHT 45.0 0.01 FULL PLAID
10 314 B1 0.14 RIGHT 45.0 0.10 NONE PLAID
11 314 C2 0.73 RIGHT NaN NaN NaN NaN
12 713 B1 5.74 LEFT 45.0 0.10 NONE PLAID

Using a join operation

In [9]:
# Often, this is done using a spreadsheet
condition_properties = pd.DataFrame(
    [condition_to_orientation, condition_to_duration, condition_to_surround, condition_to_stimulus_type],
    index=['orientation', 'duration', 'surround', 'stimulus_type'],
).T
condition_properties
Out[9]:
orientation duration surround stimulus_type
A1 0 0.1 FULL LINES
A2 0 0.01 NONE DOTS
B1 45 0.1 NONE PLAID
B2 45 0.01 FULL PLAID
C1 90 0.2 FULL WIGGLES
In [10]:
data.merge(condition_properties, left_on='condition_id', right_index=True)
Out[10]:
subject_id condition_id response_time response orientation duration surround stimulus_type
0 312 A1 0.12 LEFT 0 0.1 FULL LINES
3 711 A1 4.01 RIGHT 0 0.1 FULL LINES
5 313 A1 0.07 RIGHT 0 0.1 FULL LINES
1 312 A2 0.37 LEFT 0 0.01 NONE DOTS
4 711 A2 0.44 LEFT 0 0.01 NONE DOTS
7 712 A2 3.29 LEFT 0 0.01 NONE DOTS
8 314 A2 0.29 LEFT 0 0.01 NONE DOTS
6 313 B1 0.08 RIGHT 45 0.1 NONE PLAID
10 314 B1 0.14 RIGHT 45 0.1 NONE PLAID
12 713 B1 5.74 LEFT 45 0.1 NONE PLAID
9 714 B2 3.32 RIGHT 45 0.01 FULL PLAID
In [11]:
data.merge(condition_properties, left_on='condition_id', right_index=True, how='left')
Out[11]:
subject_id condition_id response_time response orientation duration surround stimulus_type
0 312 A1 0.12 LEFT 0 0.1 FULL LINES
1 312 A2 0.37 LEFT 0 0.01 NONE DOTS
2 312 C2 0.68 LEFT NaN NaN NaN NaN
3 711 A1 4.01 RIGHT 0 0.1 FULL LINES
4 711 A2 0.44 LEFT 0 0.01 NONE DOTS
5 313 A1 0.07 RIGHT 0 0.1 FULL LINES
6 313 B1 0.08 RIGHT 45 0.1 NONE PLAID
7 712 A2 3.29 LEFT 0 0.01 NONE DOTS
8 314 A2 0.29 LEFT 0 0.01 NONE DOTS
9 714 B2 3.32 RIGHT 45 0.01 FULL PLAID
10 314 B1 0.14 RIGHT 45 0.1 NONE PLAID
11 314 C2 0.73 RIGHT NaN NaN NaN NaN
12 713 B1 5.74 LEFT 45 0.1 NONE PLAID
In [12]:
data.merge(condition_properties, left_on='condition_id', right_index=True, how='outer')
Out[12]:
subject_id condition_id response_time response orientation duration surround stimulus_type
0.0 312 A1 0.12 LEFT 0 0.1 FULL LINES
3.0 711 A1 4.01 RIGHT 0 0.1 FULL LINES
5.0 313 A1 0.07 RIGHT 0 0.1 FULL LINES
1.0 312 A2 0.37 LEFT 0 0.01 NONE DOTS
4.0 711 A2 0.44 LEFT 0 0.01 NONE DOTS
7.0 712 A2 3.29 LEFT 0 0.01 NONE DOTS
8.0 314 A2 0.29 LEFT 0 0.01 NONE DOTS
2.0 312 C2 0.68 LEFT NaN NaN NaN NaN
11.0 314 C2 0.73 RIGHT NaN NaN NaN NaN
6.0 313 B1 0.08 RIGHT 45 0.1 NONE PLAID
10.0 314 B1 0.14 RIGHT 45 0.1 NONE PLAID
12.0 713 B1 5.74 LEFT 45 0.1 NONE PLAID
9.0 714 B2 3.32 RIGHT 45 0.01 FULL PLAID
NaN NaN C1 NaN NaN 90 0.2 FULL WIGGLES

Anti-join: filter out unwanted data

In [13]:
# We are given a list of subjects that are outliers and should be disregarded in the analysis
outliers = pd.DataFrame([['711'], ['712'], ['713'], ['714'], ['888']], columns=['subject_id'])
In [14]:
data.merge(outliers, on='subject_id')
Out[14]:
subject_id condition_id response_time response
0 711 A1 4.01 RIGHT
1 711 A2 0.44 LEFT
2 712 A2 3.29 LEFT
3 714 B2 3.32 RIGHT
4 713 B1 5.74 LEFT
In [15]:
data.merge(outliers, on='subject_id', how='outer', indicator=True)
Out[15]:
subject_id condition_id response_time response _merge
0 312 A1 0.12 LEFT left_only
1 312 A2 0.37 LEFT left_only
2 312 C2 0.68 LEFT left_only
3 711 A1 4.01 RIGHT both
4 711 A2 0.44 LEFT both
5 313 A1 0.07 RIGHT left_only
6 313 B1 0.08 RIGHT left_only
7 712 A2 3.29 LEFT both
8 314 A2 0.29 LEFT left_only
9 314 B1 0.14 RIGHT left_only
10 314 C2 0.73 RIGHT left_only
11 714 B2 3.32 RIGHT both
12 713 B1 5.74 LEFT both
13 888 NaN NaN NaN right_only
In [16]:
temp = data.merge(outliers, on='subject_id', how='outer', indicator=True)
data_without_outliers = temp[temp['_merge'] == 'left_only']
data_without_outliers
Out[16]:
subject_id condition_id response_time response _merge
0 312 A1 0.12 LEFT left_only
1 312 A2 0.37 LEFT left_only
2 312 C2 0.68 LEFT left_only
5 313 A1 0.07 RIGHT left_only
6 313 B1 0.08 RIGHT left_only
8 314 A2 0.29 LEFT left_only
9 314 B1 0.14 RIGHT left_only
10 314 C2 0.73 RIGHT left_only
In [ ]: