53 KiB
53 KiB
Combine information across tables: joins and anti-joins¶
In [1]:
import pandas as pd
"Load" some experimental data¶
In [2]:
data = pd.DataFrame(
data=[
['312', 'A1', 0.12, 'LEFT'],
['312', 'A2', 0.37, 'LEFT'],
['312', 'C2', 0.68, 'LEFT'],
['711', 'A1', 4.01, 'RIGHT'],
['711', 'A2', 0.44, 'LEFT'],
['313', 'A1', 0.07, 'RIGHT'],
['313', 'B1', 0.08, 'RIGHT'],
['712', 'A2', 3.29, 'LEFT'],
['314', 'A2', 0.29, 'LEFT'],
['714', 'B2', 3.32, 'RIGHT'],
['314', 'B1', 0.14, 'RIGHT'],
['314', 'C2', 0.73, 'RIGHT'],
['713', 'B1', 5.74, 'LEFT'],
],
columns=['subject_id', 'condition_id', 'response_time', 'response'],
)
data
Out[2]:
Each experiment belongs to one experimental condition, but the parameters of each condition are not in the table
In [3]:
condition_to_orientation = {
'A1': 0,
'A2': 0,
'B1': 45,
'B2': 45,
'C1': 90,
}
condition_to_duration = {
'A1': 0.1,
'A2': 0.01,
'B1': 0.1,
'B2': 0.01,
'C1': 0.2,
}
condition_to_surround = {
'A1': 'FULL',
'A2': 'NONE',
'B1': 'NONE',
'B2': 'FULL',
'C1': 'FULL',
}
condition_to_stimulus_type = {
'A1': 'LINES',
'A2': 'DOTS',
'B1': 'PLAID',
'B2': 'PLAID',
'C1': 'WIGGLES',
}
Manually adding the condition parameters to the table¶
In [4]:
data_with_properties = data.copy()
In [5]:
data_with_properties['condition_id']
Out[5]:
In [6]:
data_with_properties['condition_id'].map(condition_to_orientation)
Out[6]:
In [7]:
data_with_properties['orientation'] = data_with_properties['condition_id'].map(condition_to_orientation)
data_with_properties['duration'] = data_with_properties['condition_id'].map(condition_to_duration)
data_with_properties['surround'] = data_with_properties['condition_id'].map(condition_to_surround)
data_with_properties['stimulus_type'] = data_with_properties['condition_id'].map(condition_to_stimulus_type)
In [8]:
data_with_properties
Out[8]:
Using a join operation¶
In [9]:
# Often, this is done using a spreadsheet
condition_properties = pd.DataFrame(
[condition_to_orientation, condition_to_duration, condition_to_surround, condition_to_stimulus_type],
index=['orientation', 'duration', 'surround', 'stimulus_type'],
).T
condition_properties
Out[9]:
In [10]:
data.merge(condition_properties, left_on='condition_id', right_index=True)
Out[10]:
In [11]:
data.merge(condition_properties, left_on='condition_id', right_index=True, how='left')
Out[11]:
In [12]:
data.merge(condition_properties, left_on='condition_id', right_index=True, how='outer')
Out[12]:
Anti-join: filter out unwanted data¶
In [13]:
# We are given a list of subjects that are outliers and should be disregarded in the analysis
outliers = pd.DataFrame([['711'], ['712'], ['713'], ['714'], ['888']], columns=['subject_id'])
In [14]:
data.merge(outliers, on='subject_id')
Out[14]:
In [15]:
data.merge(outliers, on='subject_id', how='outer', indicator=True)
Out[15]:
In [16]:
temp = data.merge(outliers, on='subject_id', how='outer', indicator=True)
data_without_outliers = temp[temp['_merge'] == 'left_only']
data_without_outliers
Out[16]:
In [ ]: