28 KiB
28 KiB
Combine information across tables: joins and anti-joins¶
In [1]:
import pandas as pd
"Load" some experimental data¶
In [ ]:
data = pd.DataFrame(
data=[
['312', 'A1', 0.12, 'LEFT'],
['312', 'A2', 0.37, 'LEFT'],
['312', 'C2', 0.68, 'LEFT'],
['711', 'A1', 4.01, 'RIGHT'],
['711', 'A2', 0.44, 'LEFT'],
['313', 'A1', 0.07, 'RIGHT'],
['313', 'B1', 0.08, 'RIGHT'],
['712', 'A2', 3.29, 'LEFT'],
['314', 'A2', 0.29, 'LEFT'],
['714', 'B2', 3.32, 'RIGHT'],
['314', 'B1', 0.14, 'RIGHT'],
['314', 'C2', 0.73, 'RIGHT'],
['713', 'B1', 5.74, 'LEFT'],
],
columns=['subject_id', 'condition_id', 'response_time', 'response'],
)
data
Each experiment belongs to one experimental condition, but the parameters of each condition are not in the table
In [3]:
condition_to_orientation = {
'A1': 0,
'A2': 0,
'B1': 45,
'B2': 45,
'C1': 90,
}
condition_to_duration = {
'A1': 0.1,
'A2': 0.01,
'B1': 0.1,
'B2': 0.01,
'C1': 0.2,
}
condition_to_surround = {
'A1': 'FULL',
'A2': 'NONE',
'B1': 'NONE',
'B2': 'FULL',
'C1': 'FULL',
}
condition_to_stimulus_type = {
'A1': 'LINES',
'A2': 'DOTS',
'B1': 'PLAID',
'B2': 'PLAID',
'C1': 'WIGGLES',
}
Manually adding the condition parameters to the table¶
In [73]:
data_with_properties = data.copy()
In [74]:
data_with_properties['condition_id']
Out[74]:
In [75]:
data_with_properties['condition_id'].map(condition_to_orientation)
Out[75]:
In [76]:
data_with_properties['orientation'] = data_with_properties['condition_id'].map(condition_to_orientation)
data_with_properties['duration'] = data_with_properties['condition_id'].map(condition_to_duration)
data_with_properties['surround'] = data_with_properties['condition_id'].map(condition_to_surround)
data_with_properties['stimulus_type'] = data_with_properties['condition_id'].map(condition_to_stimulus_type)
In [77]:
data_with_properties
Out[77]:
Using a join operation¶
In [4]:
# Often, this is done using a spreadsheet
condition_properties = pd.DataFrame(
[condition_to_orientation, condition_to_duration, condition_to_surround, condition_to_stimulus_type],
index=['orientation', 'duration', 'surround', 'stimulus_type'],
).T
condition_properties
Out[4]:
In [ ]:
data.merge(condition_properties, left_on='condition_id', right_index=True)
In [ ]:
data.merge(condition_properties, left_on='condition_id', right_index=True, how='left')
In [ ]:
data.merge(condition_properties, left_on='condition_id', right_index=True, how='outer')
Anti-join: filter out unwanted data¶
In [5]:
# We are given a list of subjects that are outliers and should be disregarded in the analysis
outliers = pd.DataFrame([['711'], ['712'], ['713'], ['714'], ['888']], columns=['subject_id'])
In [6]:
data.merge(outliers, on='subject_id')
Out[6]:
In [7]:
data.merge(outliers, on='subject_id', how='outer', indicator=True)
Out[7]:
In [8]:
temp = data.merge(outliers, on='subject_id', how='outer', indicator=True)
data_without_outliers = temp[temp['_merge'] == 'left_only']
data_without_outliers
Out[8]:
In [ ]: