adds exercises for tabular data part

This commit is contained in:
Guillermo Aguilar 2025-09-23 11:49:18 +02:00
parent 2e60b94c52
commit 26eb146a5c
16 changed files with 60195 additions and 0 deletions

View file

@ -0,0 +1,43 @@
location-id,patient-id
1,217
1,1147
1,1170
1,627
4,541
4,72
4,727
2,500
3,177
5,1123
1,1109
4,1027
3,658
1,1241
3,1004
3,664
4,1248
1,937
5,606
4,926
5,327
1,1244
4,1001
1,959
5,246
5,55
1,899
1,991
4,915
3,4
5,465
5,235
2,1022
2,889
4,789
5,1154
5,35
3,990
2,98
1,580
1,807
3,584
1 location-id patient-id
2 1 217
3 1 1147
4 1 1170
5 1 627
6 4 541
7 4 72
8 4 727
9 2 500
10 3 177
11 5 1123
12 1 1109
13 4 1027
14 3 658
15 1 1241
16 3 1004
17 3 664
18 4 1248
19 1 937
20 5 606
21 4 926
22 5 327
23 1 1244
24 4 1001
25 1 959
26 5 246
27 5 55
28 1 899
29 1 991
30 4 915
31 3 4
32 5 465
33 5 235
34 2 1022
35 2 889
36 4 789
37 5 1154
38 5 35
39 3 990
40 2 98
41 1 580
42 1 807
43 3 584

View file

@ -0,0 +1,657 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "f11a76bf",
"metadata": {},
"source": [
"# Exercise on Joins and anti-joins: add information from other tables"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "b6f2742b",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"id": "2967c84e",
"metadata": {},
"source": [
"# Load data from clinical trial\n",
"\n",
"Data comes in two different files. The file `predimed_records.csv` file contains the clinical data for each patient, except which diet group they were assigned. The file `predimed_mapping.csv` contain the information of which patient was assigned to which diet group. "
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ed626ee3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>patient-id</th>\n",
" <th>location-id</th>\n",
" <th>sex</th>\n",
" <th>age</th>\n",
" <th>smoke</th>\n",
" <th>bmi</th>\n",
" <th>waist</th>\n",
" <th>wth</th>\n",
" <th>htn</th>\n",
" <th>diab</th>\n",
" <th>hyperchol</th>\n",
" <th>famhist</th>\n",
" <th>hormo</th>\n",
" <th>p14</th>\n",
" <th>toevent</th>\n",
" <th>event</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>436</td>\n",
" <td>4</td>\n",
" <td>Male</td>\n",
" <td>58</td>\n",
" <td>Former</td>\n",
" <td>33.53</td>\n",
" <td>122</td>\n",
" <td>0.753086</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>10</td>\n",
" <td>5.374401</td>\n",
" <td>Yes</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1130</td>\n",
" <td>4</td>\n",
" <td>Male</td>\n",
" <td>77</td>\n",
" <td>Current</td>\n",
" <td>31.05</td>\n",
" <td>119</td>\n",
" <td>0.730061</td>\n",
" <td>Yes</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>10</td>\n",
" <td>6.097194</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1131</td>\n",
" <td>4</td>\n",
" <td>Female</td>\n",
" <td>72</td>\n",
" <td>Former</td>\n",
" <td>30.86</td>\n",
" <td>106</td>\n",
" <td>0.654321</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>8</td>\n",
" <td>5.946612</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1132</td>\n",
" <td>4</td>\n",
" <td>Male</td>\n",
" <td>71</td>\n",
" <td>Former</td>\n",
" <td>27.68</td>\n",
" <td>118</td>\n",
" <td>0.694118</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>8</td>\n",
" <td>2.907598</td>\n",
" <td>Yes</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1111</td>\n",
" <td>2</td>\n",
" <td>Female</td>\n",
" <td>79</td>\n",
" <td>Never</td>\n",
" <td>35.94</td>\n",
" <td>129</td>\n",
" <td>0.806250</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>9</td>\n",
" <td>4.761123</td>\n",
" <td>No</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" patient-id location-id sex age smoke bmi waist wth htn \\\n",
"0 436 4 Male 58 Former 33.53 122 0.753086 No \n",
"1 1130 4 Male 77 Current 31.05 119 0.730061 Yes \n",
"2 1131 4 Female 72 Former 30.86 106 0.654321 No \n",
"3 1132 4 Male 71 Former 27.68 118 0.694118 Yes \n",
"4 1111 2 Female 79 Never 35.94 129 0.806250 Yes \n",
"\n",
" diab hyperchol famhist hormo p14 toevent event \n",
"0 No Yes No No 10 5.374401 Yes \n",
"1 Yes No No No 10 6.097194 No \n",
"2 Yes No Yes No 8 5.946612 No \n",
"3 No Yes No No 8 2.907598 Yes \n",
"4 No Yes No No 9 4.761123 No "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('../../data/predimed_records.csv')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "48d5375f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>location-id</th>\n",
" <th>patient-id</th>\n",
" <th>group</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>885</td>\n",
" <td>MedDiet + VOO</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>182</td>\n",
" <td>MedDiet + Nuts</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>971</td>\n",
" <td>MedDiet + Nuts</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>691</td>\n",
" <td>MedDiet + Nuts</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>632</td>\n",
" <td>Control</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" location-id patient-id group\n",
"0 2 885 MedDiet + VOO\n",
"1 1 182 MedDiet + Nuts\n",
"2 1 971 MedDiet + Nuts\n",
"3 2 691 MedDiet + Nuts\n",
"4 2 632 Control"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"info = pd.read_csv('../../data/predimed_mapping.csv')\n",
"info.head()"
]
},
{
"cell_type": "markdown",
"id": "2b4b98ed-d7ec-4b7c-b983-adc616d2f16f",
"metadata": {},
"source": [
"There were 5 different locations where the study was conducted, each one gave an identification number `patient-id` to each participant."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b9dbc492-1489-4530-96ac-5f33f7389caa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([2, 1, 3, 4, 5])"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"info['location-id'].unique()"
]
},
{
"cell_type": "markdown",
"id": "2fef4d37",
"metadata": {},
"source": [
"# 1. Add diet information to the patients' records\n",
"\n",
"* For how many patients do we have clinical information? (i.e., rows in `df`)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "861ac334-14ce-490a-b3c4-877b32789f3e",
"metadata": {},
"outputs": [],
"source": [
"## your code here\n"
]
},
{
"cell_type": "markdown",
"id": "1c1701e2-c295-4032-9e89-0d8470f41593",
"metadata": {},
"source": [
"* For how many patients do we have diet information? (i.e., rows in `info`)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "14f57842-5722-4953-88d6-d7cf3070400c",
"metadata": {},
"outputs": [],
"source": [
"## your code here\n"
]
},
{
"cell_type": "markdown",
"id": "3f23fa17-af3e-41c3-883f-3e1279d4820e",
"metadata": {},
"source": [
"Perform the merge, keeping in mind that it only make sense to analyze patients with the diet information. \n",
"* Which type of merge would you do? \n",
"* For how many patients do we have full information (records and which diet they followed? "
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "35e19a53",
"metadata": {},
"outputs": [],
"source": [
"## your code here\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "946beb08-30a5-4020-8612-360385cdfc1e",
"metadata": {},
"source": [
"# 2. Add location information to the patients' records\n",
"\n",
"There were five locations where the study was conducted. Here is a DataFrame containing the information of each location. \n",
"\n",
"- Add a new column to the dataset that contains the city where each patient was recorded.\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "36ce0688-d421-4a07-b00e-0e9b3201f0e0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>location-id</th>\n",
" <th>City</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Madrid</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Valencia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Barcelona</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Bilbao</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Malaga</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" location-id City\n",
"0 1 Madrid\n",
"1 2 Valencia\n",
"2 3 Barcelona\n",
"3 4 Bilbao\n",
"4 5 Malaga"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"locations = pd.DataFrame.from_dict({'location-id': [1, 2, 3, 4, 5], \n",
" 'City': ['Madrid', 'Valencia', 'Barcelona', 'Bilbao','Malaga']})\n",
"locations"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "b636dde4-129a-4dd1-8cbf-c539c9c8a5f2",
"metadata": {},
"outputs": [],
"source": [
"## your code here:\n"
]
},
{
"cell_type": "markdown",
"id": "44031178",
"metadata": {},
"source": [
"# 3. Remove drops from table\n",
"\n",
"Some patients drop from the study early on and they should be removed from our analysis. Their IDS are stored in file `dropped.csv`.\n",
"1. Load the list of patients who droped, from `dropped.csv`\n",
"2. Use an anti-join to remove them from the table\n",
"3. How many patients (rows) are left in the data?"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "d1d4cc27",
"metadata": {},
"outputs": [],
"source": [
"dropped = pd.read_csv('dropped.csv')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "fbebbd97",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(42, 2)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dropped.shape"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "8a3c7943",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>location-id</th>\n",
" <th>patient-id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>217</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1147</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>1170</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>627</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>541</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" location-id patient-id\n",
"0 1 217\n",
"1 1 1147\n",
"2 1 1170\n",
"3 1 627\n",
"4 4 541"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dropped.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "573687e7",
"metadata": {},
"outputs": [],
"source": [
"# your code here\n"
]
},
{
"cell_type": "markdown",
"id": "84270332",
"metadata": {},
"source": [
"# 4. Save final result in `processed_data_predimed.csv`\n",
"\n",
"1. Using the `.to_csv` method of Pandas DataFrames"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "85902eea",
"metadata": {},
"outputs": [],
"source": [
"fname = 'processed_data_predimed.csv'\n",
"\n",
"# your code here\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because it is too large Load diff