adds exercises for tabular data part
This commit is contained in:
parent
2e60b94c52
commit
26eb146a5c
16 changed files with 60195 additions and 0 deletions
420
exercises/tabular_split_apply_combine/split_apply_combine.ipynb
Normal file
420
exercises/tabular_split_apply_combine/split_apply_combine.ipynb
Normal file
|
@ -0,0 +1,420 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6f6aa857",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Exercise: Compute summary statistics"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "8f9bc8b1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import pandas as pd"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1be11d54",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Load the patient data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "d2dfebd3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df = pd.read_csv('processed_data_predimed.csv')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "09554c84",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(6245, 18)"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "df95a10b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>patient-id</th>\n",
|
||||
" <th>location-id</th>\n",
|
||||
" <th>sex</th>\n",
|
||||
" <th>age</th>\n",
|
||||
" <th>smoke</th>\n",
|
||||
" <th>bmi</th>\n",
|
||||
" <th>waist</th>\n",
|
||||
" <th>wth</th>\n",
|
||||
" <th>htn</th>\n",
|
||||
" <th>diab</th>\n",
|
||||
" <th>hyperchol</th>\n",
|
||||
" <th>famhist</th>\n",
|
||||
" <th>hormo</th>\n",
|
||||
" <th>p14</th>\n",
|
||||
" <th>toevent</th>\n",
|
||||
" <th>event</th>\n",
|
||||
" <th>group</th>\n",
|
||||
" <th>City</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>77</td>\n",
|
||||
" <td>Never</td>\n",
|
||||
" <td>25.92</td>\n",
|
||||
" <td>94</td>\n",
|
||||
" <td>0.657343</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>9</td>\n",
|
||||
" <td>5.538672</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>MedDiet + VOO</td>\n",
|
||||
" <td>Madrid</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>68</td>\n",
|
||||
" <td>Never</td>\n",
|
||||
" <td>34.85</td>\n",
|
||||
" <td>150</td>\n",
|
||||
" <td>0.949367</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>10</td>\n",
|
||||
" <td>3.063655</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>MedDiet + Nuts</td>\n",
|
||||
" <td>Madrid</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>66</td>\n",
|
||||
" <td>Never</td>\n",
|
||||
" <td>37.50</td>\n",
|
||||
" <td>120</td>\n",
|
||||
" <td>0.750000</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>6</td>\n",
|
||||
" <td>5.590691</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>MedDiet + Nuts</td>\n",
|
||||
" <td>Madrid</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>77</td>\n",
|
||||
" <td>Never</td>\n",
|
||||
" <td>29.26</td>\n",
|
||||
" <td>93</td>\n",
|
||||
" <td>0.628378</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>6</td>\n",
|
||||
" <td>5.456537</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>MedDiet + VOO</td>\n",
|
||||
" <td>Madrid</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>Female</td>\n",
|
||||
" <td>60</td>\n",
|
||||
" <td>Never</td>\n",
|
||||
" <td>30.02</td>\n",
|
||||
" <td>104</td>\n",
|
||||
" <td>0.662420</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>9</td>\n",
|
||||
" <td>2.746064</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>Control</td>\n",
|
||||
" <td>Madrid</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" patient-id location-id sex age smoke bmi waist wth htn \\\n",
|
||||
"0 1 1 Female 77 Never 25.92 94 0.657343 Yes \n",
|
||||
"1 2 1 Female 68 Never 34.85 150 0.949367 Yes \n",
|
||||
"2 3 1 Female 66 Never 37.50 120 0.750000 Yes \n",
|
||||
"3 4 1 Female 77 Never 29.26 93 0.628378 Yes \n",
|
||||
"4 5 1 Female 60 Never 30.02 104 0.662420 Yes \n",
|
||||
"\n",
|
||||
" diab hyperchol famhist hormo p14 toevent event group City \n",
|
||||
"0 No Yes Yes No 9 5.538672 No MedDiet + VOO Madrid \n",
|
||||
"1 No Yes Yes NaN 10 3.063655 No MedDiet + Nuts Madrid \n",
|
||||
"2 Yes No No No 6 5.590691 No MedDiet + Nuts Madrid \n",
|
||||
"3 Yes No No No 6 5.456537 No MedDiet + VOO Madrid \n",
|
||||
"4 No Yes No No 9 2.746064 No Control Madrid "
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0b4f6091",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# 1. Did the mediterranean diet help prevent cardiovascular events?\n",
|
||||
"\n",
|
||||
"To answer this question, we need to compute how many cardiovascular \"events\" occured in each group of participants, separated by the diet they followed.\n",
|
||||
"In the data the column `event` contains `Yes` or `No`, indicating if that patient had an cardiovascular event. The column `group` contains which diet they followed.\n",
|
||||
"\n",
|
||||
"We first convert the column ``event'' to a binary value."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "99b21627-1b48-44ee-bda2-312b0718bd59",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df['event'] = df['event'].map({'Yes': 1, 'No': 0})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a63e59c5-fe50-433f-a529-f601c795db67",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"* Now compute the total number of events by diet group. Compare the numbers and see if you can answer the question."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "00bb9eb1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# your code here:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b052a40a-ae68-4376-8557-541eafb3face",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"* Check how many patients had each group"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "db946d0f-8204-43a3-853c-41981a9811f4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# your code here:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "64339449-d766-4a2a-85d3-8aafac5533b7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"There were no equal number in each group, so to be precise we need to put the numbers into perspective of the total. For that:\n",
|
||||
"* Calculate how many events occured relative to the amount of patients in each group (in percentage). \n",
|
||||
"Do this sepearated by diet group."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "13ad4130-2094-4e7a-a416-f0fd6e810413",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# your code here:\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9655c0ac-18e9-4297-9f6e-557bfe95ed5e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"It seems that the control group had a higher percentage of events than the other two"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1940d3fe",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# 2. Smoking\n",
|
||||
"\n",
|
||||
"Did smoking make a difference in the outcome of the study?\n",
|
||||
"Calculate how many events occured by diet group *and* smoking. The idea is that you arrive to a table like this:\n",
|
||||
"\n",
|
||||
"| group | Current | Former | Never |\n",
|
||||
"|:---------------|----------:|---------:|--------:|\n",
|
||||
"| Control | ... | ... | ... |\n",
|
||||
"| MedDiet + Nuts | ... | ... | ... |\n",
|
||||
"| MedDiet + VOO | ... | ... | ... |\n",
|
||||
"\n",
|
||||
"where each entry in the table has the percentage of events for each group. \n",
|
||||
"\n",
|
||||
"Hint: use `pivot_table`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "5ab4e70e-6261-4a26-8ad9-14eae15be09c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# your code here\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "511c640e-8f0f-449f-af33-85061d89cfd3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# 3. Age differences?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1a04ea5c-8a27-4e67-aafa-ba34580a8d7f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Finally, check that there were no big differences in the age between the groups.\n",
|
||||
"* Calculate the mean and standard deviation of the patient's age, separated by diet group.\n",
|
||||
"\n",
|
||||
"You should be getting a table where diet group are in the rows and gender in columns, like this\n",
|
||||
"\n",
|
||||
"| group | Female | Male |\n",
|
||||
"|:---------------|---------:|-------:|\n",
|
||||
"| Control | 68 | 66.4 |\n",
|
||||
"| MedDiet + Nuts | 67.4 | 65.8 |\n",
|
||||
"| MedDiet + VOO | 67.7 | 66.1 |\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "196fd111-72bc-4b87-b8fb-293547a8c83d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# your code here:\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue