adds exercises for tabular data part

2025-09-23 11:49:18 +02:00 · 2025-09-23 11:49:18 +02:00 · 26eb146a5c
commit 26eb146a5c
parent 2e60b94c52
16 changed files with 60195 additions and 0 deletions
--- a/exercises/tabular_split_apply_combine/processed_data_predimed.csv
+++ b/exercises/tabular_split_apply_combine/processed_data_predimed.csv
--- a/exercises/tabular_split_apply_combine/split_apply_combine.ipynb
+++ b/exercises/tabular_split_apply_combine/split_apply_combine.ipynb
@ -0,0 +1,420 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6f6aa857",
+   "metadata": {},
+   "source": [
+    "# Exercise: Compute summary statistics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "8f9bc8b1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1be11d54",
+   "metadata": {},
+   "source": [
+    "# Load the patient data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "d2dfebd3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv('processed_data_predimed.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "09554c84",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(6245, 18)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "df95a10b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>patient-id</th>\n",
+       "      <th>location-id</th>\n",
+       "      <th>sex</th>\n",
+       "      <th>age</th>\n",
+       "      <th>smoke</th>\n",
+       "      <th>bmi</th>\n",
+       "      <th>waist</th>\n",
+       "      <th>wth</th>\n",
+       "      <th>htn</th>\n",
+       "      <th>diab</th>\n",
+       "      <th>hyperchol</th>\n",
+       "      <th>famhist</th>\n",
+       "      <th>hormo</th>\n",
+       "      <th>p14</th>\n",
+       "      <th>toevent</th>\n",
+       "      <th>event</th>\n",
+       "      <th>group</th>\n",
+       "      <th>City</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>77</td>\n",
+       "      <td>Never</td>\n",
+       "      <td>25.92</td>\n",
+       "      <td>94</td>\n",
+       "      <td>0.657343</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>No</td>\n",
+       "      <td>9</td>\n",
+       "      <td>5.538672</td>\n",
+       "      <td>No</td>\n",
+       "      <td>MedDiet + VOO</td>\n",
+       "      <td>Madrid</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>68</td>\n",
+       "      <td>Never</td>\n",
+       "      <td>34.85</td>\n",
+       "      <td>150</td>\n",
+       "      <td>0.949367</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>10</td>\n",
+       "      <td>3.063655</td>\n",
+       "      <td>No</td>\n",
+       "      <td>MedDiet + Nuts</td>\n",
+       "      <td>Madrid</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>66</td>\n",
+       "      <td>Never</td>\n",
+       "      <td>37.50</td>\n",
+       "      <td>120</td>\n",
+       "      <td>0.750000</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>6</td>\n",
+       "      <td>5.590691</td>\n",
+       "      <td>No</td>\n",
+       "      <td>MedDiet + Nuts</td>\n",
+       "      <td>Madrid</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>77</td>\n",
+       "      <td>Never</td>\n",
+       "      <td>29.26</td>\n",
+       "      <td>93</td>\n",
+       "      <td>0.628378</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>6</td>\n",
+       "      <td>5.456537</td>\n",
+       "      <td>No</td>\n",
+       "      <td>MedDiet + VOO</td>\n",
+       "      <td>Madrid</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>60</td>\n",
+       "      <td>Never</td>\n",
+       "      <td>30.02</td>\n",
+       "      <td>104</td>\n",
+       "      <td>0.662420</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>9</td>\n",
+       "      <td>2.746064</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Control</td>\n",
+       "      <td>Madrid</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   patient-id  location-id     sex  age  smoke    bmi  waist       wth  htn  \\\n",
+       "0           1            1  Female   77  Never  25.92     94  0.657343  Yes   \n",
+       "1           2            1  Female   68  Never  34.85    150  0.949367  Yes   \n",
+       "2           3            1  Female   66  Never  37.50    120  0.750000  Yes   \n",
+       "3           4            1  Female   77  Never  29.26     93  0.628378  Yes   \n",
+       "4           5            1  Female   60  Never  30.02    104  0.662420  Yes   \n",
+       "\n",
+       "  diab hyperchol famhist hormo  p14   toevent event           group    City  \n",
+       "0   No       Yes     Yes    No    9  5.538672    No   MedDiet + VOO  Madrid  \n",
+       "1   No       Yes     Yes   NaN   10  3.063655    No  MedDiet + Nuts  Madrid  \n",
+       "2  Yes        No      No    No    6  5.590691    No  MedDiet + Nuts  Madrid  \n",
+       "3  Yes        No      No    No    6  5.456537    No   MedDiet + VOO  Madrid  \n",
+       "4   No       Yes      No    No    9  2.746064    No         Control  Madrid  "
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0b4f6091",
+   "metadata": {},
+   "source": [
+    "# 1. Did the mediterranean diet help prevent cardiovascular events?\n",
+    "\n",
+    "To answer this question, we need to compute how many cardiovascular \"events\" occured in each group of participants, separated by the diet they followed.\n",
+    "In the data the column `event` contains `Yes` or `No`, indicating if that patient had an cardiovascular event. The column `group` contains which diet they followed.\n",
+    "\n",
+    "We first convert the column ``event'' to a binary value."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "99b21627-1b48-44ee-bda2-312b0718bd59",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['event'] = df['event'].map({'Yes': 1, 'No': 0})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a63e59c5-fe50-433f-a529-f601c795db67",
+   "metadata": {},
+   "source": [
+    "* Now compute the total number of events by diet group. Compare the numbers and see if you can answer the question."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "00bb9eb1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# your code here:\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b052a40a-ae68-4376-8557-541eafb3face",
+   "metadata": {},
+   "source": [
+    "* Check how many patients had each group"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "db946d0f-8204-43a3-853c-41981a9811f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# your code here:\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "64339449-d766-4a2a-85d3-8aafac5533b7",
+   "metadata": {},
+   "source": [
+    "There were no equal number in each group, so to be precise we need to put the numbers into perspective of the total. For that:\n",
+    "* Calculate how many events occured relative to the amount of patients in each group (in percentage). \n",
+    "Do this sepearated by diet group."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "13ad4130-2094-4e7a-a416-f0fd6e810413",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# your code here:\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9655c0ac-18e9-4297-9f6e-557bfe95ed5e",
+   "metadata": {},
+   "source": [
+    "It seems that the control group had a higher percentage of events than the other two"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1940d3fe",
+   "metadata": {},
+   "source": [
+    "# 2. Smoking\n",
+    "\n",
+    "Did smoking make a difference in the outcome of the study?\n",
+    "Calculate how many events occured by diet group *and* smoking. The idea is that you arrive to a table like this:\n",
+    "\n",
+    "| group          |   Current |   Former |   Never |\n",
+    "|:---------------|----------:|---------:|--------:|\n",
+    "| Control        |     ...   |    ...   |   ...   |\n",
+    "| MedDiet + Nuts |     ...   |    ...   |   ...   |\n",
+    "| MedDiet + VOO  |     ...   |    ...   |   ...   |\n",
+    "\n",
+    "where each entry in the table has the percentage of events for each group. \n",
+    "\n",
+    "Hint: use `pivot_table`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "5ab4e70e-6261-4a26-8ad9-14eae15be09c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# your code here\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "511c640e-8f0f-449f-af33-85061d89cfd3",
+   "metadata": {},
+   "source": [
+    "# 3. Age differences?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1a04ea5c-8a27-4e67-aafa-ba34580a8d7f",
+   "metadata": {},
+   "source": [
+    "Finally, check that there were no big differences in the age between the groups.\n",
+    "* Calculate the mean and standard deviation of the patient's age, separated by diet group.\n",
+    "\n",
+    "You should be getting a table where diet group are in the rows and gender in columns, like this\n",
+    "\n",
+    "| group          |   Female |   Male |\n",
+    "|:---------------|---------:|-------:|\n",
+    "| Control        |     68   |   66.4 |\n",
+    "| MedDiet + Nuts |     67.4 |   65.8 |\n",
+    "| MedDiet + VOO  |     67.7 |   66.1 |\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "196fd111-72bc-4b87-b8fb-293547a8c83d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# your code here:\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/exercises/tabular_split_apply_combine/split_apply_combine_solution.ipynb
+++ b/exercises/tabular_split_apply_combine/split_apply_combine_solution.ipynb
@ -0,0 +1,927 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6f6aa857",
+   "metadata": {},
+   "source": [
+    "# Exercise: Compute summary statistics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "8f9bc8b1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1be11d54",
+   "metadata": {},
+   "source": [
+    "# Load the patient data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "d2dfebd3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv('processed_data_predimed.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "09554c84",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(6245, 17)"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "df95a10b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>patient-id</th>\n",
+       "      <th>location-id</th>\n",
+       "      <th>sex</th>\n",
+       "      <th>age</th>\n",
+       "      <th>smoke</th>\n",
+       "      <th>bmi</th>\n",
+       "      <th>waist</th>\n",
+       "      <th>wth</th>\n",
+       "      <th>htn</th>\n",
+       "      <th>diab</th>\n",
+       "      <th>hyperchol</th>\n",
+       "      <th>famhist</th>\n",
+       "      <th>hormo</th>\n",
+       "      <th>p14</th>\n",
+       "      <th>toevent</th>\n",
+       "      <th>event</th>\n",
+       "      <th>group</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>77</td>\n",
+       "      <td>Never</td>\n",
+       "      <td>25.92</td>\n",
+       "      <td>94</td>\n",
+       "      <td>0.657343</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>No</td>\n",
+       "      <td>9</td>\n",
+       "      <td>5.538672</td>\n",
+       "      <td>No</td>\n",
+       "      <td>MedDiet + VOO</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>68</td>\n",
+       "      <td>Never</td>\n",
+       "      <td>34.85</td>\n",
+       "      <td>150</td>\n",
+       "      <td>0.949367</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>10</td>\n",
+       "      <td>3.063655</td>\n",
+       "      <td>No</td>\n",
+       "      <td>MedDiet + Nuts</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>66</td>\n",
+       "      <td>Never</td>\n",
+       "      <td>37.50</td>\n",
+       "      <td>120</td>\n",
+       "      <td>0.750000</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>6</td>\n",
+       "      <td>5.590691</td>\n",
+       "      <td>No</td>\n",
+       "      <td>MedDiet + Nuts</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>77</td>\n",
+       "      <td>Never</td>\n",
+       "      <td>29.26</td>\n",
+       "      <td>93</td>\n",
+       "      <td>0.628378</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>6</td>\n",
+       "      <td>5.456537</td>\n",
+       "      <td>No</td>\n",
+       "      <td>MedDiet + VOO</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>1</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>60</td>\n",
+       "      <td>Never</td>\n",
+       "      <td>30.02</td>\n",
+       "      <td>104</td>\n",
+       "      <td>0.662420</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>9</td>\n",
+       "      <td>2.746064</td>\n",
+       "      <td>No</td>\n",
+       "      <td>Control</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   patient-id  location-id     sex  age  smoke    bmi  waist       wth  htn  \\\n",
+       "0           1            1  Female   77  Never  25.92     94  0.657343  Yes   \n",
+       "1           2            1  Female   68  Never  34.85    150  0.949367  Yes   \n",
+       "2           3            1  Female   66  Never  37.50    120  0.750000  Yes   \n",
+       "3           4            1  Female   77  Never  29.26     93  0.628378  Yes   \n",
+       "4           5            1  Female   60  Never  30.02    104  0.662420  Yes   \n",
+       "\n",
+       "  diab hyperchol famhist hormo  p14   toevent event           group  \n",
+       "0   No       Yes     Yes    No    9  5.538672    No   MedDiet + VOO  \n",
+       "1   No       Yes     Yes   NaN   10  3.063655    No  MedDiet + Nuts  \n",
+       "2  Yes        No      No    No    6  5.590691    No  MedDiet + Nuts  \n",
+       "3  Yes        No      No    No    6  5.456537    No   MedDiet + VOO  \n",
+       "4   No       Yes      No    No    9  2.746064    No         Control  "
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0b4f6091",
+   "metadata": {},
+   "source": [
+    "# 1. Did the mediterranean diet help prevent cardiovascular events?\n",
+    "\n",
+    "To answer this question, we need to compute how many cardiovascular \"events\" occured in each group of participants, separated by the diet they followed.\n",
+    "In the data the column `event` contains `Yes` or `No`, indicating if that patient had an cardiovascular event. The column `group` contains which diet they followed.\n",
+    "\n",
+    "* Convert the column `event` from string to binary (1 for Yes, 0 for No) (this will ease the calculations that follow later).\n",
+    "  Hint: use the method `.map()`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "99b21627-1b48-44ee-bda2-312b0718bd59",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['event'] = df['event'].map({'Yes': 1, 'No': 0})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a63e59c5-fe50-433f-a529-f601c795db67",
+   "metadata": {},
+   "source": [
+    "* Now compute the total number of events by diet group. Compare the numbers and see if you can answer the question."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "00bb9eb1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "group\n",
+       "Control           96\n",
+       "MedDiet + Nuts    69\n",
+       "MedDiet + VOO     83\n",
+       "Name: event, dtype: int64"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.groupby('group')['event'].sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b052a40a-ae68-4376-8557-541eafb3face",
+   "metadata": {},
+   "source": [
+    "* Check how many patients had each group"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "db946d0f-8204-43a3-853c-41981a9811f4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "group\n",
+       "Control           2016\n",
+       "MedDiet + Nuts    2077\n",
+       "MedDiet + VOO     2152\n",
+       "Name: event, dtype: int64"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.groupby('group')['event'].count()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "64339449-d766-4a2a-85d3-8aafac5533b7",
+   "metadata": {},
+   "source": [
+    "There were no equal number in each group, so to be precise we need to put the numbers into perspective of the total. For that:\n",
+    "* Calculate how many events occured relative to the amount of patients in each group (in percentage). \n",
+    "Do this sepearated by diet group."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "13ad4130-2094-4e7a-a416-f0fd6e810413",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "group\n",
+       "Control           4.761905\n",
+       "MedDiet + Nuts    3.322099\n",
+       "MedDiet + VOO     3.856877\n",
+       "Name: event, dtype: float64"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.groupby('group')['event'].sum()*100 / df.groupby('group')['event'].count()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9655c0ac-18e9-4297-9f6e-557bfe95ed5e",
+   "metadata": {},
+   "source": [
+    "It seems that the control group had a higher percentage of events than the other two"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1940d3fe",
+   "metadata": {},
+   "source": [
+    "# 2. Smoking\n",
+    "\n",
+    "Did smoking make a difference in the outcome of the study?\n",
+    "Calculate how many events occured by diet group *and* smoking. The idea is that you arrive to a table like this:\n",
+    "\n",
+    "| group          |   Current |   Former |   Never |\n",
+    "|:---------------|----------:|---------:|--------:|\n",
+    "| Control        |     ...   |    ...   |   ...   |\n",
+    "| MedDiet + Nuts |     ...   |    ...   |   ...   |\n",
+    "| MedDiet + VOO  |     ...   |    ...   |   ...   |\n",
+    "\n",
+    "where each entry in the table has the percentage of events for each group\n",
+    "Hint: use `pivot_table`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "5ab4e70e-6261-4a26-8ad9-14eae15be09c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>smoke</th>\n",
+       "      <th>Current</th>\n",
+       "      <th>Former</th>\n",
+       "      <th>Never</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>group</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Control</th>\n",
+       "      <td>13</td>\n",
+       "      <td>39</td>\n",
+       "      <td>44</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MedDiet + Nuts</th>\n",
+       "      <td>15</td>\n",
+       "      <td>20</td>\n",
+       "      <td>34</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MedDiet + VOO</th>\n",
+       "      <td>20</td>\n",
+       "      <td>29</td>\n",
+       "      <td>34</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "smoke           Current  Former  Never\n",
+       "group                                 \n",
+       "Control              13      39     44\n",
+       "MedDiet + Nuts       15      20     34\n",
+       "MedDiet + VOO        20      29     34"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "counts = df.pivot_table(index='group', columns='smoke', values='event', aggfunc='sum')\n",
+    "counts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "f854f24d-9108-42bc-a23b-6b5503f5deba",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>smoke</th>\n",
+       "      <th>Current</th>\n",
+       "      <th>Former</th>\n",
+       "      <th>Never</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>group</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Control</th>\n",
+       "      <td>264</td>\n",
+       "      <td>485</td>\n",
+       "      <td>1267</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MedDiet + Nuts</th>\n",
+       "      <td>291</td>\n",
+       "      <td>539</td>\n",
+       "      <td>1247</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MedDiet + VOO</th>\n",
+       "      <td>290</td>\n",
+       "      <td>531</td>\n",
+       "      <td>1331</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "smoke           Current  Former  Never\n",
+       "group                                 \n",
+       "Control             264     485   1267\n",
+       "MedDiet + Nuts      291     539   1247\n",
+       "MedDiet + VOO       290     531   1331"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "N = df.pivot_table(index='group', columns='smoke', values='event', aggfunc='count')\n",
+    "N"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "1358f6b8-60a5-44db-ba23-4ef7c7af8455",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>smoke</th>\n",
+       "      <th>Current</th>\n",
+       "      <th>Former</th>\n",
+       "      <th>Never</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>group</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Control</th>\n",
+       "      <td>4.924242</td>\n",
+       "      <td>8.041237</td>\n",
+       "      <td>3.472770</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MedDiet + Nuts</th>\n",
+       "      <td>5.154639</td>\n",
+       "      <td>3.710575</td>\n",
+       "      <td>2.726544</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MedDiet + VOO</th>\n",
+       "      <td>6.896552</td>\n",
+       "      <td>5.461394</td>\n",
+       "      <td>2.554470</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "smoke            Current    Former     Never\n",
+       "group                                       \n",
+       "Control         4.924242  8.041237  3.472770\n",
+       "MedDiet + Nuts  5.154639  3.710575  2.726544\n",
+       "MedDiet + VOO   6.896552  5.461394  2.554470"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "counts*100/N"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "511c640e-8f0f-449f-af33-85061d89cfd3",
+   "metadata": {},
+   "source": [
+    "# 3. Age differences?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1a04ea5c-8a27-4e67-aafa-ba34580a8d7f",
+   "metadata": {},
+   "source": [
+    "Finally, check that there were no big differences in the age between the groups.\n",
+    "* Calculate the mean and standard deviation of the patient's age, separated by diet group.\n",
+    "\n",
+    "You should be getting a table where diet group are in the rows and gender in columns, like this\n",
+    "\n",
+    "| group          |   Female |   Male |\n",
+    "|:---------------|---------:|-------:|\n",
+    "| Control        |     68   |   66.4 |\n",
+    "| MedDiet + Nuts |     67.4 |   65.8 |\n",
+    "| MedDiet + VOO  |     67.7 |   66.1 |\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "40d8f79e-5595-4a35-822c-042206bde7db",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>sex</th>\n",
+       "      <th>Female</th>\n",
+       "      <th>Male</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>group</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Control</th>\n",
+       "      <td>68.0</td>\n",
+       "      <td>66.4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MedDiet + Nuts</th>\n",
+       "      <td>67.4</td>\n",
+       "      <td>65.8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MedDiet + VOO</th>\n",
+       "      <td>67.7</td>\n",
+       "      <td>66.1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "sex             Female  Male\n",
+       "group                       \n",
+       "Control           68.0  66.4\n",
+       "MedDiet + Nuts    67.4  65.8\n",
+       "MedDiet + VOO     67.7  66.1"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# this works but it is longer than necessary\n",
+    "df.groupby(['group', 'sex'])['age'].mean().reset_index().pivot_table(index='group', columns='sex', values='age').round(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "196fd111-72bc-4b87-b8fb-293547a8c83d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>sex</th>\n",
+       "      <th>Female</th>\n",
+       "      <th>Male</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>group</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Control</th>\n",
+       "      <td>68.0</td>\n",
+       "      <td>66.4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MedDiet + Nuts</th>\n",
+       "      <td>67.4</td>\n",
+       "      <td>65.8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MedDiet + VOO</th>\n",
+       "      <td>67.7</td>\n",
+       "      <td>66.1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "sex             Female  Male\n",
+       "group                       \n",
+       "Control           68.0  66.4\n",
+       "MedDiet + Nuts    67.4  65.8\n",
+       "MedDiet + VOO     67.7  66.1"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# instead of grouping first, do the pivot first and pass the aggregation function as an argument\n",
+    "df.pivot_table(index='group', columns='sex', values='age', aggfunc='mean').round(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "dd5023cd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr:last-of-type th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th colspan=\"2\" halign=\"left\">mean</th>\n",
+       "      <th colspan=\"2\" halign=\"left\">std</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>sex</th>\n",
+       "      <th>Female</th>\n",
+       "      <th>Male</th>\n",
+       "      <th>Female</th>\n",
+       "      <th>Male</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>group</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Control</th>\n",
+       "      <td>68.0</td>\n",
+       "      <td>66.4</td>\n",
+       "      <td>6.0</td>\n",
+       "      <td>6.6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MedDiet + Nuts</th>\n",
+       "      <td>67.4</td>\n",
+       "      <td>65.8</td>\n",
+       "      <td>5.6</td>\n",
+       "      <td>6.4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>MedDiet + VOO</th>\n",
+       "      <td>67.7</td>\n",
+       "      <td>66.1</td>\n",
+       "      <td>5.8</td>\n",
+       "      <td>6.6</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                 mean          std     \n",
+       "sex            Female  Male Female Male\n",
+       "group                                  \n",
+       "Control          68.0  66.4    6.0  6.6\n",
+       "MedDiet + Nuts   67.4  65.8    5.6  6.4\n",
+       "MedDiet + VOO    67.7  66.1    5.8  6.6"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# to get the standard deviation you could do the same the last time but pass aggfunc = 'std. This will return you another dataframe.\n",
+    "\n",
+    "# Alternatively, you can calculate both mean and S.D. in one step. For that you can pass more than one value for the aggregation function\n",
+    "df.pivot_table(index='group', columns='sex', values='age', aggfunc=['mean', 'std']).round(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0af53833-f872-4bd1-9c67-90a370dfe6c5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}