{ "cells": [ { "cell_type": "markdown", "id": "e951a26e", "metadata": {}, "source": [ "# Exercise: Analysis of tubercolosis cases by country and year period\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "6b181870", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "pd.set_option('display.max_rows', 1000)\n", "pd.set_option('display.max_columns', 100)\n", "pd.set_option(\"display.max_colwidth\", None)" ] }, { "cell_type": "markdown", "id": "9adcc036", "metadata": {}, "source": [ "# Load the TB data from the World Health Organization" ] }, { "cell_type": "code", "execution_count": 2, "id": "5d9e9162", "metadata": {}, "outputs": [], "source": [ "tb_raw = pd.read_csv('who2.csv', index_col='rownames')" ] }, { "cell_type": "markdown", "id": "cf7691e5", "metadata": {}, "source": [ "Only keep data between 2000 and 2012" ] }, { "cell_type": "code", "execution_count": 3, "id": "a953d230", "metadata": {}, "outputs": [], "source": [ "cols = ['country', 'year'] + [c for c in tb_raw.columns if c.startswith('sp')]\n", "tb_raw = tb_raw.loc[tb_raw['year'].between(2000, 2012), cols]" ] }, { "cell_type": "code", "execution_count": 4, "id": "ba962fb7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2783, 16)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tb_raw.shape" ] }, { "cell_type": "code", "execution_count": 5, "id": "c79a5b8d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countryyearsp_m_014sp_m_1524sp_m_2534sp_m_3544sp_m_4554sp_m_5564sp_m_65sp_f_014sp_f_1524sp_f_2534sp_f_3544sp_f_4554sp_f_5564sp_f_65
rownames
5551San Marino2009NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
642Belarus20090.066.0173.0208.0287.0134.054.00.041.052.052.041.025.068.0
7234Zimbabwe2007138.0500.03693.00.0716.0292.0153.0185.0739.03311.00.0553.0213.090.0
3471Kuwait20080.018.090.056.034.011.09.02.033.047.027.07.05.06.0
3336Jordan20091.05.015.014.010.07.06.00.07.014.08.03.07.012.0
2689Grenada2008NaN1.0NaN1.02.0NaN1.0NaNNaNNaNNaNNaNNaNNaN
634Belarus20012.0NaNNaNNaNNaNNaNNaN4.0NaNNaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " country year sp_m_014 sp_m_1524 sp_m_2534 sp_m_3544 \\\n", "rownames \n", "5551 San Marino 2009 NaN NaN NaN NaN \n", "642 Belarus 2009 0.0 66.0 173.0 208.0 \n", "7234 Zimbabwe 2007 138.0 500.0 3693.0 0.0 \n", "3471 Kuwait 2008 0.0 18.0 90.0 56.0 \n", "3336 Jordan 2009 1.0 5.0 15.0 14.0 \n", "2689 Grenada 2008 NaN 1.0 NaN 1.0 \n", "634 Belarus 2001 2.0 NaN NaN NaN \n", "\n", " sp_m_4554 sp_m_5564 sp_m_65 sp_f_014 sp_f_1524 sp_f_2534 \\\n", "rownames \n", "5551 NaN NaN NaN NaN NaN NaN \n", "642 287.0 134.0 54.0 0.0 41.0 52.0 \n", "7234 716.0 292.0 153.0 185.0 739.0 3311.0 \n", "3471 34.0 11.0 9.0 2.0 33.0 47.0 \n", "3336 10.0 7.0 6.0 0.0 7.0 14.0 \n", "2689 2.0 NaN 1.0 NaN NaN NaN \n", "634 NaN NaN NaN 4.0 NaN NaN \n", "\n", " sp_f_3544 sp_f_4554 sp_f_5564 sp_f_65 \n", "rownames \n", "5551 NaN NaN NaN NaN \n", "642 52.0 41.0 25.0 68.0 \n", "7234 0.0 553.0 213.0 90.0 \n", "3471 27.0 7.0 5.0 6.0 \n", "3336 8.0 3.0 7.0 12.0 \n", "2689 NaN NaN NaN NaN \n", "634 NaN NaN NaN NaN " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tb_raw.sample(7, random_state=727)" ] }, { "cell_type": "code", "execution_count": 6, "id": "6e8b1d89", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countryyearsp_m_014sp_m_1524sp_m_2534sp_m_3544sp_m_4554sp_m_5564sp_m_65sp_f_014sp_f_1524sp_f_2534sp_f_3544sp_f_4554sp_f_5564sp_f_65
rownames
191Angola2000186.0999.01003.0912.0482.0312.0194.0247.01142.01091.0844.0417.0200.0120.0
192Angola2001230.0892.0752.0648.0420.0197.0173.0279.0993.0869.0647.0323.0200.0182.0
193Angola2002435.02223.02292.01915.01187.0624.0444.0640.02610.02208.01600.0972.0533.0305.0
194Angola2003409.02355.02598.01908.01090.0512.0361.0591.03078.02641.01747.01157.0395.0129.0
195Angola2004554.02684.02659.01998.01196.0561.0321.0733.03198.02772.01854.01029.0505.0269.0
196Angola2005520.02549.02797.01918.01255.0665.0461.0704.02926.02682.01797.01138.0581.0417.0
197Angola2006540.02632.03049.02182.01397.0729.0428.0689.02851.02892.01990.01223.0583.0314.0
198Angola2007484.02824.03197.02255.01357.0699.0465.0703.02943.02721.01812.01041.0554.0367.0
199Angola2008367.02970.03493.02418.01480.0733.0420.0512.03199.02786.02082.01209.0556.0337.0
200Angola2009392.03054.03600.02420.01590.0748.0463.0568.03152.02798.01790.01069.0572.0272.0
201Angola2010448.02900.03584.02415.01424.0691.0355.0558.02763.02594.01688.0958.0482.0286.0
202Angola2011501.03000.03792.02386.01395.0680.0455.0708.02731.02563.01683.01006.0457.0346.0
203Angola2012390.02804.03627.02529.01427.0732.0424.0592.02501.02540.01617.01028.0529.0384.0
\n", "
" ], "text/plain": [ " country year sp_m_014 sp_m_1524 sp_m_2534 sp_m_3544 sp_m_4554 \\\n", "rownames \n", "191 Angola 2000 186.0 999.0 1003.0 912.0 482.0 \n", "192 Angola 2001 230.0 892.0 752.0 648.0 420.0 \n", "193 Angola 2002 435.0 2223.0 2292.0 1915.0 1187.0 \n", "194 Angola 2003 409.0 2355.0 2598.0 1908.0 1090.0 \n", "195 Angola 2004 554.0 2684.0 2659.0 1998.0 1196.0 \n", "196 Angola 2005 520.0 2549.0 2797.0 1918.0 1255.0 \n", "197 Angola 2006 540.0 2632.0 3049.0 2182.0 1397.0 \n", "198 Angola 2007 484.0 2824.0 3197.0 2255.0 1357.0 \n", "199 Angola 2008 367.0 2970.0 3493.0 2418.0 1480.0 \n", "200 Angola 2009 392.0 3054.0 3600.0 2420.0 1590.0 \n", "201 Angola 2010 448.0 2900.0 3584.0 2415.0 1424.0 \n", "202 Angola 2011 501.0 3000.0 3792.0 2386.0 1395.0 \n", "203 Angola 2012 390.0 2804.0 3627.0 2529.0 1427.0 \n", "\n", " sp_m_5564 sp_m_65 sp_f_014 sp_f_1524 sp_f_2534 sp_f_3544 \\\n", "rownames \n", "191 312.0 194.0 247.0 1142.0 1091.0 844.0 \n", "192 197.0 173.0 279.0 993.0 869.0 647.0 \n", "193 624.0 444.0 640.0 2610.0 2208.0 1600.0 \n", "194 512.0 361.0 591.0 3078.0 2641.0 1747.0 \n", "195 561.0 321.0 733.0 3198.0 2772.0 1854.0 \n", "196 665.0 461.0 704.0 2926.0 2682.0 1797.0 \n", "197 729.0 428.0 689.0 2851.0 2892.0 1990.0 \n", "198 699.0 465.0 703.0 2943.0 2721.0 1812.0 \n", "199 733.0 420.0 512.0 3199.0 2786.0 2082.0 \n", "200 748.0 463.0 568.0 3152.0 2798.0 1790.0 \n", "201 691.0 355.0 558.0 2763.0 2594.0 1688.0 \n", "202 680.0 455.0 708.0 2731.0 2563.0 1683.0 \n", "203 732.0 424.0 592.0 2501.0 2540.0 1617.0 \n", "\n", " sp_f_4554 sp_f_5564 sp_f_65 \n", "rownames \n", "191 417.0 200.0 120.0 \n", "192 323.0 200.0 182.0 \n", "193 972.0 533.0 305.0 \n", "194 1157.0 395.0 129.0 \n", "195 1029.0 505.0 269.0 \n", "196 1138.0 581.0 417.0 \n", "197 1223.0 583.0 314.0 \n", "198 1041.0 554.0 367.0 \n", "199 1209.0 556.0 337.0 \n", "200 1069.0 572.0 272.0 \n", "201 958.0 482.0 286.0 \n", "202 1006.0 457.0 346.0 \n", "203 1028.0 529.0 384.0 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tb_raw[tb_raw['country'] == 'Angola']" ] }, { "cell_type": "code", "execution_count": 7, "id": "116c47ad", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "Index(['country', 'year', 'sp_m_014', 'sp_m_1524', 'sp_m_2534', 'sp_m_3544',\n", " 'sp_m_4554', 'sp_m_5564', 'sp_m_65', 'sp_f_014', 'sp_f_1524',\n", " 'sp_f_2534', 'sp_f_3544', 'sp_f_4554', 'sp_f_5564', 'sp_f_65'],\n", " dtype='object')" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tb_raw.columns" ] }, { "cell_type": "markdown", "id": "062ed46a", "metadata": {}, "source": [ "# 1. Make data tidy\n", "\n", "The final table should have these columns: `country`, `year`, `gender`, `age_range`, `cases`" ] }, { "cell_type": "code", "execution_count": null, "id": "568c8440", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "9d1f036e", "metadata": {}, "source": [ "# 2. Compute summary tables\n", "\n", "1. Compute the number of cases per country and gender, for data between 2000 and 2006 (included)\n", "2. Compute the number of cases per country and year range (2000-2006, 2007-2012) on rows, and gender on columns" ] }, { "cell_type": "code", "execution_count": null, "id": "c8e9b0e4", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" } }, "nbformat": 4, "nbformat_minor": 5 }