diff --git a/exercises/tabular_join/tabular_join.ipynb b/exercises/tabular_join/tabular_join.ipynb index 6cf3ed9..be0b49a 100644 --- a/exercises/tabular_join/tabular_join.ipynb +++ b/exercises/tabular_join/tabular_join.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "b6f2742b", "metadata": {}, "outputs": [], @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "ed626ee3", "metadata": {}, "outputs": [ @@ -189,7 +189,7 @@ "4 No Yes No No 9 4.761123 No " ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -201,7 +201,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "48d5375f", "metadata": {}, "outputs": [ @@ -275,7 +275,7 @@ "4 2 632 Control" ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -326,12 +326,27 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "861ac334-14ce-490a-b3c4-877b32789f3e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(6324, 16)\n" + ] + } + ], "source": [ - "## your code here\n" + "## your code here\n", + "\n", + "print(df.shape)\n", + "\n", + "identifier = ['location-id', 'patient-id']\n", + "\n", + "\n", + "\n" ] }, { @@ -344,12 +359,25 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "id": "14f57842-5722-4953-88d6-d7cf3070400c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(6287, 3)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "## your code here\n" + "## your code here\n", + "\n", + "info.shape\n" ] }, { @@ -364,14 +392,553 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 10, + "id": "2dfaf171-2b81-4c7f-9101-c689aa56494d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[0;31mSignature:\u001b[0m\n", + "\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmerge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mleft\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'DataFrame | Series'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mright\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'DataFrame | Series'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mhow\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'MergeHow'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'inner'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mon\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'IndexLabel | AnyArrayLike | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mleft_on\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'IndexLabel | AnyArrayLike | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mright_on\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'IndexLabel | AnyArrayLike | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mleft_index\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mright_index\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0msort\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0msuffixes\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Suffixes'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'_x'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'_y'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mindicator\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str | bool'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mvalidate\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;34m'DataFrame'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mDocstring:\u001b[0m\n", + "Merge DataFrame or named Series objects with a database-style join.\n", + "\n", + "A named Series object is treated as a DataFrame with a single named column.\n", + "\n", + "The join is done on columns or indexes. If joining columns on\n", + "columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes\n", + "on indexes or indexes on a column or columns, the index will be passed on.\n", + "When performing a cross merge, no column specifications to merge on are\n", + "allowed.\n", + "\n", + ".. warning::\n", + "\n", + " If both key columns contain rows where the key is a null value, those\n", + " rows will be matched against each other. This is different from usual SQL\n", + " join behaviour and can lead to unexpected results.\n", + "\n", + "Parameters\n", + "----------\n", + "left : DataFrame or named Series\n", + "right : DataFrame or named Series\n", + " Object to merge with.\n", + "how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'\n", + " Type of merge to be performed.\n", + "\n", + " * left: use only keys from left frame, similar to a SQL left outer join;\n", + " preserve key order.\n", + " * right: use only keys from right frame, similar to a SQL right outer join;\n", + " preserve key order.\n", + " * outer: use union of keys from both frames, similar to a SQL full outer\n", + " join; sort keys lexicographically.\n", + " * inner: use intersection of keys from both frames, similar to a SQL inner\n", + " join; preserve the order of the left keys.\n", + " * cross: creates the cartesian product from both frames, preserves the order\n", + " of the left keys.\n", + "on : label or list\n", + " Column or index level names to join on. These must be found in both\n", + " DataFrames. If `on` is None and not merging on indexes then this defaults\n", + " to the intersection of the columns in both DataFrames.\n", + "left_on : label or list, or array-like\n", + " Column or index level names to join on in the left DataFrame. Can also\n", + " be an array or list of arrays of the length of the left DataFrame.\n", + " These arrays are treated as if they are columns.\n", + "right_on : label or list, or array-like\n", + " Column or index level names to join on in the right DataFrame. Can also\n", + " be an array or list of arrays of the length of the right DataFrame.\n", + " These arrays are treated as if they are columns.\n", + "left_index : bool, default False\n", + " Use the index from the left DataFrame as the join key(s). If it is a\n", + " MultiIndex, the number of keys in the other DataFrame (either the index\n", + " or a number of columns) must match the number of levels.\n", + "right_index : bool, default False\n", + " Use the index from the right DataFrame as the join key. Same caveats as\n", + " left_index.\n", + "sort : bool, default False\n", + " Sort the join keys lexicographically in the result DataFrame. If False,\n", + " the order of the join keys depends on the join type (how keyword).\n", + "suffixes : list-like, default is (\"_x\", \"_y\")\n", + " A length-2 sequence where each element is optionally a string\n", + " indicating the suffix to add to overlapping column names in\n", + " `left` and `right` respectively. Pass a value of `None` instead\n", + " of a string to indicate that the column name from `left` or\n", + " `right` should be left as-is, with no suffix. At least one of the\n", + " values must not be None.\n", + "copy : bool, default True\n", + " If False, avoid copy if possible.\n", + "\n", + " .. note::\n", + " The `copy` keyword will change behavior in pandas 3.0.\n", + " `Copy-on-Write\n", + " `__\n", + " will be enabled by default, which means that all methods with a\n", + " `copy` keyword will use a lazy copy mechanism to defer the copy and\n", + " ignore the `copy` keyword. The `copy` keyword will be removed in a\n", + " future version of pandas.\n", + "\n", + " You can already get the future behavior and improvements through\n", + " enabling copy on write ``pd.options.mode.copy_on_write = True``\n", + "indicator : bool or str, default False\n", + " If True, adds a column to the output DataFrame called \"_merge\" with\n", + " information on the source of each row. The column can be given a different\n", + " name by providing a string argument. The column will have a Categorical\n", + " type with the value of \"left_only\" for observations whose merge key only\n", + " appears in the left DataFrame, \"right_only\" for observations\n", + " whose merge key only appears in the right DataFrame, and \"both\"\n", + " if the observation's merge key is found in both DataFrames.\n", + "\n", + "validate : str, optional\n", + " If specified, checks if merge is of specified type.\n", + "\n", + " * \"one_to_one\" or \"1:1\": check if merge keys are unique in both\n", + " left and right datasets.\n", + " * \"one_to_many\" or \"1:m\": check if merge keys are unique in left\n", + " dataset.\n", + " * \"many_to_one\" or \"m:1\": check if merge keys are unique in right\n", + " dataset.\n", + " * \"many_to_many\" or \"m:m\": allowed, but does not result in checks.\n", + "\n", + "Returns\n", + "-------\n", + "DataFrame\n", + " A DataFrame of the two merged objects.\n", + "\n", + "See Also\n", + "--------\n", + "merge_ordered : Merge with optional filling/interpolation.\n", + "merge_asof : Merge on nearest keys.\n", + "DataFrame.join : Similar method using indices.\n", + "\n", + "Examples\n", + "--------\n", + ">>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],\n", + "... 'value': [1, 2, 3, 5]})\n", + ">>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],\n", + "... 'value': [5, 6, 7, 8]})\n", + ">>> df1\n", + " lkey value\n", + "0 foo 1\n", + "1 bar 2\n", + "2 baz 3\n", + "3 foo 5\n", + ">>> df2\n", + " rkey value\n", + "0 foo 5\n", + "1 bar 6\n", + "2 baz 7\n", + "3 foo 8\n", + "\n", + "Merge df1 and df2 on the lkey and rkey columns. The value columns have\n", + "the default suffixes, _x and _y, appended.\n", + "\n", + ">>> df1.merge(df2, left_on='lkey', right_on='rkey')\n", + " lkey value_x rkey value_y\n", + "0 foo 1 foo 5\n", + "1 foo 1 foo 8\n", + "2 bar 2 bar 6\n", + "3 baz 3 baz 7\n", + "4 foo 5 foo 5\n", + "5 foo 5 foo 8\n", + "\n", + "Merge DataFrames df1 and df2 with specified left and right suffixes\n", + "appended to any overlapping columns.\n", + "\n", + ">>> df1.merge(df2, left_on='lkey', right_on='rkey',\n", + "... suffixes=('_left', '_right'))\n", + " lkey value_left rkey value_right\n", + "0 foo 1 foo 5\n", + "1 foo 1 foo 8\n", + "2 bar 2 bar 6\n", + "3 baz 3 baz 7\n", + "4 foo 5 foo 5\n", + "5 foo 5 foo 8\n", + "\n", + "Merge DataFrames df1 and df2, but raise an exception if the DataFrames have\n", + "any overlapping columns.\n", + "\n", + ">>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))\n", + "Traceback (most recent call last):\n", + "...\n", + "ValueError: columns overlap but no suffix specified:\n", + " Index(['value'], dtype='object')\n", + "\n", + ">>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})\n", + ">>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})\n", + ">>> df1\n", + " a b\n", + "0 foo 1\n", + "1 bar 2\n", + ">>> df2\n", + " a c\n", + "0 foo 3\n", + "1 baz 4\n", + "\n", + ">>> df1.merge(df2, how='inner', on='a')\n", + " a b c\n", + "0 foo 1 3\n", + "\n", + ">>> df1.merge(df2, how='left', on='a')\n", + " a b c\n", + "0 foo 1 3.0\n", + "1 bar 2 NaN\n", + "\n", + ">>> df1 = pd.DataFrame({'left': ['foo', 'bar']})\n", + ">>> df2 = pd.DataFrame({'right': [7, 8]})\n", + ">>> df1\n", + " left\n", + "0 foo\n", + "1 bar\n", + ">>> df2\n", + " right\n", + "0 7\n", + "1 8\n", + "\n", + ">>> df1.merge(df2, how='cross')\n", + " left right\n", + "0 foo 7\n", + "1 foo 8\n", + "2 bar 7\n", + "3 bar 8\n", + "\u001b[0;31mFile:\u001b[0m /usr/lib64/python3.13/site-packages/pandas/core/reshape/merge.py\n", + "\u001b[0;31mType:\u001b[0m function" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "pd.merge?\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, "id": "35e19a53", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patient-idlocation-idsexagesmokebmiwaistwthhtndiabhypercholfamhisthormop14toeventeventgroup
04364Male58Former33.531220.753086NoNoYesNoNo105.374401YesControl
111304Male77Current31.051190.730061YesYesNoNoNo106.097194NoControl
211314Female72Former30.861060.654321NoYesNoYesNo85.946612NoMedDiet + VOO
311324Male71Former27.681180.694118YesNoYesNoNo82.907598YesMedDiet + Nuts
411112Female79Never35.941290.806250YesNoYesNoNo94.761123NoMedDiet + VOO
......................................................
63191205Female66Never28.511040.645963YesNoYesYesNo83.550992NoControl
63201185Male80Never23.811090.589189YesYesYesYesNo82.743326NoControl
63213513Male57Former25.241000.571429YesNoYesNoNaN70.479124NoMedDiet + Nuts
63224995Female71Never32.04980.653333YesNoYesYesNo62.587269NoMedDiet + VOO
632312575Male58Former24.43930.547059YesYesYesNoNo92.590007NoMedDiet + Nuts
\n", + "

6324 rows × 17 columns

\n", + "
" + ], + "text/plain": [ + " patient-id location-id sex age smoke bmi waist wth \\\n", + "0 436 4 Male 58 Former 33.53 122 0.753086 \n", + "1 1130 4 Male 77 Current 31.05 119 0.730061 \n", + "2 1131 4 Female 72 Former 30.86 106 0.654321 \n", + "3 1132 4 Male 71 Former 27.68 118 0.694118 \n", + "4 1111 2 Female 79 Never 35.94 129 0.806250 \n", + "... ... ... ... ... ... ... ... ... \n", + "6319 120 5 Female 66 Never 28.51 104 0.645963 \n", + "6320 118 5 Male 80 Never 23.81 109 0.589189 \n", + "6321 351 3 Male 57 Former 25.24 100 0.571429 \n", + "6322 499 5 Female 71 Never 32.04 98 0.653333 \n", + "6323 1257 5 Male 58 Former 24.43 93 0.547059 \n", + "\n", + " htn diab hyperchol famhist hormo p14 toevent event group \n", + "0 No No Yes No No 10 5.374401 Yes Control \n", + "1 Yes Yes No No No 10 6.097194 No Control \n", + "2 No Yes No Yes No 8 5.946612 No MedDiet + VOO \n", + "3 Yes No Yes No No 8 2.907598 Yes MedDiet + Nuts \n", + "4 Yes No Yes No No 9 4.761123 No MedDiet + VOO \n", + "... ... ... ... ... ... ... ... ... ... \n", + "6319 Yes No Yes Yes No 8 3.550992 No Control \n", + "6320 Yes Yes Yes Yes No 8 2.743326 No Control \n", + "6321 Yes No Yes No NaN 7 0.479124 No MedDiet + Nuts \n", + "6322 Yes No Yes Yes No 6 2.587269 No MedDiet + VOO \n", + "6323 Yes Yes Yes No No 9 2.590007 No MedDiet + Nuts \n", + "\n", + "[6324 rows x 17 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "## your code here\n", "\n", - "\n" + "\n", + "df_merged = df.merge(info, how = \"left\", on = identifier)\n", + "df_merged" ] }, { @@ -388,7 +955,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 14, "id": "36ce0688-d421-4a07-b00e-0e9b3201f0e0", "metadata": {}, "outputs": [ @@ -456,7 +1023,7 @@ "4 5 Malaga" ] }, - "execution_count": 8, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -469,12 +1036,342 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 16, "id": "b636dde4-129a-4dd1-8cbf-c539c9c8a5f2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patient-idlocation-idsexagesmokebmiwaistwthhtndiabhypercholfamhisthormop14toeventeventgroupCity
04364Male58Former33.531220.753086NoNoYesNoNo105.374401YesControlBilbao
111304Male77Current31.051190.730061YesYesNoNoNo106.097194NoControlBilbao
211314Female72Former30.861060.654321NoYesNoYesNo85.946612NoMedDiet + VOOBilbao
311324Male71Former27.681180.694118YesNoYesNoNo82.907598YesMedDiet + NutsBilbao
411112Female79Never35.941290.806250YesNoYesNoNo94.761123NoMedDiet + VOOValencia
.........................................................
63191205Female66Never28.511040.645963YesNoYesYesNo83.550992NoControlMalaga
63201185Male80Never23.811090.589189YesYesYesYesNo82.743326NoControlMalaga
63213513Male57Former25.241000.571429YesNoYesNoNaN70.479124NoMedDiet + NutsBarcelona
63224995Female71Never32.04980.653333YesNoYesYesNo62.587269NoMedDiet + VOOMalaga
632312575Male58Former24.43930.547059YesYesYesNoNo92.590007NoMedDiet + NutsMalaga
\n", + "

6324 rows × 18 columns

\n", + "
" + ], + "text/plain": [ + " patient-id location-id sex age smoke bmi waist wth \\\n", + "0 436 4 Male 58 Former 33.53 122 0.753086 \n", + "1 1130 4 Male 77 Current 31.05 119 0.730061 \n", + "2 1131 4 Female 72 Former 30.86 106 0.654321 \n", + "3 1132 4 Male 71 Former 27.68 118 0.694118 \n", + "4 1111 2 Female 79 Never 35.94 129 0.806250 \n", + "... ... ... ... ... ... ... ... ... \n", + "6319 120 5 Female 66 Never 28.51 104 0.645963 \n", + "6320 118 5 Male 80 Never 23.81 109 0.589189 \n", + "6321 351 3 Male 57 Former 25.24 100 0.571429 \n", + "6322 499 5 Female 71 Never 32.04 98 0.653333 \n", + "6323 1257 5 Male 58 Former 24.43 93 0.547059 \n", + "\n", + " htn diab hyperchol famhist hormo p14 toevent event group \\\n", + "0 No No Yes No No 10 5.374401 Yes Control \n", + "1 Yes Yes No No No 10 6.097194 No Control \n", + "2 No Yes No Yes No 8 5.946612 No MedDiet + VOO \n", + "3 Yes No Yes No No 8 2.907598 Yes MedDiet + Nuts \n", + "4 Yes No Yes No No 9 4.761123 No MedDiet + VOO \n", + "... ... ... ... ... ... ... ... ... ... \n", + "6319 Yes No Yes Yes No 8 3.550992 No Control \n", + "6320 Yes Yes Yes Yes No 8 2.743326 No Control \n", + "6321 Yes No Yes No NaN 7 0.479124 No MedDiet + Nuts \n", + "6322 Yes No Yes Yes No 6 2.587269 No MedDiet + VOO \n", + "6323 Yes Yes Yes No No 9 2.590007 No MedDiet + Nuts \n", + "\n", + " City \n", + "0 Bilbao \n", + "1 Bilbao \n", + "2 Bilbao \n", + "3 Bilbao \n", + "4 Valencia \n", + "... ... \n", + "6319 Malaga \n", + "6320 Malaga \n", + "6321 Barcelona \n", + "6322 Malaga \n", + "6323 Malaga \n", + "\n", + "[6324 rows x 18 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "## your code here:\n" + "## your code here:\n", + "\n", + "df_location = df_merged.merge(locations, on = \"location-id\")\n", + "df_location\n", + "\n" ] }, { @@ -492,7 +1389,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 17, "id": "d1d4cc27", "metadata": {}, "outputs": [], @@ -502,7 +1399,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 18, "id": "fbebbd97", "metadata": {}, "outputs": [ @@ -512,7 +1409,7 @@ "(42, 2)" ] }, - "execution_count": 11, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -523,7 +1420,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 19, "id": "8a3c7943", "metadata": {}, "outputs": [ @@ -591,7 +1488,7 @@ "4 4 541" ] }, - "execution_count": 12, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -602,12 +1499,343 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 31, "id": "573687e7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patient-idlocation-idsexagesmokebmiwaistwthhtndiabhypercholfamhisthormop14toeventeventgroupCity
011Female77Never25.92940.657343YesNoYesYesNo95.538672NoMedDiet + VOOMadrid
121Female68Never34.851500.949367YesNoYesYesNaN103.063655NoMedDiet + NutsMadrid
231Female66Never37.501200.750000YesYesNoNoNo65.590691NoMedDiet + NutsMadrid
341Female77Never29.26930.628378YesYesNoNoNo65.456537NoMedDiet + VOOMadrid
451Female60Never30.021040.662420YesNoYesNoNo92.746064NoControlMadrid
.........................................................
631912535Male79Never25.281050.640244YesNoYesNoNo85.828884NoMedDiet + VOOMalaga
632012545Male62Former27.101040.594286YesNoYesYesNo95.067762NoMedDiet + NutsMalaga
632112555Female65Never35.021030.686667YesNoYesNoNo101.993155NoMedDiet + VOOMalaga
632212565Male61Never28.42940.576687YesYesNoNoNo92.039699NoMedDiet + NutsMalaga
632312575Male58Former24.43930.547059YesYesYesNoNo92.590007NoMedDiet + NutsMalaga
\n", + "

6282 rows × 18 columns

\n", + "
" + ], + "text/plain": [ + " patient-id location-id sex age smoke bmi waist wth \\\n", + "0 1 1 Female 77 Never 25.92 94 0.657343 \n", + "1 2 1 Female 68 Never 34.85 150 0.949367 \n", + "2 3 1 Female 66 Never 37.50 120 0.750000 \n", + "3 4 1 Female 77 Never 29.26 93 0.628378 \n", + "4 5 1 Female 60 Never 30.02 104 0.662420 \n", + "... ... ... ... ... ... ... ... ... \n", + "6319 1253 5 Male 79 Never 25.28 105 0.640244 \n", + "6320 1254 5 Male 62 Former 27.10 104 0.594286 \n", + "6321 1255 5 Female 65 Never 35.02 103 0.686667 \n", + "6322 1256 5 Male 61 Never 28.42 94 0.576687 \n", + "6323 1257 5 Male 58 Former 24.43 93 0.547059 \n", + "\n", + " htn diab hyperchol famhist hormo p14 toevent event group \\\n", + "0 Yes No Yes Yes No 9 5.538672 No MedDiet + VOO \n", + "1 Yes No Yes Yes NaN 10 3.063655 No MedDiet + Nuts \n", + "2 Yes Yes No No No 6 5.590691 No MedDiet + Nuts \n", + "3 Yes Yes No No No 6 5.456537 No MedDiet + VOO \n", + "4 Yes No Yes No No 9 2.746064 No Control \n", + "... ... ... ... ... ... ... ... ... ... \n", + "6319 Yes No Yes No No 8 5.828884 No MedDiet + VOO \n", + "6320 Yes No Yes Yes No 9 5.067762 No MedDiet + Nuts \n", + "6321 Yes No Yes No No 10 1.993155 No MedDiet + VOO \n", + "6322 Yes Yes No No No 9 2.039699 No MedDiet + Nuts \n", + "6323 Yes Yes Yes No No 9 2.590007 No MedDiet + Nuts \n", + "\n", + " City \n", + "0 Madrid \n", + "1 Madrid \n", + "2 Madrid \n", + "3 Madrid \n", + "4 Madrid \n", + "... ... \n", + "6319 Malaga \n", + "6320 Malaga \n", + "6321 Malaga \n", + "6322 Malaga \n", + "6323 Malaga \n", + "\n", + "[6282 rows x 18 columns]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code here\n" + "# your code here\n", + "\n", + "df_removed = df_location.merge(dropped, how = \"outer\", on = identifier, indicator = True)\n", + "\n", + "df_removed = df_removed.loc[df_removed[\"_merge\"] == \"left_only\",].drop([\"_merge\"], axis = 1)\n", + "df_removed" ] }, { @@ -622,14 +1850,16 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 32, "id": "85902eea", "metadata": {}, "outputs": [], "source": [ "fname = 'processed_data_predimed.csv'\n", "\n", - "# your code here\n" + "# your code here\n", + "\n", + "df_removed.to_csv(fname)\n" ] } ], @@ -649,7 +1879,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.13.6" } }, "nbformat": 4,