{ "cells": [ { "cell_type": "markdown", "id": "8cc1c960", "metadata": {}, "source": [ "# Pandas, quick introduction" ] }, { "cell_type": "code", "execution_count": 1, "id": "0f55dab1", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "markdown", "id": "4b377c42", "metadata": {}, "source": [ "# Pandas introduces a tabular data structure, the DataFrame\n", "\n", "* Columns can be of any C-native type\n", "* Columns and rows have indices, i.e. labels that identify each column or row" ] }, { "cell_type": "code", "execution_count": 2, "id": "ec75edbe", "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(\n", " data = [\n", " ['Anthony', 28, 1.53], \n", " ['Maria', 31, 1.76], \n", " ['Emma', 26, 1.83], \n", " ['Philip', 41, 1.81], \n", " ['Bill', 27, None],\n", " ],\n", " columns = ['name', 'age', 'height'],\n", " index=['A484', 'C012', 'A123', 'B663', 'A377'],\n", ")" ] }, { "cell_type": "code", "execution_count": 3, "id": "37318480", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameageheight
A484Anthony281.53
C012Maria311.76
A123Emma261.83
B663Philip411.81
A377Bill27NaN
\n", "
" ], "text/plain": [ " name age height\n", "A484 Anthony 28 1.53\n", "C012 Maria 31 1.76\n", "A123 Emma 26 1.83\n", "B663 Philip 41 1.81\n", "A377 Bill 27 NaN" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 4, "id": "b97a9336", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameageheight
A484Anthony281.53
C012Maria311.76
A123Emma261.83
\n", "
" ], "text/plain": [ " name age height\n", "A484 Anthony 28 1.53\n", "C012 Maria 31 1.76\n", "A123 Emma 26 1.83" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(3)" ] }, { "cell_type": "code", "execution_count": 5, "id": "d3c5fea6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameageheight
A377Bill27NaN
C012Maria311.76
A484Anthony281.53
\n", "
" ], "text/plain": [ " name age height\n", "A377 Bill 27 NaN\n", "C012 Maria 31 1.76\n", "A484 Anthony 28 1.53" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sample(3)" ] }, { "cell_type": "markdown", "id": "e31f21c6", "metadata": {}, "source": [ "## DataFrame attributes" ] }, { "cell_type": "code", "execution_count": 6, "id": "41c213bf", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5, 3)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 7, "id": "8921c1c6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "name object\n", "age int64\n", "height float64\n", "dtype: object" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Each column can be a different dtype\n", "# All dtypes are native data types, as in NumPy\n", "df.dtypes" ] }, { "cell_type": "code", "execution_count": 8, "id": "84451023", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['name', 'age', 'height'], dtype='object')" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": 9, "id": "223462e3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['A484', 'C012', 'A123', 'B663', 'A377'], dtype='object')" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.index" ] }, { "cell_type": "markdown", "id": "cb2f33b9", "metadata": {}, "source": [ "## Indexing rows and columns" ] }, { "cell_type": "code", "execution_count": 10, "id": "e3420312", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "A484 28\n", "C012 31\n", "A123 26\n", "B663 41\n", "A377 27\n", "Name: age, dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Default indexing is by column\n", "df['age']" ] }, { "cell_type": "code", "execution_count": 11, "id": "58b29585", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agename
A48428Anthony
C01231Maria
A12326Emma
B66341Philip
A37727Bill
\n", "
" ], "text/plain": [ " age name\n", "A484 28 Anthony\n", "C012 31 Maria\n", "A123 26 Emma\n", "B663 41 Philip\n", "A377 27 Bill" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Use a list to select multiple columns (like in NumPy's fancy indexing)\n", "df[['age', 'name']]" ] }, { "cell_type": "code", "execution_count": 12, "id": "6458bc59", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.53" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Indexing by row / column name\n", "df.loc['A484', 'height']" ] }, { "cell_type": "code", "execution_count": 13, "id": "41496582", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.53" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Indexing by element position like in NumPy (it's a bit of a smell)\n", "df.iloc[0, 2]" ] }, { "cell_type": "markdown", "id": "43ab5233", "metadata": {}, "source": [ "## Examining a column" ] }, { "cell_type": "code", "execution_count": 14, "id": "a0929b25", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Anthony', 'Maria', 'Emma', 'Philip', 'Bill'], dtype=object)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['name'].unique()" ] }, { "cell_type": "code", "execution_count": 15, "id": "b6087787", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "5" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['name'].nunique()" ] }, { "cell_type": "code", "execution_count": 16, "id": "bdd587c7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 4.000000\n", "mean 1.732500\n", "std 0.138173\n", "min 1.530000\n", "25% 1.702500\n", "50% 1.785000\n", "75% 1.815000\n", "max 1.830000\n", "Name: height, dtype: float64" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['height'].describe()" ] }, { "cell_type": "markdown", "id": "fc081b90", "metadata": {}, "source": [ "# Filtering" ] }, { "cell_type": "code", "execution_count": 17, "id": "7d294f17", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameageheight
C012Maria311.76
B663Philip411.81
\n", "
" ], "text/plain": [ " name age height\n", "C012 Maria 31 1.76\n", "B663 Philip 41 1.81" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['age'] > 30]" ] }, { "cell_type": "code", "execution_count": 18, "id": "85604657", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameageheight
B663Philip411.81
\n", "
" ], "text/plain": [ " name age height\n", "B663 Philip 41 1.81" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "is_old_and_tall = (df['age'] > 30) & (df['height'] > 1.8)\n", "df[is_old_and_tall]" ] }, { "cell_type": "markdown", "id": "a570023a", "metadata": {}, "source": [ "# Basic operations are by column (unlike NumPy)" ] }, { "cell_type": "code", "execution_count": 19, "id": "6eb50844", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "26" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['age'].min()" ] }, { "cell_type": "code", "execution_count": 20, "id": "a95e0e90", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "name Anthony\n", "age 26\n", "height 1.53\n", "dtype: object" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.min()" ] }, { "cell_type": "code", "execution_count": 21, "id": "d5c3f2f4", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_80457/1061404192.py:2: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.\n", " df.mean()\n" ] }, { "data": { "text/plain": [ "age 30.6000\n", "height 1.7325\n", "dtype: float64" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Note that Pandas operations ignore NaNs (they consider them as \"missing\")\n", "df.mean()" ] }, { "cell_type": "code", "execution_count": 22, "id": "fdb4e73e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "age 30.6000\n", "height 1.7325\n", "dtype: float64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.mean(numeric_only=True)" ] }, { "cell_type": "code", "execution_count": 23, "id": "efc0dcc9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameageheight
A484Anthony281.53
A377Bill27NaN
A123Emma261.83
C012Maria311.76
B663Philip411.81
\n", "
" ], "text/plain": [ " name age height\n", "A484 Anthony 28 1.53\n", "A377 Bill 27 NaN\n", "A123 Emma 26 1.83\n", "C012 Maria 31 1.76\n", "B663 Philip 41 1.81" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Operations that change the order of the rows keep the index and column labels intact\n", "df.sort_values('name', axis=0)" ] }, { "cell_type": "code", "execution_count": 24, "id": "2ba681da", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameageheight
A484Anthony281.53
C012Maria311.76
A123Emma261.83
B663Philip411.81
A377Bill27NaN
\n", "
" ], "text/plain": [ " name age height\n", "A484 Anthony 28 1.53\n", "C012 Maria 31 1.76\n", "A123 Emma 26 1.83\n", "B663 Philip 41 1.81\n", "A377 Bill 27 NaN" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "id": "7cf9b5d7", "metadata": {}, "source": [ "# Operations on strings" ] }, { "cell_type": "code", "execution_count": 25, "id": "c76ca899", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "A484 t\n", "C012 r\n", "A123 m\n", "B663 i\n", "A377 l\n", "Name: name, dtype: object" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Use `.str` to access string operations\n", "# Third character of each name\n", "df['name'].str[2]" ] }, { "cell_type": "code", "execution_count": 26, "id": "c9d8494d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "A484 ANTHONY\n", "C012 MARIA\n", "A123 EMMA\n", "B663 PHILIP\n", "A377 BILL\n", "Name: name, dtype: object" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Third character of each name\n", "df['name'].str.upper()" ] }, { "cell_type": "code", "execution_count": 27, "id": "5767c6aa", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "A484 0\n", "C012 2\n", "A123 1\n", "B663 0\n", "A377 0\n", "Name: name, dtype: int64" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['name'].str.count('a')" ] }, { "cell_type": "code", "execution_count": 28, "id": "a98f79da", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "A484 1\n", "C012 2\n", "A123 1\n", "B663 0\n", "A377 0\n", "Name: name, dtype: int64" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['name'].str.lower().str.count('a')" ] }, { "cell_type": "markdown", "id": "b2d162d2", "metadata": {}, "source": [ "# Adding new columns" ] }, { "cell_type": "code", "execution_count": 29, "id": "5cdbb3cd", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameageheight
A484Anthony281.53
C012Maria311.76
A123Emma261.83
B663Philip411.81
A377Bill27NaN
\n", "
" ], "text/plain": [ " name age height\n", "A484 Anthony 28 1.53\n", "C012 Maria 31 1.76\n", "A123 Emma 26 1.83\n", "B663 Philip 41 1.81\n", "A377 Bill 27 NaN" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 30, "id": "0e97e98b", "metadata": {}, "outputs": [], "source": [ "df['name_upper'] = df['name'].str.upper()" ] }, { "cell_type": "code", "execution_count": 31, "id": "4f35c1df", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameageheightname_upper
A484Anthony281.53ANTHONY
C012Maria311.76MARIA
A123Emma261.83EMMA
B663Philip411.81PHILIP
A377Bill27NaNBILL
\n", "
" ], "text/plain": [ " name age height name_upper\n", "A484 Anthony 28 1.53 ANTHONY\n", "C012 Maria 31 1.76 MARIA\n", "A123 Emma 26 1.83 EMMA\n", "B663 Philip 41 1.81 PHILIP\n", "A377 Bill 27 NaN BILL" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "id": "2e354ace", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" } }, "nbformat": 4, "nbformat_minor": 5 }