2024-heraklion-data/notebooks/030_tabular_data/.ipynb_checkpoints/010_pandas_introduction-checkpoint.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "8cc1c960",
   "metadata": {},
   "source": [
    "# Pandas, quick introduction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0f55dab1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4b377c42",
   "metadata": {},
   "source": [
    "# Pandas introduces a tabular data structure, the DataFrame\n",
    "\n",
    "* Columns can be of any C-native type\n",
    "* Columns and rows have indices, i.e. labels that identify each column or row"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ec75edbe",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(\n",
    "    data = [\n",
    "        ['Anthony', 28, 1.53], \n",
    "        ['Maria', 31, 1.76], \n",
    "        ['Emma', 26, 1.83], \n",
    "        ['Philip', 41, 1.81], \n",
    "        ['Bill', 27, None],\n",
    "    ],\n",
    "    columns = ['name', 'age', 'height'],\n",
    "    index=['A484', 'C012', 'A123', 'B663', 'A377'],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "37318480",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>age</th>\n",
       "      <th>height</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A484</th>\n",
       "      <td>Anthony</td>\n",
       "      <td>28</td>\n",
       "      <td>1.53</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C012</th>\n",
       "      <td>Maria</td>\n",
       "      <td>31</td>\n",
       "      <td>1.76</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A123</th>\n",
       "      <td>Emma</td>\n",
       "      <td>26</td>\n",
       "      <td>1.83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>B663</th>\n",
       "      <td>Philip</td>\n",
       "      <td>41</td>\n",
       "      <td>1.81</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A377</th>\n",
       "      <td>Bill</td>\n",
       "      <td>27</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         name  age  height\n",
       "A484  Anthony   28    1.53\n",
       "C012    Maria   31    1.76\n",
       "A123     Emma   26    1.83\n",
       "B663   Philip   41    1.81\n",
       "A377     Bill   27     NaN"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b97a9336",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>age</th>\n",
       "      <th>height</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A484</th>\n",
       "      <td>Anthony</td>\n",
       "      <td>28</td>\n",
       "      <td>1.53</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C012</th>\n",
       "      <td>Maria</td>\n",
       "      <td>31</td>\n",
       "      <td>1.76</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A123</th>\n",
       "      <td>Emma</td>\n",
       "      <td>26</td>\n",
       "      <td>1.83</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         name  age  height\n",
       "A484  Anthony   28    1.53\n",
       "C012    Maria   31    1.76\n",
       "A123     Emma   26    1.83"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d3c5fea6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>age</th>\n",
       "      <th>height</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A377</th>\n",
       "      <td>Bill</td>\n",
       "      <td>27</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C012</th>\n",
       "      <td>Maria</td>\n",
       "      <td>31</td>\n",
       "      <td>1.76</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A484</th>\n",
       "      <td>Anthony</td>\n",
       "      <td>28</td>\n",
       "      <td>1.53</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         name  age  height\n",
       "A377     Bill   27     NaN\n",
       "C012    Maria   31    1.76\n",
       "A484  Anthony   28    1.53"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sample(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e31f21c6",
   "metadata": {},
   "source": [
    "## DataFrame attributes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "41c213bf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5, 3)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "8921c1c6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "name       object\n",
       "age         int64\n",
       "height    float64\n",
       "dtype: object"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Each column can be a different dtype\n",
    "# All dtypes are native data types, as in NumPy\n",
    "df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "84451023",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['name', 'age', 'height'], dtype='object')"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "223462e3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['A484', 'C012', 'A123', 'B663', 'A377'], dtype='object')"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.index"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cb2f33b9",
   "metadata": {},
   "source": [
    "## Indexing rows and columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "e3420312",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "A484    28\n",
       "C012    31\n",
       "A123    26\n",
       "B663    41\n",
       "A377    27\n",
       "Name: age, dtype: int64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Default indexing is by column\n",
    "df['age']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "58b29585",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A484</th>\n",
       "      <td>28</td>\n",
       "      <td>Anthony</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C012</th>\n",
       "      <td>31</td>\n",
       "      <td>Maria</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A123</th>\n",
       "      <td>26</td>\n",
       "      <td>Emma</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>B663</th>\n",
       "      <td>41</td>\n",
       "      <td>Philip</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A377</th>\n",
       "      <td>27</td>\n",
       "      <td>Bill</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      age     name\n",
       "A484   28  Anthony\n",
       "C012   31    Maria\n",
       "A123   26     Emma\n",
       "B663   41   Philip\n",
       "A377   27     Bill"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Use a list to select multiple columns (like in NumPy's fancy indexing)\n",
    "df[['age', 'name']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "6458bc59",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.53"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Indexing by row / column name\n",
    "df.loc['A484', 'height']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "41496582",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.53"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Indexing by element position like in NumPy (it's a bit of a smell)\n",
    "df.iloc[0, 2]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "43ab5233",
   "metadata": {},
   "source": [
    "## Examining a column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "a0929b25",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Anthony', 'Maria', 'Emma', 'Philip', 'Bill'], dtype=object)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['name'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "b6087787",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['name'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "bdd587c7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    4.000000\n",
       "mean     1.732500\n",
       "std      0.138173\n",
       "min      1.530000\n",
       "25%      1.702500\n",
       "50%      1.785000\n",
       "75%      1.815000\n",
       "max      1.830000\n",
       "Name: height, dtype: float64"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['height'].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fc081b90",
   "metadata": {},
   "source": [
    "# Filtering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "7d294f17",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>age</th>\n",
       "      <th>height</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>C012</th>\n",
       "      <td>Maria</td>\n",
       "      <td>31</td>\n",
       "      <td>1.76</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>B663</th>\n",
       "      <td>Philip</td>\n",
       "      <td>41</td>\n",
       "      <td>1.81</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        name  age  height\n",
       "C012   Maria   31    1.76\n",
       "B663  Philip   41    1.81"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['age'] > 30]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "85604657",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>age</th>\n",
       "      <th>height</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>B663</th>\n",
       "      <td>Philip</td>\n",
       "      <td>41</td>\n",
       "      <td>1.81</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        name  age  height\n",
       "B663  Philip   41    1.81"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "is_old_and_tall = (df['age'] > 30) & (df['height'] > 1.8)\n",
    "df[is_old_and_tall]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a570023a",
   "metadata": {},
   "source": [
    "# Basic operations are by column (unlike NumPy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "6eb50844",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "26"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['age'].min()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "a95e0e90",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "name      Anthony\n",
       "age            26\n",
       "height       1.53\n",
       "dtype: object"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.min()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "d5c3f2f4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_80457/1061404192.py:2: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.\n",
      "  df.mean()\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "age       30.6000\n",
       "height     1.7325\n",
       "dtype: float64"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Note that Pandas operations ignore NaNs (they consider them as \"missing\")\n",
    "df.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "fdb4e73e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "age       30.6000\n",
       "height     1.7325\n",
       "dtype: float64"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.mean(numeric_only=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "efc0dcc9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>age</th>\n",
       "      <th>height</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A484</th>\n",
       "      <td>Anthony</td>\n",
       "      <td>28</td>\n",
       "      <td>1.53</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A377</th>\n",
       "      <td>Bill</td>\n",
       "      <td>27</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A123</th>\n",
       "      <td>Emma</td>\n",
       "      <td>26</td>\n",
       "      <td>1.83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C012</th>\n",
       "      <td>Maria</td>\n",
       "      <td>31</td>\n",
       "      <td>1.76</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>B663</th>\n",
       "      <td>Philip</td>\n",
       "      <td>41</td>\n",
       "      <td>1.81</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         name  age  height\n",
       "A484  Anthony   28    1.53\n",
       "A377     Bill   27     NaN\n",
       "A123     Emma   26    1.83\n",
       "C012    Maria   31    1.76\n",
       "B663   Philip   41    1.81"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Operations that change the order of the rows keep the index and column labels intact\n",
    "df.sort_values('name', axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "2ba681da",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>age</th>\n",
       "      <th>height</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A484</th>\n",
       "      <td>Anthony</td>\n",
       "      <td>28</td>\n",
       "      <td>1.53</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C012</th>\n",
       "      <td>Maria</td>\n",
       "      <td>31</td>\n",
       "      <td>1.76</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A123</th>\n",
       "      <td>Emma</td>\n",
       "      <td>26</td>\n",
       "      <td>1.83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>B663</th>\n",
       "      <td>Philip</td>\n",
       "      <td>41</td>\n",
       "      <td>1.81</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A377</th>\n",
       "      <td>Bill</td>\n",
       "      <td>27</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         name  age  height\n",
       "A484  Anthony   28    1.53\n",
       "C012    Maria   31    1.76\n",
       "A123     Emma   26    1.83\n",
       "B663   Philip   41    1.81\n",
       "A377     Bill   27     NaN"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7cf9b5d7",
   "metadata": {},
   "source": [
    "# Operations on strings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "c76ca899",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "A484    t\n",
       "C012    r\n",
       "A123    m\n",
       "B663    i\n",
       "A377    l\n",
       "Name: name, dtype: object"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Use `.str` to access string operations\n",
    "# Third character of each name\n",
    "df['name'].str[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "c9d8494d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "A484    ANTHONY\n",
       "C012      MARIA\n",
       "A123       EMMA\n",
       "B663     PHILIP\n",
       "A377       BILL\n",
       "Name: name, dtype: object"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Third character of each name\n",
    "df['name'].str.upper()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "5767c6aa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "A484    0\n",
       "C012    2\n",
       "A123    1\n",
       "B663    0\n",
       "A377    0\n",
       "Name: name, dtype: int64"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['name'].str.count('a')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "a98f79da",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "A484    1\n",
       "C012    2\n",
       "A123    1\n",
       "B663    0\n",
       "A377    0\n",
       "Name: name, dtype: int64"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['name'].str.lower().str.count('a')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b2d162d2",
   "metadata": {},
   "source": [
    "# Adding new columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "5cdbb3cd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>age</th>\n",
       "      <th>height</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A484</th>\n",
       "      <td>Anthony</td>\n",
       "      <td>28</td>\n",
       "      <td>1.53</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C012</th>\n",
       "      <td>Maria</td>\n",
       "      <td>31</td>\n",
       "      <td>1.76</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A123</th>\n",
       "      <td>Emma</td>\n",
       "      <td>26</td>\n",
       "      <td>1.83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>B663</th>\n",
       "      <td>Philip</td>\n",
       "      <td>41</td>\n",
       "      <td>1.81</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A377</th>\n",
       "      <td>Bill</td>\n",
       "      <td>27</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         name  age  height\n",
       "A484  Anthony   28    1.53\n",
       "C012    Maria   31    1.76\n",
       "A123     Emma   26    1.83\n",
       "B663   Philip   41    1.81\n",
       "A377     Bill   27     NaN"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "0e97e98b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['name_upper'] = df['name'].str.upper()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "4f35c1df",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>age</th>\n",
       "      <th>height</th>\n",
       "      <th>name_upper</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A484</th>\n",
       "      <td>Anthony</td>\n",
       "      <td>28</td>\n",
       "      <td>1.53</td>\n",
       "      <td>ANTHONY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C012</th>\n",
       "      <td>Maria</td>\n",
       "      <td>31</td>\n",
       "      <td>1.76</td>\n",
       "      <td>MARIA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A123</th>\n",
       "      <td>Emma</td>\n",
       "      <td>26</td>\n",
       "      <td>1.83</td>\n",
       "      <td>EMMA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>B663</th>\n",
       "      <td>Philip</td>\n",
       "      <td>41</td>\n",
       "      <td>1.81</td>\n",
       "      <td>PHILIP</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A377</th>\n",
       "      <td>Bill</td>\n",
       "      <td>27</td>\n",
       "      <td>NaN</td>\n",
       "      <td>BILL</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         name  age  height name_upper\n",
       "A484  Anthony   28    1.53    ANTHONY\n",
       "C012    Maria   31    1.76      MARIA\n",
       "A123     Emma   26    1.83       EMMA\n",
       "B663   Philip   41    1.81     PHILIP\n",
       "A377     Bill   27     NaN       BILL"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e354ace",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}