2024-heraklion-data/notebooks/030_tabular_data/.ipynb_checkpoints/010_pandas_introduction-checkpoint.ipynb

1363 lines
30 KiB
Plaintext
Raw Normal View History

2024-08-27 14:27:53 +02:00
{
"cells": [
{
"cell_type": "markdown",
"id": "8cc1c960",
"metadata": {},
"source": [
"# Pandas, quick introduction"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "0f55dab1",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"id": "4b377c42",
"metadata": {},
"source": [
"# Pandas introduces a tabular data structure, the DataFrame\n",
"\n",
"* Columns can be of any C-native type\n",
"* Columns and rows have indices, i.e. labels that identify each column or row"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ec75edbe",
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(\n",
" data = [\n",
" ['Anthony', 28, 1.53], \n",
" ['Maria', 31, 1.76], \n",
" ['Emma', 26, 1.83], \n",
" ['Philip', 41, 1.81], \n",
" ['Bill', 27, None],\n",
" ],\n",
" columns = ['name', 'age', 'height'],\n",
" index=['A484', 'C012', 'A123', 'B663', 'A377'],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "37318480",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>age</th>\n",
" <th>height</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>A484</th>\n",
" <td>Anthony</td>\n",
" <td>28</td>\n",
" <td>1.53</td>\n",
" </tr>\n",
" <tr>\n",
" <th>C012</th>\n",
" <td>Maria</td>\n",
" <td>31</td>\n",
" <td>1.76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>A123</th>\n",
" <td>Emma</td>\n",
" <td>26</td>\n",
" <td>1.83</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B663</th>\n",
" <td>Philip</td>\n",
" <td>41</td>\n",
" <td>1.81</td>\n",
" </tr>\n",
" <tr>\n",
" <th>A377</th>\n",
" <td>Bill</td>\n",
" <td>27</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name age height\n",
"A484 Anthony 28 1.53\n",
"C012 Maria 31 1.76\n",
"A123 Emma 26 1.83\n",
"B663 Philip 41 1.81\n",
"A377 Bill 27 NaN"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b97a9336",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>age</th>\n",
" <th>height</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>A484</th>\n",
" <td>Anthony</td>\n",
" <td>28</td>\n",
" <td>1.53</td>\n",
" </tr>\n",
" <tr>\n",
" <th>C012</th>\n",
" <td>Maria</td>\n",
" <td>31</td>\n",
" <td>1.76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>A123</th>\n",
" <td>Emma</td>\n",
" <td>26</td>\n",
" <td>1.83</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name age height\n",
"A484 Anthony 28 1.53\n",
"C012 Maria 31 1.76\n",
"A123 Emma 26 1.83"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "d3c5fea6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>age</th>\n",
" <th>height</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>A377</th>\n",
" <td>Bill</td>\n",
" <td>27</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>C012</th>\n",
" <td>Maria</td>\n",
" <td>31</td>\n",
" <td>1.76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>A484</th>\n",
" <td>Anthony</td>\n",
" <td>28</td>\n",
" <td>1.53</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name age height\n",
"A377 Bill 27 NaN\n",
"C012 Maria 31 1.76\n",
"A484 Anthony 28 1.53"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sample(3)"
]
},
{
"cell_type": "markdown",
"id": "e31f21c6",
"metadata": {},
"source": [
"## DataFrame attributes"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "41c213bf",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(5, 3)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "8921c1c6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"name object\n",
"age int64\n",
"height float64\n",
"dtype: object"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Each column can be a different dtype\n",
"# All dtypes are native data types, as in NumPy\n",
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "84451023",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['name', 'age', 'height'], dtype='object')"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "223462e3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['A484', 'C012', 'A123', 'B663', 'A377'], dtype='object')"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.index"
]
},
{
"cell_type": "markdown",
"id": "cb2f33b9",
"metadata": {},
"source": [
"## Indexing rows and columns"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "e3420312",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"A484 28\n",
"C012 31\n",
"A123 26\n",
"B663 41\n",
"A377 27\n",
"Name: age, dtype: int64"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Default indexing is by column\n",
"df['age']"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "58b29585",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>A484</th>\n",
" <td>28</td>\n",
" <td>Anthony</td>\n",
" </tr>\n",
" <tr>\n",
" <th>C012</th>\n",
" <td>31</td>\n",
" <td>Maria</td>\n",
" </tr>\n",
" <tr>\n",
" <th>A123</th>\n",
" <td>26</td>\n",
" <td>Emma</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B663</th>\n",
" <td>41</td>\n",
" <td>Philip</td>\n",
" </tr>\n",
" <tr>\n",
" <th>A377</th>\n",
" <td>27</td>\n",
" <td>Bill</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age name\n",
"A484 28 Anthony\n",
"C012 31 Maria\n",
"A123 26 Emma\n",
"B663 41 Philip\n",
"A377 27 Bill"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Use a list to select multiple columns (like in NumPy's fancy indexing)\n",
"df[['age', 'name']]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "6458bc59",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.53"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Indexing by row / column name\n",
"df.loc['A484', 'height']"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "41496582",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.53"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Indexing by element position like in NumPy (it's a bit of a smell)\n",
"df.iloc[0, 2]"
]
},
{
"cell_type": "markdown",
"id": "43ab5233",
"metadata": {},
"source": [
"## Examining a column"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "a0929b25",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['Anthony', 'Maria', 'Emma', 'Philip', 'Bill'], dtype=object)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['name'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "b6087787",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['name'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "bdd587c7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 4.000000\n",
"mean 1.732500\n",
"std 0.138173\n",
"min 1.530000\n",
"25% 1.702500\n",
"50% 1.785000\n",
"75% 1.815000\n",
"max 1.830000\n",
"Name: height, dtype: float64"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['height'].describe()"
]
},
{
"cell_type": "markdown",
"id": "fc081b90",
"metadata": {},
"source": [
"# Filtering"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "7d294f17",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>age</th>\n",
" <th>height</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>C012</th>\n",
" <td>Maria</td>\n",
" <td>31</td>\n",
" <td>1.76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B663</th>\n",
" <td>Philip</td>\n",
" <td>41</td>\n",
" <td>1.81</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name age height\n",
"C012 Maria 31 1.76\n",
"B663 Philip 41 1.81"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['age'] > 30]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "85604657",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>age</th>\n",
" <th>height</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>B663</th>\n",
" <td>Philip</td>\n",
" <td>41</td>\n",
" <td>1.81</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name age height\n",
"B663 Philip 41 1.81"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"is_old_and_tall = (df['age'] > 30) & (df['height'] > 1.8)\n",
"df[is_old_and_tall]"
]
},
{
"cell_type": "markdown",
"id": "a570023a",
"metadata": {},
"source": [
"# Basic operations are by column (unlike NumPy)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "6eb50844",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"26"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['age'].min()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "a95e0e90",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"name Anthony\n",
"age 26\n",
"height 1.53\n",
"dtype: object"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.min()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "d5c3f2f4",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_80457/1061404192.py:2: FutureWarning: The default value of numeric_only in DataFrame.mean is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning.\n",
" df.mean()\n"
]
},
{
"data": {
"text/plain": [
"age 30.6000\n",
"height 1.7325\n",
"dtype: float64"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Note that Pandas operations ignore NaNs (they consider them as \"missing\")\n",
"df.mean()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "fdb4e73e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"age 30.6000\n",
"height 1.7325\n",
"dtype: float64"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.mean(numeric_only=True)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "efc0dcc9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>age</th>\n",
" <th>height</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>A484</th>\n",
" <td>Anthony</td>\n",
" <td>28</td>\n",
" <td>1.53</td>\n",
" </tr>\n",
" <tr>\n",
" <th>A377</th>\n",
" <td>Bill</td>\n",
" <td>27</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>A123</th>\n",
" <td>Emma</td>\n",
" <td>26</td>\n",
" <td>1.83</td>\n",
" </tr>\n",
" <tr>\n",
" <th>C012</th>\n",
" <td>Maria</td>\n",
" <td>31</td>\n",
" <td>1.76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B663</th>\n",
" <td>Philip</td>\n",
" <td>41</td>\n",
" <td>1.81</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name age height\n",
"A484 Anthony 28 1.53\n",
"A377 Bill 27 NaN\n",
"A123 Emma 26 1.83\n",
"C012 Maria 31 1.76\n",
"B663 Philip 41 1.81"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Operations that change the order of the rows keep the index and column labels intact\n",
"df.sort_values('name', axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "2ba681da",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>age</th>\n",
" <th>height</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>A484</th>\n",
" <td>Anthony</td>\n",
" <td>28</td>\n",
" <td>1.53</td>\n",
" </tr>\n",
" <tr>\n",
" <th>C012</th>\n",
" <td>Maria</td>\n",
" <td>31</td>\n",
" <td>1.76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>A123</th>\n",
" <td>Emma</td>\n",
" <td>26</td>\n",
" <td>1.83</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B663</th>\n",
" <td>Philip</td>\n",
" <td>41</td>\n",
" <td>1.81</td>\n",
" </tr>\n",
" <tr>\n",
" <th>A377</th>\n",
" <td>Bill</td>\n",
" <td>27</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name age height\n",
"A484 Anthony 28 1.53\n",
"C012 Maria 31 1.76\n",
"A123 Emma 26 1.83\n",
"B663 Philip 41 1.81\n",
"A377 Bill 27 NaN"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "markdown",
"id": "7cf9b5d7",
"metadata": {},
"source": [
"# Operations on strings"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "c76ca899",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"A484 t\n",
"C012 r\n",
"A123 m\n",
"B663 i\n",
"A377 l\n",
"Name: name, dtype: object"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Use `.str` to access string operations\n",
"# Third character of each name\n",
"df['name'].str[2]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "c9d8494d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"A484 ANTHONY\n",
"C012 MARIA\n",
"A123 EMMA\n",
"B663 PHILIP\n",
"A377 BILL\n",
"Name: name, dtype: object"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Third character of each name\n",
"df['name'].str.upper()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "5767c6aa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"A484 0\n",
"C012 2\n",
"A123 1\n",
"B663 0\n",
"A377 0\n",
"Name: name, dtype: int64"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['name'].str.count('a')"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "a98f79da",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"A484 1\n",
"C012 2\n",
"A123 1\n",
"B663 0\n",
"A377 0\n",
"Name: name, dtype: int64"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['name'].str.lower().str.count('a')"
]
},
{
"cell_type": "markdown",
"id": "b2d162d2",
"metadata": {},
"source": [
"# Adding new columns"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "5cdbb3cd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>age</th>\n",
" <th>height</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>A484</th>\n",
" <td>Anthony</td>\n",
" <td>28</td>\n",
" <td>1.53</td>\n",
" </tr>\n",
" <tr>\n",
" <th>C012</th>\n",
" <td>Maria</td>\n",
" <td>31</td>\n",
" <td>1.76</td>\n",
" </tr>\n",
" <tr>\n",
" <th>A123</th>\n",
" <td>Emma</td>\n",
" <td>26</td>\n",
" <td>1.83</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B663</th>\n",
" <td>Philip</td>\n",
" <td>41</td>\n",
" <td>1.81</td>\n",
" </tr>\n",
" <tr>\n",
" <th>A377</th>\n",
" <td>Bill</td>\n",
" <td>27</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name age height\n",
"A484 Anthony 28 1.53\n",
"C012 Maria 31 1.76\n",
"A123 Emma 26 1.83\n",
"B663 Philip 41 1.81\n",
"A377 Bill 27 NaN"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "0e97e98b",
"metadata": {},
"outputs": [],
"source": [
"df['name_upper'] = df['name'].str.upper()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "4f35c1df",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>age</th>\n",
" <th>height</th>\n",
" <th>name_upper</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>A484</th>\n",
" <td>Anthony</td>\n",
" <td>28</td>\n",
" <td>1.53</td>\n",
" <td>ANTHONY</td>\n",
" </tr>\n",
" <tr>\n",
" <th>C012</th>\n",
" <td>Maria</td>\n",
" <td>31</td>\n",
" <td>1.76</td>\n",
" <td>MARIA</td>\n",
" </tr>\n",
" <tr>\n",
" <th>A123</th>\n",
" <td>Emma</td>\n",
" <td>26</td>\n",
" <td>1.83</td>\n",
" <td>EMMA</td>\n",
" </tr>\n",
" <tr>\n",
" <th>B663</th>\n",
" <td>Philip</td>\n",
" <td>41</td>\n",
" <td>1.81</td>\n",
" <td>PHILIP</td>\n",
" </tr>\n",
" <tr>\n",
" <th>A377</th>\n",
" <td>Bill</td>\n",
" <td>27</td>\n",
" <td>NaN</td>\n",
" <td>BILL</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name age height name_upper\n",
"A484 Anthony 28 1.53 ANTHONY\n",
"C012 Maria 31 1.76 MARIA\n",
"A123 Emma 26 1.83 EMMA\n",
"B663 Philip 41 1.81 PHILIP\n",
"A377 Bill 27 NaN BILL"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2e354ace",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}