2024-heraklion-data/notebooks/.ipynb_checkpoints/notebook-1-sorting-examples-checkpoint.ipynb

387 lines
9.2 KiB
Plaintext
Raw Normal View History

2024-08-27 14:27:53 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "8685ea3a",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [],
"source": [
"import numpy as np\n",
"import timeit\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "markdown",
"id": "048881d0",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# Example: Find common words"
]
},
{
"cell_type": "markdown",
"id": "2464a282",
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"source": [
"Problem: given two lists of words, extract all the words that are in common"
]
},
{
"cell_type": "markdown",
"id": "71740eab",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# Implementation with 2x for-loops"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f175c775",
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [],
"source": [
"%%timeit\n",
"\n",
"scaling_factor = 1 #10, 100\n",
"\n",
"words1 = ['apple', 'orange', 'banana', 'melon', 'peach'] * scaling_factor\n",
"words2 = ['orange', 'kiwi', 'avocado', 'apple', 'banana'] * scaling_factor\n",
"\n",
"common_for = []\n",
"for w in words1:\n",
" if w in words2:\n",
" common_for.append(w) # 612 ns, 12.3 us, 928 us "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "affab857",
"metadata": {
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [],
"source": [
"input_size = [1, 10, 100]\n",
"results_for_loop = [(612/10**9)/(612/10**9), (12.4 /10**6)/(612/10**9), (928/10**6)/(612/10**9)] # in seconds\n",
"\n",
"x = np.linspace(0,100,100)\n",
"fit1 = np.polyfit(input_size,results_for_loop,2)\n",
"eval1 = np.polyval(fit1, x)\n",
"\n",
"plt.plot(x,eval1,c = 'orange')\n",
"plt.scatter(input_size, results_for_loop, c = 'orange', s = 100, label = '2 for loops')\n",
"\n",
"plt.xlabel('input size')\n",
"plt.ylabel('processing time')\n",
"plt.yticks(results_for_loop, ['T', str(int((12.4 /10**6)/(513/10**9)))+ 'x T', str(int((928/10**6)/(513/10**9))) + 'x T'])\n",
"plt.legend()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2a61bf38",
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [],
"source": [
"print('Data increase 1x, 10x, 100x')\n",
"print('Time increase 513 ns, 12.4 µs, 928 µs')\n",
"print('time1, ~ 24x time1, ~ 1800x time1')"
]
},
{
"cell_type": "markdown",
"id": "38e47397",
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"source": [
"What is the big-O complexity of this implementation? "
]
},
{
"cell_type": "markdown",
"id": "4118b38d",
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"source": [
"n * n ~ O(n<sup>2</sup>)"
]
},
{
"cell_type": "markdown",
"id": "31cd0e74",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# Implementation with sorted lists"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c13a24f4",
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [],
"source": [
"%%timeit\n",
"scaling_factor = 100 #10, 100\n",
"words1 = ['apple', 'orange', 'banana', 'melon', 'peach'] * scaling_factor\n",
"words2 = ['orange', 'kiwi', 'avocado', 'apple', 'banana'] *scaling_factor\n",
"words1 = sorted(words1)\n",
"words2 = sorted(words2)\n",
"\n",
"common_sort_list = []\n",
"idx2 = 0\n",
"for w in words1:\n",
" while idx2 < len(words2) and words2[idx2] < w:\n",
" idx2 += 1\n",
" if idx2 >= len(words2):\n",
" break\n",
" if words2[idx2] == w:\n",
" common_sort_list.append(w) #1.94 ns, 17.3 us, 204 us"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f1e8fed2",
"metadata": {
"slideshow": {
"slide_type": "notes"
}
},
"outputs": [],
"source": [
"# 1.9 * 10**6\n",
"# 17.9 * 10**6\n",
"# 205 * 10**6"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8ce798ab",
"metadata": {
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [],
"source": [
"input_size = [1, 10, 100]\n",
"results_sorted_lists = [(1.9 * 10**6)/(1.9 * 10**6), (17.9 * 10**6)/(1.9 * 10**6), (205 * 10**6)/(1.9 * 10**6)]\n",
"fit2 = np.polyfit(input_size, results_sorted_lists, 2)\n",
"eval2 = np.polyval(fit2, x)\n",
"plt.plot(x,eval1,c = 'orange')\n",
"plt.plot(x,eval2,c = 'pink')\n",
"plt.scatter(input_size, results_for_loop, c = 'orange', s = 100, label = '2 for loops')\n",
"plt.scatter(input_size, results_sorted_lists, c = 'pink', s = 100, label = 'sorted lists')\n",
"plt.xlabel('input size')\n",
"plt.ylabel('processing time')\n",
"plt.yticks(results_for_loop + results_sorted_lists[1:], ['T', str(int((12.4 /10**6)/(513/10**9)))+ 'x T', str(int((928/10**6)/(513/10**9))) + 'x T',\n",
" str(int((17.9 * 10**6)/(1.9 * 10**6)))+ 'x T', str(int((205 * 10**6)/(1.9 * 10**6))) + 'x T',])\n",
"plt.legend()"
]
},
{
"cell_type": "markdown",
"id": "1da4c22f",
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"source": [
"What is the big-O complexity of this implementation? "
]
},
{
"cell_type": "markdown",
"id": "4b068a1b",
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"source": [
"2 * sorting + traversing two lists = 2*n log<sub>2</sub> + 2*n ~ O(n * log<sub>n</sub>)"
]
},
{
"cell_type": "markdown",
"id": "13c96239",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# Implementation with sets"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "61edb9f3",
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [],
"source": [
"%%timeit\n",
"\n",
"scaling_factor = 1\n",
"\n",
"words1 = ['apple', 'orange', 'banana', 'melon', 'peach'] * scaling_factor\n",
"words2 = ['orange', 'kiwi', 'avocado', 'apple', 'banana'] *scaling_factor\n",
"\n",
"words2 = set(words2)\n",
"\n",
"common_sets = []\n",
"for w in words1:\n",
" if w in words2:\n",
" common_sets.append(w) # 630 ns, 3.13 us, 28.6 us"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c90d8e68",
"metadata": {
"slideshow": {
"slide_type": "notes"
}
},
"outputs": [],
"source": [
"# 630 * 10**9\n",
"# 3.13 * 10**6\n",
"# 28.6 * 10**6"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "236c132d",
"metadata": {
"scrolled": true,
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [],
"source": [
"results_sets = [(630 * 10**9)/(630 * 10**9), (3.13 * 10**6)/(630 * 10**9), (28.6 * 10**6)/(630 * 10**9)]\n",
"fit3 = np.polyfit(input_size, results_sets, 2)\n",
"eval3 = np.polyval(fit3, x)\n",
"plt.plot(x,eval1,c = 'orange')\n",
"plt.plot(x,eval2,c = 'pink')\n",
"plt.plot(x, eval3, c = 'blue')\n",
"plt.scatter(input_size, results_for_loop, c = 'orange', s = 100, label = '2 for loops')\n",
"plt.scatter(input_size, results_sorted_lists, c = 'pink', s = 100, label = 'sorted lists')\n",
"plt.scatter(input_size, results_sets, c = 'blue', s = 100, label = 'sets')\n",
"plt.xlabel('input size')\n",
"plt.ylabel('processing time')\n",
"plt.yticks(results_for_loop + results_sorted_lists[1:], ['T', str(int((12.4 /10**6)/(513/10**9)))+ 'x T', str(int((928/10**6)/(513/10**9))) + 'x T', str(int((17.9 * 10**6)/(1.9 * 10**6)))+ 'x T', str(int((205 * 10**6)/(1.9 * 10**6))) + 'x T'])\n",
"plt.legend()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "c9780532",
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"source": [
"What is the big-O complexity of this implementation? "
]
},
{
"cell_type": "markdown",
"id": "297bcd7d",
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"source": [
"transforming one list to set + 1 for loop = 2 * n ~ O(n)\n",
"\n",
"Its the exact same code as for lists, but now looking up an element in sets \u000b",
"(if w in words2) takes constant time!\n",
"How could you have known that set lookup is fast? Learning about data structures!"
]
}
],
"metadata": {
"celltoolbar": "Slideshow",
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}