2024-heraklion-data/notebooks/.ipynb_checkpoints/notebook-1-sorting-examples-checkpoint.ipynb
2024-08-27 15:27:53 +03:00

387 lines
9.2 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "8685ea3a",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [],
"source": [
"import numpy as np\n",
"import timeit\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "markdown",
"id": "048881d0",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# Example: Find common words"
]
},
{
"cell_type": "markdown",
"id": "2464a282",
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"source": [
"Problem: given two lists of words, extract all the words that are in common"
]
},
{
"cell_type": "markdown",
"id": "71740eab",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# Implementation with 2x for-loops"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f175c775",
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [],
"source": [
"%%timeit\n",
"\n",
"scaling_factor = 1 #10, 100\n",
"\n",
"words1 = ['apple', 'orange', 'banana', 'melon', 'peach'] * scaling_factor\n",
"words2 = ['orange', 'kiwi', 'avocado', 'apple', 'banana'] * scaling_factor\n",
"\n",
"common_for = []\n",
"for w in words1:\n",
" if w in words2:\n",
" common_for.append(w) # 612 ns, 12.3 us, 928 us "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "affab857",
"metadata": {
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [],
"source": [
"input_size = [1, 10, 100]\n",
"results_for_loop = [(612/10**9)/(612/10**9), (12.4 /10**6)/(612/10**9), (928/10**6)/(612/10**9)] # in seconds\n",
"\n",
"x = np.linspace(0,100,100)\n",
"fit1 = np.polyfit(input_size,results_for_loop,2)\n",
"eval1 = np.polyval(fit1, x)\n",
"\n",
"plt.plot(x,eval1,c = 'orange')\n",
"plt.scatter(input_size, results_for_loop, c = 'orange', s = 100, label = '2 for loops')\n",
"\n",
"plt.xlabel('input size')\n",
"plt.ylabel('processing time')\n",
"plt.yticks(results_for_loop, ['T', str(int((12.4 /10**6)/(513/10**9)))+ 'x T', str(int((928/10**6)/(513/10**9))) + 'x T'])\n",
"plt.legend()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2a61bf38",
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"outputs": [],
"source": [
"print('Data increase 1x, 10x, 100x')\n",
"print('Time increase 513 ns, 12.4 µs, 928 µs')\n",
"print('time1, ~ 24x time1, ~ 1800x time1')"
]
},
{
"cell_type": "markdown",
"id": "38e47397",
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"source": [
"What is the big-O complexity of this implementation? "
]
},
{
"cell_type": "markdown",
"id": "4118b38d",
"metadata": {
"slideshow": {
"slide_type": "skip"
}
},
"source": [
"n * n ~ O(n<sup>2</sup>)"
]
},
{
"cell_type": "markdown",
"id": "31cd0e74",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# Implementation with sorted lists"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c13a24f4",
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [],
"source": [
"%%timeit\n",
"scaling_factor = 100 #10, 100\n",
"words1 = ['apple', 'orange', 'banana', 'melon', 'peach'] * scaling_factor\n",
"words2 = ['orange', 'kiwi', 'avocado', 'apple', 'banana'] *scaling_factor\n",
"words1 = sorted(words1)\n",
"words2 = sorted(words2)\n",
"\n",
"common_sort_list = []\n",
"idx2 = 0\n",
"for w in words1:\n",
" while idx2 < len(words2) and words2[idx2] < w:\n",
" idx2 += 1\n",
" if idx2 >= len(words2):\n",
" break\n",
" if words2[idx2] == w:\n",
" common_sort_list.append(w) #1.94 ns, 17.3 us, 204 us"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f1e8fed2",
"metadata": {
"slideshow": {
"slide_type": "notes"
}
},
"outputs": [],
"source": [
"# 1.9 * 10**6\n",
"# 17.9 * 10**6\n",
"# 205 * 10**6"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8ce798ab",
"metadata": {
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [],
"source": [
"input_size = [1, 10, 100]\n",
"results_sorted_lists = [(1.9 * 10**6)/(1.9 * 10**6), (17.9 * 10**6)/(1.9 * 10**6), (205 * 10**6)/(1.9 * 10**6)]\n",
"fit2 = np.polyfit(input_size, results_sorted_lists, 2)\n",
"eval2 = np.polyval(fit2, x)\n",
"plt.plot(x,eval1,c = 'orange')\n",
"plt.plot(x,eval2,c = 'pink')\n",
"plt.scatter(input_size, results_for_loop, c = 'orange', s = 100, label = '2 for loops')\n",
"plt.scatter(input_size, results_sorted_lists, c = 'pink', s = 100, label = 'sorted lists')\n",
"plt.xlabel('input size')\n",
"plt.ylabel('processing time')\n",
"plt.yticks(results_for_loop + results_sorted_lists[1:], ['T', str(int((12.4 /10**6)/(513/10**9)))+ 'x T', str(int((928/10**6)/(513/10**9))) + 'x T',\n",
" str(int((17.9 * 10**6)/(1.9 * 10**6)))+ 'x T', str(int((205 * 10**6)/(1.9 * 10**6))) + 'x T',])\n",
"plt.legend()"
]
},
{
"cell_type": "markdown",
"id": "1da4c22f",
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"source": [
"What is the big-O complexity of this implementation? "
]
},
{
"cell_type": "markdown",
"id": "4b068a1b",
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"source": [
"2 * sorting + traversing two lists = 2*n log<sub>2</sub> + 2*n ~ O(n * log<sub>n</sub>)"
]
},
{
"cell_type": "markdown",
"id": "13c96239",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# Implementation with sets"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "61edb9f3",
"metadata": {
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [],
"source": [
"%%timeit\n",
"\n",
"scaling_factor = 1\n",
"\n",
"words1 = ['apple', 'orange', 'banana', 'melon', 'peach'] * scaling_factor\n",
"words2 = ['orange', 'kiwi', 'avocado', 'apple', 'banana'] *scaling_factor\n",
"\n",
"words2 = set(words2)\n",
"\n",
"common_sets = []\n",
"for w in words1:\n",
" if w in words2:\n",
" common_sets.append(w) # 630 ns, 3.13 us, 28.6 us"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c90d8e68",
"metadata": {
"slideshow": {
"slide_type": "notes"
}
},
"outputs": [],
"source": [
"# 630 * 10**9\n",
"# 3.13 * 10**6\n",
"# 28.6 * 10**6"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "236c132d",
"metadata": {
"scrolled": true,
"slideshow": {
"slide_type": "subslide"
}
},
"outputs": [],
"source": [
"results_sets = [(630 * 10**9)/(630 * 10**9), (3.13 * 10**6)/(630 * 10**9), (28.6 * 10**6)/(630 * 10**9)]\n",
"fit3 = np.polyfit(input_size, results_sets, 2)\n",
"eval3 = np.polyval(fit3, x)\n",
"plt.plot(x,eval1,c = 'orange')\n",
"plt.plot(x,eval2,c = 'pink')\n",
"plt.plot(x, eval3, c = 'blue')\n",
"plt.scatter(input_size, results_for_loop, c = 'orange', s = 100, label = '2 for loops')\n",
"plt.scatter(input_size, results_sorted_lists, c = 'pink', s = 100, label = 'sorted lists')\n",
"plt.scatter(input_size, results_sets, c = 'blue', s = 100, label = 'sets')\n",
"plt.xlabel('input size')\n",
"plt.ylabel('processing time')\n",
"plt.yticks(results_for_loop + results_sorted_lists[1:], ['T', str(int((12.4 /10**6)/(513/10**9)))+ 'x T', str(int((928/10**6)/(513/10**9))) + 'x T', str(int((17.9 * 10**6)/(1.9 * 10**6)))+ 'x T', str(int((205 * 10**6)/(1.9 * 10**6))) + 'x T'])\n",
"plt.legend()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "c9780532",
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"source": [
"What is the big-O complexity of this implementation? "
]
},
{
"cell_type": "markdown",
"id": "297bcd7d",
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"source": [
"transforming one list to set + 1 for loop = 2 * n ~ O(n)\n",
"\n",
"Its the exact same code as for lists, but now looking up an element in sets \u000b",
"(if w in words2) takes constant time!\n",
"How could you have known that set lookup is fast? Learning about data structures!"
]
}
],
"metadata": {
"celltoolbar": "Slideshow",
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}