2024-heraklion-data/notebooks/.ipynb_checkpoints/notebook-1-sorting-examples-checkpoint.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8685ea3a",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import timeit\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "048881d0",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Example: Find common words"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2464a282",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "Problem: given two lists of words, extract all the words that are in common"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "71740eab",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Implementation with 2x for-loops"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f175c775",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "%%timeit\n",
    "\n",
    "scaling_factor = 1 #10, 100\n",
    "\n",
    "words1 = ['apple', 'orange', 'banana', 'melon', 'peach'] * scaling_factor\n",
    "words2 = ['orange', 'kiwi', 'avocado', 'apple', 'banana'] * scaling_factor\n",
    "\n",
    "common_for = []\n",
    "for w in words1:\n",
    "    if w in words2:\n",
    "        common_for.append(w)   # 612 ns, 12.3 us, 928 us       "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "affab857",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "outputs": [],
   "source": [
    "input_size = [1, 10, 100]\n",
    "results_for_loop = [(612/10**9)/(612/10**9), (12.4 /10**6)/(612/10**9), (928/10**6)/(612/10**9)] # in seconds\n",
    "\n",
    "x = np.linspace(0,100,100)\n",
    "fit1 = np.polyfit(input_size,results_for_loop,2)\n",
    "eval1 = np.polyval(fit1, x)\n",
    "\n",
    "plt.plot(x,eval1,c = 'orange')\n",
    "plt.scatter(input_size, results_for_loop, c = 'orange', s = 100, label = '2 for loops')\n",
    "\n",
    "plt.xlabel('input size')\n",
    "plt.ylabel('processing time')\n",
    "plt.yticks(results_for_loop, ['T', str(int((12.4 /10**6)/(513/10**9)))+ 'x T', str(int((928/10**6)/(513/10**9))) + 'x T'])\n",
    "plt.legend()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a61bf38",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [],
   "source": [
    "print('Data increase 1x, 10x, 100x')\n",
    "print('Time increase 513 ns, 12.4 µs, 928 µs')\n",
    "print('time1, ~ 24x time1, ~ 1800x time1')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "38e47397",
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "source": [
    "What is the big-O complexity of this implementation? "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4118b38d",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "source": [
    "n * n ~ O(n<sup>2</sup>)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "31cd0e74",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Implementation with sorted lists"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c13a24f4",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "%%timeit\n",
    "scaling_factor = 100 #10, 100\n",
    "words1 = ['apple', 'orange', 'banana', 'melon', 'peach'] * scaling_factor\n",
    "words2 = ['orange', 'kiwi', 'avocado', 'apple', 'banana'] *scaling_factor\n",
    "words1 = sorted(words1)\n",
    "words2 = sorted(words2)\n",
    "\n",
    "common_sort_list = []\n",
    "idx2 = 0\n",
    "for w in words1:\n",
    "    while idx2 < len(words2) and words2[idx2] < w:\n",
    "        idx2 += 1\n",
    "    if idx2 >= len(words2):\n",
    "        break\n",
    "    if words2[idx2] == w:\n",
    "        common_sort_list.append(w) #1.94 ns, 17.3 us, 204 us"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1e8fed2",
   "metadata": {
    "slideshow": {
     "slide_type": "notes"
    }
   },
   "outputs": [],
   "source": [
    "# 1.9 * 10**6\n",
    "# 17.9 * 10**6\n",
    "# 205 * 10**6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ce798ab",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "outputs": [],
   "source": [
    "input_size = [1, 10, 100]\n",
    "results_sorted_lists = [(1.9 * 10**6)/(1.9 * 10**6), (17.9 * 10**6)/(1.9 * 10**6),  (205 * 10**6)/(1.9 * 10**6)]\n",
    "fit2 = np.polyfit(input_size, results_sorted_lists, 2)\n",
    "eval2 = np.polyval(fit2, x)\n",
    "plt.plot(x,eval1,c = 'orange')\n",
    "plt.plot(x,eval2,c = 'pink')\n",
    "plt.scatter(input_size, results_for_loop, c = 'orange', s = 100, label = '2 for loops')\n",
    "plt.scatter(input_size, results_sorted_lists, c = 'pink', s = 100, label = 'sorted lists')\n",
    "plt.xlabel('input size')\n",
    "plt.ylabel('processing time')\n",
    "plt.yticks(results_for_loop + results_sorted_lists[1:], ['T', str(int((12.4 /10**6)/(513/10**9)))+ 'x T', str(int((928/10**6)/(513/10**9))) + 'x T',\n",
    "                                                        str(int((17.9 * 10**6)/(1.9 * 10**6)))+ 'x T', str(int((205 * 10**6)/(1.9 * 10**6))) + 'x T',])\n",
    "plt.legend()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1da4c22f",
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "source": [
    "What is the big-O complexity of this implementation? "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4b068a1b",
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "source": [
    "2 * sorting + traversing two lists = 2*n log<sub>2</sub> + 2*n  ~  O(n * log<sub>n</sub>)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13c96239",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Implementation with sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "61edb9f3",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "%%timeit\n",
    "\n",
    "scaling_factor = 1\n",
    "\n",
    "words1 = ['apple', 'orange', 'banana', 'melon', 'peach'] * scaling_factor\n",
    "words2 = ['orange', 'kiwi', 'avocado', 'apple', 'banana'] *scaling_factor\n",
    "\n",
    "words2 = set(words2)\n",
    "\n",
    "common_sets = []\n",
    "for w in words1:\n",
    "    if w in words2:\n",
    "        common_sets.append(w)  # 630 ns, 3.13 us, 28.6 us"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c90d8e68",
   "metadata": {
    "slideshow": {
     "slide_type": "notes"
    }
   },
   "outputs": [],
   "source": [
    "# 630 * 10**9\n",
    "# 3.13 * 10**6\n",
    "# 28.6 * 10**6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "236c132d",
   "metadata": {
    "scrolled": true,
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "outputs": [],
   "source": [
    "results_sets = [(630 * 10**9)/(630 * 10**9), (3.13 * 10**6)/(630 * 10**9), (28.6 * 10**6)/(630 * 10**9)]\n",
    "fit3 = np.polyfit(input_size, results_sets, 2)\n",
    "eval3 = np.polyval(fit3, x)\n",
    "plt.plot(x,eval1,c = 'orange')\n",
    "plt.plot(x,eval2,c = 'pink')\n",
    "plt.plot(x, eval3, c = 'blue')\n",
    "plt.scatter(input_size, results_for_loop, c = 'orange', s = 100, label = '2 for loops')\n",
    "plt.scatter(input_size, results_sorted_lists, c = 'pink', s = 100, label = 'sorted lists')\n",
    "plt.scatter(input_size, results_sets, c = 'blue', s = 100, label = 'sets')\n",
    "plt.xlabel('input size')\n",
    "plt.ylabel('processing time')\n",
    "plt.yticks(results_for_loop + results_sorted_lists[1:], ['T', str(int((12.4 /10**6)/(513/10**9)))+ 'x T', str(int((928/10**6)/(513/10**9))) + 'x T',                                                       str(int((17.9 * 10**6)/(1.9 * 10**6)))+ 'x T', str(int((205 * 10**6)/(1.9 * 10**6))) + 'x T'])\n",
    "plt.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c9780532",
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "source": [
    "What is the big-O complexity of this implementation? "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "297bcd7d",
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "source": [
    "transforming one list to set  + 1 for loop  = 2 * n ~  O(n)\n",
    "\n",
    "It’s the exact same code as for lists, but now looking up an element in sets \u000b",
    "(if w in words2) takes constant time!\n",
    "How could you have known that set lookup is fast? Learning about data structures!"
   ]
  }
 ],
 "metadata": {
  "celltoolbar": "Slideshow",
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}