{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "8685ea3a", "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [], "source": [ "import numpy as np\n", "import timeit\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", "id": "048881d0", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Example: Find common words" ] }, { "cell_type": "markdown", "id": "2464a282", "metadata": { "slideshow": { "slide_type": "fragment" } }, "source": [ "Problem: given two lists of words, extract all the words that are in common" ] }, { "cell_type": "markdown", "id": "71740eab", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Implementation with 2x for-loops" ] }, { "cell_type": "code", "execution_count": null, "id": "f175c775", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "source": [ "%%timeit\n", "\n", "scaling_factor = 1 #10, 100\n", "\n", "words1 = ['apple', 'orange', 'banana', 'melon', 'peach'] * scaling_factor\n", "words2 = ['orange', 'kiwi', 'avocado', 'apple', 'banana'] * scaling_factor\n", "\n", "common_for = []\n", "for w in words1:\n", " if w in words2:\n", " common_for.append(w) # 612 ns, 12.3 us, 928 us " ] }, { "cell_type": "code", "execution_count": null, "id": "affab857", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "input_size = [1, 10, 100]\n", "results_for_loop = [(612/10**9)/(612/10**9), (12.4 /10**6)/(612/10**9), (928/10**6)/(612/10**9)] # in seconds\n", "\n", "x = np.linspace(0,100,100)\n", "fit1 = np.polyfit(input_size,results_for_loop,2)\n", "eval1 = np.polyval(fit1, x)\n", "\n", "plt.plot(x,eval1,c = 'orange')\n", "plt.scatter(input_size, results_for_loop, c = 'orange', s = 100, label = '2 for loops')\n", "\n", "plt.xlabel('input size')\n", "plt.ylabel('processing time')\n", "plt.yticks(results_for_loop, ['T', str(int((12.4 /10**6)/(513/10**9)))+ 'x T', str(int((928/10**6)/(513/10**9))) + 'x T'])\n", "plt.legend()" ] }, { "cell_type": "code", "execution_count": null, "id": "2a61bf38", "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "print('Data increase 1x, 10x, 100x')\n", "print('Time increase 513 ns, 12.4 µs, 928 µs')\n", "print('time1, ~ 24x time1, ~ 1800x time1')" ] }, { "cell_type": "markdown", "id": "38e47397", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "What is the big-O complexity of this implementation? " ] }, { "cell_type": "markdown", "id": "4118b38d", "metadata": { "slideshow": { "slide_type": "skip" } }, "source": [ "n * n ~ O(n2)" ] }, { "cell_type": "markdown", "id": "31cd0e74", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Implementation with sorted lists" ] }, { "cell_type": "code", "execution_count": null, "id": "c13a24f4", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "source": [ "%%timeit\n", "scaling_factor = 100 #10, 100\n", "words1 = ['apple', 'orange', 'banana', 'melon', 'peach'] * scaling_factor\n", "words2 = ['orange', 'kiwi', 'avocado', 'apple', 'banana'] *scaling_factor\n", "words1 = sorted(words1)\n", "words2 = sorted(words2)\n", "\n", "common_sort_list = []\n", "idx2 = 0\n", "for w in words1:\n", " while idx2 < len(words2) and words2[idx2] < w:\n", " idx2 += 1\n", " if idx2 >= len(words2):\n", " break\n", " if words2[idx2] == w:\n", " common_sort_list.append(w) #1.94 ns, 17.3 us, 204 us" ] }, { "cell_type": "code", "execution_count": null, "id": "f1e8fed2", "metadata": { "slideshow": { "slide_type": "notes" } }, "outputs": [], "source": [ "# 1.9 * 10**6\n", "# 17.9 * 10**6\n", "# 205 * 10**6" ] }, { "cell_type": "code", "execution_count": null, "id": "8ce798ab", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "input_size = [1, 10, 100]\n", "results_sorted_lists = [(1.9 * 10**6)/(1.9 * 10**6), (17.9 * 10**6)/(1.9 * 10**6), (205 * 10**6)/(1.9 * 10**6)]\n", "fit2 = np.polyfit(input_size, results_sorted_lists, 2)\n", "eval2 = np.polyval(fit2, x)\n", "plt.plot(x,eval1,c = 'orange')\n", "plt.plot(x,eval2,c = 'pink')\n", "plt.scatter(input_size, results_for_loop, c = 'orange', s = 100, label = '2 for loops')\n", "plt.scatter(input_size, results_sorted_lists, c = 'pink', s = 100, label = 'sorted lists')\n", "plt.xlabel('input size')\n", "plt.ylabel('processing time')\n", "plt.yticks(results_for_loop + results_sorted_lists[1:], ['T', str(int((12.4 /10**6)/(513/10**9)))+ 'x T', str(int((928/10**6)/(513/10**9))) + 'x T',\n", " str(int((17.9 * 10**6)/(1.9 * 10**6)))+ 'x T', str(int((205 * 10**6)/(1.9 * 10**6))) + 'x T',])\n", "plt.legend()" ] }, { "cell_type": "markdown", "id": "1da4c22f", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "What is the big-O complexity of this implementation? " ] }, { "cell_type": "markdown", "id": "4b068a1b", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "2 * sorting + traversing two lists = 2*n log2 + 2*n ~ O(n * logn)" ] }, { "cell_type": "markdown", "id": "13c96239", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Implementation with sets" ] }, { "cell_type": "code", "execution_count": null, "id": "61edb9f3", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "source": [ "%%timeit\n", "\n", "scaling_factor = 1\n", "\n", "words1 = ['apple', 'orange', 'banana', 'melon', 'peach'] * scaling_factor\n", "words2 = ['orange', 'kiwi', 'avocado', 'apple', 'banana'] *scaling_factor\n", "\n", "words2 = set(words2)\n", "\n", "common_sets = []\n", "for w in words1:\n", " if w in words2:\n", " common_sets.append(w) # 630 ns, 3.13 us, 28.6 us" ] }, { "cell_type": "code", "execution_count": null, "id": "c90d8e68", "metadata": { "slideshow": { "slide_type": "notes" } }, "outputs": [], "source": [ "# 630 * 10**9\n", "# 3.13 * 10**6\n", "# 28.6 * 10**6" ] }, { "cell_type": "code", "execution_count": null, "id": "236c132d", "metadata": { "scrolled": true, "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "results_sets = [(630 * 10**9)/(630 * 10**9), (3.13 * 10**6)/(630 * 10**9), (28.6 * 10**6)/(630 * 10**9)]\n", "fit3 = np.polyfit(input_size, results_sets, 2)\n", "eval3 = np.polyval(fit3, x)\n", "plt.plot(x,eval1,c = 'orange')\n", "plt.plot(x,eval2,c = 'pink')\n", "plt.plot(x, eval3, c = 'blue')\n", "plt.scatter(input_size, results_for_loop, c = 'orange', s = 100, label = '2 for loops')\n", "plt.scatter(input_size, results_sorted_lists, c = 'pink', s = 100, label = 'sorted lists')\n", "plt.scatter(input_size, results_sets, c = 'blue', s = 100, label = 'sets')\n", "plt.xlabel('input size')\n", "plt.ylabel('processing time')\n", "plt.yticks(results_for_loop + results_sorted_lists[1:], ['T', str(int((12.4 /10**6)/(513/10**9)))+ 'x T', str(int((928/10**6)/(513/10**9))) + 'x T', str(int((17.9 * 10**6)/(1.9 * 10**6)))+ 'x T', str(int((205 * 10**6)/(1.9 * 10**6))) + 'x T'])\n", "plt.legend()\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "c9780532", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "What is the big-O complexity of this implementation? " ] }, { "cell_type": "markdown", "id": "297bcd7d", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "transforming one list to set + 1 for loop = 2 * n ~ O(n)\n", "\n", "It’s the exact same code as for lists, but now looking up an element in sets \u000b", "(if w in words2) takes constant time!\n", "How could you have known that set lookup is fast? Learning about data structures!" ] } ], "metadata": { "celltoolbar": "Slideshow", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" } }, "nbformat": 4, "nbformat_minor": 5 }