2024-heraklion-data/notebooks/.ipynb_checkpoints/notebook-1-sorting-examples-checkpoint.ipynb
2024-08-27 15:27:53 +03:00

9.2 KiB
Raw Blame History

In [ ]:
import numpy as np
import timeit
import matplotlib.pyplot as plt

Example: Find common words

Problem: given two lists of words, extract all the words that are in common

Implementation with 2x for-loops

In [ ]:
%%timeit

scaling_factor = 1 #10, 100

words1 = ['apple', 'orange', 'banana', 'melon', 'peach'] * scaling_factor
words2 = ['orange', 'kiwi', 'avocado', 'apple', 'banana'] * scaling_factor

common_for = []
for w in words1:
    if w in words2:
        common_for.append(w)   # 612 ns, 12.3 us, 928 us
In [ ]:
input_size = [1, 10, 100]
results_for_loop = [(612/10**9)/(612/10**9), (12.4 /10**6)/(612/10**9), (928/10**6)/(612/10**9)] # in seconds

x = np.linspace(0,100,100)
fit1 = np.polyfit(input_size,results_for_loop,2)
eval1 = np.polyval(fit1, x)

plt.plot(x,eval1,c = 'orange')
plt.scatter(input_size, results_for_loop, c = 'orange', s = 100, label = '2 for loops')

plt.xlabel('input size')
plt.ylabel('processing time')
plt.yticks(results_for_loop, ['T', str(int((12.4 /10**6)/(513/10**9)))+ 'x T', str(int((928/10**6)/(513/10**9))) + 'x T'])
plt.legend()
In [ ]:
print('Data increase 1x, 10x, 100x')
print('Time increase 513 ns, 12.4 µs, 928 µs')
print('time1, ~ 24x time1, ~ 1800x time1')

What is the big-O complexity of this implementation?

n * n ~ O(n2)

Implementation with sorted lists

In [ ]:
%%timeit
scaling_factor = 100 #10, 100
words1 = ['apple', 'orange', 'banana', 'melon', 'peach'] * scaling_factor
words2 = ['orange', 'kiwi', 'avocado', 'apple', 'banana'] *scaling_factor
words1 = sorted(words1)
words2 = sorted(words2)

common_sort_list = []
idx2 = 0
for w in words1:
    while idx2 < len(words2) and words2[idx2] < w:
        idx2 += 1
    if idx2 >= len(words2):
        break
    if words2[idx2] == w:
        common_sort_list.append(w) #1.94 ns, 17.3 us, 204 us
In [ ]:
# 1.9 * 10**6
# 17.9 * 10**6
# 205 * 10**6
In [ ]:
input_size = [1, 10, 100]
results_sorted_lists = [(1.9 * 10**6)/(1.9 * 10**6), (17.9 * 10**6)/(1.9 * 10**6),  (205 * 10**6)/(1.9 * 10**6)]
fit2 = np.polyfit(input_size, results_sorted_lists, 2)
eval2 = np.polyval(fit2, x)
plt.plot(x,eval1,c = 'orange')
plt.plot(x,eval2,c = 'pink')
plt.scatter(input_size, results_for_loop, c = 'orange', s = 100, label = '2 for loops')
plt.scatter(input_size, results_sorted_lists, c = 'pink', s = 100, label = 'sorted lists')
plt.xlabel('input size')
plt.ylabel('processing time')
plt.yticks(results_for_loop + results_sorted_lists[1:], ['T', str(int((12.4 /10**6)/(513/10**9)))+ 'x T', str(int((928/10**6)/(513/10**9))) + 'x T',
                                                        str(int((17.9 * 10**6)/(1.9 * 10**6)))+ 'x T', str(int((205 * 10**6)/(1.9 * 10**6))) + 'x T',])
plt.legend()

What is the big-O complexity of this implementation?

2 sorting + traversing two lists = 2n log2 + 2n ~ O(n logn)

Implementation with sets

In [ ]:
%%timeit

scaling_factor = 1

words1 = ['apple', 'orange', 'banana', 'melon', 'peach'] * scaling_factor
words2 = ['orange', 'kiwi', 'avocado', 'apple', 'banana'] *scaling_factor

words2 = set(words2)

common_sets = []
for w in words1:
    if w in words2:
        common_sets.append(w)  # 630 ns, 3.13 us, 28.6 us
In [ ]:
# 630 * 10**9
# 3.13 * 10**6
# 28.6 * 10**6
In [ ]:
results_sets = [(630 * 10**9)/(630 * 10**9), (3.13 * 10**6)/(630 * 10**9), (28.6 * 10**6)/(630 * 10**9)]
fit3 = np.polyfit(input_size, results_sets, 2)
eval3 = np.polyval(fit3, x)
plt.plot(x,eval1,c = 'orange')
plt.plot(x,eval2,c = 'pink')
plt.plot(x, eval3, c = 'blue')
plt.scatter(input_size, results_for_loop, c = 'orange', s = 100, label = '2 for loops')
plt.scatter(input_size, results_sorted_lists, c = 'pink', s = 100, label = 'sorted lists')
plt.scatter(input_size, results_sets, c = 'blue', s = 100, label = 'sets')
plt.xlabel('input size')
plt.ylabel('processing time')
plt.yticks(results_for_loop + results_sorted_lists[1:], ['T', str(int((12.4 /10**6)/(513/10**9)))+ 'x T', str(int((928/10**6)/(513/10**9))) + 'x T',                                                       str(int((17.9 * 10**6)/(1.9 * 10**6)))+ 'x T', str(int((205 * 10**6)/(1.9 * 10**6))) + 'x T'])
plt.legend()
plt.show()

What is the big-O complexity of this implementation?

transforming one list to set + 1 for loop = 2 * n ~ O(n)

Its the exact same code as for lists, but now looking up an element in sets (if w in words2) takes constant time! How could you have known that set lookup is fast? Learning about data structures!