9.2 KiB
9.2 KiB
In [ ]:
import numpy as np
import timeit
import matplotlib.pyplot as plt
Example: Find common words¶
Problem: given two lists of words, extract all the words that are in common
Implementation with 2x for-loops¶
In [ ]:
%%timeit
scaling_factor = 1 #10, 100
words1 = ['apple', 'orange', 'banana', 'melon', 'peach'] * scaling_factor
words2 = ['orange', 'kiwi', 'avocado', 'apple', 'banana'] * scaling_factor
common_for = []
for w in words1:
if w in words2:
common_for.append(w) # 612 ns, 12.3 us, 928 us
In [ ]:
input_size = [1, 10, 100]
results_for_loop = [(612/10**9)/(612/10**9), (12.4 /10**6)/(612/10**9), (928/10**6)/(612/10**9)] # in seconds
x = np.linspace(0,100,100)
fit1 = np.polyfit(input_size,results_for_loop,2)
eval1 = np.polyval(fit1, x)
plt.plot(x,eval1,c = 'orange')
plt.scatter(input_size, results_for_loop, c = 'orange', s = 100, label = '2 for loops')
plt.xlabel('input size')
plt.ylabel('processing time')
plt.yticks(results_for_loop, ['T', str(int((12.4 /10**6)/(513/10**9)))+ 'x T', str(int((928/10**6)/(513/10**9))) + 'x T'])
plt.legend()
In [ ]:
print('Data increase 1x, 10x, 100x')
print('Time increase 513 ns, 12.4 µs, 928 µs')
print('time1, ~ 24x time1, ~ 1800x time1')
What is the big-O complexity of this implementation?
n * n ~ O(n2)
Implementation with sorted lists¶
In [ ]:
%%timeit
scaling_factor = 100 #10, 100
words1 = ['apple', 'orange', 'banana', 'melon', 'peach'] * scaling_factor
words2 = ['orange', 'kiwi', 'avocado', 'apple', 'banana'] *scaling_factor
words1 = sorted(words1)
words2 = sorted(words2)
common_sort_list = []
idx2 = 0
for w in words1:
while idx2 < len(words2) and words2[idx2] < w:
idx2 += 1
if idx2 >= len(words2):
break
if words2[idx2] == w:
common_sort_list.append(w) #1.94 ns, 17.3 us, 204 us
In [ ]:
# 1.9 * 10**6
# 17.9 * 10**6
# 205 * 10**6
In [ ]:
input_size = [1, 10, 100]
results_sorted_lists = [(1.9 * 10**6)/(1.9 * 10**6), (17.9 * 10**6)/(1.9 * 10**6), (205 * 10**6)/(1.9 * 10**6)]
fit2 = np.polyfit(input_size, results_sorted_lists, 2)
eval2 = np.polyval(fit2, x)
plt.plot(x,eval1,c = 'orange')
plt.plot(x,eval2,c = 'pink')
plt.scatter(input_size, results_for_loop, c = 'orange', s = 100, label = '2 for loops')
plt.scatter(input_size, results_sorted_lists, c = 'pink', s = 100, label = 'sorted lists')
plt.xlabel('input size')
plt.ylabel('processing time')
plt.yticks(results_for_loop + results_sorted_lists[1:], ['T', str(int((12.4 /10**6)/(513/10**9)))+ 'x T', str(int((928/10**6)/(513/10**9))) + 'x T',
str(int((17.9 * 10**6)/(1.9 * 10**6)))+ 'x T', str(int((205 * 10**6)/(1.9 * 10**6))) + 'x T',])
plt.legend()
What is the big-O complexity of this implementation?
2 sorting + traversing two lists = 2n log2 + 2n ~ O(n logn)
Implementation with sets¶
In [ ]:
%%timeit
scaling_factor = 1
words1 = ['apple', 'orange', 'banana', 'melon', 'peach'] * scaling_factor
words2 = ['orange', 'kiwi', 'avocado', 'apple', 'banana'] *scaling_factor
words2 = set(words2)
common_sets = []
for w in words1:
if w in words2:
common_sets.append(w) # 630 ns, 3.13 us, 28.6 us
In [ ]:
# 630 * 10**9
# 3.13 * 10**6
# 28.6 * 10**6
In [ ]:
results_sets = [(630 * 10**9)/(630 * 10**9), (3.13 * 10**6)/(630 * 10**9), (28.6 * 10**6)/(630 * 10**9)]
fit3 = np.polyfit(input_size, results_sets, 2)
eval3 = np.polyval(fit3, x)
plt.plot(x,eval1,c = 'orange')
plt.plot(x,eval2,c = 'pink')
plt.plot(x, eval3, c = 'blue')
plt.scatter(input_size, results_for_loop, c = 'orange', s = 100, label = '2 for loops')
plt.scatter(input_size, results_sorted_lists, c = 'pink', s = 100, label = 'sorted lists')
plt.scatter(input_size, results_sets, c = 'blue', s = 100, label = 'sets')
plt.xlabel('input size')
plt.ylabel('processing time')
plt.yticks(results_for_loop + results_sorted_lists[1:], ['T', str(int((12.4 /10**6)/(513/10**9)))+ 'x T', str(int((928/10**6)/(513/10**9))) + 'x T', str(int((17.9 * 10**6)/(1.9 * 10**6)))+ 'x T', str(int((205 * 10**6)/(1.9 * 10**6))) + 'x T'])
plt.legend()
plt.show()
What is the big-O complexity of this implementation?
transforming one list to set + 1 for loop = 2 * n ~ O(n)
It’s the exact same code as for lists, but now looking up an element in sets (if w in words2) takes constant time! How could you have known that set lookup is fast? Learning about data structures!