From 01e7a23ae28a50627cfdbcc472fc14580a05e5ab Mon Sep 17 00:00:00 2001
From: Tiziano Zito <opossumnano@gmail.com>
Date: Wed, 13 Aug 2025 13:57:19 +0200
Subject: [PATCH] first commit

---
 analog_programming.md  |  51 ++++++++++
 exercise.ipynb         | 213 +++++++++++++++++++++++++++++++++++++++++
 parallel.ipynb         | 150 +++++++++++++++++++++++++++++
 parallel/overcommit.py |   7 ++
 parallel/submit.sh     |   8 ++
 puzzle.ipynb           | 169 ++++++++++++++++++++++++++++++++
 6 files changed, 598 insertions(+)
 create mode 100644 analog_programming.md
 create mode 100644 exercise.ipynb
 create mode 100644 parallel.ipynb
 create mode 100644 parallel/overcommit.py
 create mode 100755 parallel/submit.sh
 create mode 100644 puzzle.ipynb

diff --git a/analog_programming.md b/analog_programming.md
new file mode 100644
index 0000000..3ff3d7f
--- /dev/null
+++ b/analog_programming.md
@@ -0,0 +1,51 @@
+# Analog programming
+Two exercises to activate the body and the mind
+
+Common goal of both exercises is to sort a deck of tarot cards by value
+
+Before starting, make yourself acquainted with the meaning and the power of the tarot cards. Carefully read [this booklet](https://aspp.school/_media/tarot-runic.pdf)
+
+# First experiment: human sorting
+## Setup
+- 1 volunteer to keep the time spent sorting
+- each person picks up a tarot card from the randomly shuffled deck on the table
+- moving around and speaking is allowed until the tarot cards are displayed sorted on the table
+
+# Second experiment: machine sorting
+## Setup
+We will keep some timings:
+- time spent *programming* and *compiling*
+- time spent *executing* the program
+
+## Programming in groups
+Divide in groups and write pseudo-code on paper to sort a randomly shuffled deck of tarot cards.
+
+## Rules
+- You have to "compile" the program in our special Assembly language, which consists of the following instructions:
+  - **`fetch(ADDR, REG)`**: fetch a value from the memory address `ADDR` and store it into register `REG`
+  - **`push(ADDR, REG)`**: push a value from the register `REG` into the memory address `ADDR`
+  - **`sort(REG_X, REG_Y)`**: sort the values in `REG_X` and `REG_Y` so that the value in `REG_X`is **bigger or equal** than the value in `REG_Y`
+  - **`stop`**: stop the execution of the program
+- "compile" in this context means translating the pseudo-code in a long sequence of the four instructions above
+- note that if you use loops, you have to "unroll" them when you compile the program
+- you are going to be using the hardware described below to execute the compiled program
+
+Present your solution to the general assembly and choose one implementation to try out!
+
+## Hardware setup
+- 4 empty seats will be the 4 CPU registers: `REG0`, `REG1`, `REG2`, `REG3`
+- distribute memory addresses (in hexadecimal) to all the remaining seats which are occupied by a participant
+- distribute the shuffled cards to every participant: they represent values and are hold by participants 
+- A tutor will be the CPU. A CPU can only execute one of the above instructions at a time
+- Everyone who is not a CPU is data:
+  - you'll have an address when you are stored in memory
+  - you'll have a value (the value of the tarot card you hold on your hand): don't let it go!
+  - when `fetch`-ed by the CPU, you'll take the PCI-bus and go sit on the corresponding register, taking the card with you
+  - when `push`-ed by the CPU, you'll take the PCI-bus and go sit on the corresponding memory address, taking the card with you
+  - when `sort`-ed, move to the appropriate register, according the semantics of the `sort` instruction defined above 
+
+## Run the compiled code!
+The CPUs will run the program by saying the instructions out loud and everyone else will move around accordingly. You can also literally "run" if you want to speed up the process a bit.
+
+Do not forget to start the timing!
+
diff --git a/exercise.ipynb b/exercise.ipynb
new file mode 100644
index 0000000..f34ca00
--- /dev/null
+++ b/exercise.ipynb
@@ -0,0 +1,213 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-03-04T09:40:28.904Z",
+     "iopub.status.busy": "2024-03-04T09:40:28.896Z",
+     "iopub.status.idle": "2024-03-04T09:40:28.978Z",
+     "shell.execute_reply": "2024-03-04T09:40:28.967Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-03-04T10:02:39.062Z",
+     "iopub.status.busy": "2024-03-04T10:02:39.057Z",
+     "iopub.status.idle": "2024-03-04T10:02:39.068Z",
+     "shell.execute_reply": "2024-03-04T10:02:39.071Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "n_series = 32\n",
+    "len_one_series = 5*2**20\n",
+    "time_series = np.random.rand(n_series, len_one_series)\n",
+    "gap = 16*2**10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-03-04T10:02:41.027Z",
+     "iopub.status.busy": "2024-03-04T10:02:41.020Z",
+     "iopub.status.idle": "2024-03-04T10:02:41.036Z",
+     "shell.execute_reply": "2024-03-04T10:02:41.040Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "print(f'Size of one time series: {int(time_series[0].nbytes/2**20)} M')\n",
+    "print(f'Size of collection: {int(time_series.nbytes/2**20)} M')\n",
+    "print(f'Gap size: {int(gap*8/2**10)} K')\n",
+    "print(f'Gapped series size: {int(time_series[0, ::gap].nbytes/2**10)} K')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The following function implements an approximation of a power series of every `gap` value in our time series.\n",
+    "\n",
+    "If we define one time series of length `N` to be:\n",
+    "\n",
+    "$[x_0, x_1, x_2, ..., x_N]$,\n",
+    "\n",
+    "then the \"gapped\" series with `gap=g` is:\n",
+    "\n",
+    "$[x_0, x_g, x_{2g}, ..., x_{N/g}]$,\n",
+    "\n",
+    "where $N/g$ is the number of gaps.\n",
+    "\n",
+    "The approximation of the power series up to power `30` for our \"gapped\" series is defined as:\n",
+    "\n",
+    "$$\\mathbf{P} = \\sum_{p=0}^{30} \\sum_i x_i^{p} = \\sum_i x_i^0 + \\sum_i x_i^1 + \\sum_i x_i^2 + ... + \\sum_i x_i^{30} $$\n",
+    "\n",
+    "where $i \\in [0, g, 2g, ..., N/g]$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-03-04T10:06:08.461Z",
+     "iopub.status.busy": "2024-03-04T10:06:08.459Z",
+     "iopub.status.idle": "2024-03-04T10:06:08.466Z",
+     "shell.execute_reply": "2024-03-04T10:06:08.468Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# compute an approximation of a power series for a collection of gapped timeseries\n",
+    "def power(time_series, P, gap):\n",
+    "    for row in range(time_series.shape[0]):\n",
+    "        for pwr in range(30):\n",
+    "            P[row] += (time_series[row, ::gap]**pwr).sum()\n",
+    "    return P\n",
+    "       "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Challenge\n",
+    "- Can you improve on the above implementation of the `power` function?\n",
+    "- Change the following `power_improved` function and see what you can do\n",
+    "- **Remember**: you can't change any other cell in this notebook!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-03-04T10:06:08.461Z",
+     "iopub.status.busy": "2024-03-04T10:06:08.459Z",
+     "iopub.status.idle": "2024-03-04T10:06:08.466Z",
+     "shell.execute_reply": "2024-03-04T10:06:08.468Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def power_improved(time_series, P, gap):\n",
+    "    for row in range(time_series.shape[0]):\n",
+    "        for pwr in range(30):\n",
+    "            P[row] += (time_series[row, ::gap]**pwr).sum()\n",
+    "    return P"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# verify that they yield the same results\n",
+    "P = np.zeros(n_series, dtype='float64')\n",
+    "out1 = power(time_series, P, gap)\n",
+    "P = np.zeros(n_series, dtype='float64')\n",
+    "out2 = power_improved(time_series, P, gap)\n",
+    "np.testing.assert_allclose(out1, out2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-03-04T10:06:14.959Z",
+     "iopub.status.busy": "2024-03-04T10:06:14.956Z",
+     "iopub.status.idle": "2024-03-04T10:06:17.437Z",
+     "shell.execute_reply": "2024-03-04T10:06:17.443Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "P = np.zeros(n_series, dtype='float64')\n",
+    "%timeit power(time_series, P, gap)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-03-04T10:06:20.056Z",
+     "iopub.status.busy": "2024-03-04T10:06:20.053Z",
+     "iopub.status.idle": "2024-03-04T10:06:21.695Z",
+     "shell.execute_reply": "2024-03-04T10:06:21.700Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "P = np.zeros(n_series, dtype='float64')\n",
+    "%timeit power_improved(time_series, P, gap)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.5"
+  },
+  "nteract": {
+   "version": "0.28.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/parallel.ipynb b/parallel.ipynb
new file mode 100644
index 0000000..81bd0ac
--- /dev/null
+++ b/parallel.ipynb
@@ -0,0 +1,150 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "import pprint\n",
+        "import numpy as np\n",
+        "# you may need to pip-install this\n",
+        "import threadpoolctl as th"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {},
+      "id": "7480c6b9"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "N = 10000\n",
+        "x = np.zeros((N, N), dtype=\"float64\")"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {},
+      "id": "653a14c9"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# open a terminal and look at htop while running this,\n",
+        "# then repeat by changing N ➔ notice how the workload is distributed and\n",
+        "# how the frequencies of the CPUs are adjusted!\n",
+        "y = x @ x"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {},
+      "id": "3f3e7872"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# now control the number of OpenMP/BLAS threads with threadpoolctl\n",
+        "# monitor with htop -➔ see how the one process jumps around CPUs"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {},
+      "id": "eeec6203"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "with th.threadpool_limits(limits=1, user_api='blas'):\n",
+        "    y = x @ x"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {},
+      "id": "fbe3e515"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# OpenMP/BLAS infos\n",
+        "pprint.pprint(th.threadpool_info())"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {},
+      "id": "6b1ef107"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# How to limit the jumping around?\n",
+        "os.sched_getaffinity(0) # 0 is the \"calling\" process, i.e. this very process"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {},
+      "id": "9616d412"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# let's make our process stick to CPU0!\n",
+        "with th.threadpool_limits(limits=1, user_api='blas'):\n",
+        "    os.sched_setaffinity(0, {0})\n",
+        "    y = x @ x"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {},
+      "id": "a85806f8"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# let's see what happens if we move it to a E-core\n",
+        "with th.threadpool_limits(limits=1, user_api='blas'):\n",
+        "    os.sched_setaffinity(0, {10})\n",
+        "    y = x @ x"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {},
+      "id": "3efc9fa1"
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# and now let's try to force it to use the two physical P-cores, and go around HyperThreading ;-)\n",
+        "# note that we are changing to limits=2!\n",
+        "with th.threadpool_limits(limits=2, user_api='blas'):\n",
+        "    os.sched_setaffinity(0, {0,2})\n",
+        "    y = x @ x"
+      ],
+      "outputs": [],
+      "execution_count": null,
+      "metadata": {},
+      "id": "de5c8f61"
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.11.8"
+    },
+    "nteract": {
+      "version": "0.28.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/parallel/overcommit.py b/parallel/overcommit.py
new file mode 100644
index 0000000..951a0c0
--- /dev/null
+++ b/parallel/overcommit.py
@@ -0,0 +1,7 @@
+import sys
+import numpy as np
+
+N = int(sys.argv[1]) if len(sys.argv) > 1 else 10000
+x = np.zeros((N, N), dtype="float64")
+
+y = x @ x
diff --git a/parallel/submit.sh b/parallel/submit.sh
new file mode 100755
index 0000000..d5affe2
--- /dev/null
+++ b/parallel/submit.sh
@@ -0,0 +1,8 @@
+# the number of python processes to start, P=3 if not set
+P=${1:-3}
+# the size of the matrix, use the default in overcommit.py if not set
+N=$2
+for i in $(seq $P); do 
+    echo -n "Starting process ${i}… "
+    python overcommit.py $N & echo $!
+done
diff --git a/puzzle.ipynb b/puzzle.ipynb
new file mode 100644
index 0000000..35788c4
--- /dev/null
+++ b/puzzle.ipynb
@@ -0,0 +1,169 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-03-04T09:40:28.904Z",
+     "iopub.status.busy": "2024-03-04T09:40:28.896Z",
+     "iopub.status.idle": "2024-03-04T09:40:28.978Z",
+     "shell.execute_reply": "2024-03-04T09:40:28.967Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-03-04T10:02:39.062Z",
+     "iopub.status.busy": "2024-03-04T10:02:39.057Z",
+     "iopub.status.idle": "2024-03-04T10:02:39.068Z",
+     "shell.execute_reply": "2024-03-04T10:02:39.071Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# create a collection of time series\n",
+    "# in real life, this data comes from an experiment/simulation\n",
+    "n_series = 128\n",
+    "len_one_series = 2**20  # ➔ 2^20 = 1,048,576 items (8 B x 2^20 = 8,388,608 B = 8 M)\n",
+    "time_series = np.zeros((n_series, len_one_series), dtype='float64')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-03-04T10:02:41.027Z",
+     "iopub.status.busy": "2024-03-04T10:02:41.020Z",
+     "iopub.status.idle": "2024-03-04T10:02:41.036Z",
+     "shell.execute_reply": "2024-03-04T10:02:41.040Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# how much memory does one time series need?\n",
+    "ts_size = time_series[0].nbytes/2**20 # -> 2^20 is 1 M\n",
+    "total_size = time_series.nbytes/2**20\n",
+    "print(f'Size of one time series: {int(ts_size)} M')\n",
+    "print(f'Size of collection: {int(total_size)} M')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-03-04T10:06:08.461Z",
+     "iopub.status.busy": "2024-03-04T10:06:08.459Z",
+     "iopub.status.idle": "2024-03-04T10:06:08.466Z",
+     "shell.execute_reply": "2024-03-04T10:06:08.468Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# let's load the collection in one big array\n",
+    "def load_data_row(x, time_series):\n",
+    "    \"\"\"Store one time series per raw\"\"\"\n",
+    "    for row, ts in enumerate(time_series):\n",
+    "        x[row,:] = ts\n",
+    "    return x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-03-04T10:06:10.280Z",
+     "iopub.status.busy": "2024-03-04T10:06:10.277Z",
+     "iopub.status.idle": "2024-03-04T10:06:10.284Z",
+     "shell.execute_reply": "2024-03-04T10:06:10.288Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# let's load the collection in one big array\n",
+    "def load_data_column(x, time_series):\n",
+    "    \"\"\"Store one time series per column\"\"\"\n",
+    "    for column, ts in enumerate(time_series):\n",
+    "        x[:,column] = ts\n",
+    "    return x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-03-04T10:06:14.959Z",
+     "iopub.status.busy": "2024-03-04T10:06:14.956Z",
+     "iopub.status.idle": "2024-03-04T10:06:17.437Z",
+     "shell.execute_reply": "2024-03-04T10:06:17.443Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "x = np.zeros((n_series, len_one_series), dtype='float64')\n",
+    "%timeit load_data_row(x, time_series)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-03-04T10:06:20.056Z",
+     "iopub.status.busy": "2024-03-04T10:06:20.053Z",
+     "iopub.status.idle": "2024-03-04T10:06:21.695Z",
+     "shell.execute_reply": "2024-03-04T10:06:21.700Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "x = np.zeros((len_one_series, n_series), dtype='float64')\n",
+    "%timeit load_data_column(x, time_series)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# on my machine: 31 ms vs 1240 ms ≈ 40x slowdown!!!"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.6"
+  },
+  "nteract": {
+   "version": "0.28.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}