diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 241dd84..e24b666 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -8,43 +8,43 @@ jobs:
     strategy:
       max-parallel: 5
       matrix:
-        python-version: ['3.8', '3.9', '3.10']
+        python-version: ['3.11', '3.12', 'pypy-3.11']
 
     steps:
 
     - name: Setup Julia
-      uses: julia-actions/setup-julia@v1
+      uses: julia-actions/setup-julia@v2
 
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
 
-    - name: Install dependencies
+    - name: Checkout
+      uses: actions/checkout@v4
+
+    - name: Build and install deps
       run: |
-        # install HPy from source if depending on a dev version
-        # git clone -b master --single-branch https://github.com/hpyproject/hpy
-        # git checkout 1234abcd
-        # cd hpy
-        # pip install .
-        pip install numpy cython pytest transonic pythran 'setuptools>=60.2' 'hpy>=0.9.0rc1'
+        pip install -e .[full]
 
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 0
+    - if: startsWith(matrix.python-version, 'pypy') != true
+      name: Build universal extension (only needed for CPython)
+      run: |
+        pip install -e . --config-settings="--global-option=--hpy-abi=universal"
 
-    - name: build
+    - name: Remove _piconumpy_hpy.py
       run: |
-        python setup.py develop
-        python setup.py --hpy-abi=universal develop
+        rm -f piconumpy/_piconumpy_hpy.py
 
     - name: Run tests
       run: |
-        pytest -s
+        pytest -v
 
     - name: Run bench
       run: |
         cd bench
+        make tmp_result_julia.txt
+        make bench_hpy
+        make bench_full
+        # rerun bench_hpy to get these results also at the end
         make bench_hpy
-        make
diff --git a/.gitignore b/.gitignore
index 9a709bb..7a37679 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,5 +9,10 @@ build
 
 **/tmp*.*
 **/tmp*.*
+**/tmp/*
 
-*_cython.c
\ No newline at end of file
+*_cython.c
+
+piconumpy/_piconumpy_hpy.py
+
+.venv*
diff --git a/.mdformat.toml b/.mdformat.toml
new file mode 100644
index 0000000..972483a
--- /dev/null
+++ b/.mdformat.toml
@@ -0,0 +1,3 @@
+wrap = 89
+number = true
+end_of_line = "lf"
diff --git a/LICENSE b/LICENSE
index 44b8153..17e7869 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause License
 
-Copyright (c) 2020, Pierre Augier
+Copyright (c) 2020-2025, Pierre Augier
 Copyright (c) 2021, 2023, Oracle and/or it's affiliates
 All rights reserved.
 
diff --git a/Makefile b/Makefile
index f7f7c54..92877da 100644
--- a/Makefile
+++ b/Makefile
@@ -1,38 +1,57 @@
 
 ifeq ($(PYTHON),)
-PYTHON := python
+PYTHON := python3
 endif
 
+IMPLEMENTATION := $(shell $(PYTHON) -c "import sys; print(sys.implementation.name)")
+
+
 all:
-	make develop_universal
-ifeq ($(PYTHON),python)
-	make build_ext
+	make editable_universal
+ifeq ($(IMPLEMENTATION),cpython)
+	make editable
 endif
 
+
+rm_hpy_py:
+	rm -f piconumpy/_piconumpy_hpy.py
+
+editable:
+	$(PYTHON) -m pip install -e .
+	make rm_hpy_py
+
+editable_universal:
+	$(PYTHON) -m pip install -e . --config-settings="--global-option=--hpy-abi=universal"
+	make rm_hpy_py
+
+editable_full:
+	$(PYTHON) -m pip install -e .[full]
+	make rm_hpy_py
+
+
+# deprecated but let's keep them
 develop:
 	$(PYTHON) setup.py develop
+	make rm_hpy_py
 
 develop_universal:
 	$(PYTHON) setup.py --hpy-abi=universal develop
-	rm -f piconumpy/_piconumpy_hpy.py
-
-pip:
-	$(PYTHON) -m pip install -e .[dev]
+	make rm_hpy_py
 
 build_ext_universal:
 	$(PYTHON) setup.py --hpy-abi=universal build_ext -if
+	make rm_hpy_py
 
 build_ext:
 	$(PYTHON) setup.py build_ext -if
+	make rm_hpy_py
 
-full:
-	$(PYTHON) -m pip install -e .[full]
 
 format:
 	black -l 82 setup.py piconumpy/*.py
 	clang-format-7 -i piconumpy/*cpython_capi.c
 
-tests:
+tests: rm_hpy_py
 	$(PYTHON) -m pytest piconumpy -s
 
 clean:
@@ -40,4 +59,21 @@ clean:
 	rm -rf build dist piconumpy.egg-info
 
 black:
-	black -l 82 .
\ No newline at end of file
+	black -l 82 .
+
+
+install_pypy:
+	uv python install pypy
+
+install_graalpy:
+	uv python install graalpy
+
+create_venv_cpy:
+	$(PYTHON) -m venv .venv_cpy --upgrade-deps
+
+create_venv_pypy:
+	$(shell uv python find pypy) -m venv .venv_pypy --upgrade-deps
+
+create_venv_graalpy:
+	# cannot use --upgrade-deps because pip is patched for GraalPy
+	$(shell uv python find graalpy) -m venv .venv_graalpy
diff --git a/README.md b/README.md
index a0bad3d..f473ec5 100644
--- a/README.md
+++ b/README.md
@@ -5,81 +5,64 @@
 **An experiment about Numpy and HPy**
 
 The C API of CPython is one of the cause of the success of Python in scientific
-computing. In particular, Numpy (and all the Python scientific stack) is built
-on top of this API. However, some characteristics of this API start to be an
-issue for the future of scientific Python (see [1], [2], [HPy]).
+computing. In particular, Numpy (and all the Python scientific stack) is built on top of
+this API. However, some characteristics of this API start to be an issue for the future
+of scientific Python (see [1], [2], [HPy]).
 
-[1]: https://faster-cpython.readthedocs.io/
-[2]: https://morepypy.blogspot.com/2019/12/hpy-kick-off-sprint-report.html
-[HPy]: https://github.com/hpyproject/hpy
-
-[HPy] is a very ambitious and promissing project to design a new and better C
-API for interacting with Python interpreters. It should allow people to write
-Python extensions efficient on different interpreters (CPython, PyPy, Jython,
-IronPython, GraalPython, RustPython, etc.).
+[HPy] is a very ambitious and promising project to design a new and better C API for
+interacting with Python interpreters. It should allow people to write Python extensions
+efficient on different interpreters (CPython, PyPy, Jython, IronPython, GraalPython,
+RustPython, etc.).
 
-PyPy would be especially useful for some scientific applications. For example
-for Integration and ODEs
-([scipy.integrate](https://docs.scipy.org/doc/scipy/reference/integrate.html)),
-for which there are a lot of callbacks of very small functions. This repository
-contains [a tiny benchmark](bench/without_numpy) showing that as long as Numpy
-is not used, PyPy is very efficient for such task. Unfortunately, as soon as
-Numpy is used, PyPy becomes very slow!
+PyPy would be especially useful for some scientific applications. For example for
+Integration and ODEs
+([scipy.integrate](https://docs.scipy.org/doc/scipy/reference/integrate.html)), for which
+there are a lot of callbacks of very small functions. This repository contains
+[a tiny benchmark](bench/without_numpy) showing that as long as Numpy is not used, PyPy
+is very efficient for such task. Unfortunately, as soon as Numpy is used, PyPy becomes
+very slow!
 
-[bench/without_numpy]: https://github.com/paugier/piconumpy/blob/master/bench/without_numpy/
+With PicoNumpy, I'd like to study if [HPy] could help for codes using Numpy and callbacks
+of small Python functions.
 
-With PicoNumpy, I'd like to study if [HPy] could help for codes using Numpy and
-callbacks of small Python functions.
+We start by a [simple but realistic benchmark](bench/bench_array1d.py) (the slow loops
+only involve pure-Python and very simple Numpy). We then wrote a tiny ("pico")
+implementation of a Numpy like object (just sufficient to run the benchmark).
 
-We start by a [simple but realistic benchmark](bench/bench_array1d.py) (the
-slow loops only involve pure-Python and very simple Numpy). We then wrote a
-tiny ("pico") implementation of a Numpy like object (just sufficient to run the
-benchmark).
+The next task is to reimplement PicoNumpy using [HPy] and to check if PyPy could
+efficiently accelerate [our main benchmark](bench/bench_array1d.py).
 
-The next task is to reimplement PicoNumpy using [HPy] and to check if PyPy
-could efficiently accelerate [our main benchmark](bench/bench_array1d.py).
-
-PicoNumpy is really tiny. It just provides an `array` class (one-dimensional)
-supporting:
+PicoNumpy is really tiny. It just provides an `array` class (one-dimensional) supporting:
 
 - Instantiation from a list of floats
-- Elementwise multiplication and division by a float
-- Elementwise addition (of 2 arrays)
+- Element-wise multiplication and division by a float
+- Element-wise addition (of 2 arrays)
 - Indexing
 - `len`
 
-A good acceleration by PyPy of our example would be a great proof that the
-scientific Python community has to invest time and energy on [HPy].
-
-In the script [bench_array1d.py](bench/bench_array1d.py), Transonic is used for
-the benchmark and comparison. With Transonic-Pythran, we typically get a 50
-speedup compared to CPython (and ~400 versus PyPy, which is still very slow for
-such codes using Numpy).
+A good acceleration by PyPy of our example would be a great proof that the scientific
+Python community has to invest time and energy on [HPy].
 
-[bench/bench_array1d.py]: https://github.com/paugier/piconumpy/blob/master/bench/bench_array1d.py
+In the script [bench_array1d.py](bench/bench_array1d.py), Transonic is used for the
+benchmark and comparison. With Transonic-Pythran, we typically get a 50 speed-up compared
+to CPython (and ~400 versus PyPy, which is still very slow for such codes using Numpy).
 
 ## Install and run the benchmarks
 
-**Warning:** PicoNumpy now depends on HPy, which still has to be installed from
-the [Git repository](https://github.com/hpyproject/hpy). For now, the
-installation is a bit more complex that what is described here (more about this
-[here](#more-precise-notes-on-how-to-install-and-run-the-benchmarks-with-PyPy)).
-
-`make` should install the package in editable mode. `cd bench; make` should run
-the benchmarks. For the benchmarks, Julia is used for a good comparison point
-so the command `julia` has to be available.
+`pip install -e .[full]` should build and install the package in editable mode and all
+dependencies necessary for testing, benchmarking and profiling.
 
-For PyPy, the Makefiles are sensible to the environment variable `PYTHON`, so
-you could do:
+For the benchmarks, Julia is used for a good comparison point so the command `julia` has
+to be available. Different benchmarks can be run with
 
-```bash
-export PYTHON=pypy3
-make
+```sh
 cd bench
-make
+make clean
+make bench_hpy
+make bench_full
 ```
 
-The benchmark code can be profiled for the different implementations with the
+The benchmark code can be profiled for the different piconumpy implementations with the
 commands (you need gprof2dot and graphviz):
 
 ```bash
@@ -90,97 +73,192 @@ make profile METHOD="purepy"
 make profile METHOD="cython"
 ```
 
-### More precise notes on how to install and run the benchmarks with PyPy
+### Notes on PyPy
 
-Download and extract a nightly PyPy build
-<https://buildbot.pypy.org/nightly/>. Add to the `PATH` environment variable
-the path of the directory containing the `pypy` executable (something like
-`~/opt/pypy-c-jit-101190-b661dc329618-linux64/bin`). Then, you should be able
-to run:
+PyPy can be downloaded with UV or manually (for example from
+<https://buildbot.pypy.org/nightly/> for a nightly build).
 
-```bash
-pypy -m ensurepip
-pypy -m pip install pip -U
-pypy -m pip install numpy cython pytest transonic pythran
+With UV, one can run
+
+```sh
+uv python install pypy
+```
+
+and then get the path towards `pypy` executable with:
+
+```sh
+uv python find pypy
 ```
 
-We need to install the correct version of HPy for the version of PyPy we are using:
+which can give something like
+`~/.local/share/uv/python/pypy-3.11.11-linux-x86_64-gnu/bin/pypy`.
+
+Then, you should be able to create a virtual environment, activate it and build-install
+PicoNumpy with
 
 ```bash
-pypy -c "import hpy.universal as u; print(u.get_version())"
+cd ~/dev/piconumpy
+$(uv python find pypy) -m venv .venv_pypy --upgrade-deps
+. .venv_pypy/bin/activate
+pip install -e .[full]
 ```
 
-gives `('0.0.2rc2.dev12+gc9660c2', 'c9660c2')`.
+and run the benchmarks with:
 
 ```bash
-cd ~/Dev/hpy
-# update to the correct commit
-pypy setup.py develop
+cd bench
+make clean
+make bench_hpy
+make bench_full
 ```
 
-Now we can build-install PicoNumpy:
+Note that one can check which HPy version is vendored with PyPy:
 
 ```bash
-cd ~/Dev/piconumpy
-pypy setup.py --hpy-abi=universal develop
+python -c "import hpy.universal as u; print(u.get_version())"
 ```
 
-And run the benchmarks with:
+### Notes on GraalPy
+
+GraalPy can be downloaded with UV with
+
+```sh
+uv python install graalpy
+```
+
+Then, one can run
+
+```sh
+cd ~/dev/piconumpy
+# cannot use --upgrade-deps because pip is patched for GraalPy
+$(uv python find graalpy) -m venv .venv_graalpy
+. .venv_graalpy/bin/activate
+# we don't try to run the full benchmarks using Pythran on GraalPy
+pip install -e .[test,profile]
+```
+
+and run the benchmarks with:
 
 ```bash
-export PYTHON="pypy"
+cd bench
 make clean
 make bench_hpy
-make
 ```
 
 ## Few results
 
-As of today (6 July 2021), HPy is not yet ready for high performance, but at
-least (with HPy 0.0.2) it runs !
-
-### At home (Intel(R) Core(TM) i5-8400 CPU @ 2.80GHz)
+### Full benchmarks
 
 - With CPython
 
 ```
-Julia                      :     1 * norm = 0.00196 s
-PicoNumpy (CPython C-API)  :  9.42 * norm
-PicoNumpy (HPy CPy ABI)    :  9.95 * norm
-PicoNumpy (HPy Universal)  :  10.4 * norm
-Transonic-Pythran          : 0.497 * norm
-Numpy                      :  27.5 * norm
-PicoNumpy (purepy)         :  37.3 * norm
-PicoNumpy (purepy_array)   :  37.7 * norm
-PicoNumpy (Cython)         :  28.9 * norm
+{'cache_tag': 'cpython-311',
+ 'version': sys.version_info(major=3, minor=11, micro=2, releaselevel='final', serial=0)}
+hostname: meige7ltpa212
+Julia                      :     1 * norm = 0.0129 s
+PicoNumpy (CPython C-API)  :  6.55 * norm
+PicoNumpy (HPy CPy ABI)    :  7.46 * norm
+PicoNumpy (HPy Universal)  :  7.92 * norm
+Transonic-Pythran          : 0.581 * norm
+Numpy                      :  27.1 * norm
+PicoNumpy (purepy)         :  18.8 * norm
+PicoNumpy (purepy_array)   :  31.7 * norm
+PicoNumpy (Cython)         :  23.3 * norm
 ```
 
 - With PyPy3
 
 ```
-Julia                      :     1 * norm = 0.00196 s
-PicoNumpy (CPython C-API)  :  34.1 * norm
-PicoNumpy (HPy Universal)  :  12.8 * norm
-Transonic-Pythran          : 0.539 * norm
-Numpy                      :   232 * norm
-PicoNumpy (purepy)         :  4.39 * norm
-PicoNumpy (purepy_array)   :  6.33 * norm
-PicoNumpy (Cython)         :   274 * norm
+{'cache_tag': 'pypy311',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=19, releaselevel='final', serial=0)}
+hostname: meige7ltpa212
+Julia                      :     1 * norm = 0.0129 s
+PicoNumpy (CPython C-API)  :  35.5 * norm
+PicoNumpy (HPy Universal)  :  44.7 * norm
+Transonic-Pythran          : 0.609 * norm
+Numpy                      :   168 * norm
+PicoNumpy (purepy)         :  2.98 * norm
+PicoNumpy (purepy_array)   :   8.7 * norm
+PicoNumpy (Cython)         :   288 * norm
 ```
 
-#### Simpler benchmarks (bench/bench_cpy_vs_hpy.py)
+Discussion: PyPy with HPy universal is really too slow (44.7x slower than Julia, 6x slower than
+CPython with its C-API and even a bit slower that PyPy with cpyext!). This is a big issue
+for HPy!
+
+A reasonable target would be as fast as CPython with its C-API...
+
+Profiling shows that the issue is related to slow element-wise operations as in the micro-benchmark
+
+```sh
+cd microbench_low_level
+make bench_element_wise
+```
 
 - With CPython
 
+```sh
+bench element_wise
+hostname: meige7ltpa212
+{'cache_tag': 'cpython-311',
+ 'version': sys.version_info(major=3, minor=11, micro=2, releaselevel='final', serial=0)}
+piconumpy.purepy              : 7.88e-06 s ( 21.9 * Julia)
+numpy                         : 7.88e-06 s ( 21.9 * Julia)
+piconumpy.hpy (universal)     : 1.34e-06 s (  3.7 * Julia)
+piconumpy.cpython_capi        : 6.12e-07 s (  1.7 * Julia)
 ```
-CPython C-API:   1.92 seconds
-HPy [Universal]: 2.08 seconds
-HPy [CPy ABI]:   2.02 seconds
+
+- With PyPy3
+
+```sh
+bench element_wise
+hostname: meige7ltpa212
+{'cache_tag': 'pypy311',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=19, releaselevel='final', serial=0)}
+piconumpy.purepy              : 1.46e-06 s (  4.1 * Julia)
+numpy                         : 4.39e-05 s (121.9 * Julia)
+piconumpy.hpy (universal)     : 4.27e-06 s ( 11.9 * Julia)
+piconumpy.cpython_capi        : 1.84e-06 s (  5.1 * Julia)
+```
+
+### Simpler benchmarks (bench/bench_cpy_vs_hpy.py)
+
+- With CPython
+
+```
+{'cache_tag': 'cpython-311',
+ 'version': sys.version_info(major=3, minor=11, micro=2, releaselevel='final', serial=0)}
+hostname: meige7ltpa212
+Julia:           0.013 seconds
+CPython C-API:   0.084 seconds ( 6.5 * Julia)
+HPy [Universal]: 0.102 seconds ( 7.9 * Julia)
+HPy [CPy ABI]:   0.096 seconds ( 7.4 * Julia)
 ```
 
 - With PyPy3
 
 ```
-CPython C-API:   5.75 seconds
-HPy [Universal]: 2.11 seconds
+{'cache_tag': 'pypy311',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=19, releaselevel='final', serial=0)}
+hostname: meige7ltpa212
+Julia:           0.013 seconds
+CPython C-API:   0.382 seconds (29.6 * Julia)
+HPy [Universal]: 0.487 seconds (37.6 * Julia)
+Python list:     0.037 seconds ( 2.9 * Julia)
 ```
+
+- GraalPy
+
+```
+{'cache_tag': 'graalpy242-311',
+ 'version': sys.version_info(major=3, minor=11, micro=7, releaselevel='final', serial=0)}
+hostname: meige7ltpa212
+Julia:           0.013 seconds
+CPython C-API:   2.123 seconds (164.2 * Julia)
+HPy [Universal]: 1.541 seconds (119.2 * Julia)
+Python list:     0.542 seconds (41.9 * Julia)
+```
+
+[1]: https://faster-cpython.readthedocs.io/
+[2]: https://morepypy.blogspot.com/2019/12/hpy-kick-off-sprint-report.html
+[hpy]: https://github.com/hpyproject/hpy
diff --git a/bench/Makefile b/bench/Makefile
index 7da6e64..eb4c4d4 100644
--- a/bench/Makefile
+++ b/bench/Makefile
@@ -7,7 +7,7 @@ ifeq ($(METHOD),)
 METHOD := cpython-c-api
 endif
 
-all: tmp.py tmp_result_julia.txt
+bench_full: rm_hpy_py tmp.py tmp_result_julia.txt
 	$(PYTHON) tmp.py
 
 tmp.py: bench_array1d.py make_bench_piconumpy.py
@@ -20,11 +20,14 @@ clean:
 tmp_result_julia.txt:
 	julia bench.jl > tmp_result_julia.txt
 
-profile: tmp.py
+profile: rm_hpy_py tmp.py
 	$(PYTHON) profile_piconumpy.py $(METHOD)
 	# with gprof2dot and graphviz (command dot)
 	gprof2dot -f pstats tmp.pstats | dot -Tpng -o tmp_$(METHOD).png
 	eog tmp_$(METHOD).png
 
-bench_hpy:
+bench_hpy: rm_hpy_py
 	$(PYTHON) bench_cpy_vs_hpy.py
+
+rm_hpy_py:
+	rm -f ../piconumpy/_piconumpy_hpy.py
diff --git a/bench/bench.jl b/bench/bench.jl
index 00cedff..bd98571 100644
--- a/bench/bench.jl
+++ b/bench/bench.jl
@@ -65,7 +65,7 @@ function bench(n_sleds, n_time)
 end
 
 
-n_sleds = 10
+n_sleds = 100
 n_time = 200
 
 nb_runs = 200
diff --git a/bench/bench_array1d.py b/bench/bench_array1d.py
index a73a635..ba4426f 100644
--- a/bench/bench_array1d.py
+++ b/bench/bench_array1d.py
@@ -1,9 +1,14 @@
+import sys
+
 import numpy as np
 
 from numpy import array
 from math import pi, cos, sin
 
-from transonic import jit
+from transonic import jit, wait_for_all_extensions
+
+IS_CPY = sys.implementation.name == "cpython"
+IS_PYPY = sys.implementation.name == "pypy"
 
 # begin code functions (don't remove this line)
 
@@ -75,15 +80,15 @@ def bench(n_sleds, n_time):
 
 # end code functions (don't remove this line)
 
+if IS_CPY or IS_PYPY:
 
-bench_pythran = jit(bench)
-# Numba does not support this code...
-# bench_numba = jit(backend="numba")(bench)
-from transonic import wait_for_all_extensions
+    bench_pythran = jit(bench)
+    # Numba does not support this code...
+    # bench_numba = jit(backend="numba")(bench)
 
-# warmup (compilation of the Pythran extension)
-bench_pythran(1, 1)
-wait_for_all_extensions()
+    # warmup (compilation of the Pythran extension)
+    bench_pythran(1, 1)
+    wait_for_all_extensions()
 
 if __name__ == "__main__":
 
diff --git a/bench/bench_cpy_vs_hpy.py b/bench/bench_cpy_vs_hpy.py
index 1b36278..1bb35dd 100644
--- a/bench/bench_cpy_vs_hpy.py
+++ b/bench/bench_cpy_vs_hpy.py
@@ -1,8 +1,11 @@
-import sys
-import time
 import random
+import socket
+import sys
+
 from math import pi, cos, sin
 from pathlib import Path
+from pprint import pprint
+from time import perf_counter
 
 here = Path(__file__).absolute().parent
 
@@ -14,7 +17,7 @@ def my_randn(mod, n):
     return result
 
 
-IS_PYPY = hasattr(sys, "pypy_version_info")
+IS_CPY = sys.implementation.name == "cpython"
 
 
 def runge_kutta_step(mod, f, x0, dt, t=None):
@@ -75,14 +78,18 @@ def bench(mod, n_sleds, n_time):
     u_init = mod.zeros(n_sleds)
     for i in range(n_sleds):
         u_init[i] += 3.5
-    start = time.time()
-    solver(mod, board, x_init, y_init, u_init, v_init, 0.01, n_time)
-    end = time.time()
-    return end - start
+    times = []
+    for _ in range(20):
+        start = perf_counter()
+        solver(mod, board, x_init, y_init, u_init, v_init, 0.01, n_time)
+        times.append(perf_counter() - start)
+
+    times.sort()
+    return times[len(times) // 2]
 
 
 N_SLEDS = 100
-N_TIME = 2000
+N_TIME = 200
 
 
 def import_piconumpy_hpy_universal():
@@ -101,18 +108,48 @@ def main():
 
     import piconumpy._piconumpy_cpython_capi as pnp_capi
 
-    t = bench(pnp_capi, N_SLEDS, N_TIME)
-    print(f"CPython C-API:   {t:.2f} seconds")
+    pprint({key: sys.implementation.__dict__[key] for key in ("cache_tag", "version")})
+    print(f"hostname: {socket.gethostname()}")
+
+    tmp_result_julia = Path("tmp_result_julia.txt")
+    if tmp_result_julia.exists():
+        with open("tmp_result_julia.txt") as file:
+            norm = float(file.read())
+        end = ""
+        print(f"Julia:           {norm:.3f} seconds")
+    else:
+        norm = False
+        end = "\n"
+
+    t_capi = bench(pnp_capi, N_SLEDS, N_TIME)
+    print(f"CPython C-API:   {t_capi:.3f} seconds", end=end)
+    if norm:
+        print(f" ({t_capi/norm:4.1f} * Julia)")
 
     pnp_hpy_universal = import_piconumpy_hpy_universal()
-    t = bench(pnp_hpy_universal, N_SLEDS, N_TIME)
-    print(f"HPy [Universal]: {t:.2f} seconds")
+    t_hpy_univ = bench(pnp_hpy_universal, N_SLEDS, N_TIME)
+    print(f"HPy [Universal]: {t_hpy_univ:.3f} seconds", end=end)
 
-    if not IS_PYPY:
+    if norm:
+        print(f" ({t_hpy_univ/norm:4.1f} * Julia)")
+
+    if IS_CPY:
         import piconumpy._piconumpy_hpy as pnp_hpy
 
-        t = bench(pnp_hpy, N_SLEDS, N_TIME)
-        print(f"HPy [CPy ABI]:   {t:.2f} seconds")
+        t_hpy_cpy_abi = bench(pnp_hpy, N_SLEDS, N_TIME)
+        print(f"HPy [CPy ABI]:   {t_hpy_cpy_abi:.3f} seconds", end=end)
+
+        if norm:
+            print(f" ({t_hpy_cpy_abi/norm:4.1f} * Julia)")
+
+    if not IS_CPY:
+        import piconumpy.purepy as pnp_with_list
+
+        t_with_list = bench(pnp_with_list, N_SLEDS, N_TIME)
+        print(f"Python list:     {t_with_list:.3f} seconds", end=end)
+
+        if norm:
+            print(f" ({t_with_list/norm:4.1f} * Julia)")
 
 
 if __name__ == "__main__":
diff --git a/bench/make_bench_piconumpy.py b/bench/make_bench_piconumpy.py
index 4fbf5c0..4f92bcc 100644
--- a/bench/make_bench_piconumpy.py
+++ b/bench/make_bench_piconumpy.py
@@ -9,8 +9,9 @@ def create_tmp_file(name_module):
 
     if name_module == "_piconumpy_hpy_universal":
         code_import = """
-from piconumpy import _piconumpy_hpy
-array = _piconumpy_hpy.array
+from piconumpy.util_hpy import import_ext
+ext = import_ext()
+array = ext.array
 """
     else:
         code_import = f"from piconumpy.{name_module} import array"
@@ -42,12 +43,19 @@ def create_tmp_file(name_module):
 
 code = (
     """
+import socket
 import sys
+
+from math import pi, cos, sin
+from pathlib import Path
+from pprint import pprint
+
 import numpy as np
+
 from piconumpy import array
-from math import pi, cos, sin
 
-IS_PYPY = hasattr(sys, 'pypy_version_info')
+IS_CPY = sys.implementation.name == "cpython"
+
 """
     + code_functions
     + """
@@ -61,12 +69,16 @@ def create_tmp_file(name_module):
 from tmp_purepy_array import bench as bench_piconumpy_purepy_array
 from tmp_cython import bench as bench_cython
 
-if not IS_PYPY:
+if IS_CPY:
     from tmp_hpy import bench as bench_hpy
 
+pprint({key: sys.implementation.__dict__[key] for key in ("cache_tag", "version")})
+print(f"hostname: {socket.gethostname()}")
 # get norm from Julia benchmark
-with open("tmp_result_julia.txt") as file:
-    norm = float(file.read())
+
+path_julia_result = Path("tmp_result_julia.txt")
+assert path_julia_result.exists()
+norm = float(path_julia_result.read_text())
 
 max_length_name = len("piconumpy (CPython C-API)") + 2
 
@@ -74,12 +86,12 @@ def create_tmp_file(name_module):
 name = fmt_name.format("Julia")
 print(f"{name}:     1 * norm = {norm:4.3g} s")
 
-n_sleds = 10
+n_sleds = 100
 n_time = 200
 
 g = locals()
 
-def timeit(name_func, name):
+def timeit(name_func, name, total_duration=2):
     return timeit_verbose(
         name_func + "(n_sleds, n_time)",
         globals=g,
@@ -87,21 +99,28 @@ def timeit(name_func, name):
         print_time=False,
         norm=norm,
         max_length_name=max_length_name,
+        total_duration=total_duration,
     )
 
 timeit("bench", name="PicoNumpy (CPython C-API)")
-if not IS_PYPY:
+if IS_CPY:
     timeit("bench_hpy", name="PicoNumpy (HPy CPy ABI)")
 timeit("bench_hpy_universal", name="PicoNumpy (HPy Universal)")
 timeit("bench_pythran", name="Transonic-Pythran")
-timeit("bench_numpy", name="Numpy")
+try:
+    timeit("bench_numpy", name="Numpy", total_duration=8)
+except RuntimeError:
+    print("Skip bench_numpy because it's too slow")
 timeit(
     "bench_piconumpy_purepy", name="PicoNumpy (purepy)",
 )
 timeit(
     "bench_piconumpy_purepy_array", name="PicoNumpy (purepy_array)",
 )
-timeit("bench_cython", name="PicoNumpy (Cython)")
+try:
+    timeit("bench_cython", name="PicoNumpy (Cython)", total_duration=8)
+except RuntimeError:
+    print("Skip bench_cython because it's too slow")
 """
 )
 
diff --git a/bench/microbench_low_level/Makefile b/bench/microbench_low_level/Makefile
new file mode 100644
index 0000000..5f874dd
--- /dev/null
+++ b/bench/microbench_low_level/Makefile
@@ -0,0 +1,48 @@
+
+IMPLEMENTATION=$(shell python -c 'import sys; print(sys.implementation.cache_tag)')
+
+.PHONY : clean bench_sum_loop bench_sum_loop_index bench_cort bench_init_zeros bench_instantiate
+
+bench_sum_loop: NAME_BENCH=sum_loop
+bench_sum_loop: tmp/sum_loop_julia.txt _bench
+
+bench_sum_loop_index: NAME_BENCH=sum_loop_index
+bench_sum_loop_index: tmp/sum_loop_index_julia.txt _bench
+
+bench_cort: NAME_BENCH=cort
+bench_cort: tmp/cort_julia.txt _bench
+
+bench_init_zeros: NAME_BENCH=init_zeros
+bench_init_zeros: tmp/init_zeros_julia.txt _bench
+
+bench_board: NAME_BENCH=board
+bench_board: tmp/board_julia.txt _bench
+
+bench_instantiate: NAME_BENCH=instantiate
+bench_instantiate: tmp/instantiate_julia.txt _bench
+
+bench_element_wise: NAME_BENCH=element_wise
+bench_element_wise: tmp/element_wise_julia.txt _bench
+
+_bench:
+	@echo bench $(NAME_BENCH)
+	@python -c "from socket import gethostname as f; print('hostname:', f())"
+	@python -c "import sys; from pprint import pprint as p; p({key: sys.implementation.__dict__[key] for key in ('cache_tag', 'version')})"
+	@python bench.py list $(NAME_BENCH) | tee tmp/$(NAME_BENCH)_$(IMPLEMENTATION)_list.txt
+	@python bench.py purepy $(NAME_BENCH) | tee tmp/$(NAME_BENCH)_$(IMPLEMENTATION)_purepy.txt
+	@python bench.py numpy $(NAME_BENCH) | tee tmp/$(NAME_BENCH)_$(IMPLEMENTATION)_numpy.txt
+	@python bench.py _piconumpy_hpy $(NAME_BENCH) | tee tmp/$(NAME_BENCH)_$(IMPLEMENTATION)_hpy.txt
+	@python bench.py _piconumpy_cpython_capi $(NAME_BENCH) | tee tmp/$(NAME_BENCH)_$(IMPLEMENTATION)_cpy_api.txt
+
+tmp/%_julia.txt: julia/bench_%.jl
+	@mkdir -p tmp
+	@julia julia/bench_$*.jl > $@
+
+clean:
+	rm -rf tmp
+
+produce_traces: tmp/sum_loop_julia.txt
+	@mkdir -p tmp
+	PYPYLOG=jit-log-opt,jit-summary,jit-backend-counts:tmp/pypylog_list.txt pypy bench.py list
+	PYPYLOG=jit-log-opt,jit-summary,jit-backend-counts:tmp/pypylog_piconumpy_list.txt pypy bench.py purepy
+	PYPYLOG=jit-log-opt,jit-summary,jit-backend-counts:tmp/pypylog_piconumpy_hpy.txt pypy bench.py _piconumpy_hpy
diff --git a/bench/microbench_low_level/README.md b/bench/microbench_low_level/README.md
new file mode 100644
index 0000000..7965c76
--- /dev/null
+++ b/bench/microbench_low_level/README.md
@@ -0,0 +1,25 @@
+# Microbenchmarks low level Python code
+
+We measure the performance for functions containing low level Python code.
+
+- `sum_loop` (command `make bench`): `for value in arr` and summation
+
+- `sum_loop_index` (command `make bench_sum_loop_index`):
+  `for index in range(5000)` and summation
+
+- `init_zeros` (command `make bench_init_zeros`): set values to zeros
+
+- `cort` (command `make bench_cort`): normalized cosine similarity measure
+  between derivatives
+
+- `board` (command `make bench_board`): few indexing, simple float computations
+  with sin/cos and instantiation of a small array.
+
+- `instantiate` (command `make bench_instantiate`): dominated by the
+  instantiation/deletion of small arrays of 4 floats.
+
+- `element_wise` (command `make bench_element_wise`): dominated by the
+  instantiation/deletion of small arrays of 4 floats and calling element-wise
+  operations.
+
+The files result_*.txt contain few results.
diff --git a/bench/microbench_low_level/bench.py b/bench/microbench_low_level/bench.py
new file mode 100644
index 0000000..50a7d48
--- /dev/null
+++ b/bench/microbench_low_level/bench.py
@@ -0,0 +1,186 @@
+import sys
+from time import perf_counter
+from pathlib import Path
+from random import random
+from math import sqrt, pi, sin, cos
+
+try:
+    method = sys.argv[1]
+except IndexError:
+    method = "purepy"
+
+try:
+    name_bench = sys.argv[2]
+except IndexError:
+    name_bench = "sum_loop"
+
+try:
+    size = sys.argv[3]
+except IndexError:
+    size = None
+
+if method == "_piconumpy_hpy":
+    from piconumpy.util_hpy import import_ext
+
+    ext = import_ext()
+    array = ext.array
+elif method == "list":
+    array = list
+    if name_bench == "element_wise":
+        sys.exit(0)
+
+elif method == "numpy":
+
+    try:
+        import numpy as np
+    except ImportError:
+        print(f"{method:30s}: ImportError numpy")
+        sys.exit(0)
+
+    array = np.array
+else:
+    d = {}
+    exec(f"from piconumpy.{method} import array", d)
+    array = d["array"]
+    if "piconumpy" not in method:
+        method = f"piconumpy.{method}"
+
+if "_piconumpy_" in method:
+    method = method.replace("_piconumpy_", "piconumpy.")
+
+if method.endswith("hpy"):
+    method += " (universal)"
+
+tmp_result_julia = Path(f"tmp/{name_bench}_julia.txt")
+if tmp_result_julia.exists():
+    with open(tmp_result_julia) as file:
+        norm = float(file.read())
+else:
+    raise RuntimeError(
+        f"{tmp_result_julia} does not exist. First execute with `make`"
+    )
+
+
+def sum_loop(arr):
+    result = 0.0
+    for value in arr:
+        result += value
+    return result
+
+
+def sum_loop_index(arr):
+    result = 0.0
+    for index in range(5000):
+        result += arr[index]
+    return result
+
+
+def init_zeros(arr):
+    for index in range(len(arr)):
+        arr[index] = 0.0
+
+
+def _cort(s1, s2):
+    num = 0.0
+    sum_square_x = 0.0
+    sum_square_y = 0.0
+    for t in range(len(s1) - 1):
+        slope_1 = s1[t + 1] - s1[t]
+        slope_2 = s2[t + 1] - s2[t]
+        num += slope_1 * slope_2
+        sum_square_x += slope_1 * slope_1
+        sum_square_y += slope_2 * slope_2
+    return num / (sqrt(sum_square_x * sum_square_y))
+
+
+def cort(arr):
+    return _cort(arr, arr)
+
+
+def board(X_0):
+    x0 = X_0[0]
+    y0 = X_0[1]
+    u0 = X_0[2]
+    v0 = X_0[3]
+
+    g = 9.81
+    b = 0.5
+    a = 0.25
+    c = 0.5
+    p = (2 * pi) / 10.0
+    q = (2 * pi) / 4.0
+
+    H_x = -a + b * p * sin(p * x0) * cos(q * y0)
+    H_xx = b * p ** 2 * cos(p * x0) * cos(q * y0)
+    H_y = b * q * cos(p * x0) * sin(q * y0)
+    H_yy = b * q ** 2 * cos(p * x0) * cos(q * y0)
+    H_xy = -b * q * p * sin(p * x0) * sin(q * y0)
+
+    F = (g + H_xx * u0 ** 2 + 2 * H_xy * u0 * v0 + H_yy * v0 ** 2) / (
+        1 + H_x ** 2 + H_y ** 2
+    )
+
+    dU = -F * H_x - c * u0
+    dV = -F * H_y - c * v0
+
+    return array([u0, v0, dU, dV])
+
+
+def instantiate(arr):
+    x = arr[0]
+    result = array([x, 3 * x, 6 * x, 9 * x])
+    result[0] = 2 * result[1]
+    return result
+
+
+def element_wise(arr):
+
+    dt = 0.1
+    x0 = arr
+
+    k1 = x0 * dt
+    k2 = (x0 + k1 / 2) * dt
+    k3 = (x0 + k2 / 2) * dt
+    k4 = (x0 + k3) * dt
+    # workaround for a pypy bug
+    # see https://foss.heptapod.net/pypy/pypy/-/issues/3509
+    # x_new = x0 + (k1 + 2 * k2 + 2 * k3 + k4) / 6
+    x_new = x0 + (k1 + k2 * 2 + k3 * 2 + k4) / 6
+    return x_new
+
+
+compute_from_arr = locals()[name_bench]
+
+if size is None:
+    if name_bench.startswith("sum_loop") or name_bench == "cort":
+        size = 10000
+    else:
+        size = 4
+
+print(f"{method:30s}:", end="", flush=True)
+
+# warming during ~ 1s
+data_as_list = [random() for _ in range(size)]
+arr = array(data_as_list)
+t_start = perf_counter()
+while perf_counter() - t_start < 1.0:
+    compute_from_arr(arr)
+
+
+def median(sequence):
+    tmp = sorted(sequence)
+    return tmp[len(tmp) // 2]
+
+
+# measure during ~ 4s
+t0 = perf_counter()
+times = []
+while perf_counter() - t0 < 4.0:
+    data_as_list = [random() for _ in range(size)]
+    arr = array(data_as_list)
+    t_start = perf_counter()
+    compute_from_arr(arr)
+    times.append(perf_counter() - t_start)
+
+time = median(times)
+print(f" {time:.2e} s ({time / norm:5.1f} * Julia)")
diff --git a/bench/microbench_low_level/julia/bench_board.jl b/bench/microbench_low_level/julia/bench_board.jl
new file mode 100644
index 0000000..69d8b64
--- /dev/null
+++ b/bench/microbench_low_level/julia/bench_board.jl
@@ -0,0 +1,44 @@
+using Statistics
+
+function board(X_0::Array)
+
+    x0 = copy(X_0[1])
+    y0 = copy(X_0[2])
+    u0 = copy(X_0[3])
+    v0 = copy(X_0[4])
+
+    g = 9.81
+    a = 0.25
+    b = 0.5
+    c = 0.5
+    p = (2*π)/10.0
+    q = (2*π)/4.0
+
+    H_x = -a + b*p*sin(p*x0)*cos(q*y0)
+    H_xx = b*p^2 * cos(p*x0)*cos(q*y0)
+    H_y = b*q*cos(p*x0)*sin(q*y0)
+    H_yy = b*q^2 * cos(p*x0)*cos(q*y0)
+    H_xy = -b*q*p*sin(p*x0)*sin(q*y0)
+
+    F = (g + H_xx*u0^2 + 2*H_xy*u0*v0 + H_yy*v0^2)/(1 + H_x^2 + H_y^2)
+
+    dU = -F*H_x - c*u0
+    dV = -F*H_y - c*v0
+
+    return [u0, v0, dU, dV]
+
+end
+
+compute_from_arr = board
+
+size = 4
+nb_runs = 200
+
+times = zeros(nb_runs)
+
+for irun in 1:nb_runs
+    arr = rand(size)
+    times[irun] = @elapsed compute_from_arr(arr)
+end
+
+println(median(times))
diff --git a/bench/microbench_low_level/julia/bench_cort.jl b/bench/microbench_low_level/julia/bench_cort.jl
new file mode 100644
index 0000000..a816541
--- /dev/null
+++ b/bench/microbench_low_level/julia/bench_cort.jl
@@ -0,0 +1,35 @@
+using Statistics
+
+
+function cort(s1, s2)
+    num = 0.0
+    sum_square_x = 0.0
+    sum_square_y = 0.0
+    for t in 1:length(s1)-1
+        slope_1 = s1[t + 1] - s1[t]
+        slope_2 = s2[t + 1] - s2[t]
+        num += slope_1 * slope_2
+        sum_square_x += slope_1 * slope_1
+        sum_square_y += slope_2 * slope_2
+    end
+    return num / (sqrt(sum_square_x * sum_square_y))
+end
+
+function use_cort(arr)
+    return cort(arr, arr)
+end
+
+
+compute_from_arr = use_cort
+
+size = 10000
+nb_runs = 200
+
+times = zeros(nb_runs)
+
+for irun in 1:nb_runs
+    arr = rand(size)
+    times[irun] = @elapsed compute_from_arr(arr)
+end
+
+println(median(times))
diff --git a/bench/microbench_low_level/julia/bench_element_wise.jl b/bench/microbench_low_level/julia/bench_element_wise.jl
new file mode 100644
index 0000000..c91a16f
--- /dev/null
+++ b/bench/microbench_low_level/julia/bench_element_wise.jl
@@ -0,0 +1,30 @@
+using Statistics
+
+function element_wise(arr::Array)
+
+    dt = 0.1
+    x0 = arr
+
+    k1 = x0 * dt
+    k2 = (x0 + k1 / 2) * dt
+    k3 = (x0 + k2 / 2) * dt
+    k4 = (x0 + k3) * dt
+    x_new = x0 + (k1 + 2 * k2 + 2 * k3 + k4) / 6
+
+    return x_new
+
+end
+
+compute_from_arr = element_wise
+
+size = 4
+nb_runs = 2000
+
+times = zeros(nb_runs)
+
+for irun in 1:nb_runs
+    arr = rand(size)
+    times[irun] = @elapsed compute_from_arr(arr)
+end
+
+println(median(times))
diff --git a/bench/microbench_low_level/julia/bench_init_zeros.jl b/bench/microbench_low_level/julia/bench_init_zeros.jl
new file mode 100644
index 0000000..4ac2656
--- /dev/null
+++ b/bench/microbench_low_level/julia/bench_init_zeros.jl
@@ -0,0 +1,21 @@
+using Statistics
+
+function init_zeros(arr)
+    for i in eachindex(arr)
+        arr[i] = 0.0
+    end
+end
+
+compute_from_arr = init_zeros
+
+size = 4
+nb_runs = 200
+
+times = zeros(nb_runs)
+
+for irun in 1:nb_runs
+    arr = rand(size)
+    times[irun] = @elapsed compute_from_arr(arr)
+end
+
+println(median(times))
diff --git a/bench/microbench_low_level/julia/bench_instantiate.jl b/bench/microbench_low_level/julia/bench_instantiate.jl
new file mode 100644
index 0000000..5116e07
--- /dev/null
+++ b/bench/microbench_low_level/julia/bench_instantiate.jl
@@ -0,0 +1,22 @@
+using Statistics
+
+function instantiate(arr::Array)
+    x = arr[1]
+    result = [x, 3*x, 6*x, 9*x]
+    result[1] = 2 * result[2]
+    return result
+end
+
+compute_from_arr = instantiate
+
+size = 4
+nb_runs = 200
+
+times = zeros(nb_runs)
+
+for irun in 1:nb_runs
+    arr = rand(size)
+    times[irun] = @elapsed compute_from_arr(arr)
+end
+
+println(median(times))
diff --git a/bench/microbench_low_level/julia/bench_sum_loop.jl b/bench/microbench_low_level/julia/bench_sum_loop.jl
new file mode 100644
index 0000000..5c38b52
--- /dev/null
+++ b/bench/microbench_low_level/julia/bench_sum_loop.jl
@@ -0,0 +1,23 @@
+using Statistics
+
+function sum_loop(arr)
+    result = 0.
+    for i in eachindex(arr)
+        result += arr[i]
+    end
+    return result
+end
+
+compute_from_arr = sum_loop
+
+size = 10000
+nb_runs = 200
+
+times = zeros(nb_runs)
+
+for irun in 1:nb_runs
+    arr = rand(size)
+    times[irun] = @elapsed compute_from_arr(arr)
+end
+
+println(median(times))
diff --git a/bench/microbench_low_level/julia/bench_sum_loop_index.jl b/bench/microbench_low_level/julia/bench_sum_loop_index.jl
new file mode 100644
index 0000000..b4c682c
--- /dev/null
+++ b/bench/microbench_low_level/julia/bench_sum_loop_index.jl
@@ -0,0 +1,23 @@
+using Statistics
+
+function sum_loop_index(arr)
+    result = 0.
+    for i = 1:5000
+        result += arr[i]
+    end
+    return result
+end
+
+compute_from_arr = sum_loop_index
+
+size = 10000
+nb_runs = 200
+
+times = zeros(nb_runs)
+
+for irun in 1:nb_runs
+    arr = rand(size)
+    times[irun] = @elapsed compute_from_arr(arr)
+end
+
+println(median(times))
diff --git a/bench/microbench_low_level/result_board.md b/bench/microbench_low_level/result_board.md
new file mode 100644
index 0000000..30b407b
--- /dev/null
+++ b/bench/microbench_low_level/result_board.md
@@ -0,0 +1,77 @@
+# Microbenchmark board
+
+We measure the performance for this function:
+
+```python
+def board(X_0):
+    x0 = X_0[0]
+    y0 = X_0[1]
+    u0 = X_0[2]
+    v0 = X_0[3]
+
+    g = 9.81
+    b = 0.5
+    a = 0.25
+    c = 0.5
+    p = (2 * pi) / 10.0
+    q = (2 * pi) / 4.0
+
+    H_x = -a + b * p * sin(p * x0) * cos(q * y0)
+    H_xx = b * p ** 2 * cos(p * x0) * cos(q * y0)
+    H_y = b * q * cos(p * x0) * sin(q * y0)
+    H_yy = b * q ** 2 * cos(p * x0) * cos(q * y0)
+    H_xy = -b * q * p * sin(p * x0) * sin(q * y0)
+
+    F = (g + H_xx * u0 ** 2 + 2 * H_xy * u0 * v0 + H_yy * v0 ** 2) / (
+        1 + H_x ** 2 + H_y ** 2
+    )
+
+    dU = -F * H_x - c * u0
+    dV = -F * H_y - c * v0
+
+    return array([u0, v0, dU, dV])
+```
+
+One can run the benchmarks with `make bench_board`.
+
+With PyPy3.7, I get:
+
+```
+bench board
+hostname: voyage
+{'cache_tag': 'pypy37',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=7, releaselevel='final', serial=0)}
+list                          : 3.21e-07 s (  0.9 * Julia)
+piconumpy.purepy              : 1.37e-05 s ( 36.9 * Julia)
+numpy                         : 1.18e-04 s (316.6 * Julia)
+piconumpy.hpy                 : 1.26e-05 s ( 33.8 * Julia)
+piconumpy.cpython_capi        : 5.52e-05 s (148.6 * Julia)
+```
+
+With CPython:
+
+```
+bench board
+hostname: voyage
+{'cache_tag': 'cpython-39',
+ 'version': sys.version_info(major=3, minor=9, micro=7, releaselevel='final', serial=0)}
+list                          : 5.16e-06 s ( 13.9 * Julia)
+piconumpy.purepy              : 8.04e-06 s ( 21.6 * Julia)
+numpy                         : 1.01e-05 s ( 27.1 * Julia)
+piconumpy.hpy                 : 5.90e-06 s ( 15.9 * Julia)
+piconumpy.cpython_capi        : 5.56e-06 s ( 15.0 * Julia)
+```
+
+With Python 3.8.5 (GraalVM CE Native 21.3.0)
+
+```
+bench board
+hostname: voyage
+{'cache_tag': 'graalpython-38',
+ 'version': sys.version_info(major=3, minor=8, micro=5, releaselevel='alpha', serial=0)}
+list                          : 1.15e-05 s ( 30.9 * Julia)
+piconumpy.purepy              : 1.74e-05 s ( 46.8 * Julia)
+numpy                         : ImportError numpy
+piconumpy.hpy                 : 4.91e-05 s (132.2 * Julia)
+piconumpy.cpython_capi        : 6.19e-05 s (166.7 * Julia)
+```
diff --git a/bench/microbench_low_level/result_cort.md b/bench/microbench_low_level/result_cort.md
new file mode 100644
index 0000000..b5578bf
--- /dev/null
+++ b/bench/microbench_low_level/result_cort.md
@@ -0,0 +1,64 @@
+# Microbenchmark cort
+
+We measure the performance for this function:
+
+```python
+def cort(arr):
+    return _cort(arr, arr)
+
+def _cort(s1, s2):
+    num = 0.0
+    sum_square_x = 0.0
+    sum_square_y = 0.0
+    for t in range(len(s1) - 1):
+        slope_1 = s1[t + 1] - s1[t]
+        slope_2 = s2[t + 1] - s2[t]
+        num += slope_1 * slope_2
+        sum_square_x += slope_1 * slope_1
+        sum_square_y += slope_2 * slope_2
+    return num / (sqrt(sum_square_x * sum_square_y))
+```
+
+One can run the benchmarks with `make bench_cort`.
+
+With PyPy3.7, I get:
+
+```
+bench cort
+hostname: voyage
+{'cache_tag': 'pypy37',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=7, releaselevel='final', serial=0)}
+list                          : 4.29e-05 s (  1.8 * Julia)
+piconumpy.purepy              : 4.12e-05 s (  1.7 * Julia)
+numpy                         : 4.77e-02 s (1975.5 * Julia)
+piconumpy.hpy                 : 1.46e-03 s ( 60.5 * Julia)
+piconumpy.cpython_capi        : 6.96e-03 s (288.5 * Julia)
+```
+
+With CPython:
+
+```
+bench cort
+hostname: voyage
+{'cache_tag': 'cpython-39',
+ 'version': sys.version_info(major=3, minor=9, micro=7, releaselevel='final', serial=0)}
+list                          : 4.42e-03 s (183.4 * Julia)
+piconumpy.purepy              : 1.04e-02 s (430.0 * Julia)
+numpy                         : 9.76e-03 s (404.4 * Julia)
+piconumpy.hpy                 : 5.66e-03 s (234.7 * Julia)
+piconumpy.cpython_capi        : 4.77e-03 s (197.7 * Julia)
+```
+
+With Python 3.8.5 (GraalVM CE Native 21.3.0)
+
+```
+bench cort
+hostname: voyage
+{'cache_tag': 'graalpython-38',
+ 'version': sys.version_info(major=3, minor=8, micro=5, releaselevel='alpha', serial=0)}
+list                          : 2.44e-05 s (  1.0 * Julia)
+piconumpy.purepy              : 3.13e-05 s (  1.3 * Julia)
+numpy                         : ImportError numpy
+piconumpy.hpy                 : 1.69e-04 s (  7.0 * Julia)
+piconumpy.cpython_capi        : 3.55e-04 s ( 14.7 * Julia)
+```
diff --git a/bench/microbench_low_level/result_init_zeros.md b/bench/microbench_low_level/result_init_zeros.md
new file mode 100644
index 0000000..b88e4bd
--- /dev/null
+++ b/bench/microbench_low_level/result_init_zeros.md
@@ -0,0 +1,53 @@
+# Microbenchmark sum_init_zeros
+
+We measure the performance for this function:
+
+```python
+def init_zeros(arr):
+    for index in range(len(arr)):
+        arr[index] = 0.0
+```
+
+One can run the benchmarks with `make bench_init_zeros`.
+
+With PyPy3.7, I get:
+
+```
+bench init_zeros
+hostname: voyage
+{'cache_tag': 'pypy37',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=7, releaselevel='final', serial=0)}
+list                          : 2.63e-05 s (  5.4 * Julia)
+piconumpy.purepy              : 2.99e-05 s (  6.1 * Julia)
+numpy                         : 1.17e-02 s (2403.5 * Julia)
+piconumpy.hpy                 : 4.58e-04 s ( 94.1 * Julia)
+piconumpy.cpython_capi        : 8.46e-04 s (173.6 * Julia)
+```
+
+With CPython:
+
+```
+bench init_zeros
+hostname: voyage
+{'cache_tag': 'cpython-39',
+ 'version': sys.version_info(major=3, minor=9, micro=7, releaselevel='final', serial=0)}
+list                          : 5.34e-04 s (109.6 * Julia)
+piconumpy.purepy              : 2.03e-03 s (417.4 * Julia)
+numpy                         : 1.17e-03 s (239.3 * Julia)
+piconumpy.hpy                 : 7.51e-04 s (154.1 * Julia)
+piconumpy.cpython_capi        : 5.44e-04 s (111.5 * Julia)
+```
+
+With Python 3.8.5 (GraalVM CE Native 21.3.0)
+
+```
+bench init_zeros
+hostname: voyage
+{'cache_tag': 'graalpython-38',
+ 'version': sys.version_info(major=3, minor=8, micro=5, releaselevel='alpha', serial=0)}
+list                          : 1.37e-05 s (  2.8 * Julia)
+piconumpy.purepy              : 1.93e-05 s (  4.0 * Julia)
+numpy                         : ImportError numpy
+piconumpy.hpy                 : 4.68e-05 s (  9.6 * Julia)
+piconumpy.cpython_capi        : 1.74e-04 s ( 35.8 * Julia)
+```
diff --git a/bench/microbench_low_level/result_instantiate.md b/bench/microbench_low_level/result_instantiate.md
new file mode 100644
index 0000000..883cea1
--- /dev/null
+++ b/bench/microbench_low_level/result_instantiate.md
@@ -0,0 +1,55 @@
+# Microbenchmark instantiate
+
+We measure the performance for this function:
+
+```python
+def instantiate(arr):
+    x = arr[0]
+    result = array([x, 3 * x, 6 * x, 9 * x])
+    result[0] = 2 * result[1]
+    return result
+```
+
+One can run the benchmarks with `make bench_instantiate`.
+
+With PyPy3.7, I get:
+
+```
+bench instantiate
+hostname: meige8pcpa79
+{'cache_tag': 'pypy37',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=7, releaselevel='final', serial=0)}
+list                          : 1.13e-07 s (  0.9 * Julia)
+piconumpy.purepy              : 8.50e-08 s (  0.7 * Julia)
+numpy                         : ImportError numpy
+piconumpy.hpy                 : 1.69e-06 s ( 13.1 * Julia)
+piconumpy.cpython_capi        : 1.53e-05 s (118.3 * Julia)
+```
+
+With CPython:
+
+```
+bench instantiate
+hostname: meige8pcpa79
+{'cache_tag': 'cpython-39',
+ 'version': sys.version_info(major=3, minor=9, micro=7, releaselevel='final', serial=0)}
+list                          : 1.19e-06 s (  9.2 * Julia)
+piconumpy.purepy              : 2.59e-06 s ( 20.0 * Julia)
+numpy                         : 3.63e-06 s ( 28.1 * Julia)
+piconumpy.hpy                 : 1.84e-06 s ( 14.3 * Julia)
+piconumpy.cpython_capi        : 1.35e-06 s ( 10.5 * Julia)
+```
+
+With Python 3.8.5 (GraalVM CE Native 21.3.0)
+
+```
+bench instantiate
+hostname: meige8pcpa79
+{'cache_tag': 'graalpython-38',
+ 'version': sys.version_info(major=3, minor=8, micro=5, releaselevel='alpha', serial=0)}
+list                          : 4.16e-06 s ( 32.3 * Julia)
+piconumpy.purepy              : 4.15e-06 s ( 32.2 * Julia)
+numpy                         : ImportError numpy
+piconumpy.hpy                 : 7.32e-06 s ( 56.8 * Julia)
+piconumpy.cpython_capi        : 9.68e-06 s ( 75.0 * Julia)
+```
diff --git a/bench/microbench_low_level/result_sum_loop.md b/bench/microbench_low_level/result_sum_loop.md
new file mode 100644
index 0000000..062840b
--- /dev/null
+++ b/bench/microbench_low_level/result_sum_loop.md
@@ -0,0 +1,201 @@
+# Microbenchmark sum_loop
+
+We measure the performance for this function:
+
+```python
+def sum_loop(arr):
+    result = 0.0
+    for value in arr:
+        result += value
+    return result
+```
+
+One can run the benchmarks with `make bench_sum_loop`.
+
+With PyPy3.7, I get:
+
+```
+bench sum_loop
+hostname: voyage
+{'cache_tag': 'pypy37',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=7, releaselevel='final', serial=0)}
+list                          : 2.35e-05 s (  1.8 * Julia)
+piconumpy.purepy              : 2.60e-05 s (  2.0 * Julia)
+numpy                         : 8.97e-03 s (677.0 * Julia)
+piconumpy.hpy                 : 3.73e-04 s ( 28.2 * Julia)
+piconumpy.cpython_capi        : 1.75e-03 s (132.4 * Julia)
+```
+
+With CPython:
+
+```
+bench sum_loop
+hostname: voyage
+{'cache_tag': 'cpython-39',
+ 'version': sys.version_info(major=3, minor=9, micro=7, releaselevel='final', serial=0)}
+list                          : 3.65e-04 s ( 27.5 * Julia)
+piconumpy.purepy              : 2.17e-03 s (164.1 * Julia)
+numpy                         : 1.09e-03 s ( 82.2 * Julia)
+piconumpy.hpy                 : 7.39e-04 s ( 55.8 * Julia)
+piconumpy.cpython_capi        : 5.07e-04 s ( 38.3 * Julia)
+```
+
+With Python 3.8.5 (GraalVM CE Native 21.3.0)
+
+```
+bench sum_loop
+hostname: voyage
+{'cache_tag': 'graalpython-38',
+ 'version': sys.version_info(major=3, minor=8, micro=5, releaselevel='alpha', serial=0)}
+list                          : 1.92e-05 s (  1.4 * Julia)
+piconumpy.purepy              : 3.61e-05 s (  2.7 * Julia)
+numpy                         : ImportError numpy
+piconumpy.hpy                 : 5.03e-04 s ( 38.0 * Julia)
+piconumpy.cpython_capi        : 2.90e-03 s (219.1 * Julia)
+```
+
+## Summary
+
+- PyPy is fast with list (1.3 * Julia, same order of magnitude than with Julia)
+and as fast for a piconumpy array based on a list ("piconumpy.purepy", zero
+cost abstraction!)
+
+- Numpy and _piconumpy_cpython_capi are both much slower with PyPy than with
+Cpython. We can guess that the Numpy port to HPy would fix that.
+
+- piconumpy_hpy is a bit faster with PyPy (19 * Julia) than with CPython (40 *
+Julia), however, we see that PyPy does not strongly accelerate piconumpy_hpy
+(19 * Julia, 14 * piconumpy_list).
+
+## Traces PyPy `sum_loop`
+
+### List
+
+```
++557: label(p0, p1, p6, p9, f35, f30, p15, p22, p26, i32, i27, p29, descr=TargetToken(140447503809120))
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
++606: i44 = uint_ge(i32, i27)
+guard_false(i44, descr=<Guard0x7fbc7b939a00>) [p0, p6, p9, p15, p1, i32, i27, i44, p26, f30, f35]
++615: f45 = getarrayitem_gc_f(p29, i32, descr=<ArrayF 8>)
++622: i47 = int_add(i32, 1)
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#12 STORE_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#14 LOAD_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#16 LOAD_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#18 INPLACE_ADD')
++626: f48 = float_add(f35, f45)
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#20 STORE_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#22 JUMP_ABSOLUTE')
++630: setfield_gc(p15, i47, descr=<FieldS pypy.objspace.std.iterobject.W_AbstractSeqIterObject.inst_index 8>)
++634: guard_not_invalidated(descr=<Guard0x7fbc7b939a60>) [p0, p6, p9, p15, p1, f45, f48, None, None]
++634: i51 = getfield_raw_i(140447672379264, descr=<FieldS pypysig_long_struct.c_value 0>)
++647: i53 = int_sub(i51, 1)
++651: setfield_raw(140447672379264, i53, descr=<FieldS pypysig_long_struct.c_value 0>)
++654: i56 = int_lt(i53, 0)
++658: guard_false(i56, descr=<Guard0x7fbc7b939ac0>) [p0, p6, p9, p15, p1, i53, f45, f48, None, None]
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
++664: i57 = arraylen_gc(p29, descr=<ArrayF 8>)
++664: jump(p0, p1, p6, p9, f48, f45, p15, p22, p26, i47, i27, p29, descr=TargetToken(140447503809120))
+```
+
+### piconumpy purepy (based on list)
+
+```
++705: label(p0, p1, p6, p9, f53, f46, p15, p22, i49, p29, p38, p42, i43, p45, descr=TargetToken(139748702723776))
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
++760: guard_not_invalidated(descr=<Guard0x7f19c7c97d60>) [p0, p6, p9, p15, p1, p22, i49, f46, f53]
++760: p62 = force_token()
++760: enter_portal_frame(21, 28364)
+debug_merge_point(1, 1, '__getitem__;/home/pierre/Dev/piconumpy/piconumpy/purepy.py:27-28~#0 LOAD_FAST')
+debug_merge_point(1, 1, '__getitem__;/home/pierre/Dev/piconumpy/piconumpy/purepy.py:27-28~#2 LOAD_ATTR')
+debug_merge_point(1, 1, '__getitem__;/home/pierre/Dev/piconumpy/piconumpy/purepy.py:27-28~#4 LOAD_FAST')
+debug_merge_point(1, 1, '__getitem__;/home/pierre/Dev/piconumpy/piconumpy/purepy.py:27-28~#6 BINARY_SUBSCR')
++760: i65 = uint_ge(i49, i43)
++763: guard_false(i65, descr=<Guard0x7f19ba0b44a0>) [p0, p6, p9, p15, p1, p22, i49, f46, f53]
++769: f66 = getarrayitem_gc_f(p45, i49, descr=<ArrayF 8>)
+debug_merge_point(1, 1, '__getitem__;/home/pierre/Dev/piconumpy/piconumpy/purepy.py:27-28~#8 RETURN_VALUE')
++776: leave_portal_frame(21)
++776: i69 = int_add(i49, 1)
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#12 STORE_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#14 LOAD_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#16 LOAD_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#18 INPLACE_ADD')
++780: f70 = float_add(f53, f66)
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#20 STORE_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#22 JUMP_ABSOLUTE')
++784: i72 = getfield_raw_i(139748871243648, descr=<FieldS pypysig_long_struct.c_value 0>)
++797: i74 = int_sub(i72, 3)
++801: setfield_raw(139748871243648, i74, descr=<FieldS pypysig_long_struct.c_value 0>)
++804: setfield_gc(p15, i69, descr=<FieldS pypy.objspace.std.iterobject.W_AbstractSeqIterObject.inst_index 8>)
++808: i77 = int_lt(i74, 0)
++812: guard_false(i77, descr=<Guard0x7f19c7c97dc0>) [p0, p6, p9, p15, p1, i74, f66, f70, None, None, None]
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
++818: i78 = arraylen_gc(p45, descr=<ArrayF 8>)
++818: jump(p0, p1, p6, p9, f70, f66, p15, p22, i69, p29, p38, p42, i43, p45, descr=TargetToken(139748702723776))
+```
+
+### piconumpy hpy
+
+```
++1339: label(p0, p1, p6, p9, f73, p63, p15, i68, p62, descr=TargetToken(139865876151520))
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
++1352: p82 = getfield_gc_r(p15, descr=<FieldP pypy.objspace.std.iterobject.W_AbstractSeqIterObject.inst_w_seq 16>)
++1356: guard_nonnull_class(p82, 139866025815200, descr=<Guard0x7f350fde1a60>) [p0, p6, p9, p63, p15, p1, p82, f73]
++1376: p84 = getfield_gc_r(p82, descr=<FieldP pypy.interpreter.typedef.W_HPyObjectUserDictWeakrefable.inst_map 16>)
++1387: guard_value(p84, ConstPtr(ptr85), descr=<Guard0x7f35021fe0b0>) [p0, p6, p9, p63, p15, p1, p82, f73]
++1396: guard_not_invalidated(descr=<Guard0x7f35021fe0f8>) [p0, p6, p9, p63, p15, p1, p82, f73]
++1403: p87 = getfield_gc_r(ConstPtr(ptr86), descr=<FieldP pypy.module._hpy_universal.interp_slot.W_SlotWrapper.inst_w_objclass 32>)
++1414: guard_value(p87, ConstPtr(ptr88), descr=<Guard0x7f350fde1ac0>) [p0, p6, p9, p63, p15, p1, p82, f73]
++1423: i90 = getfield_gc_i(ConstPtr(ptr89), descr=<FieldU pypy.module._hpy_universal.interp_slot.W_SlotWrapper.inst_cfuncptr 8>)
++1427: i92 = int_lt(i68, 0)
++1431: guard_false(i92, descr=<Guard0x7f35021fe140>) [p0, p6, p9, p63, p15, p1, p82, f73]
++1444: i94 = getfield_gc_i(ConstPtr(ptr93), descr=<FieldS list.length 8>)
++1448: i95 = int_is_zero(i94)
++1451: guard_false(i95, descr=<Guard0x7f35021fe188>) [p0, p6, p9, p63, p15, p1, p82, f73]
++1457: i97 = int_sub(i94, 1)
++1461: p99 = getfield_gc_r(ConstPtr(ptr98), descr=<FieldP list.items 16>)
++1465: i100 = getarrayitem_gc_i(p99, i97, descr=<ArrayS 8>)
++1470: i101 = arraylen_gc(p99, descr=<ArrayS 8>)
++1474: i103 = int_rshift(i101, 1)
++1477: i105 = int_sub(i103, 5)
++1481: i106 = int_lt(i97, i105)
++1484: cond_call(i106, ConstClass(_ll_list_resize_hint_really_look_inside_iff__listPtr_Signed_Bool), ConstPtr(ptr108), i97, 0, descr=<Callv 0 rii EF=5>)
++1490: guard_no_exception(descr=<Guard0x7f350fde1b20>) [p0, p6, p9, p63, p15, p1, i68, i90, i100, p82, i97, f73]
++1490: setfield_gc(ConstPtr(ptr110), i97, descr=<FieldS list.length 8>)
++1494: i112 = int_lt(i100, 0)
++1498: guard_false(i112, descr=<Guard0x7f350fde1b80>) [p0, p6, p9, p63, p15, p1, i68, i90, i100, p82, f73]
++1522: setarrayitem_gc(p62, i100, p82, descr=<ArrayP 8>)
++1527: p113 = force_token()
++1548: setfield_gc(p0, p113, descr=<FieldP pypy.interpreter.pyframe.PyFrame.vable_token 8>)
++1552: i115 = call_may_force_i(i90, 139866044538144, i100, i68, descr=<Calli 8 iii EF=7>)
++1663: guard_not_forced(descr=<Guard0x7f350fdfabe8>) [p0, p6, p9, p63, p15, p1, i100, i115, i68, p82, f73]
++1674: guard_no_exception(descr=<Guard0x7f35021fe1d0>) [p0, p6, p9, p63, p15, p1, i100, i115, i68, p82, f73]
++1688: call_n(ConstClass(close), i100, descr=<Callv 0 i EF=5>)
++1754: guard_no_exception(descr=<Guard0x7f350fde1be0>) [p0, p6, p9, p63, p15, p1, i115, i68, p82, f73]
++1768: i117 = int_is_true(i115)
++1771: guard_true(i117, descr=<Guard0x7f35021fe218>) [p0, p6, p9, p63, p15, p1, i115, i68, p82, f73]
++1784: p119 = getfield_gc_r(ConstPtr(ptr118), descr=<FieldP list.items 16>)
++1788: p120 = getarrayitem_gc_r(p119, i115, descr=<ArrayP 8>)
++1793: call_n(ConstClass(close), i115, descr=<Callv 0 i EF=5>)
++1866: guard_no_exception(descr=<Guard0x7f350fde1c40>) [p0, p6, p9, p63, p15, p1, p120, i68, p82, f73]
++1880: guard_nonnull_class(p120, ConstClass(W_FloatObject), descr=<Guard0x7f350fde1ca0>) [p0, p6, p9, p63, p15, p1, p120, i68, p82, f73]
++1907: i123 = getfield_gc_i(p15, descr=<FieldS pypy.objspace.std.iterobject.W_AbstractSeqIterObject.inst_index 8>)
++1918: i125 = int_add(i123, 1)
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#12 STORE_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#14 LOAD_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#16 LOAD_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#18 INPLACE_ADD')
++1923: setfield_gc(p15, i125, descr=<FieldS pypy.objspace.std.iterobject.W_AbstractSeqIterObject.inst_index 8>)
++1927: f126 = getfield_gc_f(p120, descr=<FieldF pypy.objspace.std.floatobject.W_FloatObject.inst_floatval 8 pure>)
++1933: f127 = float_add(f73, f126)
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#20 STORE_FAST')
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-26~#22 JUMP_ABSOLUTE')
++1947: guard_not_invalidated(descr=<Guard0x7f350fde1d00>) [p0, p6, p9, p120, p15, p1, f127, None, None, None]
++1947: i129 = getfield_raw_i(139866044675968, descr=<FieldS pypysig_long_struct.c_value 0>)
++1960: i131 = int_sub(i129, 3)
++1964: setfield_raw(139866044675968, i131, descr=<FieldS pypysig_long_struct.c_value 0>)
++1967: i134 = int_lt(i131, 0)
++1971: guard_false(i134, descr=<Guard0x7f350fde1d60>) [p0, p6, p9, p120, p15, p1, i131, f127, None, None, None]
+debug_merge_point(0, 0, 'sum_loop;bench.py:23-25~#10 FOR_ITER')
++1977: i135 = arraylen_gc(p119, descr=<ArrayP 8>)
++1977: jump(p0, p1, p6, p9, f127, p120, p15, i125, p119, descr=TargetToken(139865876151520))
+```
diff --git a/bench/microbench_low_level/result_sum_loop_index.md b/bench/microbench_low_level/result_sum_loop_index.md
new file mode 100644
index 0000000..fd63301
--- /dev/null
+++ b/bench/microbench_low_level/result_sum_loop_index.md
@@ -0,0 +1,55 @@
+# Microbenchmark sum_loop_index
+
+We measure the performance for this function:
+
+```python
+def sum_loop_index(arr):
+    result = 0.0
+    for index in range(5000):
+        result += arr[index]
+    return result
+```
+
+One can run the benchmarks with `make bench_sum_loop_index`.
+
+With PyPy3.7, I get:
+
+```
+bench sum_loop_index
+hostname: voyage
+{'cache_tag': 'pypy37',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=7, releaselevel='final', serial=0)}
+list                          : 1.19e-05 s (  2.0 * Julia)
+piconumpy.purepy              : 1.64e-05 s (  2.8 * Julia)
+numpy                         : 4.18e-03 s (711.4 * Julia)
+piconumpy.hpy                 : 1.73e-04 s ( 29.4 * Julia)
+piconumpy.cpython_capi        : 8.44e-04 s (143.8 * Julia)
+```
+
+With CPython:
+
+```
+bench sum_loop_index
+hostname: voyage
+{'cache_tag': 'cpython-39',
+ 'version': sys.version_info(major=3, minor=9, micro=7, releaselevel='final', serial=0)}
+list                          : 3.91e-04 s ( 66.5 * Julia)
+piconumpy.purepy              : 1.11e-03 s (188.3 * Julia)
+numpy                         : 8.93e-04 s (152.1 * Julia)
+piconumpy.hpy                 : 5.42e-04 s ( 92.3 * Julia)
+piconumpy.cpython_capi        : 4.17e-04 s ( 71.0 * Julia)
+```
+
+With Python 3.8.5 (GraalVM CE Native 21.3.0)
+
+```
+bench sum_loop_index
+hostname: voyage
+{'cache_tag': 'graalpython-38',
+ 'version': sys.version_info(major=3, minor=8, micro=5, releaselevel='alpha', serial=0)}
+list                          : 1.36e-05 s (  2.3 * Julia)
+piconumpy.purepy              : 1.81e-05 s (  3.1 * Julia)
+numpy                         : ImportError numpy
+piconumpy.hpy                 : 3.68e-05 s (  6.3 * Julia)
+piconumpy.cpython_capi        : 1.08e-04 s ( 18.5 * Julia)
+```
diff --git a/bench/profile_piconumpy.py b/bench/profile_piconumpy.py
index b7de388..3bde5ae 100644
--- a/bench/profile_piconumpy.py
+++ b/bench/profile_piconumpy.py
@@ -7,12 +7,14 @@
 import tmp_purepy
 import tmp_purepy_array
 import tmp_cython
+import tmp_hpy_universal
 
 methods = {
     "cpython-c-api": bench_array1d,
     "purepy": tmp_purepy,
     "purepy_array": tmp_purepy_array,
     "cython": tmp_cython,
+    "universal": tmp_hpy_universal,
 }
 
 module = methods.get(sys.argv[-1], bench_array1d)
diff --git a/piconumpy/bench.py b/piconumpy/bench.py
index a704e5f..f5d4d8d 100644
--- a/piconumpy/bench.py
+++ b/piconumpy/bench.py
@@ -11,6 +11,13 @@ def timeit_verbose(
     print_time=False,
     max_length_name=33,
 ):
+    if name is None:
+        name = stmt.split("(")[0]
+
+    fmt_name = f"{{:{max_length_name}s}}"
+    name = fmt_name.format(name)
+    print(f"{name}:", end="", flush=True)
+
     result = timeit(
         stmt, setup=setup, total_duration=total_duration, globals=globals
     )
@@ -20,18 +27,12 @@ def timeit_verbose(
     else:
         norm_given = True
 
-    if name is None:
-        name = stmt.split("(")[0]
-
-    fmt_name = f"{{:{max_length_name}s}}"
-    name = fmt_name.format(name)
-
     if print_time:
         raw_time = f" = {result:7.3g} s"
     else:
         raw_time = ""
 
-    print(f"{name}: {result/norm:5.3g} * norm{raw_time}")
+    print(f" {result/norm:5.3g} * norm{raw_time}")
     if not norm_given and not print_time:
         print(f"norm = {norm:5.3g} s")
 
diff --git a/piconumpy/purepy.py b/piconumpy/purepy.py
index a84ad31..bfa4b03 100644
--- a/piconumpy/purepy.py
+++ b/piconumpy/purepy.py
@@ -2,7 +2,7 @@ class array:
     __slots__ = ["data", "size"]
 
     def __init__(self, data):
-        self.data = list(float(number) for number in data)
+        self.data = list(data)
         self.size = len(self.data)
 
     def __add__(self, other):
@@ -30,9 +30,10 @@ def __getitem__(self, index):
     def __setitem__(self, index, value):
         self.data[index] = value
 
+
 def empty(size):
-    return array([0]*size)
+    return array([0] * size)
 
-def zeros(size):
-    return array([0]*size)
 
+def zeros(size):
+    return array([0] * size)
diff --git a/piconumpy/purepy_array.py b/piconumpy/purepy_array.py
index ba801a2..7306cff 100644
--- a/piconumpy/purepy_array.py
+++ b/piconumpy/purepy_array.py
@@ -23,8 +23,10 @@ def __mul__(self, other):
     def __truediv__(self, other):
         return self.__class__(number / other for number in self)
 
+
 def empty(size):
-    return array([0]*size)
+    return array([0] * size)
+
 
 def zeros(size):
-    return array([0]*size)
+    return array([0] * size)
diff --git a/piconumpy/test_cpython_capi.py b/piconumpy/test_cpython_capi.py
index a1638dc..cedbed5 100644
--- a/piconumpy/test_cpython_capi.py
+++ b/piconumpy/test_cpython_capi.py
@@ -6,6 +6,7 @@
 
 class Tests:
     piconumpy = _piconumpy_cpython_capi
+
     def _array(self, *args):
         return self.piconumpy.array(*args)
 
diff --git a/piconumpy/test_cython.py b/piconumpy/test_cython.py
index 44cf1c5..438adc7 100644
--- a/piconumpy/test_cython.py
+++ b/piconumpy/test_cython.py
@@ -1,4 +1,5 @@
 from .test_cpython_capi import Tests as _Tests
 
+
 class Tests(_Tests):
     from . import _piconumpy_cython as piconumpy
diff --git a/piconumpy/test_hpy_universal.py b/piconumpy/test_hpy_universal.py
index 358f037..2a470ca 100644
--- a/piconumpy/test_hpy_universal.py
+++ b/piconumpy/test_hpy_universal.py
@@ -1,16 +1,31 @@
+import sys
+
 import pytest
 
+from .util_hpy import import_ext
 
 from .test_cpython_capi import Tests as _Tests
 
 try:
-    from . import _piconumpy_hpy
+    piconumpy_universal = import_ext()
 except ImportError:
-    _piconumpy_hpy = False
+    piconumpy_universal = False
 
 
 @pytest.mark.skipif(
-    not _piconumpy_hpy, reason="ImportError piconumpy HPy Universal"
+    not piconumpy_universal, reason="ImportError piconumpy HPy Universal"
 )
 class TestsCPyABI(_Tests):
-    piconumpy = _piconumpy_hpy
+    piconumpy = piconumpy_universal
+
+    def test_multiply(self):
+        if sys.implementation.name == "pypy":
+            pytest.xfail("Expected failure with PyPy (but should work)")
+
+        super().test_multiply()
+
+    def test_add(self):
+        if sys.implementation.name == "pypy":
+            pytest.xfail("Expected failure with PyPy (but should work)")
+
+        super().test_add()
diff --git a/piconumpy/test_purepy.py b/piconumpy/test_purepy.py
index 0793611..e7320e0 100644
--- a/piconumpy/test_purepy.py
+++ b/piconumpy/test_purepy.py
@@ -1,4 +1,5 @@
 from .test_cpython_capi import Tests as _Tests
 
+
 class Tests(_Tests):
     from . import purepy as piconumpy
diff --git a/piconumpy/test_purepy_array.py b/piconumpy/test_purepy_array.py
index b41a8b7..4c3da8c 100644
--- a/piconumpy/test_purepy_array.py
+++ b/piconumpy/test_purepy_array.py
@@ -1,4 +1,5 @@
 from .test_cpython_capi import Tests as _Tests
 
+
 class Tests(_Tests):
     from . import purepy_array as piconumpy
diff --git a/piconumpy/util_hpy.py b/piconumpy/util_hpy.py
new file mode 100644
index 0000000..1fbc47c
--- /dev/null
+++ b/piconumpy/util_hpy.py
@@ -0,0 +1,21 @@
+from importlib.util import spec_from_file_location
+from pathlib import Path
+
+from hpy.universal import load
+
+
+def import_from_path(path):
+    name_ext = "_piconumpy_hpy"
+    ext_filepath = str(path)
+    spec = spec_from_file_location(name_ext, ext_filepath)
+    m = load(name_ext, ext_filepath, spec)
+    m.__file__ = ext_filepath
+    m.__loader__ = __loader__
+    m.__name__ = __name__
+    m.__package__ = __package__
+    return m
+
+
+def import_ext():
+    path = Path(__file__).parent / "_piconumpy_hpy.hpy0.so"
+    return import_from_path(path)
diff --git a/pyproject.toml b/pyproject.toml
index 3234fad..ff6b793 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,10 +5,12 @@ description = "An experiment about Numpy and pyhandle/hpy."
 authors = [
     {name = "Pierre Augier", email = "pierre.augier@univ-grenoble-alpes.fr"},
 ]
-license = {text = "BSD 3-Clause"}
+license = "BSD-3-Clause"
+license-files = ["LICENSE"]
 readme = "README.md"
 keywords = ["numpy", "hpy", "PyPy"]
 requires-python = ">=3.8"
+dependencies = ["hpy>=0.9.0; implementation_name == 'cpython'"]
 
 [project.urls]
 homepage = "https://github.com/paugier/piconumpy"
@@ -16,15 +18,20 @@ repository = "https://github.com/paugier/piconumpy"
 documentation = "https://github.com/paugier/piconumpy"
 
 [project.optional-dependencies]
-dev = ['transonic', 'numpy', 'pytest', 'pythran']
-full = ['black']
+test = ["pytest", "numpy"]
+# pythran 0.18.0 needed but not yet on PyPI
+# (see https://github.com/serge-sans-paille/pythran/pull/2310#issuecomment-2871805768)
+bench = ['transonic', 'numpy', 'pythran@git+https://github.com/serge-sans-paille/pythran.git@0.18.0']
+profile = ["gprof2dot"]
+format = ['black']
+full = ["piconumpy[test,bench,profile,format]"]
 
 [build-system]
 requires = [
-    "setuptools >= 35.0.2",
+    "setuptools>=35.0.2",
     "wheel",
     "cython",
-    "hpy >= 0.9.0"
+    "hpy>=0.9.0; implementation_name == 'cpython'"
 ]
 
 [tool.black]