diff --git a/MANIFEST.in b/MANIFEST.in
index c5a3ea2b..471001a3 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -9,6 +9,7 @@ prune examples
 prune tests
 prune .github
 prune .tox
+prune benchmark
 
 exclude CODE_OF_CONDUCT.md
 exclude CONTRIBUTING.rst
diff --git a/benchmark/README.rst b/benchmark/README.rst
new file mode 100644
index 00000000..54398fad
--- /dev/null
+++ b/benchmark/README.rst
@@ -0,0 +1,23 @@
+``benchmark``: performance sanity checking
+==========================================
+
+This provides a basic benchmarking script intended to catch major performance regressions (or improvements!). It emulates a "training"-style workload of computing energy and forces for a number of similar systems with pre-computed neighborlists and settings. In particular, we run PME forward and backward calculations for supercells of cubic CsCl crystals, in different sizes. Results are stored as a time-stamped ``.yaml`` file, together with some basic system and version information (which is just the output of ``git`` for now).
+
+
+Usage
+-----
+
+First, make sure to install ``torch-pme`` with the right dependencies in a fresh enviroment:
+
+.. code-block:: bash
+
+    pip install .[metatensor,examples]
+
+Then, run
+
+.. code-block:: bash
+
+    python run.py
+
+
+Please include the output ``.yaml`` file in any major pull request. We recommend using a H100 as benchmark GPU (internally, we use the ``kuma`` cluster).
diff --git a/benchmark/geometry.in b/benchmark/geometry.in
new file mode 100644
index 00000000..3543f1c5
--- /dev/null
+++ b/benchmark/geometry.in
@@ -0,0 +1,11 @@
+#===============================================================================
+# Created using the Atomic Simulation Environment (ASE)
+
+# Fri Sep 20 14:44:08 2024
+
+#=======================================================
+lattice_vector 1.0000000000000000 0.0000000000000000 0.0000000000000000 
+lattice_vector 0.0000000000000000 1.0000000000000000 0.0000000000000000 
+lattice_vector 0.0000000000000000 0.0000000000000000 1.0000000000000000 
+atom 0.0000000000000000 0.0000000000000000 0.0000000000000000 Cs
+atom 0.5000000000000000 0.5000000000000000 0.5000000000000000 Cl
diff --git a/benchmark/run.py b/benchmark/run.py
new file mode 100644
index 00000000..81ae5d57
--- /dev/null
+++ b/benchmark/run.py
@@ -0,0 +1,394 @@
+#!/usr/bin/env python
+
+import datetime
+import os
+import platform
+import subprocess
+from pathlib import Path
+from time import monotonic
+
+import metatensor.torch as mt
+import numpy as np
+import torch
+import yaml
+from ase.io import read
+from vesin import NeighborList
+
+import torchpme
+
+primitive = read(Path(__file__).parent / "geometry.in")
+
+# -- settings --
+multipliers = [4, 8]
+repeats = 16
+
+# -- setup --
+devices = []
+if torch.cuda.is_available():
+    devices.append("cuda")
+
+# run CUDA first!
+devices.append("cpu")
+
+systems = {}
+for multiplier in multipliers:
+    atoms = primitive * [multiplier, multiplier, multiplier]
+    N = len(atoms)
+
+    systems[N] = atoms
+
+# -- hypers --
+all_hypers = {}
+
+for N, atoms in systems.items():
+    cell = atoms.get_cell().array
+
+    cell_dimensions = np.linalg.norm(cell, axis=1)
+    half_cell = np.min(cell_dimensions) / 2 - 1e-6
+    atomic_smearing = half_cell / 5.0
+
+    # tight settings
+    cutoff_tight = half_cell
+    mesh_spacing_tight = atomic_smearing / 8.0
+    lr_wavelength_tight = atomic_smearing / 2.0
+
+    # light settings, roughly 4 times less computational costs
+    cutoff_light = cutoff_tight / 2.0
+    mesh_spacing_light = mesh_spacing_tight * 2.0
+    lr_wavelength_light = lr_wavelength_tight * 2.0
+
+    all_hypers[N] = {
+        "tight": {
+            "cutoff": cutoff_tight,
+            "mesh_spacing": mesh_spacing_tight,
+            "lr_wavelength": lr_wavelength_tight,
+            "atomic_smearing": atomic_smearing,
+        },
+        "light": {
+            "cutoff": cutoff_light,
+            "mesh_spacing": mesh_spacing_light,
+            "lr_wavelength": lr_wavelength_light,
+            "atomic_smearing": atomic_smearing,
+        },
+    }
+
+# -- system info --
+system = {
+    "platform": platform.platform(),
+    "cpu": platform.processor(),
+    "node": platform.node(),
+}
+
+if "cuda" in devices:
+    system["gpu"] = torch.cuda.get_device_name(0)
+
+
+# -- version info --
+cwd = os.path.dirname(os.path.abspath(__file__))
+try:
+    torch_pme_commit = subprocess.check_output(
+        ["git", "log", "--oneline", "-1"], cwd=cwd
+    ).decode("utf-8")
+    torch_pme_status = subprocess.check_output(
+        ["git", "status", "--porcelain"], cwd=cwd
+    ).decode("utf-8")
+except subprocess.CalledProcessError:
+    torch_pme_commit = "not found"
+    torch_pme_status = "not found"
+
+version = {
+    "torch": str(torch.__version__),
+    "torch-pme-commit": torch_pme_commit,
+    "torch-pme-status": torch_pme_status,
+    "torch-pme-version": torchpme.__version__,
+}
+
+
+# -- what do we have to do? --
+runs = {}
+
+for N, atoms in systems.items():
+    for h, hypers in all_hypers[N].items():
+        for d in devices:
+            if h == "tight" and d == "cpu":
+                # avoid very expensive work on CPU
+                continue
+
+            name = f"{N}_{h}_{d}"
+
+            runs[name] = {
+                "N": N,
+                "device": d,
+                "hypers": hypers,
+                "atoms": atoms.copy(),
+            }
+
+
+def compute_distances(positions, neighbor_indices, cell=None, neighbor_shifts=None):
+    """Compute pairwise distances."""
+    atom_is = neighbor_indices[:, 0]
+    atom_js = neighbor_indices[:, 1]
+
+    pos_is = positions[atom_is]
+    pos_js = positions[atom_js]
+
+    distance_vectors = pos_js - pos_is
+
+    if cell is not None and neighbor_shifts is not None:
+        shifts = neighbor_shifts.type(cell.dtype)
+        distance_vectors += shifts @ cell
+    elif cell is not None and neighbor_shifts is None:
+        raise ValueError("Provided `cell` but no `neighbor_shifts`.")
+    elif cell is None and neighbor_shifts is not None:
+        raise ValueError("Provided `neighbor_shifts` but no `cell`.")
+
+    return torch.linalg.norm(distance_vectors, dim=1)
+
+
+def atoms_to_inputs(atoms, device, cutoff):
+    """Convert atoms to inputs for calculate function."""
+    charges = torch.tensor([-1.0, 1.0]).type(torch.float32).to(device)
+    charges = charges.repeat(len(atoms) // 2).reshape(-1, 1)
+    positions = torch.from_numpy(atoms.positions).type(torch.float32).to(device)
+    cell = torch.from_numpy(atoms.cell.array).type(torch.float32).to(device)
+
+    # nl = NeighborList(cutoff=cutoff, full_list=True)
+    nl = NeighborList(cutoff=cutoff, full_list=False)
+
+    i, j, S = nl.compute(
+        points=positions.cpu(), box=cell.cpu(), periodic=True, quantities="ijS"
+    )
+
+    i = torch.from_numpy(i.astype(int)).to(device)
+    j = torch.from_numpy(j.astype(int)).to(device)
+    neighbor_indices = torch.stack([i, j], dim=1).to(device)
+    neighbor_shifts = torch.from_numpy(S.astype(int)).to(device)
+
+    return charges, positions, cell, neighbor_indices, neighbor_shifts
+
+
+def atoms_to_metatensor_inputs(atoms, device, cutoff):
+    """Convert atoms to metatensor inputs for calculate functions."""
+    from metatensor.torch.atomistic.ase_calculator import _compute_ase_neighbors
+
+    system = mt.atomistic.systems_to_torch(
+        atoms, device=device, positions_requires_grad=True
+    )
+    nl_options = mt.atomistic.NeighborListOptions(cutoff=cutoff, full_list=False)
+
+    charges = torch.tensor([-1.0, 1.0]).type(torch.float32).to(device)
+    charges = charges.repeat(len(atoms) // 2).reshape(-1, 1)
+    data = mt.TensorBlock(
+        values=charges,
+        samples=mt.Labels.range("atom", charges.shape[0]).to(device),
+        components=[],
+        properties=mt.Labels.range("charge", charges.shape[1]).to(device),
+    )
+
+    system.add_data(name="charges", data=data)
+    neighbors = _compute_ase_neighbors(
+        atoms, nl_options, dtype=torch.float32, device=device
+    )
+    mt.atomistic.register_autograd_neighbors(system, neighbors)
+
+    return (system, neighbors)
+
+
+def get_calculate_fn(calculator):
+    """Define a calculate function for a given calculator."""
+
+    def calculate(charges, positions, cell, neighbor_indices, neighbor_shifts):
+        positions.requires_grad = True
+        distances = compute_distances(
+            positions, neighbor_indices, cell, neighbor_shifts
+        )
+        potentials = calculator(charges, cell, positions, neighbor_indices, distances)
+        energy = potentials.sum()  # we don't benchmark multiplying with charges
+        forces = -torch.autograd.grad(energy, positions)[0]
+        return energy, forces
+
+    return calculate
+
+
+def get_metatensor_calculate_fn(calculator):
+    """Get a metatensor calculate function from calculator."""
+
+    def calculate(system, neighbors):
+        potentials = calculator(system, neighbors)
+        energy = (potentials.block_by_id(0).values).sum()
+        forces = -torch.autograd.grad(energy, system.positions)[0]
+        return energy, forces
+
+    return calculate
+
+
+def timed_run(calculate_fn, inputs, repeats, device, warmup=True):
+    """Helper for repeated and timed running of a function with warmup."""
+    if warmup:
+        for i in range(repeats):
+            calculate_fn(*inputs[i])
+        if device == "cuda":
+            torch.cuda.synchronize()
+
+    start = monotonic()
+    for i in range(repeats):
+        calculate_fn(*inputs[i])
+
+    # we allow parallel execution of the repeats -- just like in real training!
+    if device == "cuda":
+        torch.cuda.synchronize()
+    end = monotonic()
+    duration = end - start
+    return duration / repeats
+
+
+def execute(run):
+    """Do a benchmark for given settings."""
+    device = run["device"]
+    atoms = run["atoms"]
+    hypers = run["hypers"]
+    run["N"]
+
+    cutoff = hypers["cutoff"]
+    mesh_spacing = hypers["mesh_spacing"]
+    lr_wavelength = hypers["lr_wavelength"]
+    atomic_smearing = hypers["atomic_smearing"]
+
+    rng = np.random.default_rng(42)
+
+    results = {}
+
+    inputs = []
+    atomss = []
+    metatensor_inputs = []
+    for _ in range(repeats):
+        a = atoms.copy()
+        a.rattle(stdev=0.01, rng=rng)
+        inputs.append(atoms_to_inputs(atoms, device, cutoff))
+        atomss.append(a)
+
+    potential = torchpme.CoulombPotential(smearing=atomic_smearing)
+
+    # PME, no metatensor
+    calculator = torchpme.PMECalculator(
+        potential,
+        mesh_spacing=mesh_spacing,
+        full_neighbor_list=False,
+    )
+    calculator = torch.jit.script(calculator)
+    calculator = calculator.to(device)
+    calculate = get_calculate_fn(calculator)
+    time_per_calculation = timed_run(calculate, inputs, repeats, device)
+    results["PME"] = time_per_calculation
+
+    # PME, metatensor
+    metatensor_inputs = [
+        atoms_to_metatensor_inputs(atoms, device, cutoff) for atoms in atomss
+    ]
+    warmup_metatensor_inputs = [
+        atoms_to_metatensor_inputs(atoms, device, cutoff) for atoms in atomss
+    ]
+    calculator = torchpme.metatensor.PMECalculator(
+        potential,
+        mesh_spacing=mesh_spacing,
+        full_neighbor_list=False,
+    )
+    calculator = torch.jit.script(calculator)
+    calculator = calculator.to(device)
+    calculate = get_metatensor_calculate_fn(calculator)
+
+    # metatensor doesn't reset grads, so we can't reuse inputs for warmup
+    timed_run(calculate, warmup_metatensor_inputs, repeats, device, warmup=False)
+    time_per_calculation = timed_run(
+        calculate, metatensor_inputs, repeats, device, warmup=False
+    )
+    results["PME_MT"] = time_per_calculation
+
+    # P3M, no metatensor
+    calculator = torchpme.P3MCalculator(
+        potential,
+        mesh_spacing=mesh_spacing,
+        full_neighbor_list=False,
+    )
+    calculator = torch.jit.script(calculator)
+    calculator = calculator.to(device)
+    calculate = get_calculate_fn(calculator)
+    time_per_calculation = timed_run(calculate, inputs, repeats, device)
+    results["P3M"] = time_per_calculation
+
+    # P3M, metatensor
+    metatensor_inputs = [
+        atoms_to_metatensor_inputs(atoms, device, cutoff) for atoms in atomss
+    ]
+    warmup_metatensor_inputs = [
+        atoms_to_metatensor_inputs(atoms, device, cutoff) for atoms in atomss
+    ]
+    calculator = torchpme.metatensor.P3MCalculator(
+        potential,
+        mesh_spacing=mesh_spacing,
+        full_neighbor_list=False,
+    )
+    calculator = torch.jit.script(calculator)
+    calculator = calculator.to(device)
+    calculate = get_metatensor_calculate_fn(calculator)
+
+    # metatensor doesn't reset grads, so we can't reuse inputs for warmup
+    timed_run(calculate, warmup_metatensor_inputs, repeats, device, warmup=False)
+    time_per_calculation = timed_run(
+        calculate, metatensor_inputs, repeats, device, warmup=False
+    )
+    results["P3M_MT"] = time_per_calculation
+
+    # Ewald, no metatensor
+    calculator = torchpme.EwaldCalculator(
+        potential,
+        lr_wavelength=lr_wavelength,
+        full_neighbor_list=False,
+    )
+    calculator = torch.jit.script(calculator)
+    calculator = calculator.to(device)
+    calculate = get_calculate_fn(calculator)
+    time_per_calculation = timed_run(calculate, inputs, repeats, device)
+    results["E"] = time_per_calculation
+
+    # Ewald, metatensor
+    metatensor_inputs = [
+        atoms_to_metatensor_inputs(atoms, device, cutoff) for atoms in atomss
+    ]
+    warmup_metatensor_inputs = [
+        atoms_to_metatensor_inputs(atoms, device, cutoff) for atoms in atomss
+    ]
+    calculator = torchpme.metatensor.EwaldCalculator(
+        potential,
+        lr_wavelength=lr_wavelength,
+        full_neighbor_list=False,
+    )
+    calculator = torch.jit.script(calculator)
+    calculator = calculator.to(device)
+    calculate = get_metatensor_calculate_fn(calculator)
+
+    # metatensor doesn't reset grads, so we can't reuse inputs for warmup
+    timed_run(calculate, warmup_metatensor_inputs, repeats, device, warmup=False)
+    time_per_calculation = timed_run(
+        calculate, metatensor_inputs, repeats, device, warmup=False
+    )
+    results["E_MT"] = time_per_calculation
+
+    return results
+
+
+results = {}
+for name, run in runs.items():
+    print(f"Running {name}...")
+    r = execute(run)
+    print(r)
+    print()
+    results[name] = r
+
+timestamp = datetime.datetime.now().isoformat()
+out = {"timestamp": timestamp, "system": system, "version": version, "results": results}
+
+with open(f"{timestamp}.yaml", "w") as file:
+    yaml.dump(out, file)
+
+print("Have a nice day.")
diff --git a/tox.ini b/tox.ini
index 8756baa9..28b458d2 100644
--- a/tox.ini
+++ b/tox.ini
@@ -10,7 +10,8 @@ lint_folders =
     "{toxinidir}/docs/src" \
     "{toxinidir}/examples" \
     "{toxinidir}/src" \
-    "{toxinidir}/tests"
+    "{toxinidir}/tests" \
+    "{toxinidir}/benchmark"
 
 [testenv:build]
 description = Asserts package build integrity.
@@ -57,6 +58,7 @@ deps =
     ruff
     mypy
     sphinx-lint
+    types-PyYAML
 commands =
     ruff format --diff {[testenv]lint_folders}
     ruff check {[testenv]lint_folders}