unitaryfoundation · vprusso · Oct 28, 2025 · Oct 28, 2025 · Nov 25, 2025
diff --git a/.gitignore b/.gitignore
@@ -33,6 +33,9 @@ coverage.xml
 
 # Sphinx documentation
 docs/_build/
+docs/build/
+docs/source/generated/
+docs/jupyter_execute/
 
 # Jupyter Notebook
 .ipynb_checkpoints

diff --git a/metriq_gym/benchmarks/bseq.py b/metriq_gym/benchmarks/bseq.py
@@ -1,8 +1,20 @@
-""" "Bell state effective qubits" BSEQ benchmark for the Metriq Gym
-(credit to Paul Nation for the original code for IBM devices).
-
-This benchmark evaluates a quantum device's ability to produce entangled states and measure correlations that violate
-the CHSH inequality. The violation of this inequality indicates successful entanglement between qubits.
+"""BSEQ (Bell state effective qubits) benchmark implementation.
+
+Summary:
+    Evaluates how well a device generates Bell pairs that violate the CHSH inequality across
+    its connectivity graph. Circuits are built per colouring of the topology and executed in
+    four measurement bases to detect correlations.
+
+Result interpretation:
+    Polling returns BSEQResult with:
+        - largest_connected_size: size of the biggest connected subgraph of qubit pairs that
+          violated CHSH (> 2). Higher means entanglement spans more of the device.
+        - fraction_connected: largest_connected_size normalised by the discovered qubit count,
+          making it easier to compare devices of different sizes.
+
+References:
+    - Original routines attributed to Paul Nation (Qiskit Device Benchmarking).
+    - J. F. Clauser et al., Phys. Rev. Lett. 23, 880 (1969).
 """
 
 from dataclasses import dataclass

diff --git a/metriq_gym/benchmarks/clops.py b/metriq_gym/benchmarks/clops.py
@@ -1,3 +1,5 @@
+"""CLOPS (circuit layer operations per second) benchmark implementation."""
+
 import copy
 from dataclasses import dataclass
 

diff --git a/metriq_gym/benchmarks/lr_qaoa.py b/metriq_gym/benchmarks/lr_qaoa.py
@@ -1,3 +1,21 @@
+"""Linear Ramp QAOA benchmark implementation.
+
+Summary:
+    Solves weighted Max-Cut instances with a linear-ramp parameter schedule and compares
+    results against classical optima to estimate approximation ratios and optimal sampling
+    probabilities.
+
+Result interpretation:
+    Polling returns LinearRampQAOAResult with metrics including:
+        - approx_ratio_mean / stddev: how close average costs are to the optimum.
+        - optimal_probability_mean / stddev: frequency of sampling an optimal bitstring.
+        - confidence_pass: boolean indicating whether results meet the configured confidence.
+    Higher approximation ratios and optimal probabilities reflect better QAOA performance.
+
+Reference:
+    - Based on the linear-ramp QAOA studies described in Wurtz and Love, arXiv:2106.13250.
+"""
+
 import math
 import statistics
 import networkx as nx

diff --git a/metriq_gym/benchmarks/mirror_circuits.py b/metriq_gym/benchmarks/mirror_circuits.py
@@ -1,10 +1,19 @@
-"""
-Mirror circuits benchmark for the Metriq Gym.
-
-This benchmark evaluates a quantum device's ability to execute mirror circuits,
-which are quantum circuits with a reflection structure that perform calculations
-and then reverse them. Mirror circuits provide scalable benchmarking capabilities
-for quantum computers as defined in Proctor et al., arXiv:2008.11294.
+"""Mirror Circuits benchmark implementation.
+
+Summary:
+    Generates randomly parameterised mirror circuits that apply layers of Clifford gates,
+    then invert them to test how well a device preserves state fidelity across the forward
+    and reverse halves of the circuit.
+
+Result interpretation:
+    Polling yields MirrorCircuitsResult with:
+        - success_probability: fraction of runs matching the expected bitstring.
+        - polarization: decay parameter relative to an exponential threshold; higher implies
+          better coherence.
+        - binary_success: boolean indicating whether polarization exceeded 1/e.
+
+Reference:
+    - Proctor et al., "Scalable Randomized Benchmarking of Quantum Computers", arXiv:2008.11294.
 """
 
 from dataclasses import dataclass

diff --git a/metriq_gym/benchmarks/qedc_benchmarks.py b/metriq_gym/benchmarks/qedc_benchmarks.py
@@ -1,9 +1,17 @@
-"""
-A general structure for dispatching and polling QED-C benchmarks.
-Credit to QED-C for implementing the benchmarks.
+"""QED-C application-oriented benchmark wrapper.
+
+Summary:
+    Provides a generic dispatch/poll pipeline around the QED-C benchmark suite (Bernstein-
+    Vazirani, Phase Estimation, Hidden Shift, Quantum Fourier Transform) via the QC-App-
+    Oriented-Benchmarks submodule.
+
+Result interpretation:
+    Polling returns QEDCResult.circuit_metrics, a nested dictionary keyed by qubit count and
+    circuit identifier, populated with the fidelity or related metrics computed by the QED-C
+    analyser. Inspect the per-circuit entries to understand performance trends.
 
-The benchmarks generate N circuits for M qubits ranging from min_qubits to max_qubits.
-Each circuit is then run, and the metrics are computed.
+Reference:
+    - QED-C QC-App-Oriented-Benchmarks repository for algorithm-specific methodology.
 """
 
 from dataclasses import dataclass

diff --git a/metriq_gym/benchmarks/qml_kernel.py b/metriq_gym/benchmarks/qml_kernel.py
@@ -1,3 +1,19 @@
+"""Quantum Machine Learning Kernel benchmark implementation.
+
+Summary:
+    Constructs a ZZ feature map kernel, computes the inner-product circuit, and measures the
+    probability of returning to the all-zero state as a proxy for kernel quality.
+
+Result interpretation:
+    Polling returns QMLKernelResult.accuracy_score as a BenchmarkScore where:
+        - value: fraction of shots measuring the expected all-zero bitstring.
+        - uncertainty: binomial standard deviation from the sample counts.
+    Higher accuracy suggests better kernel reproducibility on the selected hardware.
+
+Reference:
+    - Inspired by ZZ-feature map approaches, e.g., arXiv:2405.09724.
+"""
+
 import numpy as np
 from dataclasses import dataclass
 
@@ -27,7 +43,9 @@ class QMLKernelData(BenchmarkData):
 
 
 class QMLKernelResult(BenchmarkResult):
-    accuracy_score: BenchmarkScore = Field(..., json_schema_extra={"direction": MetricDirection.HIGHER})
+    accuracy_score: BenchmarkScore = Field(
+        ..., json_schema_extra={"direction": MetricDirection.HIGHER}
+    )
 
 
 def ZZfeature_circuit(num_qubits: int) -> QuantumCircuit:
@@ -79,7 +97,10 @@ def create_inner_product_circuit(num_qubits: int, seed: int = 0) -> QuantumCircu
 def calculate_accuracy_score(num_qubits: int, count_results: "MeasCount") -> list[float]:
     expected_state = "0" * num_qubits
     accuracy_score = count_results.get(expected_state, 0) / sum(count_results.values())
-    return [accuracy_score, np.sqrt(accuracy_score * (1 - accuracy_score) / sum(count_results.values()))]
+    return [
+        accuracy_score,
+        np.sqrt(accuracy_score * (1 - accuracy_score) / sum(count_results.values())),
+    ]
 
 
 class QMLKernel(Benchmark):
@@ -96,9 +117,7 @@ def poll_handler(
         result_data: list["GateModelResultData"],
         quantum_jobs: list["QuantumJob"],
     ) -> QMLKernelResult:
-        metrics = calculate_accuracy_score(
-                self.params.num_qubits, flatten_counts(result_data)[0]
-            )
+        metrics = calculate_accuracy_score(self.params.num_qubits, flatten_counts(result_data)[0])
         return QMLKernelResult(
             accuracy_score=BenchmarkScore(
                 value=metrics[0],

diff --git a/metriq_gym/benchmarks/quantum_volume.py b/metriq_gym/benchmarks/quantum_volume.py
@@ -1,3 +1,5 @@
+"""Quantum Volume benchmark implementation."""
+
 import math
 import statistics
 from scipy.stats import binom

diff --git a/metriq_gym/benchmarks/wit.py b/metriq_gym/benchmarks/wit.py
@@ -1,13 +1,21 @@
-"""WIT (Wormhole-inspired teleportation) benchmark for the Metriq Gym
-(credit to Paul Nation for the original code for IBM devices).
+"""WIT (wormhole-inspired teleportation) benchmark implementation.
 
-The WIT benchmark is based on the following paper:
-    Towards Quantum Gravity in the Lab on Quantum Processors
-    Illya Shapoval, Vincent Paul Su, Wibe de Jong, Miro Urbanek, Brian Swingle
-    Quantum 7, 1138 (2023)
+Summary:
+    Runs a six- or seven-qubit teleportation-inspired circuit that mimics the protocol from
+    Shapoval et al. (2023) and reports a Pauli-Z expectation value with binomial uncertainty.
 
-A generalized version of the WIT benchmark software can also be found as a companion [software
-repository](https://gitlab.com/ishapova/qglab/-/blob/master/scripts/wormhole.py) to the above paper.
+Result interpretation:
+    Polling returns WITResult.expectation_value as a BenchmarkScore:
+        - value: estimated Pauli-Z expectation (ideal teleportation trends toward +1).
+        - uncertainty: binomial standard deviation computed from the observed counts.
+    Compare value versus uncertainty to decide whether more shots are required or if noise is
+    degrading the teleportation fidelity.
+
+References:
+    - I. Shapoval et al., "Towards Quantum Gravity in the Lab on Quantum Processors", Quantum 7,
+      1138 (2023), arXiv:2205.14081.
+    - Companion script: https://gitlab.com/ishapova/qglab/-/blob/master/scripts/wormhole.py.
+    - Implementation lineage credited to Paul Nation (IBM Quantum).
 """
 
 import numpy as np