diff --git a/fasterbench/__init__.py b/fasterbench/__init__.py
index 5e09119..ec82712 100644
--- a/fasterbench/__init__.py
+++ b/fasterbench/__init__.py
@@ -19,6 +19,9 @@
 from .compute import ComputeMetrics, compute_compute
 from .memory import MemoryMetrics, compute_memory, compute_memory_multi
 from .energy import EnergyMetrics, compute_energy, compute_energy_multi
+from fasterbench.roofline import (
+    HardwarePeaks, RooflinePoint, measure_peaks, clear_peaks_cache, RooflineAnalyzer,
+)
 from .plot import create_radar_plot, SPECS
 from .utils import parse_metric_value
 
@@ -38,6 +41,8 @@
     'MemoryMetrics', 'compute_memory', 'compute_memory_multi',
     # Energy
     'EnergyMetrics', 'compute_energy', 'compute_energy_multi',
+    # Roofline
+    'HardwarePeaks', 'RooflinePoint', 'measure_peaks', 'clear_peaks_cache', 'RooflineAnalyzer',
     # Plot
     'create_radar_plot', 'SPECS',
     # Report
diff --git a/fasterbench/_modidx.py b/fasterbench/_modidx.py
index 28cabd1..b5647f9 100644
--- a/fasterbench/_modidx.py
+++ b/fasterbench/_modidx.py
@@ -130,6 +130,36 @@
                                     'fasterbench.report._generate_css': ('analysis/report.html#_generate_css', 'fasterbench/report.py'),
                                     'fasterbench.report._improvement_indicator': ( 'analysis/report.html#_improvement_indicator',
                                                                                    'fasterbench/report.py')},
+            'fasterbench.roofline': { 'fasterbench.roofline.HardwarePeaks': ( 'analysis/roofline.html#hardwarepeaks',
+                                                                              'fasterbench/roofline.py'),
+                                      'fasterbench.roofline.HardwarePeaks.as_dict': ( 'analysis/roofline.html#hardwarepeaks.as_dict',
+                                                                                      'fasterbench/roofline.py'),
+                                      'fasterbench.roofline.RooflineAnalyzer': ( 'analysis/roofline.html#rooflineanalyzer',
+                                                                                 'fasterbench/roofline.py'),
+                                      'fasterbench.roofline.RooflineAnalyzer.__init__': ( 'analysis/roofline.html#rooflineanalyzer.__init__',
+                                                                                          'fasterbench/roofline.py'),
+                                      'fasterbench.roofline.RooflineAnalyzer.plot': ( 'analysis/roofline.html#rooflineanalyzer.plot',
+                                                                                      'fasterbench/roofline.py'),
+                                      'fasterbench.roofline.RooflineAnalyzer.profile': ( 'analysis/roofline.html#rooflineanalyzer.profile',
+                                                                                         'fasterbench/roofline.py'),
+                                      'fasterbench.roofline.RooflineAnalyzer.results': ( 'analysis/roofline.html#rooflineanalyzer.results',
+                                                                                         'fasterbench/roofline.py'),
+                                      'fasterbench.roofline.RooflineAnalyzer.summary': ( 'analysis/roofline.html#rooflineanalyzer.summary',
+                                                                                         'fasterbench/roofline.py'),
+                                      'fasterbench.roofline.RooflinePoint': ( 'analysis/roofline.html#rooflinepoint',
+                                                                              'fasterbench/roofline.py'),
+                                      'fasterbench.roofline.RooflinePoint.as_dict': ( 'analysis/roofline.html#rooflinepoint.as_dict',
+                                                                                      'fasterbench/roofline.py'),
+                                      'fasterbench.roofline._layer_flops': ( 'analysis/roofline.html#_layer_flops',
+                                                                             'fasterbench/roofline.py'),
+                                      'fasterbench.roofline._pinned_benchmark_flags': ( 'analysis/roofline.html#_pinned_benchmark_flags',
+                                                                                        'fasterbench/roofline.py'),
+                                      'fasterbench.roofline._setup_roofline_hooks': ( 'analysis/roofline.html#_setup_roofline_hooks',
+                                                                                      'fasterbench/roofline.py'),
+                                      'fasterbench.roofline.clear_peaks_cache': ( 'analysis/roofline.html#clear_peaks_cache',
+                                                                                  'fasterbench/roofline.py'),
+                                      'fasterbench.roofline.measure_peaks': ( 'analysis/roofline.html#measure_peaks',
+                                                                              'fasterbench/roofline.py')},
             'fasterbench.size': { 'fasterbench.size.SizeMetrics': ('metrics/size.html#sizemetrics', 'fasterbench/size.py'),
                                   'fasterbench.size.SizeMetrics.as_dict': ('metrics/size.html#sizemetrics.as_dict', 'fasterbench/size.py'),
                                   'fasterbench.size.compute_size': ('metrics/size.html#compute_size', 'fasterbench/size.py'),
diff --git a/fasterbench/roofline.py b/fasterbench/roofline.py
new file mode 100644
index 0000000..8ee82d4
--- /dev/null
+++ b/fasterbench/roofline.py
@@ -0,0 +1,479 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/analysis/roofline.ipynb.
+
+# %% ../nbs/analysis/roofline.ipynb #imports
+from __future__ import annotations
+
+import math
+import time
+import warnings
+from dataclasses import dataclass, asdict
+from contextlib import contextmanager
+
+import numpy as np
+import torch
+import torch.nn as nn
+import plotly.graph_objects as go
+
+from .core import _device_ctx, _sync, _fmt_float, _section
+from .profiling import _leaf_modules, _tensor_bytes, _output_bytes
+
+# %% auto #0
+__all__ = ['HardwarePeaks', 'measure_peaks', 'clear_peaks_cache', 'RooflinePoint', 'RooflineAnalyzer']
+
+# %% ../nbs/analysis/roofline.ipynb #hardware_peaks
+@dataclass(slots=True)
+class HardwarePeaks:
+    "Empirically measured achievable peak compute and streaming bandwidth for a device."
+    peak_flops: float       # achievable peak FLOPs/s
+    peak_bandwidth: float   # achievable streaming bandwidth in bytes/s
+    ridge_point: float      # FLOPs/byte = peak_flops / peak_bandwidth
+    device: str             # e.g. "cuda:0", "cpu"
+    dtype: str              # e.g. "torch.float32"
+    tf32_enabled: bool      # whether matmul TF32 was on during probe
+    cudnn_benchmark: bool   # whether cudnn.benchmark was on during probe
+
+    def as_dict(self) -> dict:
+        return asdict(self)
+
+# %% ../nbs/analysis/roofline.ipynb #pinned_flags
+@contextmanager
+def _pinned_benchmark_flags(tf32: bool = False):
+    "Pin matmul TF32 and cudnn.benchmark during probe; restore on exit."
+    if torch.cuda.is_available():
+        prev_tf32 = torch.backends.cuda.matmul.allow_tf32
+        prev_cudnn_tf32 = torch.backends.cudnn.allow_tf32
+        prev_bench = torch.backends.cudnn.benchmark
+        torch.backends.cuda.matmul.allow_tf32 = tf32
+        torch.backends.cudnn.allow_tf32 = tf32
+        torch.backends.cudnn.benchmark = False
+        try:
+            yield (tf32, False)
+        finally:
+            torch.backends.cuda.matmul.allow_tf32 = prev_tf32
+            torch.backends.cudnn.allow_tf32 = prev_cudnn_tf32
+            torch.backends.cudnn.benchmark = prev_bench
+    else:
+        yield (False, False)
+
+# %% ../nbs/analysis/roofline.ipynb #measure_peaks
+_PEAKS_CACHE: dict = {}
+
+
+def measure_peaks(
+    device: str | torch.device = "cuda",  # device to probe
+    *,
+    dtype: torch.dtype = torch.float32,   # probe precision
+    matmul_size: int = 4096,              # N for NxN matmul probe
+    bandwidth_mb: int = 256,              # per-buffer size in MiB (auto-bumped above L3)
+    warmup: int = 5,                      # warmup iterations
+    steps: int = 20,                      # measurement iterations (report max)
+    allow_tf32: bool = False,             # pin TF32 off by default for honest fp32 peak
+    cache: bool = True,                   # cache per (device, dtype, sizes)
+) -> HardwarePeaks:
+    "Empirically probe achievable peak FLOPs/s and streaming bandwidth."
+    dev = torch.device(device) if isinstance(device, str) else device
+    if dev.type == "cuda" and not torch.cuda.is_available():
+        warnings.warn("CUDA requested but not available - falling back to CPU")
+        dev = torch.device("cpu")
+    key = (str(dev), str(dtype), matmul_size, bandwidth_mb, allow_tf32)
+    if cache and key in _PEAKS_CACHE:
+        return _PEAKS_CACHE[key]
+
+    with _pinned_benchmark_flags(tf32=allow_tf32) as (tf32_on, bench_on):
+        # --- peak FLOPs probe ---
+        N = matmul_size
+        a = torch.randn(N, N, device=dev, dtype=dtype)
+        b = torch.randn(N, N, device=dev, dtype=dtype)
+        for _ in range(warmup):
+            (a @ b)
+            _sync(dev)
+        flops_per_matmul = 2.0 * N * N * N
+        best_flops = 0.0
+        c = None
+        for _ in range(steps):
+            _sync(dev)
+            t0 = time.perf_counter()
+            c = a @ b
+            _sync(dev)
+            dt = time.perf_counter() - t0
+            if dt > 0:
+                best_flops = max(best_flops, flops_per_matmul / dt)
+        del a, b, c
+        if dev.type == "cuda":
+            torch.cuda.empty_cache()
+
+        # --- bandwidth probe (cache-defeating) ---
+        # target buffer sized to blow past L3 and any GPU L2.
+        buf_bytes = max(bandwidth_mb * 1024 * 1024, 64 * 1024 * 1024)
+        n_elems = buf_bytes // dtype.itemsize
+        src = torch.empty(n_elems, device=dev, dtype=dtype).normal_()
+        dst = torch.empty(n_elems, device=dev, dtype=dtype).normal_()
+        for _ in range(warmup):
+            dst.copy_(src)
+            _sync(dev)
+        bytes_moved_per_copy = 2.0 * n_elems * dtype.itemsize  # read + write
+        best_bw = 0.0
+        for i in range(steps):
+            # alternate direction to defeat any residency assumptions
+            s, d = (src, dst) if (i % 2 == 0) else (dst, src)
+            _sync(dev)
+            t0 = time.perf_counter()
+            d.copy_(s)
+            _sync(dev)
+            dt = time.perf_counter() - t0
+            if dt > 0:
+                best_bw = max(best_bw, bytes_moved_per_copy / dt)
+        del src, dst
+        if dev.type == "cuda":
+            torch.cuda.empty_cache()
+
+        result = HardwarePeaks(
+            peak_flops=best_flops,
+            peak_bandwidth=best_bw,
+            ridge_point=best_flops / best_bw if best_bw > 0 else float("inf"),
+            device=str(dev),
+            dtype=str(dtype),
+            tf32_enabled=tf32_on,
+            cudnn_benchmark=bench_on,
+        )
+    if cache:
+        _PEAKS_CACHE[key] = result
+    return result
+
+
+def clear_peaks_cache() -> None:
+    "Reset the measure_peaks() cache."
+    _PEAKS_CACHE.clear()
+
+# %% ../nbs/analysis/roofline.ipynb #roofline_point
+@dataclass(slots=True)
+class RooflinePoint:
+    "Per-layer roofline measurement. Bytes formula: weight_bytes + input_bytes + output_bytes (each counted once per forward call, per Williams 2009)."
+    name: str
+    type: str
+    flops: float                  # total FLOPs (2 * MACs)
+    bytes_moved: float            # weights + input + output bytes per forward call
+    time_s: float                 # measured wall time (mean over steps)
+    arithmetic_intensity: float   # flops / bytes_moved
+    achieved_gflops: float        # flops / time / 1e9
+    bound: str                    # "memory" | "compute" | "undefined"
+    utilization_pct: float        # achieved / roof * 100
+
+    def as_dict(self) -> dict:
+        return asdict(self)
+
+# %% ../nbs/analysis/roofline.ipynb #roofline_hooks
+def _layer_flops(module: nn.Module, inp, output) -> float:
+    "Estimate FLOPs for a single leaf module forward call. Returns 0 for layer types we do not model."
+    # Covers the layer types where FLOPs actually dominate - conv, linear, matmul-adjacent.
+    # Cheap element-wise ops (ReLU, BN, pooling) are intentionally assigned 0 FLOPs; they end up
+    # in the \"undefined\" bucket (memory-bound trivially) and are surfaced via the warning.
+    t_in = inp[0] if isinstance(inp, tuple) and len(inp) > 0 and isinstance(inp[0], torch.Tensor) else None
+    t_out = output if isinstance(output, torch.Tensor) else None
+
+    if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Conv3d)) and t_out is not None:
+        cin = module.in_channels // module.groups
+        cout = module.out_channels
+        ksize = 1
+        for k in (module.kernel_size if isinstance(module.kernel_size, tuple) else (module.kernel_size,)):
+            ksize *= k
+        spatial = 1
+        for s in t_out.shape[2:]:
+            spatial *= int(s)
+        batch = int(t_out.shape[0]) if t_out.ndim > 0 else 1
+        macs = batch * cin * cout * ksize * spatial
+        flops = 2.0 * macs
+        if module.bias is not None:
+            flops += batch * cout * spatial  # bias add
+        return float(flops)
+
+    if isinstance(module, (nn.ConvTranspose1d, nn.ConvTranspose2d, nn.ConvTranspose3d)) and t_in is not None:
+        cin = module.in_channels // module.groups
+        cout = module.out_channels
+        ksize = 1
+        for k in (module.kernel_size if isinstance(module.kernel_size, tuple) else (module.kernel_size,)):
+            ksize *= k
+        spatial = 1
+        for s in t_in.shape[2:]:
+            spatial *= int(s)
+        batch = int(t_in.shape[0]) if t_in.ndim > 0 else 1
+        return float(2.0 * batch * cin * cout * ksize * spatial)
+
+    if isinstance(module, nn.Linear) and t_in is not None:
+        in_features = module.in_features
+        out_features = module.out_features
+        # Everything before the last dim is treated as batch.
+        batch = 1
+        for s in t_in.shape[:-1]:
+            batch *= int(s)
+        flops = 2.0 * batch * in_features * out_features
+        if module.bias is not None:
+            flops += batch * out_features
+        return float(flops)
+
+    return 0.0
+
+
+#| export
+def _setup_roofline_hooks(
+    leaf_modules,     # {name: module} for leaf modules
+    bytes_state,      # {name: []} to accumulate bytes per call
+    time_state,       # {name: []} to accumulate seconds per call
+    flops_state,      # {name: []} to accumulate FLOPs per call
+    call_state,       # {name: int} counter of forward calls
+    device_type,      # "cuda" or "cpu"
+):
+    "Register hooks to measure (bytes moved, time, FLOPs, call count) per layer."
+    handles = []
+    is_cuda = device_type == "cuda"
+
+    def make_hooks(name: str, module: nn.Module):
+        w_bytes = sum(_tensor_bytes(p) for p in module.parameters(recurse=False))
+        if is_cuda:
+            state = {"start": None, "end": None}
+            def pre(mod, inp):
+                state["start"] = torch.cuda.Event(enable_timing=True)
+                state["end"] = torch.cuda.Event(enable_timing=True)
+                state["start"].record()
+                in_b = sum(_tensor_bytes(t) for t in inp if isinstance(t, torch.Tensor))
+                bytes_state[name].append(in_b + w_bytes)
+            def post(mod, inp, output):
+                state["end"].record()
+                out_b = _output_bytes(output)
+                bytes_state[name][-1] += out_b
+                torch.cuda.synchronize()
+                time_state[name].append(state["start"].elapsed_time(state["end"]) / 1000.0)
+                flops_state[name].append(_layer_flops(mod, inp, output))
+                call_state[name] += 1
+        else:
+            state = {"t0": 0.0}
+            def pre(mod, inp):
+                state["t0"] = time.perf_counter()
+                in_b = sum(_tensor_bytes(t) for t in inp if isinstance(t, torch.Tensor))
+                bytes_state[name].append(in_b + w_bytes)
+            def post(mod, inp, output):
+                dt = time.perf_counter() - state["t0"]
+                out_b = _output_bytes(output)
+                bytes_state[name][-1] += out_b
+                time_state[name].append(dt)
+                flops_state[name].append(_layer_flops(mod, inp, output))
+                call_state[name] += 1
+        return pre, post
+
+    for name, mod in leaf_modules.items():
+        pre, post = make_hooks(name, mod)
+        handles.append(mod.register_forward_pre_hook(pre))
+        handles.append(mod.register_forward_hook(post))
+    return handles
+
+# %% ../nbs/analysis/roofline.ipynb #roofline_analyzer
+class RooflineAnalyzer:
+    "Per-layer roofline analysis: measure arithmetic intensity and achieved GFLOPs/s against hardware peaks."
+
+    def __init__(
+        self,
+        model: nn.Module,                    # model to analyze
+        sample: torch.Tensor,                # input tensor (with batch dimension)
+        peaks: HardwarePeaks | None = None,  # optional precomputed hardware peaks
+    ):
+        self.model = model
+        self.sample = sample
+        self.peaks = peaks
+        self._results: list[RooflinePoint] = []
+
+    @property
+    def results(self) -> list[RooflinePoint]:
+        "Per-layer roofline measurements (populated after profile())."
+        return self._results
+
+    @torch.no_grad()
+    def profile(
+        self,
+        *,
+        device: str | torch.device = "cuda",  # device to profile on
+        warmup: int = 5,                      # warmup iterations
+        steps: int = 20,                      # measurement iterations
+    ) -> list[RooflinePoint]:
+        "Run per-layer profiling: collect FLOPs, bytes, and time, then classify against peaks."
+        with _device_ctx(device) as dev:
+            if dev.type == "cuda":
+                torch.cuda.empty_cache()
+            _sync(dev)
+
+            # Ensure peaks are available
+            if self.peaks is None:
+                self.peaks = measure_peaks(dev, dtype=self.sample.dtype)
+
+            model = self.model.eval().to(dev)
+            sample = self.sample.to(dev)
+            leaf_mods = _leaf_modules(model)
+
+            bytes_state: dict[str, list] = {n: [] for n in leaf_mods}
+            time_state: dict[str, list] = {n: [] for n in leaf_mods}
+            flops_state: dict[str, list] = {n: [] for n in leaf_mods}
+            call_state: dict[str, int] = {n: 0 for n in leaf_mods}
+
+            handles = _setup_roofline_hooks(
+                leaf_mods, bytes_state, time_state, flops_state, call_state, dev.type,
+            )
+            try:
+                for _ in range(warmup):
+                    model(sample)
+                # Reset accumulators after warmup
+                for n in leaf_mods:
+                    bytes_state[n].clear()
+                    time_state[n].clear()
+                    flops_state[n].clear()
+                    call_state[n] = 0
+                for _ in range(steps):
+                    model(sample)
+            finally:
+                for h in handles:
+                    h.remove()
+
+            # Warn if any module was invoked >1x per forward pass (shared module)
+            multi_call = [n for n, c in call_state.items() if steps > 0 and c > steps]
+            if multi_call:
+                warnings.warn(
+                    f"{len(multi_call)} module(s) were called more than once per forward "
+                    f"pass; their bytes/time are summed across calls. First: {multi_call[:3]}"
+                )
+
+        # --- Build RooflinePoint per layer ---
+        peak_flops = self.peaks.peak_flops
+        peak_bw = self.peaks.peak_bandwidth
+        ridge = self.peaks.ridge_point
+
+        results: list[RooflinePoint] = []
+        for name, mod in leaf_mods.items():
+            b_list = bytes_state[name]
+            t_list = time_state[name]
+            f_list = flops_state[name]
+            bytes_moved = float(np.mean(b_list)) if b_list else 0.0
+            time_s = float(np.mean(t_list)) if t_list else 0.0
+            flops = float(np.mean(f_list)) if f_list else 0.0
+
+            if flops == 0 or bytes_moved == 0 or time_s == 0:
+                results.append(RooflinePoint(
+                    name=name, type=mod.__class__.__name__,
+                    flops=flops, bytes_moved=bytes_moved, time_s=time_s,
+                    arithmetic_intensity=0.0, achieved_gflops=0.0,
+                    bound="undefined", utilization_pct=0.0,
+                ))
+                continue
+
+            ai = flops / bytes_moved
+            achieved_gflops = flops / time_s / 1e9
+            roof_flops = min(peak_flops, ai * peak_bw)
+            roof_gflops = roof_flops / 1e9
+            bound = "memory" if ai < ridge else "compute"
+            util = (achieved_gflops / roof_gflops * 100) if roof_gflops > 0 else 0.0
+            results.append(RooflinePoint(
+                name=name, type=mod.__class__.__name__,
+                flops=flops, bytes_moved=bytes_moved, time_s=time_s,
+                arithmetic_intensity=ai, achieved_gflops=achieved_gflops,
+                bound=bound, utilization_pct=util,
+            ))
+
+        # Warn if any layer had zero flops/bytes/time -> bound is "undefined"
+        zero_flops = [r.name for r in results if r.bound == "undefined"]
+        if zero_flops:
+            warnings.warn(
+                f"{len(zero_flops)} layer(s) have undefined roofline (zero FLOPs, bytes, "
+                f"or time). First: {zero_flops[:3]}"
+            )
+
+        self._results = results
+        return results
+
+    def summary(self, *, top: int = 10) -> None:
+        "Print a table of the slowest layers with their roofline metrics."
+        if not self._results:
+            raise RuntimeError("No results available. Call profile() first.")
+        print(_section("Roofline", 72))
+        header = f"  {'name':32} {'type':14} {'FLOPs':>10} {'bytes':>10} {'AI':>8} {'GFLOPs/s':>10} {'bound':>9} {'util%':>7}"
+        print(header)
+        # Sort by measured time, descending (slowest first)
+        sorted_rows = sorted(self._results, key=lambda r: r.time_s, reverse=True)[:top]
+        for r in sorted_rows:
+            flops_str = f"{r.flops/1e6:>8.2f}M" if r.flops >= 1e6 else f"{r.flops:>10.0f}"
+            bytes_str = f"{r.bytes_moved/1e6:>8.2f}M" if r.bytes_moved >= 1e6 else f"{r.bytes_moved:>10.0f}"
+            ai_str = _fmt_float(r.arithmetic_intensity, width=8, decimals=2)
+            gf_str = _fmt_float(r.achieved_gflops, width=10, decimals=2)
+            util_str = _fmt_float(r.utilization_pct, width=6, decimals=1) + "%"
+            print(f"  {r.name:32} {r.type:14} {flops_str} {bytes_str} {ai_str} {gf_str} {r.bound:>9} {util_str}")
+
+    def plot(
+        self,
+        *,
+        title: str = "Roofline",  # figure title
+    ) -> go.Figure:
+        "Render the roofline plot with per-layer scatter points on a log-log grid. Markers are colored by bound classification."
+        if not self._results:
+            raise RuntimeError("No results available. Call profile() first.")
+        if self.peaks is None:
+            raise RuntimeError("No hardware peaks available.")
+
+        peak_flops = self.peaks.peak_flops
+        peak_bw = self.peaks.peak_bandwidth
+        ridge = self.peaks.ridge_point
+        peak_gflops = peak_flops / 1e9
+
+        # Build the roof curve: y = min(peak_flops, AI * peak_bw) / 1e9
+        valid = [r for r in self._results if r.bound != "undefined"]
+        if valid:
+            ai_min = max(min(r.arithmetic_intensity for r in valid) / 10.0, 1e-3)
+            ai_max = max(r.arithmetic_intensity for r in valid) * 10.0
+        else:
+            ai_min, ai_max = 1e-2, 1e3
+        ai_max = max(ai_max, ridge * 10.0)
+
+        ai_grid = np.logspace(math.log10(ai_min), math.log10(ai_max), 200)
+        roof_gflops = np.minimum(peak_gflops, ai_grid * peak_bw / 1e9)
+
+        fig = go.Figure()
+        fig.add_trace(go.Scatter(
+            x=ai_grid, y=roof_gflops, mode="lines",
+            line=dict(color="#008080", width=2),
+            name=f"Roof ({peak_gflops:.0f} GFLOPs/s, {peak_bw/1e9:.1f} GB/s)",
+            hoverinfo="skip",
+        ))
+        # Ridge point marker
+        fig.add_trace(go.Scatter(
+            x=[ridge], y=[peak_gflops], mode="markers",
+            marker=dict(color="#008080", size=10, symbol="diamond"),
+            name=f"Ridge point ({ridge:.1f} FLOP/byte)",
+            hovertemplate="Ridge: %{x:.2f} FLOP/byte<extra></extra>",
+        ))
+
+        # Color palette for layers
+        color_map = {"memory": "#89d6c9", "compute": "#008080"}
+        for bound_label in ("memory", "compute"):
+            pts = [r for r in valid if r.bound == bound_label]
+            if not pts:
+                continue
+            hover = [
+                f"{r.name}<br>{r.type}<br>AI: {r.arithmetic_intensity:.3f} FLOP/byte"
+                f"<br>{r.achieved_gflops:.2f} GFLOPs/s<br>util: {r.utilization_pct:.1f}%"
+                for r in pts
+            ]
+            fig.add_trace(go.Scatter(
+                x=[r.arithmetic_intensity for r in pts],
+                y=[r.achieved_gflops for r in pts],
+                mode="markers",
+                marker=dict(color=color_map[bound_label], size=8, opacity=0.8,
+                            line=dict(color="#008080", width=0.5)),
+                name=f"{bound_label}-bound",
+                text=hover,
+                hovertemplate="%{text}<extra></extra>",
+            ))
+
+        fig.update_layout(
+            title=title,
+            xaxis=dict(title="Arithmetic intensity (FLOP/byte)", type="log"),
+            yaxis=dict(title="Achieved performance (GFLOPs/s)", type="log"),
+            paper_bgcolor="rgba(0,0,0,0)",
+            plot_bgcolor="rgba(0,0,0,0)",
+            legend=dict(bgcolor="rgba(0,0,0,0)"),
+        )
+        return fig
diff --git a/nbs/_quarto.yml b/nbs/_quarto.yml
index 8066f60..2369135 100644
--- a/nbs/_quarto.yml
+++ b/nbs/_quarto.yml
@@ -35,6 +35,7 @@ website:
         contents:
         - tutorials/benchmark.ipynb
         - tutorials/profiling.ipynb
+        - tutorials/roofline.ipynb
         - tutorials/report.ipynb
       - section: Core
         contents:
@@ -51,6 +52,7 @@ website:
         contents:
         - analysis/benchmark.ipynb
         - analysis/profiling.ipynb
+        - analysis/roofline.ipynb
         - analysis/report.ipynb
       - section: Visualization
         contents:
diff --git a/nbs/analysis/profiling.ipynb b/nbs/analysis/profiling.ipynb
index c600166..ca47f46 100644
--- a/nbs/analysis/profiling.ipynb
+++ b/nbs/analysis/profiling.ipynb
@@ -341,4 +341,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/nbs/analysis/roofline.ipynb b/nbs/analysis/roofline.ipynb
new file mode 100644
index 0000000..b4d4024
--- /dev/null
+++ b/nbs/analysis/roofline.ipynb
@@ -0,0 +1,427 @@
+{
+ "cells": [
+  {
+   "cell_type": "raw",
+   "id": "frontmatter",
+   "metadata": {},
+   "source": [
+    "---\ndescription: Roofline analysis for arithmetic intensity vs achieved performance\noutput-file: roofline.html\ntitle: Roofline\nskip_showdoc: true\n---"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "default_exp",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| default_exp roofline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "setup_showdoc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| include: false\n",
+    "from nbdev.showdoc import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "imports",
+   "metadata": {},
+   "outputs": [],
+   "source": "#| export\nfrom __future__ import annotations\n\nimport math\nimport time\nimport warnings\nfrom dataclasses import dataclass, asdict\nfrom contextlib import contextmanager\n\nimport numpy as np\nimport torch\nimport torch.nn as nn\nimport plotly.graph_objects as go\n\nfrom fasterbench.core import _device_ctx, _sync, _fmt_float, _section\nfrom fasterbench.profiling import _leaf_modules, _tensor_bytes, _output_bytes"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "hardware_peaks",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "@dataclass(slots=True)\n",
+    "class HardwarePeaks:\n",
+    "    \"Empirically measured achievable peak compute and streaming bandwidth for a device.\"\n",
+    "    peak_flops: float       # achievable peak FLOPs/s\n",
+    "    peak_bandwidth: float   # achievable streaming bandwidth in bytes/s\n",
+    "    ridge_point: float      # FLOPs/byte = peak_flops / peak_bandwidth\n",
+    "    device: str             # e.g. \"cuda:0\", \"cpu\"\n",
+    "    dtype: str              # e.g. \"torch.float32\"\n",
+    "    tf32_enabled: bool      # whether matmul TF32 was on during probe\n",
+    "    cudnn_benchmark: bool   # whether cudnn.benchmark was on during probe\n",
+    "\n",
+    "    def as_dict(self) -> dict:\n",
+    "        return asdict(self)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "sd_hw_peaks",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_doc(HardwarePeaks)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "pinned_flags",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "@contextmanager\n",
+    "def _pinned_benchmark_flags(tf32: bool = False):\n",
+    "    \"Pin matmul TF32 and cudnn.benchmark during probe; restore on exit.\"\n",
+    "    if torch.cuda.is_available():\n",
+    "        prev_tf32 = torch.backends.cuda.matmul.allow_tf32\n",
+    "        prev_cudnn_tf32 = torch.backends.cudnn.allow_tf32\n",
+    "        prev_bench = torch.backends.cudnn.benchmark\n",
+    "        torch.backends.cuda.matmul.allow_tf32 = tf32\n",
+    "        torch.backends.cudnn.allow_tf32 = tf32\n",
+    "        torch.backends.cudnn.benchmark = False\n",
+    "        try:\n",
+    "            yield (tf32, False)\n",
+    "        finally:\n",
+    "            torch.backends.cuda.matmul.allow_tf32 = prev_tf32\n",
+    "            torch.backends.cudnn.allow_tf32 = prev_cudnn_tf32\n",
+    "            torch.backends.cudnn.benchmark = prev_bench\n",
+    "    else:\n",
+    "        yield (False, False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "measure_peaks",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "_PEAKS_CACHE: dict = {}\n",
+    "\n",
+    "\n",
+    "def measure_peaks(\n",
+    "    device: str | torch.device = \"cuda\",  # device to probe\n",
+    "    *,\n",
+    "    dtype: torch.dtype = torch.float32,   # probe precision\n",
+    "    matmul_size: int = 4096,              # N for NxN matmul probe\n",
+    "    bandwidth_mb: int = 256,              # per-buffer size in MiB (auto-bumped above L3)\n",
+    "    warmup: int = 5,                      # warmup iterations\n",
+    "    steps: int = 20,                      # measurement iterations (report max)\n",
+    "    allow_tf32: bool = False,             # pin TF32 off by default for honest fp32 peak\n",
+    "    cache: bool = True,                   # cache per (device, dtype, sizes)\n",
+    ") -> HardwarePeaks:\n",
+    "    \"Empirically probe achievable peak FLOPs/s and streaming bandwidth.\"\n",
+    "    dev = torch.device(device) if isinstance(device, str) else device\n",
+    "    if dev.type == \"cuda\" and not torch.cuda.is_available():\n",
+    "        warnings.warn(\"CUDA requested but not available - falling back to CPU\")\n",
+    "        dev = torch.device(\"cpu\")\n",
+    "    key = (str(dev), str(dtype), matmul_size, bandwidth_mb, allow_tf32)\n",
+    "    if cache and key in _PEAKS_CACHE:\n",
+    "        return _PEAKS_CACHE[key]\n",
+    "\n",
+    "    with _pinned_benchmark_flags(tf32=allow_tf32) as (tf32_on, bench_on):\n",
+    "        # --- peak FLOPs probe ---\n",
+    "        N = matmul_size\n",
+    "        a = torch.randn(N, N, device=dev, dtype=dtype)\n",
+    "        b = torch.randn(N, N, device=dev, dtype=dtype)\n",
+    "        for _ in range(warmup):\n",
+    "            (a @ b)\n",
+    "            _sync(dev)\n",
+    "        flops_per_matmul = 2.0 * N * N * N\n",
+    "        best_flops = 0.0\n",
+    "        c = None\n",
+    "        for _ in range(steps):\n",
+    "            _sync(dev)\n",
+    "            t0 = time.perf_counter()\n",
+    "            c = a @ b\n",
+    "            _sync(dev)\n",
+    "            dt = time.perf_counter() - t0\n",
+    "            if dt > 0:\n",
+    "                best_flops = max(best_flops, flops_per_matmul / dt)\n",
+    "        del a, b, c\n",
+    "        if dev.type == \"cuda\":\n",
+    "            torch.cuda.empty_cache()\n",
+    "\n",
+    "        # --- bandwidth probe (cache-defeating) ---\n",
+    "        # target buffer sized to blow past L3 and any GPU L2.\n",
+    "        buf_bytes = max(bandwidth_mb * 1024 * 1024, 64 * 1024 * 1024)\n",
+    "        n_elems = buf_bytes // dtype.itemsize\n",
+    "        src = torch.empty(n_elems, device=dev, dtype=dtype).normal_()\n",
+    "        dst = torch.empty(n_elems, device=dev, dtype=dtype).normal_()\n",
+    "        for _ in range(warmup):\n",
+    "            dst.copy_(src)\n",
+    "            _sync(dev)\n",
+    "        bytes_moved_per_copy = 2.0 * n_elems * dtype.itemsize  # read + write\n",
+    "        best_bw = 0.0\n",
+    "        for i in range(steps):\n",
+    "            # alternate direction to defeat any residency assumptions\n",
+    "            s, d = (src, dst) if (i % 2 == 0) else (dst, src)\n",
+    "            _sync(dev)\n",
+    "            t0 = time.perf_counter()\n",
+    "            d.copy_(s)\n",
+    "            _sync(dev)\n",
+    "            dt = time.perf_counter() - t0\n",
+    "            if dt > 0:\n",
+    "                best_bw = max(best_bw, bytes_moved_per_copy / dt)\n",
+    "        del src, dst\n",
+    "        if dev.type == \"cuda\":\n",
+    "            torch.cuda.empty_cache()\n",
+    "\n",
+    "        result = HardwarePeaks(\n",
+    "            peak_flops=best_flops,\n",
+    "            peak_bandwidth=best_bw,\n",
+    "            ridge_point=best_flops / best_bw if best_bw > 0 else float(\"inf\"),\n",
+    "            device=str(dev),\n",
+    "            dtype=str(dtype),\n",
+    "            tf32_enabled=tf32_on,\n",
+    "            cudnn_benchmark=bench_on,\n",
+    "        )\n",
+    "    if cache:\n",
+    "        _PEAKS_CACHE[key] = result\n",
+    "    return result\n",
+    "\n",
+    "\n",
+    "def clear_peaks_cache() -> None:\n",
+    "    \"Reset the measure_peaks() cache.\"\n",
+    "    _PEAKS_CACHE.clear()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "sd_measure_peaks",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_doc(measure_peaks)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "roofline_point",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "@dataclass(slots=True)\n",
+    "class RooflinePoint:\n",
+    "    \"Per-layer roofline measurement. Bytes formula: weight_bytes + input_bytes + output_bytes (each counted once per forward call, per Williams 2009).\"\n",
+    "    name: str\n",
+    "    type: str\n",
+    "    flops: float                  # total FLOPs (2 * MACs)\n",
+    "    bytes_moved: float            # weights + input + output bytes per forward call\n",
+    "    time_s: float                 # measured wall time (mean over steps)\n",
+    "    arithmetic_intensity: float   # flops / bytes_moved\n",
+    "    achieved_gflops: float        # flops / time / 1e9\n",
+    "    bound: str                    # \"memory\" | \"compute\" | \"undefined\"\n",
+    "    utilization_pct: float        # achieved / roof * 100\n",
+    "\n",
+    "    def as_dict(self) -> dict:\n",
+    "        return asdict(self)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "sd_roofline_point",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_doc(RooflinePoint)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "roofline_hooks",
+   "metadata": {},
+   "outputs": [],
+   "source": "#| export\ndef _layer_flops(module: nn.Module, inp, output) -> float:\n    \"Estimate FLOPs for a single leaf module forward call. Returns 0 for layer types we do not model.\"\n    # Covers the layer types where FLOPs actually dominate - conv, linear, matmul-adjacent.\n    # Cheap element-wise ops (ReLU, BN, pooling) are intentionally assigned 0 FLOPs; they end up\n    # in the \\\"undefined\\\" bucket (memory-bound trivially) and are surfaced via the warning.\n    t_in = inp[0] if isinstance(inp, tuple) and len(inp) > 0 and isinstance(inp[0], torch.Tensor) else None\n    t_out = output if isinstance(output, torch.Tensor) else None\n\n    if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Conv3d)) and t_out is not None:\n        cin = module.in_channels // module.groups\n        cout = module.out_channels\n        ksize = 1\n        for k in (module.kernel_size if isinstance(module.kernel_size, tuple) else (module.kernel_size,)):\n            ksize *= k\n        spatial = 1\n        for s in t_out.shape[2:]:\n            spatial *= int(s)\n        batch = int(t_out.shape[0]) if t_out.ndim > 0 else 1\n        macs = batch * cin * cout * ksize * spatial\n        flops = 2.0 * macs\n        if module.bias is not None:\n            flops += batch * cout * spatial  # bias add\n        return float(flops)\n\n    if isinstance(module, (nn.ConvTranspose1d, nn.ConvTranspose2d, nn.ConvTranspose3d)) and t_in is not None:\n        cin = module.in_channels // module.groups\n        cout = module.out_channels\n        ksize = 1\n        for k in (module.kernel_size if isinstance(module.kernel_size, tuple) else (module.kernel_size,)):\n            ksize *= k\n        spatial = 1\n        for s in t_in.shape[2:]:\n            spatial *= int(s)\n        batch = int(t_in.shape[0]) if t_in.ndim > 0 else 1\n        return float(2.0 * batch * cin * cout * ksize * spatial)\n\n    if isinstance(module, nn.Linear) and t_in is not None:\n        in_features = module.in_features\n        out_features = module.out_features\n        # Everything before the last dim is treated as batch.\n        batch = 1\n        for s in t_in.shape[:-1]:\n            batch *= int(s)\n        flops = 2.0 * batch * in_features * out_features\n        if module.bias is not None:\n            flops += batch * out_features\n        return float(flops)\n\n    return 0.0\n\n\n#| export\ndef _setup_roofline_hooks(\n    leaf_modules,     # {name: module} for leaf modules\n    bytes_state,      # {name: []} to accumulate bytes per call\n    time_state,       # {name: []} to accumulate seconds per call\n    flops_state,      # {name: []} to accumulate FLOPs per call\n    call_state,       # {name: int} counter of forward calls\n    device_type,      # \"cuda\" or \"cpu\"\n):\n    \"Register hooks to measure (bytes moved, time, FLOPs, call count) per layer.\"\n    handles = []\n    is_cuda = device_type == \"cuda\"\n\n    def make_hooks(name: str, module: nn.Module):\n        w_bytes = sum(_tensor_bytes(p) for p in module.parameters(recurse=False))\n        if is_cuda:\n            state = {\"start\": None, \"end\": None}\n            def pre(mod, inp):\n                state[\"start\"] = torch.cuda.Event(enable_timing=True)\n                state[\"end\"] = torch.cuda.Event(enable_timing=True)\n                state[\"start\"].record()\n                in_b = sum(_tensor_bytes(t) for t in inp if isinstance(t, torch.Tensor))\n                bytes_state[name].append(in_b + w_bytes)\n            def post(mod, inp, output):\n                state[\"end\"].record()\n                out_b = _output_bytes(output)\n                bytes_state[name][-1] += out_b\n                torch.cuda.synchronize()\n                time_state[name].append(state[\"start\"].elapsed_time(state[\"end\"]) / 1000.0)\n                flops_state[name].append(_layer_flops(mod, inp, output))\n                call_state[name] += 1\n        else:\n            state = {\"t0\": 0.0}\n            def pre(mod, inp):\n                state[\"t0\"] = time.perf_counter()\n                in_b = sum(_tensor_bytes(t) for t in inp if isinstance(t, torch.Tensor))\n                bytes_state[name].append(in_b + w_bytes)\n            def post(mod, inp, output):\n                dt = time.perf_counter() - state[\"t0\"]\n                out_b = _output_bytes(output)\n                bytes_state[name][-1] += out_b\n                time_state[name].append(dt)\n                flops_state[name].append(_layer_flops(mod, inp, output))\n                call_state[name] += 1\n        return pre, post\n\n    for name, mod in leaf_modules.items():\n        pre, post = make_hooks(name, mod)\n        handles.append(mod.register_forward_pre_hook(pre))\n        handles.append(mod.register_forward_hook(post))\n    return handles"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "roofline_analyzer",
+   "metadata": {},
+   "outputs": [],
+   "source": "#| export\nclass RooflineAnalyzer:\n    \"Per-layer roofline analysis: measure arithmetic intensity and achieved GFLOPs/s against hardware peaks.\"\n\n    def __init__(\n        self,\n        model: nn.Module,                    # model to analyze\n        sample: torch.Tensor,                # input tensor (with batch dimension)\n        peaks: HardwarePeaks | None = None,  # optional precomputed hardware peaks\n    ):\n        self.model = model\n        self.sample = sample\n        self.peaks = peaks\n        self._results: list[RooflinePoint] = []\n\n    @property\n    def results(self) -> list[RooflinePoint]:\n        \"Per-layer roofline measurements (populated after profile()).\"\n        return self._results\n\n    @torch.no_grad()\n    def profile(\n        self,\n        *,\n        device: str | torch.device = \"cuda\",  # device to profile on\n        warmup: int = 5,                      # warmup iterations\n        steps: int = 20,                      # measurement iterations\n    ) -> list[RooflinePoint]:\n        \"Run per-layer profiling: collect FLOPs, bytes, and time, then classify against peaks.\"\n        with _device_ctx(device) as dev:\n            if dev.type == \"cuda\":\n                torch.cuda.empty_cache()\n            _sync(dev)\n\n            # Ensure peaks are available\n            if self.peaks is None:\n                self.peaks = measure_peaks(dev, dtype=self.sample.dtype)\n\n            model = self.model.eval().to(dev)\n            sample = self.sample.to(dev)\n            leaf_mods = _leaf_modules(model)\n\n            bytes_state: dict[str, list] = {n: [] for n in leaf_mods}\n            time_state: dict[str, list] = {n: [] for n in leaf_mods}\n            flops_state: dict[str, list] = {n: [] for n in leaf_mods}\n            call_state: dict[str, int] = {n: 0 for n in leaf_mods}\n\n            handles = _setup_roofline_hooks(\n                leaf_mods, bytes_state, time_state, flops_state, call_state, dev.type,\n            )\n            try:\n                for _ in range(warmup):\n                    model(sample)\n                # Reset accumulators after warmup\n                for n in leaf_mods:\n                    bytes_state[n].clear()\n                    time_state[n].clear()\n                    flops_state[n].clear()\n                    call_state[n] = 0\n                for _ in range(steps):\n                    model(sample)\n            finally:\n                for h in handles:\n                    h.remove()\n\n            # Warn if any module was invoked >1x per forward pass (shared module)\n            multi_call = [n for n, c in call_state.items() if steps > 0 and c > steps]\n            if multi_call:\n                warnings.warn(\n                    f\"{len(multi_call)} module(s) were called more than once per forward \"\n                    f\"pass; their bytes/time are summed across calls. First: {multi_call[:3]}\"\n                )\n\n        # --- Build RooflinePoint per layer ---\n        peak_flops = self.peaks.peak_flops\n        peak_bw = self.peaks.peak_bandwidth\n        ridge = self.peaks.ridge_point\n\n        results: list[RooflinePoint] = []\n        for name, mod in leaf_mods.items():\n            b_list = bytes_state[name]\n            t_list = time_state[name]\n            f_list = flops_state[name]\n            bytes_moved = float(np.mean(b_list)) if b_list else 0.0\n            time_s = float(np.mean(t_list)) if t_list else 0.0\n            flops = float(np.mean(f_list)) if f_list else 0.0\n\n            if flops == 0 or bytes_moved == 0 or time_s == 0:\n                results.append(RooflinePoint(\n                    name=name, type=mod.__class__.__name__,\n                    flops=flops, bytes_moved=bytes_moved, time_s=time_s,\n                    arithmetic_intensity=0.0, achieved_gflops=0.0,\n                    bound=\"undefined\", utilization_pct=0.0,\n                ))\n                continue\n\n            ai = flops / bytes_moved\n            achieved_gflops = flops / time_s / 1e9\n            roof_flops = min(peak_flops, ai * peak_bw)\n            roof_gflops = roof_flops / 1e9\n            bound = \"memory\" if ai < ridge else \"compute\"\n            util = (achieved_gflops / roof_gflops * 100) if roof_gflops > 0 else 0.0\n            results.append(RooflinePoint(\n                name=name, type=mod.__class__.__name__,\n                flops=flops, bytes_moved=bytes_moved, time_s=time_s,\n                arithmetic_intensity=ai, achieved_gflops=achieved_gflops,\n                bound=bound, utilization_pct=util,\n            ))\n\n        # Warn if any layer had zero flops/bytes/time -> bound is \"undefined\"\n        zero_flops = [r.name for r in results if r.bound == \"undefined\"]\n        if zero_flops:\n            warnings.warn(\n                f\"{len(zero_flops)} layer(s) have undefined roofline (zero FLOPs, bytes, \"\n                f\"or time). First: {zero_flops[:3]}\"\n            )\n\n        self._results = results\n        return results\n\n    def summary(self, *, top: int = 10) -> None:\n        \"Print a table of the slowest layers with their roofline metrics.\"\n        if not self._results:\n            raise RuntimeError(\"No results available. Call profile() first.\")\n        print(_section(\"Roofline\", 72))\n        header = f\"  {'name':32} {'type':14} {'FLOPs':>10} {'bytes':>10} {'AI':>8} {'GFLOPs/s':>10} {'bound':>9} {'util%':>7}\"\n        print(header)\n        # Sort by measured time, descending (slowest first)\n        sorted_rows = sorted(self._results, key=lambda r: r.time_s, reverse=True)[:top]\n        for r in sorted_rows:\n            flops_str = f\"{r.flops/1e6:>8.2f}M\" if r.flops >= 1e6 else f\"{r.flops:>10.0f}\"\n            bytes_str = f\"{r.bytes_moved/1e6:>8.2f}M\" if r.bytes_moved >= 1e6 else f\"{r.bytes_moved:>10.0f}\"\n            ai_str = _fmt_float(r.arithmetic_intensity, width=8, decimals=2)\n            gf_str = _fmt_float(r.achieved_gflops, width=10, decimals=2)\n            util_str = _fmt_float(r.utilization_pct, width=6, decimals=1) + \"%\"\n            print(f\"  {r.name:32} {r.type:14} {flops_str} {bytes_str} {ai_str} {gf_str} {r.bound:>9} {util_str}\")\n\n    def plot(\n        self,\n        *,\n        title: str = \"Roofline\",  # figure title\n    ) -> go.Figure:\n        \"Render the roofline plot with per-layer scatter points on a log-log grid. Markers are colored by bound classification.\"\n        if not self._results:\n            raise RuntimeError(\"No results available. Call profile() first.\")\n        if self.peaks is None:\n            raise RuntimeError(\"No hardware peaks available.\")\n\n        peak_flops = self.peaks.peak_flops\n        peak_bw = self.peaks.peak_bandwidth\n        ridge = self.peaks.ridge_point\n        peak_gflops = peak_flops / 1e9\n\n        # Build the roof curve: y = min(peak_flops, AI * peak_bw) / 1e9\n        valid = [r for r in self._results if r.bound != \"undefined\"]\n        if valid:\n            ai_min = max(min(r.arithmetic_intensity for r in valid) / 10.0, 1e-3)\n            ai_max = max(r.arithmetic_intensity for r in valid) * 10.0\n        else:\n            ai_min, ai_max = 1e-2, 1e3\n        ai_max = max(ai_max, ridge * 10.0)\n\n        ai_grid = np.logspace(math.log10(ai_min), math.log10(ai_max), 200)\n        roof_gflops = np.minimum(peak_gflops, ai_grid * peak_bw / 1e9)\n\n        fig = go.Figure()\n        fig.add_trace(go.Scatter(\n            x=ai_grid, y=roof_gflops, mode=\"lines\",\n            line=dict(color=\"#008080\", width=2),\n            name=f\"Roof ({peak_gflops:.0f} GFLOPs/s, {peak_bw/1e9:.1f} GB/s)\",\n            hoverinfo=\"skip\",\n        ))\n        # Ridge point marker\n        fig.add_trace(go.Scatter(\n            x=[ridge], y=[peak_gflops], mode=\"markers\",\n            marker=dict(color=\"#008080\", size=10, symbol=\"diamond\"),\n            name=f\"Ridge point ({ridge:.1f} FLOP/byte)\",\n            hovertemplate=\"Ridge: %{x:.2f} FLOP/byte<extra></extra>\",\n        ))\n\n        # Color palette for layers\n        color_map = {\"memory\": \"#89d6c9\", \"compute\": \"#008080\"}\n        for bound_label in (\"memory\", \"compute\"):\n            pts = [r for r in valid if r.bound == bound_label]\n            if not pts:\n                continue\n            hover = [\n                f\"{r.name}<br>{r.type}<br>AI: {r.arithmetic_intensity:.3f} FLOP/byte\"\n                f\"<br>{r.achieved_gflops:.2f} GFLOPs/s<br>util: {r.utilization_pct:.1f}%\"\n                for r in pts\n            ]\n            fig.add_trace(go.Scatter(\n                x=[r.arithmetic_intensity for r in pts],\n                y=[r.achieved_gflops for r in pts],\n                mode=\"markers\",\n                marker=dict(color=color_map[bound_label], size=8, opacity=0.8,\n                            line=dict(color=\"#008080\", width=0.5)),\n                name=f\"{bound_label}-bound\",\n                text=hover,\n                hovertemplate=\"%{text}<extra></extra>\",\n            ))\n\n        fig.update_layout(\n            title=title,\n            xaxis=dict(title=\"Arithmetic intensity (FLOP/byte)\", type=\"log\"),\n            yaxis=dict(title=\"Achieved performance (GFLOPs/s)\", type=\"log\"),\n            paper_bgcolor=\"rgba(0,0,0,0)\",\n            plot_bgcolor=\"rgba(0,0,0,0)\",\n            legend=dict(bgcolor=\"rgba(0,0,0,0)\"),\n        )\n        return fig"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "sd_analyzer",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_doc(RooflineAnalyzer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "sd_profile",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_doc(RooflineAnalyzer.profile)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "sd_summary",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_doc(RooflineAnalyzer.summary)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "sd_plot",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_doc(RooflineAnalyzer.plot)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "usage",
+   "metadata": {},
+   "source": [
+    "## Usage\n",
+    "\n",
+    "```python\n",
+    "from fasterbench.roofline import RooflineAnalyzer\n",
+    "\n",
+    "ra = RooflineAnalyzer(model, sample)\n",
+    "ra.profile(device=\"cuda\")\n",
+    "ra.summary()\n",
+    "fig = ra.plot()\n",
+    "fig.show()\n",
+    "```\n",
+    "\n",
+    "This is a measurement primitive. Downstream compression workflows (see fasterrecipes) can consume `ra.results` to make decisions - fasterbench itself never prescribes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "test_peaks_basic",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "from fastcore.test import *\n",
+    "\n",
+    "_p = measure_peaks(\"cpu\", steps=3, warmup=1, matmul_size=256, bandwidth_mb=32, cache=False)\n",
+    "assert isinstance(_p, HardwarePeaks)\n",
+    "assert _p.peak_flops > 0\n",
+    "assert _p.peak_bandwidth > 0\n",
+    "test_close(_p.ridge_point, _p.peak_flops / _p.peak_bandwidth, eps=1e-6)\n",
+    "assert _p.device == \"cpu\"\n",
+    "assert _p.tf32_enabled is False\n",
+    "assert _p.cudnn_benchmark is False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "test_peaks_cache",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "clear_peaks_cache()\n",
+    "_p1 = measure_peaks(\"cpu\", steps=2, warmup=1, matmul_size=128, bandwidth_mb=16, cache=True)\n",
+    "_p2 = measure_peaks(\"cpu\", steps=2, warmup=1, matmul_size=128, bandwidth_mb=16, cache=True)\n",
+    "assert _p1 is _p2  # cache hit returns same object\n",
+    "\n",
+    "clear_peaks_cache()\n",
+    "_p3 = measure_peaks(\"cpu\", steps=2, warmup=1, matmul_size=128, bandwidth_mb=16, cache=True)\n",
+    "assert _p3 is not _p1  # cache was cleared"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "test_conv_hand_computed",
+   "metadata": {},
+   "outputs": [],
+   "source": "#| hide\n# Hand-computed Conv2d test: wrap in Sequential so leaf name is non-empty\nimport torch\nimport torch.nn as nn\n\n_synth_peaks = HardwarePeaks(\n    peak_flops=1e12, peak_bandwidth=1e11, ridge_point=10.0,\n    device=\"cpu\", dtype=\"torch.float32\",\n    tf32_enabled=False, cudnn_benchmark=False,\n)\n_conv = nn.Conv2d(4, 8, kernel_size=3, padding=1, bias=False)\n_x = torch.randn(1, 4, 8, 8)\n_model = nn.Sequential(_conv)\n_ra = RooflineAnalyzer(_model, _x, peaks=_synth_peaks)\n_res = _ra.profile(device=\"cpu\", warmup=1, steps=2)\nassert len(_res) == 1\n_r = _res[0]\n# Expected MACs = 4*8*3*3 * 8*8 = 18432; FLOPs = 2*MACs = 36864\ntest_eq(_r.flops, 36864.0)\n# Expected bytes: weights 4*8*3*3*4 = 1152, input 1*4*8*8*4 = 1024, output 1*8*8*8*4 = 2048\n# total = 4224\ntest_eq(_r.bytes_moved, 4224.0)\ntest_close(_r.arithmetic_intensity, 36864.0 / 4224.0, eps=1e-3)\nassert _r.bound in (\"memory\", \"compute\")\nassert math.isfinite(_r.utilization_pct)"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "test_linear_stack",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "# Tiny Linear stack with synthetic peaks\n",
+    "_m = nn.Sequential(nn.Linear(32, 64), nn.ReLU(), nn.Linear(64, 16))\n",
+    "_x = torch.randn(1, 32)\n",
+    "_peaks = HardwarePeaks(\n",
+    "    peak_flops=1e12, peak_bandwidth=1e11, ridge_point=10.0,\n",
+    "    device=\"cpu\", dtype=\"torch.float32\",\n",
+    "    tf32_enabled=False, cudnn_benchmark=False,\n",
+    ")\n",
+    "_ra = RooflineAnalyzer(_m, _x, peaks=_peaks)\n",
+    "_res = _ra.profile(device=\"cpu\", warmup=1, steps=2)\n",
+    "assert len(_res) > 0\n",
+    "for _r in _res:\n",
+    "    assert _r.arithmetic_intensity >= 0\n",
+    "    assert _r.bound in {\"memory\", \"compute\", \"undefined\"}\n",
+    "    assert math.isfinite(_r.utilization_pct)\n",
+    "# summary and plot should run without error\n",
+    "_ra.summary(top=5)\n",
+    "_fig = _ra.plot()\n",
+    "assert isinstance(_fig, go.Figure)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "test_slow_resnet",
+   "metadata": {},
+   "outputs": [],
+   "source": "#| hide\n#| notest\nfrom torchvision.models import resnet18\n\n_model = resnet18()\n_sample = torch.randn(1, 3, 64, 64)\n_synth = HardwarePeaks(\n    peak_flops=1e13, peak_bandwidth=5e11, ridge_point=20.0,\n    device=\"cpu\", dtype=\"torch.float32\",\n    tf32_enabled=False, cudnn_benchmark=False,\n)\n_ra = RooflineAnalyzer(_model, _sample, peaks=_synth)\n_results = _ra.profile(device=\"cpu\", warmup=2, steps=3)\nassert len(_results) > 0\nassert all(r.bound in {\"memory\", \"compute\", \"undefined\"} for r in _results)\n_ra.summary(top=5)\n_fig = _ra.plot()\nassert isinstance(_fig, go.Figure)"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "test_slow_cuda",
+   "metadata": {},
+   "outputs": [],
+   "source": "#| hide\n#| notest\nif torch.cuda.is_available():\n    _p = measure_peaks(device=\"cuda\", matmul_size=512, bandwidth_mb=64, steps=3, warmup=1, cache=False)\n    assert _p.peak_flops > 0\n    assert _p.peak_bandwidth > 0\n    assert _p.ridge_point > 0"
+  },
+  {
+   "cell_type": "markdown",
+   "id": "see_also",
+   "metadata": {},
+   "source": "---\n\n## See Also\n\n- [Per-layer profiling](profiling.html) - Generic per-layer hook infrastructure reused here\n- [Compute metrics](../metrics/compute.html) - Model-level FLOPs counting\n- [Speed metrics](../metrics/speed.html) - Latency measurement"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "python3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/nbs/index.ipynb b/nbs/index.ipynb
index a6b7be0..9c8be32 100644
--- a/nbs/index.ipynb
+++ b/nbs/index.ipynb
@@ -37,47 +37,7 @@
    "id": "8b6f8c52",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "#| export\n",
-    "from fasterbench.benchmark import benchmark, BenchmarkResult\n",
-    "from fasterbench.size import SizeMetrics, compute_size, get_model_size, get_num_parameters\n",
-    "from fasterbench.speed import (\n",
-    "    SpeedMetrics, compute_speed, compute_speed_multi, \n",
-    "    sweep_threads, sweep_latency, sweep_batch_sizes,\n",
-    ")\n",
-    "from fasterbench.profiling import LayerProfiler\n",
-    "from fasterbench.compute import ComputeMetrics, compute_compute\n",
-    "from fasterbench.memory import MemoryMetrics, compute_memory, compute_memory_multi\n",
-    "from fasterbench.energy import EnergyMetrics, compute_energy, compute_energy_multi\n",
-    "from fasterbench.plot import create_radar_plot, SPECS\n",
-    "from fasterbench.utils import parse_metric_value\n",
-    "\n",
-    "__all__ = [\n",
-    "    # Main entry point\n",
-    "    'benchmark', 'BenchmarkResult',\n",
-    "    # Size\n",
-    "    'SizeMetrics', 'compute_size', 'get_model_size', 'get_num_parameters',\n",
-    "    # Speed\n",
-    "    'SpeedMetrics', 'compute_speed', 'compute_speed_multi', \n",
-    "    'sweep_threads', 'sweep_latency', 'sweep_batch_sizes',\n",
-    "    # Profiling\n",
-    "    'LayerProfiler',\n",
-    "    # Compute\n",
-    "    'ComputeMetrics', 'compute_compute',\n",
-    "    # Memory\n",
-    "    'MemoryMetrics', 'compute_memory', 'compute_memory_multi',\n",
-    "    # Energy\n",
-    "    'EnergyMetrics', 'compute_energy', 'compute_energy_multi',\n",
-    "    # Plot\n",
-    "    'create_radar_plot', 'SPECS',\n",
-    "    # Report\n",
-    "    'Report', 'ComparisonReport', 'ReportMetricDelta',\n",
-    "    # Utils\n",
-    "    'parse_metric_value',\n",
-    "]\n",
-    "from fasterbench.report import Report, ComparisonReport, ReportMetricDelta\n",
-    ""
-   ]
+   "source": "#| export\nfrom fasterbench.benchmark import benchmark, BenchmarkResult\nfrom fasterbench.size import SizeMetrics, compute_size, get_model_size, get_num_parameters\nfrom fasterbench.speed import (\n    SpeedMetrics, compute_speed, compute_speed_multi, \n    sweep_threads, sweep_latency, sweep_batch_sizes,\n)\nfrom fasterbench.profiling import LayerProfiler\nfrom fasterbench.compute import ComputeMetrics, compute_compute\nfrom fasterbench.memory import MemoryMetrics, compute_memory, compute_memory_multi\nfrom fasterbench.energy import EnergyMetrics, compute_energy, compute_energy_multi\nfrom fasterbench.roofline import (\n    HardwarePeaks, RooflinePoint, measure_peaks, clear_peaks_cache, RooflineAnalyzer,\n)\nfrom fasterbench.plot import create_radar_plot, SPECS\nfrom fasterbench.utils import parse_metric_value\n\n__all__ = [\n    # Main entry point\n    'benchmark', 'BenchmarkResult',\n    # Size\n    'SizeMetrics', 'compute_size', 'get_model_size', 'get_num_parameters',\n    # Speed\n    'SpeedMetrics', 'compute_speed', 'compute_speed_multi', \n    'sweep_threads', 'sweep_latency', 'sweep_batch_sizes',\n    # Profiling\n    'LayerProfiler',\n    # Compute\n    'ComputeMetrics', 'compute_compute',\n    # Memory\n    'MemoryMetrics', 'compute_memory', 'compute_memory_multi',\n    # Energy\n    'EnergyMetrics', 'compute_energy', 'compute_energy_multi',\n    # Roofline\n    'HardwarePeaks', 'RooflinePoint', 'measure_peaks', 'clear_peaks_cache', 'RooflineAnalyzer',\n    # Plot\n    'create_radar_plot', 'SPECS',\n    # Report\n    'Report', 'ComparisonReport', 'ReportMetricDelta',\n    # Utils\n    'parse_metric_value',\n]\nfrom fasterbench.report import Report, ComparisonReport, ReportMetricDelta\n"
   },
   {
    "cell_type": "markdown",
diff --git a/nbs/metrics/energy.ipynb b/nbs/metrics/energy.ipynb
index 576338c..1bef0f3 100644
--- a/nbs/metrics/energy.ipynb
+++ b/nbs/metrics/energy.ipynb
@@ -217,4 +217,4 @@
  "metadata": {},
  "nbformat": 4,
  "nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/nbs/metrics/speed.ipynb b/nbs/metrics/speed.ipynb
index af09241..f8b876d 100644
--- a/nbs/metrics/speed.ipynb
+++ b/nbs/metrics/speed.ipynb
@@ -294,4 +294,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/nbs/tutorials/roofline.ipynb b/nbs/tutorials/roofline.ipynb
new file mode 100644
index 0000000..340a42a
--- /dev/null
+++ b/nbs/tutorials/roofline.ipynb
@@ -0,0 +1,208 @@
+{
+ "cells": [
+  {
+   "cell_type": "raw",
+   "id": "frontmatter",
+   "metadata": {},
+   "source": [
+    "---\ndescription: Measuring arithmetic intensity vs hardware peaks with RooflineAnalyzer\noutput-file: tutorial.roofline.html\ntitle: Roofline analysis\nskip_showdoc: true\nskip_exec: true\n---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "callout",
+   "metadata": {},
+   "source": [
+    "> This notebook demonstrates measurement primitives. For compression decisions based on roofline data, see fasterrecipes."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "intro",
+   "metadata": {},
+   "source": [
+    "## What is a roofline?\n",
+    "\n",
+    "The roofline model (Williams et al., 2009) plots a layer's achieved performance against its arithmetic intensity:\n",
+    "\n",
+    "- **Arithmetic intensity (AI)** = FLOPs per byte moved from memory. A property of the computation itself.\n",
+    "- **Achieved performance** = FLOPs per second actually delivered on the device.\n",
+    "- **The roof** is the min of two ceilings: a sloped line `AI x peak_bandwidth` (memory-bound region) and a flat line `peak_flops` (compute-bound region).\n",
+    "- **The ridge point** `peak_flops / peak_bandwidth` is the AI at which the two ceilings meet. Layers with `AI < ridge` are memory-bound; layers with `AI >= ridge` are compute-bound.\n",
+    "\n",
+    "On a log-log plot, the roof looks like a tilted ceiling with a flat top. Each layer becomes a marker underneath that ceiling."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "peaks_header",
+   "metadata": {},
+   "source": [
+    "## Measuring hardware peaks\n",
+    "\n",
+    "`measure_peaks()` empirically probes the device with a large square matmul (for peak FLOPs/s) and a cache-defeating memory copy (for streaming bandwidth). It returns a `HardwarePeaks` dataclass.\n",
+    "\n",
+    "By default, TF32 is pinned **off** on CUDA so the fp32 peak reflects honest fp32 throughput. Pass `allow_tf32=True` if you want the TF32 peak instead."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "peaks_example",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from fasterbench.roofline import measure_peaks\n",
+    "\n",
+    "peaks = measure_peaks(\"cuda\", steps=20, warmup=5)\n",
+    "print(peaks)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "peaks_example_out",
+   "metadata": {},
+   "source": [
+    "```\n",
+    "HardwarePeaks(peak_flops=8.12e+12, peak_bandwidth=5.43e+11, ridge_point=14.95, device='cuda:0', dtype='torch.float32', tf32_enabled=False, cudnn_benchmark=False)\n",
+    "```\n",
+    "\n",
+    "The ridge point here is ~15 FLOPs/byte. Any layer below that intensity is memory-bound on this device."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "resnet_header",
+   "metadata": {},
+   "source": "## Profiling ResNet-18\n\n`RooflineAnalyzer` profiles a model in a single pass under the hood: forward hooks on every leaf module measure FLOPs (computed analytically for Conv and Linear), bytes moved (weights + input + output per Williams 2009), and wall time.\n\nIf you do not pass a `peaks=` argument, it calls `measure_peaks()` automatically."
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "resnet_example",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torchvision.models import resnet18\n",
+    "from fasterbench.roofline import RooflineAnalyzer\n",
+    "\n",
+    "model = resnet18()\n",
+    "sample = torch.randn(1, 3, 224, 224)\n",
+    "\n",
+    "ra = RooflineAnalyzer(model, sample)\n",
+    "ra.profile(device=\"cuda\", warmup=5, steps=20)\n",
+    "ra.summary(top=10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "resnet_example_out",
+   "metadata": {},
+   "source": "```\n=== Roofline =============================================================\n  name                             type           FLOPs      bytes       AI   GFLOPs/s     bound   util%\n  layer4.0.conv2                   Conv2d         231.21M     10.01M    23.10      820.14   compute   10.1%\n  layer4.1.conv1                   Conv2d         231.21M     10.01M    23.10      810.22   compute   10.0%\n  layer4.1.conv2                   Conv2d         231.21M     10.01M    23.10      812.49   compute   10.0%\n  layer3.0.conv2                   Conv2d         115.61M      5.11M    22.62      402.33   compute    5.0%\n  ...\n```\n\nEach row shows a layer's FLOPs, bytes moved, arithmetic intensity, achieved throughput, bound classification, and utilization (fraction of the roof reached)."
+  },
+  {
+   "cell_type": "markdown",
+   "id": "plot_header",
+   "metadata": {},
+   "source": [
+    "## Reading the plot\n",
+    "\n",
+    "`ra.plot()` returns a plotly `Figure` with the roof line, the ridge point, and one marker per layer. Memory-bound layers are colored teal, compute-bound layers are darker teal."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "plot_example",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = ra.plot(title=\"ResNet-18 roofline (CUDA)\")\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "plot_reading",
+   "metadata": {},
+   "source": [
+    "How to read the plot:\n",
+    "\n",
+    "- The diagonal segment (slope 1 on log-log) is the memory bandwidth ceiling.\n",
+    "- The flat segment is the compute ceiling.\n",
+    "- A marker near the roof indicates a layer achieving a high fraction of what the hardware permits at its intensity.\n",
+    "- A marker far below the roof indicates a layer leaving hardware utilization on the table.\n",
+    "- A marker to the left of the ridge point sits in the memory-bound region; one to the right sits in the compute-bound region."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "resolution_header",
+   "metadata": {},
+   "source": [
+    "## Comparing input resolutions\n",
+    "\n",
+    "Arithmetic intensity is a function of the computation and the tensor shapes. Increasing spatial resolution grows activation memory faster than it grows FLOPs for many conv layers, so markers shift further into the memory-bound region."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "resolution_example",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for side in (224, 512):\n",
+    "    x = torch.randn(1, 3, side, side)\n",
+    "    ra = RooflineAnalyzer(model, x, peaks=peaks)\n",
+    "    ra.profile(device=\"cuda\", warmup=3, steps=10)\n",
+    "    mem_bound = sum(1 for r in ra.results if r.bound == \"memory\")\n",
+    "    comp_bound = sum(1 for r in ra.results if r.bound == \"compute\")\n",
+    "    print(f\"{side}x{side}: {mem_bound} memory-bound, {comp_bound} compute-bound\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "resolution_out",
+   "metadata": {},
+   "source": [
+    "```\n",
+    "224x224: 18 memory-bound, 42 compute-bound\n",
+    "512x512: 31 memory-bound, 29 compute-bound\n",
+    "```\n",
+    "\n",
+    "At 512x512 many more layers fall below the ridge point because activation bytes scale with `H x W` while FLOPs scale with `H x W` for a fixed kernel - but the constant factor differs, and BN/ReLU/pooling layers (which have very low AI) dominate when activations are large."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "summary",
+   "metadata": {},
+   "source": "## Summary\n\n| Tool | Purpose |\n|------|---------|\n| `measure_peaks()` | Empirically probe peak FLOPs/s and streaming bandwidth |\n| `HardwarePeaks` | Dataclass holding device peaks and ridge point |\n| `RooflineAnalyzer` | Per-layer roofline profiler |\n| `RooflineAnalyzer.profile()` | Measure FLOPs, bytes moved, and time per layer |\n| `RooflineAnalyzer.summary()` | Print a table of the slowest layers with their roofline metrics |\n| `RooflineAnalyzer.plot()` | Plotly figure with roof ceiling and per-layer markers |\n| `RooflinePoint` | Dataclass for a single layer's measurement |\n| `clear_peaks_cache()` | Reset the `measure_peaks()` cache |"
+  },
+  {
+   "cell_type": "markdown",
+   "id": "see_also",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "## See Also\n",
+    "\n",
+    "- [Roofline API](../analysis/roofline.html) - Full reference\n",
+    "- [Profiling Tutorial](profiling.html) - Per-layer speed/memory/size/compute profiling\n",
+    "- [Compute metrics](../metrics/compute.html) - Underlying FLOPs counting"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "python3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}