diff --git a/fasterbench/__init__.py b/fasterbench/__init__.py
index 5e09119..ec82712 100644
--- a/fasterbench/__init__.py
+++ b/fasterbench/__init__.py
@@ -19,6 +19,9 @@
from .compute import ComputeMetrics, compute_compute
from .memory import MemoryMetrics, compute_memory, compute_memory_multi
from .energy import EnergyMetrics, compute_energy, compute_energy_multi
+from fasterbench.roofline import (
+ HardwarePeaks, RooflinePoint, measure_peaks, clear_peaks_cache, RooflineAnalyzer,
+)
from .plot import create_radar_plot, SPECS
from .utils import parse_metric_value
@@ -38,6 +41,8 @@
'MemoryMetrics', 'compute_memory', 'compute_memory_multi',
# Energy
'EnergyMetrics', 'compute_energy', 'compute_energy_multi',
+ # Roofline
+ 'HardwarePeaks', 'RooflinePoint', 'measure_peaks', 'clear_peaks_cache', 'RooflineAnalyzer',
# Plot
'create_radar_plot', 'SPECS',
# Report
diff --git a/fasterbench/_modidx.py b/fasterbench/_modidx.py
index 28cabd1..b5647f9 100644
--- a/fasterbench/_modidx.py
+++ b/fasterbench/_modidx.py
@@ -130,6 +130,36 @@
'fasterbench.report._generate_css': ('analysis/report.html#_generate_css', 'fasterbench/report.py'),
'fasterbench.report._improvement_indicator': ( 'analysis/report.html#_improvement_indicator',
'fasterbench/report.py')},
+ 'fasterbench.roofline': { 'fasterbench.roofline.HardwarePeaks': ( 'analysis/roofline.html#hardwarepeaks',
+ 'fasterbench/roofline.py'),
+ 'fasterbench.roofline.HardwarePeaks.as_dict': ( 'analysis/roofline.html#hardwarepeaks.as_dict',
+ 'fasterbench/roofline.py'),
+ 'fasterbench.roofline.RooflineAnalyzer': ( 'analysis/roofline.html#rooflineanalyzer',
+ 'fasterbench/roofline.py'),
+ 'fasterbench.roofline.RooflineAnalyzer.__init__': ( 'analysis/roofline.html#rooflineanalyzer.__init__',
+ 'fasterbench/roofline.py'),
+ 'fasterbench.roofline.RooflineAnalyzer.plot': ( 'analysis/roofline.html#rooflineanalyzer.plot',
+ 'fasterbench/roofline.py'),
+ 'fasterbench.roofline.RooflineAnalyzer.profile': ( 'analysis/roofline.html#rooflineanalyzer.profile',
+ 'fasterbench/roofline.py'),
+ 'fasterbench.roofline.RooflineAnalyzer.results': ( 'analysis/roofline.html#rooflineanalyzer.results',
+ 'fasterbench/roofline.py'),
+ 'fasterbench.roofline.RooflineAnalyzer.summary': ( 'analysis/roofline.html#rooflineanalyzer.summary',
+ 'fasterbench/roofline.py'),
+ 'fasterbench.roofline.RooflinePoint': ( 'analysis/roofline.html#rooflinepoint',
+ 'fasterbench/roofline.py'),
+ 'fasterbench.roofline.RooflinePoint.as_dict': ( 'analysis/roofline.html#rooflinepoint.as_dict',
+ 'fasterbench/roofline.py'),
+ 'fasterbench.roofline._layer_flops': ( 'analysis/roofline.html#_layer_flops',
+ 'fasterbench/roofline.py'),
+ 'fasterbench.roofline._pinned_benchmark_flags': ( 'analysis/roofline.html#_pinned_benchmark_flags',
+ 'fasterbench/roofline.py'),
+ 'fasterbench.roofline._setup_roofline_hooks': ( 'analysis/roofline.html#_setup_roofline_hooks',
+ 'fasterbench/roofline.py'),
+ 'fasterbench.roofline.clear_peaks_cache': ( 'analysis/roofline.html#clear_peaks_cache',
+ 'fasterbench/roofline.py'),
+ 'fasterbench.roofline.measure_peaks': ( 'analysis/roofline.html#measure_peaks',
+ 'fasterbench/roofline.py')},
'fasterbench.size': { 'fasterbench.size.SizeMetrics': ('metrics/size.html#sizemetrics', 'fasterbench/size.py'),
'fasterbench.size.SizeMetrics.as_dict': ('metrics/size.html#sizemetrics.as_dict', 'fasterbench/size.py'),
'fasterbench.size.compute_size': ('metrics/size.html#compute_size', 'fasterbench/size.py'),
diff --git a/fasterbench/roofline.py b/fasterbench/roofline.py
new file mode 100644
index 0000000..8ee82d4
--- /dev/null
+++ b/fasterbench/roofline.py
@@ -0,0 +1,479 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/analysis/roofline.ipynb.
+
+# %% ../nbs/analysis/roofline.ipynb #imports
+from __future__ import annotations
+
+import math
+import time
+import warnings
+from dataclasses import dataclass, asdict
+from contextlib import contextmanager
+
+import numpy as np
+import torch
+import torch.nn as nn
+import plotly.graph_objects as go
+
+from .core import _device_ctx, _sync, _fmt_float, _section
+from .profiling import _leaf_modules, _tensor_bytes, _output_bytes
+
+# %% auto #0
+__all__ = ['HardwarePeaks', 'measure_peaks', 'clear_peaks_cache', 'RooflinePoint', 'RooflineAnalyzer']
+
+# %% ../nbs/analysis/roofline.ipynb #hardware_peaks
+@dataclass(slots=True)
+class HardwarePeaks:
+ "Empirically measured achievable peak compute and streaming bandwidth for a device."
+ peak_flops: float # achievable peak FLOPs/s
+ peak_bandwidth: float # achievable streaming bandwidth in bytes/s
+ ridge_point: float # FLOPs/byte = peak_flops / peak_bandwidth
+ device: str # e.g. "cuda:0", "cpu"
+ dtype: str # e.g. "torch.float32"
+ tf32_enabled: bool # whether matmul TF32 was on during probe
+ cudnn_benchmark: bool # whether cudnn.benchmark was on during probe
+
+ def as_dict(self) -> dict:
+ return asdict(self)
+
+# %% ../nbs/analysis/roofline.ipynb #pinned_flags
+@contextmanager
+def _pinned_benchmark_flags(tf32: bool = False):
+ "Pin matmul TF32 and cudnn.benchmark during probe; restore on exit."
+ if torch.cuda.is_available():
+ prev_tf32 = torch.backends.cuda.matmul.allow_tf32
+ prev_cudnn_tf32 = torch.backends.cudnn.allow_tf32
+ prev_bench = torch.backends.cudnn.benchmark
+ torch.backends.cuda.matmul.allow_tf32 = tf32
+ torch.backends.cudnn.allow_tf32 = tf32
+ torch.backends.cudnn.benchmark = False
+ try:
+ yield (tf32, False)
+ finally:
+ torch.backends.cuda.matmul.allow_tf32 = prev_tf32
+ torch.backends.cudnn.allow_tf32 = prev_cudnn_tf32
+ torch.backends.cudnn.benchmark = prev_bench
+ else:
+ yield (False, False)
+
+# %% ../nbs/analysis/roofline.ipynb #measure_peaks
+_PEAKS_CACHE: dict = {}
+
+
+def measure_peaks(
+ device: str | torch.device = "cuda", # device to probe
+ *,
+ dtype: torch.dtype = torch.float32, # probe precision
+ matmul_size: int = 4096, # N for NxN matmul probe
+ bandwidth_mb: int = 256, # per-buffer size in MiB (auto-bumped above L3)
+ warmup: int = 5, # warmup iterations
+ steps: int = 20, # measurement iterations (report max)
+ allow_tf32: bool = False, # pin TF32 off by default for honest fp32 peak
+ cache: bool = True, # cache per (device, dtype, sizes)
+) -> HardwarePeaks:
+ "Empirically probe achievable peak FLOPs/s and streaming bandwidth."
+ dev = torch.device(device) if isinstance(device, str) else device
+ if dev.type == "cuda" and not torch.cuda.is_available():
+ warnings.warn("CUDA requested but not available - falling back to CPU")
+ dev = torch.device("cpu")
+ key = (str(dev), str(dtype), matmul_size, bandwidth_mb, allow_tf32)
+ if cache and key in _PEAKS_CACHE:
+ return _PEAKS_CACHE[key]
+
+ with _pinned_benchmark_flags(tf32=allow_tf32) as (tf32_on, bench_on):
+ # --- peak FLOPs probe ---
+ N = matmul_size
+ a = torch.randn(N, N, device=dev, dtype=dtype)
+ b = torch.randn(N, N, device=dev, dtype=dtype)
+ for _ in range(warmup):
+ (a @ b)
+ _sync(dev)
+ flops_per_matmul = 2.0 * N * N * N
+ best_flops = 0.0
+ c = None
+ for _ in range(steps):
+ _sync(dev)
+ t0 = time.perf_counter()
+ c = a @ b
+ _sync(dev)
+ dt = time.perf_counter() - t0
+ if dt > 0:
+ best_flops = max(best_flops, flops_per_matmul / dt)
+ del a, b, c
+ if dev.type == "cuda":
+ torch.cuda.empty_cache()
+
+ # --- bandwidth probe (cache-defeating) ---
+ # target buffer sized to blow past L3 and any GPU L2.
+ buf_bytes = max(bandwidth_mb * 1024 * 1024, 64 * 1024 * 1024)
+ n_elems = buf_bytes // dtype.itemsize
+ src = torch.empty(n_elems, device=dev, dtype=dtype).normal_()
+ dst = torch.empty(n_elems, device=dev, dtype=dtype).normal_()
+ for _ in range(warmup):
+ dst.copy_(src)
+ _sync(dev)
+ bytes_moved_per_copy = 2.0 * n_elems * dtype.itemsize # read + write
+ best_bw = 0.0
+ for i in range(steps):
+ # alternate direction to defeat any residency assumptions
+ s, d = (src, dst) if (i % 2 == 0) else (dst, src)
+ _sync(dev)
+ t0 = time.perf_counter()
+ d.copy_(s)
+ _sync(dev)
+ dt = time.perf_counter() - t0
+ if dt > 0:
+ best_bw = max(best_bw, bytes_moved_per_copy / dt)
+ del src, dst
+ if dev.type == "cuda":
+ torch.cuda.empty_cache()
+
+ result = HardwarePeaks(
+ peak_flops=best_flops,
+ peak_bandwidth=best_bw,
+ ridge_point=best_flops / best_bw if best_bw > 0 else float("inf"),
+ device=str(dev),
+ dtype=str(dtype),
+ tf32_enabled=tf32_on,
+ cudnn_benchmark=bench_on,
+ )
+ if cache:
+ _PEAKS_CACHE[key] = result
+ return result
+
+
+def clear_peaks_cache() -> None:
+ "Reset the measure_peaks() cache."
+ _PEAKS_CACHE.clear()
+
+# %% ../nbs/analysis/roofline.ipynb #roofline_point
+@dataclass(slots=True)
+class RooflinePoint:
+ "Per-layer roofline measurement. Bytes formula: weight_bytes + input_bytes + output_bytes (each counted once per forward call, per Williams 2009)."
+ name: str
+ type: str
+ flops: float # total FLOPs (2 * MACs)
+ bytes_moved: float # weights + input + output bytes per forward call
+ time_s: float # measured wall time (mean over steps)
+ arithmetic_intensity: float # flops / bytes_moved
+ achieved_gflops: float # flops / time / 1e9
+ bound: str # "memory" | "compute" | "undefined"
+ utilization_pct: float # achieved / roof * 100
+
+ def as_dict(self) -> dict:
+ return asdict(self)
+
+# %% ../nbs/analysis/roofline.ipynb #roofline_hooks
+def _layer_flops(module: nn.Module, inp, output) -> float:
+ "Estimate FLOPs for a single leaf module forward call. Returns 0 for layer types we do not model."
+ # Covers the layer types where FLOPs actually dominate - conv, linear, matmul-adjacent.
+ # Cheap element-wise ops (ReLU, BN, pooling) are intentionally assigned 0 FLOPs; they end up
+ # in the \"undefined\" bucket (memory-bound trivially) and are surfaced via the warning.
+ t_in = inp[0] if isinstance(inp, tuple) and len(inp) > 0 and isinstance(inp[0], torch.Tensor) else None
+ t_out = output if isinstance(output, torch.Tensor) else None
+
+ if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Conv3d)) and t_out is not None:
+ cin = module.in_channels // module.groups
+ cout = module.out_channels
+ ksize = 1
+ for k in (module.kernel_size if isinstance(module.kernel_size, tuple) else (module.kernel_size,)):
+ ksize *= k
+ spatial = 1
+ for s in t_out.shape[2:]:
+ spatial *= int(s)
+ batch = int(t_out.shape[0]) if t_out.ndim > 0 else 1
+ macs = batch * cin * cout * ksize * spatial
+ flops = 2.0 * macs
+ if module.bias is not None:
+ flops += batch * cout * spatial # bias add
+ return float(flops)
+
+ if isinstance(module, (nn.ConvTranspose1d, nn.ConvTranspose2d, nn.ConvTranspose3d)) and t_in is not None:
+ cin = module.in_channels // module.groups
+ cout = module.out_channels
+ ksize = 1
+ for k in (module.kernel_size if isinstance(module.kernel_size, tuple) else (module.kernel_size,)):
+ ksize *= k
+ spatial = 1
+ for s in t_in.shape[2:]:
+ spatial *= int(s)
+ batch = int(t_in.shape[0]) if t_in.ndim > 0 else 1
+ return float(2.0 * batch * cin * cout * ksize * spatial)
+
+ if isinstance(module, nn.Linear) and t_in is not None:
+ in_features = module.in_features
+ out_features = module.out_features
+ # Everything before the last dim is treated as batch.
+ batch = 1
+ for s in t_in.shape[:-1]:
+ batch *= int(s)
+ flops = 2.0 * batch * in_features * out_features
+ if module.bias is not None:
+ flops += batch * out_features
+ return float(flops)
+
+ return 0.0
+
+
+#| export
+def _setup_roofline_hooks(
+ leaf_modules, # {name: module} for leaf modules
+ bytes_state, # {name: []} to accumulate bytes per call
+ time_state, # {name: []} to accumulate seconds per call
+ flops_state, # {name: []} to accumulate FLOPs per call
+ call_state, # {name: int} counter of forward calls
+ device_type, # "cuda" or "cpu"
+):
+ "Register hooks to measure (bytes moved, time, FLOPs, call count) per layer."
+ handles = []
+ is_cuda = device_type == "cuda"
+
+ def make_hooks(name: str, module: nn.Module):
+ w_bytes = sum(_tensor_bytes(p) for p in module.parameters(recurse=False))
+ if is_cuda:
+ state = {"start": None, "end": None}
+ def pre(mod, inp):
+ state["start"] = torch.cuda.Event(enable_timing=True)
+ state["end"] = torch.cuda.Event(enable_timing=True)
+ state["start"].record()
+ in_b = sum(_tensor_bytes(t) for t in inp if isinstance(t, torch.Tensor))
+ bytes_state[name].append(in_b + w_bytes)
+ def post(mod, inp, output):
+ state["end"].record()
+ out_b = _output_bytes(output)
+ bytes_state[name][-1] += out_b
+ torch.cuda.synchronize()
+ time_state[name].append(state["start"].elapsed_time(state["end"]) / 1000.0)
+ flops_state[name].append(_layer_flops(mod, inp, output))
+ call_state[name] += 1
+ else:
+ state = {"t0": 0.0}
+ def pre(mod, inp):
+ state["t0"] = time.perf_counter()
+ in_b = sum(_tensor_bytes(t) for t in inp if isinstance(t, torch.Tensor))
+ bytes_state[name].append(in_b + w_bytes)
+ def post(mod, inp, output):
+ dt = time.perf_counter() - state["t0"]
+ out_b = _output_bytes(output)
+ bytes_state[name][-1] += out_b
+ time_state[name].append(dt)
+ flops_state[name].append(_layer_flops(mod, inp, output))
+ call_state[name] += 1
+ return pre, post
+
+ for name, mod in leaf_modules.items():
+ pre, post = make_hooks(name, mod)
+ handles.append(mod.register_forward_pre_hook(pre))
+ handles.append(mod.register_forward_hook(post))
+ return handles
+
+# %% ../nbs/analysis/roofline.ipynb #roofline_analyzer
+class RooflineAnalyzer:
+ "Per-layer roofline analysis: measure arithmetic intensity and achieved GFLOPs/s against hardware peaks."
+
+ def __init__(
+ self,
+ model: nn.Module, # model to analyze
+ sample: torch.Tensor, # input tensor (with batch dimension)
+ peaks: HardwarePeaks | None = None, # optional precomputed hardware peaks
+ ):
+ self.model = model
+ self.sample = sample
+ self.peaks = peaks
+ self._results: list[RooflinePoint] = []
+
+ @property
+ def results(self) -> list[RooflinePoint]:
+ "Per-layer roofline measurements (populated after profile())."
+ return self._results
+
+ @torch.no_grad()
+ def profile(
+ self,
+ *,
+ device: str | torch.device = "cuda", # device to profile on
+ warmup: int = 5, # warmup iterations
+ steps: int = 20, # measurement iterations
+ ) -> list[RooflinePoint]:
+ "Run per-layer profiling: collect FLOPs, bytes, and time, then classify against peaks."
+ with _device_ctx(device) as dev:
+ if dev.type == "cuda":
+ torch.cuda.empty_cache()
+ _sync(dev)
+
+ # Ensure peaks are available
+ if self.peaks is None:
+ self.peaks = measure_peaks(dev, dtype=self.sample.dtype)
+
+ model = self.model.eval().to(dev)
+ sample = self.sample.to(dev)
+ leaf_mods = _leaf_modules(model)
+
+ bytes_state: dict[str, list] = {n: [] for n in leaf_mods}
+ time_state: dict[str, list] = {n: [] for n in leaf_mods}
+ flops_state: dict[str, list] = {n: [] for n in leaf_mods}
+ call_state: dict[str, int] = {n: 0 for n in leaf_mods}
+
+ handles = _setup_roofline_hooks(
+ leaf_mods, bytes_state, time_state, flops_state, call_state, dev.type,
+ )
+ try:
+ for _ in range(warmup):
+ model(sample)
+ # Reset accumulators after warmup
+ for n in leaf_mods:
+ bytes_state[n].clear()
+ time_state[n].clear()
+ flops_state[n].clear()
+ call_state[n] = 0
+ for _ in range(steps):
+ model(sample)
+ finally:
+ for h in handles:
+ h.remove()
+
+ # Warn if any module was invoked >1x per forward pass (shared module)
+ multi_call = [n for n, c in call_state.items() if steps > 0 and c > steps]
+ if multi_call:
+ warnings.warn(
+ f"{len(multi_call)} module(s) were called more than once per forward "
+ f"pass; their bytes/time are summed across calls. First: {multi_call[:3]}"
+ )
+
+ # --- Build RooflinePoint per layer ---
+ peak_flops = self.peaks.peak_flops
+ peak_bw = self.peaks.peak_bandwidth
+ ridge = self.peaks.ridge_point
+
+ results: list[RooflinePoint] = []
+ for name, mod in leaf_mods.items():
+ b_list = bytes_state[name]
+ t_list = time_state[name]
+ f_list = flops_state[name]
+ bytes_moved = float(np.mean(b_list)) if b_list else 0.0
+ time_s = float(np.mean(t_list)) if t_list else 0.0
+ flops = float(np.mean(f_list)) if f_list else 0.0
+
+ if flops == 0 or bytes_moved == 0 or time_s == 0:
+ results.append(RooflinePoint(
+ name=name, type=mod.__class__.__name__,
+ flops=flops, bytes_moved=bytes_moved, time_s=time_s,
+ arithmetic_intensity=0.0, achieved_gflops=0.0,
+ bound="undefined", utilization_pct=0.0,
+ ))
+ continue
+
+ ai = flops / bytes_moved
+ achieved_gflops = flops / time_s / 1e9
+ roof_flops = min(peak_flops, ai * peak_bw)
+ roof_gflops = roof_flops / 1e9
+ bound = "memory" if ai < ridge else "compute"
+ util = (achieved_gflops / roof_gflops * 100) if roof_gflops > 0 else 0.0
+ results.append(RooflinePoint(
+ name=name, type=mod.__class__.__name__,
+ flops=flops, bytes_moved=bytes_moved, time_s=time_s,
+ arithmetic_intensity=ai, achieved_gflops=achieved_gflops,
+ bound=bound, utilization_pct=util,
+ ))
+
+ # Warn if any layer had zero flops/bytes/time -> bound is "undefined"
+ zero_flops = [r.name for r in results if r.bound == "undefined"]
+ if zero_flops:
+ warnings.warn(
+ f"{len(zero_flops)} layer(s) have undefined roofline (zero FLOPs, bytes, "
+ f"or time). First: {zero_flops[:3]}"
+ )
+
+ self._results = results
+ return results
+
+ def summary(self, *, top: int = 10) -> None:
+ "Print a table of the slowest layers with their roofline metrics."
+ if not self._results:
+ raise RuntimeError("No results available. Call profile() first.")
+ print(_section("Roofline", 72))
+ header = f" {'name':32} {'type':14} {'FLOPs':>10} {'bytes':>10} {'AI':>8} {'GFLOPs/s':>10} {'bound':>9} {'util%':>7}"
+ print(header)
+ # Sort by measured time, descending (slowest first)
+ sorted_rows = sorted(self._results, key=lambda r: r.time_s, reverse=True)[:top]
+ for r in sorted_rows:
+ flops_str = f"{r.flops/1e6:>8.2f}M" if r.flops >= 1e6 else f"{r.flops:>10.0f}"
+ bytes_str = f"{r.bytes_moved/1e6:>8.2f}M" if r.bytes_moved >= 1e6 else f"{r.bytes_moved:>10.0f}"
+ ai_str = _fmt_float(r.arithmetic_intensity, width=8, decimals=2)
+ gf_str = _fmt_float(r.achieved_gflops, width=10, decimals=2)
+ util_str = _fmt_float(r.utilization_pct, width=6, decimals=1) + "%"
+ print(f" {r.name:32} {r.type:14} {flops_str} {bytes_str} {ai_str} {gf_str} {r.bound:>9} {util_str}")
+
+ def plot(
+ self,
+ *,
+ title: str = "Roofline", # figure title
+ ) -> go.Figure:
+ "Render the roofline plot with per-layer scatter points on a log-log grid. Markers are colored by bound classification."
+ if not self._results:
+ raise RuntimeError("No results available. Call profile() first.")
+ if self.peaks is None:
+ raise RuntimeError("No hardware peaks available.")
+
+ peak_flops = self.peaks.peak_flops
+ peak_bw = self.peaks.peak_bandwidth
+ ridge = self.peaks.ridge_point
+ peak_gflops = peak_flops / 1e9
+
+ # Build the roof curve: y = min(peak_flops, AI * peak_bw) / 1e9
+ valid = [r for r in self._results if r.bound != "undefined"]
+ if valid:
+ ai_min = max(min(r.arithmetic_intensity for r in valid) / 10.0, 1e-3)
+ ai_max = max(r.arithmetic_intensity for r in valid) * 10.0
+ else:
+ ai_min, ai_max = 1e-2, 1e3
+ ai_max = max(ai_max, ridge * 10.0)
+
+ ai_grid = np.logspace(math.log10(ai_min), math.log10(ai_max), 200)
+ roof_gflops = np.minimum(peak_gflops, ai_grid * peak_bw / 1e9)
+
+ fig = go.Figure()
+ fig.add_trace(go.Scatter(
+ x=ai_grid, y=roof_gflops, mode="lines",
+ line=dict(color="#008080", width=2),
+ name=f"Roof ({peak_gflops:.0f} GFLOPs/s, {peak_bw/1e9:.1f} GB/s)",
+ hoverinfo="skip",
+ ))
+ # Ridge point marker
+ fig.add_trace(go.Scatter(
+ x=[ridge], y=[peak_gflops], mode="markers",
+ marker=dict(color="#008080", size=10, symbol="diamond"),
+ name=f"Ridge point ({ridge:.1f} FLOP/byte)",
+ hovertemplate="Ridge: %{x:.2f} FLOP/byte",
+ ))
+
+ # Color palette for layers
+ color_map = {"memory": "#89d6c9", "compute": "#008080"}
+ for bound_label in ("memory", "compute"):
+ pts = [r for r in valid if r.bound == bound_label]
+ if not pts:
+ continue
+ hover = [
+ f"{r.name}
{r.type}
AI: {r.arithmetic_intensity:.3f} FLOP/byte"
+ f"
{r.achieved_gflops:.2f} GFLOPs/s
util: {r.utilization_pct:.1f}%"
+ for r in pts
+ ]
+ fig.add_trace(go.Scatter(
+ x=[r.arithmetic_intensity for r in pts],
+ y=[r.achieved_gflops for r in pts],
+ mode="markers",
+ marker=dict(color=color_map[bound_label], size=8, opacity=0.8,
+ line=dict(color="#008080", width=0.5)),
+ name=f"{bound_label}-bound",
+ text=hover,
+ hovertemplate="%{text}",
+ ))
+
+ fig.update_layout(
+ title=title,
+ xaxis=dict(title="Arithmetic intensity (FLOP/byte)", type="log"),
+ yaxis=dict(title="Achieved performance (GFLOPs/s)", type="log"),
+ paper_bgcolor="rgba(0,0,0,0)",
+ plot_bgcolor="rgba(0,0,0,0)",
+ legend=dict(bgcolor="rgba(0,0,0,0)"),
+ )
+ return fig
diff --git a/nbs/_quarto.yml b/nbs/_quarto.yml
index 8066f60..2369135 100644
--- a/nbs/_quarto.yml
+++ b/nbs/_quarto.yml
@@ -35,6 +35,7 @@ website:
contents:
- tutorials/benchmark.ipynb
- tutorials/profiling.ipynb
+ - tutorials/roofline.ipynb
- tutorials/report.ipynb
- section: Core
contents:
@@ -51,6 +52,7 @@ website:
contents:
- analysis/benchmark.ipynb
- analysis/profiling.ipynb
+ - analysis/roofline.ipynb
- analysis/report.ipynb
- section: Visualization
contents:
diff --git a/nbs/analysis/profiling.ipynb b/nbs/analysis/profiling.ipynb
index c600166..ca47f46 100644
--- a/nbs/analysis/profiling.ipynb
+++ b/nbs/analysis/profiling.ipynb
@@ -341,4 +341,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/nbs/analysis/roofline.ipynb b/nbs/analysis/roofline.ipynb
new file mode 100644
index 0000000..b4d4024
--- /dev/null
+++ b/nbs/analysis/roofline.ipynb
@@ -0,0 +1,427 @@
+{
+ "cells": [
+ {
+ "cell_type": "raw",
+ "id": "frontmatter",
+ "metadata": {},
+ "source": [
+ "---\ndescription: Roofline analysis for arithmetic intensity vs achieved performance\noutput-file: roofline.html\ntitle: Roofline\nskip_showdoc: true\n---"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "default_exp",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#| default_exp roofline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "setup_showdoc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#| include: false\n",
+ "from nbdev.showdoc import *"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "imports",
+ "metadata": {},
+ "outputs": [],
+ "source": "#| export\nfrom __future__ import annotations\n\nimport math\nimport time\nimport warnings\nfrom dataclasses import dataclass, asdict\nfrom contextlib import contextmanager\n\nimport numpy as np\nimport torch\nimport torch.nn as nn\nimport plotly.graph_objects as go\n\nfrom fasterbench.core import _device_ctx, _sync, _fmt_float, _section\nfrom fasterbench.profiling import _leaf_modules, _tensor_bytes, _output_bytes"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "hardware_peaks",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#| export\n",
+ "@dataclass(slots=True)\n",
+ "class HardwarePeaks:\n",
+ " \"Empirically measured achievable peak compute and streaming bandwidth for a device.\"\n",
+ " peak_flops: float # achievable peak FLOPs/s\n",
+ " peak_bandwidth: float # achievable streaming bandwidth in bytes/s\n",
+ " ridge_point: float # FLOPs/byte = peak_flops / peak_bandwidth\n",
+ " device: str # e.g. \"cuda:0\", \"cpu\"\n",
+ " dtype: str # e.g. \"torch.float32\"\n",
+ " tf32_enabled: bool # whether matmul TF32 was on during probe\n",
+ " cudnn_benchmark: bool # whether cudnn.benchmark was on during probe\n",
+ "\n",
+ " def as_dict(self) -> dict:\n",
+ " return asdict(self)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "sd_hw_peaks",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "show_doc(HardwarePeaks)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "pinned_flags",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#| export\n",
+ "@contextmanager\n",
+ "def _pinned_benchmark_flags(tf32: bool = False):\n",
+ " \"Pin matmul TF32 and cudnn.benchmark during probe; restore on exit.\"\n",
+ " if torch.cuda.is_available():\n",
+ " prev_tf32 = torch.backends.cuda.matmul.allow_tf32\n",
+ " prev_cudnn_tf32 = torch.backends.cudnn.allow_tf32\n",
+ " prev_bench = torch.backends.cudnn.benchmark\n",
+ " torch.backends.cuda.matmul.allow_tf32 = tf32\n",
+ " torch.backends.cudnn.allow_tf32 = tf32\n",
+ " torch.backends.cudnn.benchmark = False\n",
+ " try:\n",
+ " yield (tf32, False)\n",
+ " finally:\n",
+ " torch.backends.cuda.matmul.allow_tf32 = prev_tf32\n",
+ " torch.backends.cudnn.allow_tf32 = prev_cudnn_tf32\n",
+ " torch.backends.cudnn.benchmark = prev_bench\n",
+ " else:\n",
+ " yield (False, False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "measure_peaks",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#| export\n",
+ "_PEAKS_CACHE: dict = {}\n",
+ "\n",
+ "\n",
+ "def measure_peaks(\n",
+ " device: str | torch.device = \"cuda\", # device to probe\n",
+ " *,\n",
+ " dtype: torch.dtype = torch.float32, # probe precision\n",
+ " matmul_size: int = 4096, # N for NxN matmul probe\n",
+ " bandwidth_mb: int = 256, # per-buffer size in MiB (auto-bumped above L3)\n",
+ " warmup: int = 5, # warmup iterations\n",
+ " steps: int = 20, # measurement iterations (report max)\n",
+ " allow_tf32: bool = False, # pin TF32 off by default for honest fp32 peak\n",
+ " cache: bool = True, # cache per (device, dtype, sizes)\n",
+ ") -> HardwarePeaks:\n",
+ " \"Empirically probe achievable peak FLOPs/s and streaming bandwidth.\"\n",
+ " dev = torch.device(device) if isinstance(device, str) else device\n",
+ " if dev.type == \"cuda\" and not torch.cuda.is_available():\n",
+ " warnings.warn(\"CUDA requested but not available - falling back to CPU\")\n",
+ " dev = torch.device(\"cpu\")\n",
+ " key = (str(dev), str(dtype), matmul_size, bandwidth_mb, allow_tf32)\n",
+ " if cache and key in _PEAKS_CACHE:\n",
+ " return _PEAKS_CACHE[key]\n",
+ "\n",
+ " with _pinned_benchmark_flags(tf32=allow_tf32) as (tf32_on, bench_on):\n",
+ " # --- peak FLOPs probe ---\n",
+ " N = matmul_size\n",
+ " a = torch.randn(N, N, device=dev, dtype=dtype)\n",
+ " b = torch.randn(N, N, device=dev, dtype=dtype)\n",
+ " for _ in range(warmup):\n",
+ " (a @ b)\n",
+ " _sync(dev)\n",
+ " flops_per_matmul = 2.0 * N * N * N\n",
+ " best_flops = 0.0\n",
+ " c = None\n",
+ " for _ in range(steps):\n",
+ " _sync(dev)\n",
+ " t0 = time.perf_counter()\n",
+ " c = a @ b\n",
+ " _sync(dev)\n",
+ " dt = time.perf_counter() - t0\n",
+ " if dt > 0:\n",
+ " best_flops = max(best_flops, flops_per_matmul / dt)\n",
+ " del a, b, c\n",
+ " if dev.type == \"cuda\":\n",
+ " torch.cuda.empty_cache()\n",
+ "\n",
+ " # --- bandwidth probe (cache-defeating) ---\n",
+ " # target buffer sized to blow past L3 and any GPU L2.\n",
+ " buf_bytes = max(bandwidth_mb * 1024 * 1024, 64 * 1024 * 1024)\n",
+ " n_elems = buf_bytes // dtype.itemsize\n",
+ " src = torch.empty(n_elems, device=dev, dtype=dtype).normal_()\n",
+ " dst = torch.empty(n_elems, device=dev, dtype=dtype).normal_()\n",
+ " for _ in range(warmup):\n",
+ " dst.copy_(src)\n",
+ " _sync(dev)\n",
+ " bytes_moved_per_copy = 2.0 * n_elems * dtype.itemsize # read + write\n",
+ " best_bw = 0.0\n",
+ " for i in range(steps):\n",
+ " # alternate direction to defeat any residency assumptions\n",
+ " s, d = (src, dst) if (i % 2 == 0) else (dst, src)\n",
+ " _sync(dev)\n",
+ " t0 = time.perf_counter()\n",
+ " d.copy_(s)\n",
+ " _sync(dev)\n",
+ " dt = time.perf_counter() - t0\n",
+ " if dt > 0:\n",
+ " best_bw = max(best_bw, bytes_moved_per_copy / dt)\n",
+ " del src, dst\n",
+ " if dev.type == \"cuda\":\n",
+ " torch.cuda.empty_cache()\n",
+ "\n",
+ " result = HardwarePeaks(\n",
+ " peak_flops=best_flops,\n",
+ " peak_bandwidth=best_bw,\n",
+ " ridge_point=best_flops / best_bw if best_bw > 0 else float(\"inf\"),\n",
+ " device=str(dev),\n",
+ " dtype=str(dtype),\n",
+ " tf32_enabled=tf32_on,\n",
+ " cudnn_benchmark=bench_on,\n",
+ " )\n",
+ " if cache:\n",
+ " _PEAKS_CACHE[key] = result\n",
+ " return result\n",
+ "\n",
+ "\n",
+ "def clear_peaks_cache() -> None:\n",
+ " \"Reset the measure_peaks() cache.\"\n",
+ " _PEAKS_CACHE.clear()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "sd_measure_peaks",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "show_doc(measure_peaks)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "roofline_point",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#| export\n",
+ "@dataclass(slots=True)\n",
+ "class RooflinePoint:\n",
+ " \"Per-layer roofline measurement. Bytes formula: weight_bytes + input_bytes + output_bytes (each counted once per forward call, per Williams 2009).\"\n",
+ " name: str\n",
+ " type: str\n",
+ " flops: float # total FLOPs (2 * MACs)\n",
+ " bytes_moved: float # weights + input + output bytes per forward call\n",
+ " time_s: float # measured wall time (mean over steps)\n",
+ " arithmetic_intensity: float # flops / bytes_moved\n",
+ " achieved_gflops: float # flops / time / 1e9\n",
+ " bound: str # \"memory\" | \"compute\" | \"undefined\"\n",
+ " utilization_pct: float # achieved / roof * 100\n",
+ "\n",
+ " def as_dict(self) -> dict:\n",
+ " return asdict(self)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "sd_roofline_point",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "show_doc(RooflinePoint)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "roofline_hooks",
+ "metadata": {},
+ "outputs": [],
+ "source": "#| export\ndef _layer_flops(module: nn.Module, inp, output) -> float:\n \"Estimate FLOPs for a single leaf module forward call. Returns 0 for layer types we do not model.\"\n # Covers the layer types where FLOPs actually dominate - conv, linear, matmul-adjacent.\n # Cheap element-wise ops (ReLU, BN, pooling) are intentionally assigned 0 FLOPs; they end up\n # in the \\\"undefined\\\" bucket (memory-bound trivially) and are surfaced via the warning.\n t_in = inp[0] if isinstance(inp, tuple) and len(inp) > 0 and isinstance(inp[0], torch.Tensor) else None\n t_out = output if isinstance(output, torch.Tensor) else None\n\n if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Conv3d)) and t_out is not None:\n cin = module.in_channels // module.groups\n cout = module.out_channels\n ksize = 1\n for k in (module.kernel_size if isinstance(module.kernel_size, tuple) else (module.kernel_size,)):\n ksize *= k\n spatial = 1\n for s in t_out.shape[2:]:\n spatial *= int(s)\n batch = int(t_out.shape[0]) if t_out.ndim > 0 else 1\n macs = batch * cin * cout * ksize * spatial\n flops = 2.0 * macs\n if module.bias is not None:\n flops += batch * cout * spatial # bias add\n return float(flops)\n\n if isinstance(module, (nn.ConvTranspose1d, nn.ConvTranspose2d, nn.ConvTranspose3d)) and t_in is not None:\n cin = module.in_channels // module.groups\n cout = module.out_channels\n ksize = 1\n for k in (module.kernel_size if isinstance(module.kernel_size, tuple) else (module.kernel_size,)):\n ksize *= k\n spatial = 1\n for s in t_in.shape[2:]:\n spatial *= int(s)\n batch = int(t_in.shape[0]) if t_in.ndim > 0 else 1\n return float(2.0 * batch * cin * cout * ksize * spatial)\n\n if isinstance(module, nn.Linear) and t_in is not None:\n in_features = module.in_features\n out_features = module.out_features\n # Everything before the last dim is treated as batch.\n batch = 1\n for s in t_in.shape[:-1]:\n batch *= int(s)\n flops = 2.0 * batch * in_features * out_features\n if module.bias is not None:\n flops += batch * out_features\n return float(flops)\n\n return 0.0\n\n\n#| export\ndef _setup_roofline_hooks(\n leaf_modules, # {name: module} for leaf modules\n bytes_state, # {name: []} to accumulate bytes per call\n time_state, # {name: []} to accumulate seconds per call\n flops_state, # {name: []} to accumulate FLOPs per call\n call_state, # {name: int} counter of forward calls\n device_type, # \"cuda\" or \"cpu\"\n):\n \"Register hooks to measure (bytes moved, time, FLOPs, call count) per layer.\"\n handles = []\n is_cuda = device_type == \"cuda\"\n\n def make_hooks(name: str, module: nn.Module):\n w_bytes = sum(_tensor_bytes(p) for p in module.parameters(recurse=False))\n if is_cuda:\n state = {\"start\": None, \"end\": None}\n def pre(mod, inp):\n state[\"start\"] = torch.cuda.Event(enable_timing=True)\n state[\"end\"] = torch.cuda.Event(enable_timing=True)\n state[\"start\"].record()\n in_b = sum(_tensor_bytes(t) for t in inp if isinstance(t, torch.Tensor))\n bytes_state[name].append(in_b + w_bytes)\n def post(mod, inp, output):\n state[\"end\"].record()\n out_b = _output_bytes(output)\n bytes_state[name][-1] += out_b\n torch.cuda.synchronize()\n time_state[name].append(state[\"start\"].elapsed_time(state[\"end\"]) / 1000.0)\n flops_state[name].append(_layer_flops(mod, inp, output))\n call_state[name] += 1\n else:\n state = {\"t0\": 0.0}\n def pre(mod, inp):\n state[\"t0\"] = time.perf_counter()\n in_b = sum(_tensor_bytes(t) for t in inp if isinstance(t, torch.Tensor))\n bytes_state[name].append(in_b + w_bytes)\n def post(mod, inp, output):\n dt = time.perf_counter() - state[\"t0\"]\n out_b = _output_bytes(output)\n bytes_state[name][-1] += out_b\n time_state[name].append(dt)\n flops_state[name].append(_layer_flops(mod, inp, output))\n call_state[name] += 1\n return pre, post\n\n for name, mod in leaf_modules.items():\n pre, post = make_hooks(name, mod)\n handles.append(mod.register_forward_pre_hook(pre))\n handles.append(mod.register_forward_hook(post))\n return handles"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "roofline_analyzer",
+ "metadata": {},
+ "outputs": [],
+ "source": "#| export\nclass RooflineAnalyzer:\n \"Per-layer roofline analysis: measure arithmetic intensity and achieved GFLOPs/s against hardware peaks.\"\n\n def __init__(\n self,\n model: nn.Module, # model to analyze\n sample: torch.Tensor, # input tensor (with batch dimension)\n peaks: HardwarePeaks | None = None, # optional precomputed hardware peaks\n ):\n self.model = model\n self.sample = sample\n self.peaks = peaks\n self._results: list[RooflinePoint] = []\n\n @property\n def results(self) -> list[RooflinePoint]:\n \"Per-layer roofline measurements (populated after profile()).\"\n return self._results\n\n @torch.no_grad()\n def profile(\n self,\n *,\n device: str | torch.device = \"cuda\", # device to profile on\n warmup: int = 5, # warmup iterations\n steps: int = 20, # measurement iterations\n ) -> list[RooflinePoint]:\n \"Run per-layer profiling: collect FLOPs, bytes, and time, then classify against peaks.\"\n with _device_ctx(device) as dev:\n if dev.type == \"cuda\":\n torch.cuda.empty_cache()\n _sync(dev)\n\n # Ensure peaks are available\n if self.peaks is None:\n self.peaks = measure_peaks(dev, dtype=self.sample.dtype)\n\n model = self.model.eval().to(dev)\n sample = self.sample.to(dev)\n leaf_mods = _leaf_modules(model)\n\n bytes_state: dict[str, list] = {n: [] for n in leaf_mods}\n time_state: dict[str, list] = {n: [] for n in leaf_mods}\n flops_state: dict[str, list] = {n: [] for n in leaf_mods}\n call_state: dict[str, int] = {n: 0 for n in leaf_mods}\n\n handles = _setup_roofline_hooks(\n leaf_mods, bytes_state, time_state, flops_state, call_state, dev.type,\n )\n try:\n for _ in range(warmup):\n model(sample)\n # Reset accumulators after warmup\n for n in leaf_mods:\n bytes_state[n].clear()\n time_state[n].clear()\n flops_state[n].clear()\n call_state[n] = 0\n for _ in range(steps):\n model(sample)\n finally:\n for h in handles:\n h.remove()\n\n # Warn if any module was invoked >1x per forward pass (shared module)\n multi_call = [n for n, c in call_state.items() if steps > 0 and c > steps]\n if multi_call:\n warnings.warn(\n f\"{len(multi_call)} module(s) were called more than once per forward \"\n f\"pass; their bytes/time are summed across calls. First: {multi_call[:3]}\"\n )\n\n # --- Build RooflinePoint per layer ---\n peak_flops = self.peaks.peak_flops\n peak_bw = self.peaks.peak_bandwidth\n ridge = self.peaks.ridge_point\n\n results: list[RooflinePoint] = []\n for name, mod in leaf_mods.items():\n b_list = bytes_state[name]\n t_list = time_state[name]\n f_list = flops_state[name]\n bytes_moved = float(np.mean(b_list)) if b_list else 0.0\n time_s = float(np.mean(t_list)) if t_list else 0.0\n flops = float(np.mean(f_list)) if f_list else 0.0\n\n if flops == 0 or bytes_moved == 0 or time_s == 0:\n results.append(RooflinePoint(\n name=name, type=mod.__class__.__name__,\n flops=flops, bytes_moved=bytes_moved, time_s=time_s,\n arithmetic_intensity=0.0, achieved_gflops=0.0,\n bound=\"undefined\", utilization_pct=0.0,\n ))\n continue\n\n ai = flops / bytes_moved\n achieved_gflops = flops / time_s / 1e9\n roof_flops = min(peak_flops, ai * peak_bw)\n roof_gflops = roof_flops / 1e9\n bound = \"memory\" if ai < ridge else \"compute\"\n util = (achieved_gflops / roof_gflops * 100) if roof_gflops > 0 else 0.0\n results.append(RooflinePoint(\n name=name, type=mod.__class__.__name__,\n flops=flops, bytes_moved=bytes_moved, time_s=time_s,\n arithmetic_intensity=ai, achieved_gflops=achieved_gflops,\n bound=bound, utilization_pct=util,\n ))\n\n # Warn if any layer had zero flops/bytes/time -> bound is \"undefined\"\n zero_flops = [r.name for r in results if r.bound == \"undefined\"]\n if zero_flops:\n warnings.warn(\n f\"{len(zero_flops)} layer(s) have undefined roofline (zero FLOPs, bytes, \"\n f\"or time). First: {zero_flops[:3]}\"\n )\n\n self._results = results\n return results\n\n def summary(self, *, top: int = 10) -> None:\n \"Print a table of the slowest layers with their roofline metrics.\"\n if not self._results:\n raise RuntimeError(\"No results available. Call profile() first.\")\n print(_section(\"Roofline\", 72))\n header = f\" {'name':32} {'type':14} {'FLOPs':>10} {'bytes':>10} {'AI':>8} {'GFLOPs/s':>10} {'bound':>9} {'util%':>7}\"\n print(header)\n # Sort by measured time, descending (slowest first)\n sorted_rows = sorted(self._results, key=lambda r: r.time_s, reverse=True)[:top]\n for r in sorted_rows:\n flops_str = f\"{r.flops/1e6:>8.2f}M\" if r.flops >= 1e6 else f\"{r.flops:>10.0f}\"\n bytes_str = f\"{r.bytes_moved/1e6:>8.2f}M\" if r.bytes_moved >= 1e6 else f\"{r.bytes_moved:>10.0f}\"\n ai_str = _fmt_float(r.arithmetic_intensity, width=8, decimals=2)\n gf_str = _fmt_float(r.achieved_gflops, width=10, decimals=2)\n util_str = _fmt_float(r.utilization_pct, width=6, decimals=1) + \"%\"\n print(f\" {r.name:32} {r.type:14} {flops_str} {bytes_str} {ai_str} {gf_str} {r.bound:>9} {util_str}\")\n\n def plot(\n self,\n *,\n title: str = \"Roofline\", # figure title\n ) -> go.Figure:\n \"Render the roofline plot with per-layer scatter points on a log-log grid. Markers are colored by bound classification.\"\n if not self._results:\n raise RuntimeError(\"No results available. Call profile() first.\")\n if self.peaks is None:\n raise RuntimeError(\"No hardware peaks available.\")\n\n peak_flops = self.peaks.peak_flops\n peak_bw = self.peaks.peak_bandwidth\n ridge = self.peaks.ridge_point\n peak_gflops = peak_flops / 1e9\n\n # Build the roof curve: y = min(peak_flops, AI * peak_bw) / 1e9\n valid = [r for r in self._results if r.bound != \"undefined\"]\n if valid:\n ai_min = max(min(r.arithmetic_intensity for r in valid) / 10.0, 1e-3)\n ai_max = max(r.arithmetic_intensity for r in valid) * 10.0\n else:\n ai_min, ai_max = 1e-2, 1e3\n ai_max = max(ai_max, ridge * 10.0)\n\n ai_grid = np.logspace(math.log10(ai_min), math.log10(ai_max), 200)\n roof_gflops = np.minimum(peak_gflops, ai_grid * peak_bw / 1e9)\n\n fig = go.Figure()\n fig.add_trace(go.Scatter(\n x=ai_grid, y=roof_gflops, mode=\"lines\",\n line=dict(color=\"#008080\", width=2),\n name=f\"Roof ({peak_gflops:.0f} GFLOPs/s, {peak_bw/1e9:.1f} GB/s)\",\n hoverinfo=\"skip\",\n ))\n # Ridge point marker\n fig.add_trace(go.Scatter(\n x=[ridge], y=[peak_gflops], mode=\"markers\",\n marker=dict(color=\"#008080\", size=10, symbol=\"diamond\"),\n name=f\"Ridge point ({ridge:.1f} FLOP/byte)\",\n hovertemplate=\"Ridge: %{x:.2f} FLOP/byte\",\n ))\n\n # Color palette for layers\n color_map = {\"memory\": \"#89d6c9\", \"compute\": \"#008080\"}\n for bound_label in (\"memory\", \"compute\"):\n pts = [r for r in valid if r.bound == bound_label]\n if not pts:\n continue\n hover = [\n f\"{r.name}
{r.type}
AI: {r.arithmetic_intensity:.3f} FLOP/byte\"\n f\"
{r.achieved_gflops:.2f} GFLOPs/s
util: {r.utilization_pct:.1f}%\"\n for r in pts\n ]\n fig.add_trace(go.Scatter(\n x=[r.arithmetic_intensity for r in pts],\n y=[r.achieved_gflops for r in pts],\n mode=\"markers\",\n marker=dict(color=color_map[bound_label], size=8, opacity=0.8,\n line=dict(color=\"#008080\", width=0.5)),\n name=f\"{bound_label}-bound\",\n text=hover,\n hovertemplate=\"%{text}\",\n ))\n\n fig.update_layout(\n title=title,\n xaxis=dict(title=\"Arithmetic intensity (FLOP/byte)\", type=\"log\"),\n yaxis=dict(title=\"Achieved performance (GFLOPs/s)\", type=\"log\"),\n paper_bgcolor=\"rgba(0,0,0,0)\",\n plot_bgcolor=\"rgba(0,0,0,0)\",\n legend=dict(bgcolor=\"rgba(0,0,0,0)\"),\n )\n return fig"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "sd_analyzer",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "show_doc(RooflineAnalyzer)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "sd_profile",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "show_doc(RooflineAnalyzer.profile)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "sd_summary",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "show_doc(RooflineAnalyzer.summary)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "sd_plot",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "show_doc(RooflineAnalyzer.plot)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "usage",
+ "metadata": {},
+ "source": [
+ "## Usage\n",
+ "\n",
+ "```python\n",
+ "from fasterbench.roofline import RooflineAnalyzer\n",
+ "\n",
+ "ra = RooflineAnalyzer(model, sample)\n",
+ "ra.profile(device=\"cuda\")\n",
+ "ra.summary()\n",
+ "fig = ra.plot()\n",
+ "fig.show()\n",
+ "```\n",
+ "\n",
+ "This is a measurement primitive. Downstream compression workflows (see fasterrecipes) can consume `ra.results` to make decisions - fasterbench itself never prescribes."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "test_peaks_basic",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#| hide\n",
+ "from fastcore.test import *\n",
+ "\n",
+ "_p = measure_peaks(\"cpu\", steps=3, warmup=1, matmul_size=256, bandwidth_mb=32, cache=False)\n",
+ "assert isinstance(_p, HardwarePeaks)\n",
+ "assert _p.peak_flops > 0\n",
+ "assert _p.peak_bandwidth > 0\n",
+ "test_close(_p.ridge_point, _p.peak_flops / _p.peak_bandwidth, eps=1e-6)\n",
+ "assert _p.device == \"cpu\"\n",
+ "assert _p.tf32_enabled is False\n",
+ "assert _p.cudnn_benchmark is False"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "test_peaks_cache",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#| hide\n",
+ "clear_peaks_cache()\n",
+ "_p1 = measure_peaks(\"cpu\", steps=2, warmup=1, matmul_size=128, bandwidth_mb=16, cache=True)\n",
+ "_p2 = measure_peaks(\"cpu\", steps=2, warmup=1, matmul_size=128, bandwidth_mb=16, cache=True)\n",
+ "assert _p1 is _p2 # cache hit returns same object\n",
+ "\n",
+ "clear_peaks_cache()\n",
+ "_p3 = measure_peaks(\"cpu\", steps=2, warmup=1, matmul_size=128, bandwidth_mb=16, cache=True)\n",
+ "assert _p3 is not _p1 # cache was cleared"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "test_conv_hand_computed",
+ "metadata": {},
+ "outputs": [],
+ "source": "#| hide\n# Hand-computed Conv2d test: wrap in Sequential so leaf name is non-empty\nimport torch\nimport torch.nn as nn\n\n_synth_peaks = HardwarePeaks(\n peak_flops=1e12, peak_bandwidth=1e11, ridge_point=10.0,\n device=\"cpu\", dtype=\"torch.float32\",\n tf32_enabled=False, cudnn_benchmark=False,\n)\n_conv = nn.Conv2d(4, 8, kernel_size=3, padding=1, bias=False)\n_x = torch.randn(1, 4, 8, 8)\n_model = nn.Sequential(_conv)\n_ra = RooflineAnalyzer(_model, _x, peaks=_synth_peaks)\n_res = _ra.profile(device=\"cpu\", warmup=1, steps=2)\nassert len(_res) == 1\n_r = _res[0]\n# Expected MACs = 4*8*3*3 * 8*8 = 18432; FLOPs = 2*MACs = 36864\ntest_eq(_r.flops, 36864.0)\n# Expected bytes: weights 4*8*3*3*4 = 1152, input 1*4*8*8*4 = 1024, output 1*8*8*8*4 = 2048\n# total = 4224\ntest_eq(_r.bytes_moved, 4224.0)\ntest_close(_r.arithmetic_intensity, 36864.0 / 4224.0, eps=1e-3)\nassert _r.bound in (\"memory\", \"compute\")\nassert math.isfinite(_r.utilization_pct)"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "test_linear_stack",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#| hide\n",
+ "# Tiny Linear stack with synthetic peaks\n",
+ "_m = nn.Sequential(nn.Linear(32, 64), nn.ReLU(), nn.Linear(64, 16))\n",
+ "_x = torch.randn(1, 32)\n",
+ "_peaks = HardwarePeaks(\n",
+ " peak_flops=1e12, peak_bandwidth=1e11, ridge_point=10.0,\n",
+ " device=\"cpu\", dtype=\"torch.float32\",\n",
+ " tf32_enabled=False, cudnn_benchmark=False,\n",
+ ")\n",
+ "_ra = RooflineAnalyzer(_m, _x, peaks=_peaks)\n",
+ "_res = _ra.profile(device=\"cpu\", warmup=1, steps=2)\n",
+ "assert len(_res) > 0\n",
+ "for _r in _res:\n",
+ " assert _r.arithmetic_intensity >= 0\n",
+ " assert _r.bound in {\"memory\", \"compute\", \"undefined\"}\n",
+ " assert math.isfinite(_r.utilization_pct)\n",
+ "# summary and plot should run without error\n",
+ "_ra.summary(top=5)\n",
+ "_fig = _ra.plot()\n",
+ "assert isinstance(_fig, go.Figure)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "test_slow_resnet",
+ "metadata": {},
+ "outputs": [],
+ "source": "#| hide\n#| notest\nfrom torchvision.models import resnet18\n\n_model = resnet18()\n_sample = torch.randn(1, 3, 64, 64)\n_synth = HardwarePeaks(\n peak_flops=1e13, peak_bandwidth=5e11, ridge_point=20.0,\n device=\"cpu\", dtype=\"torch.float32\",\n tf32_enabled=False, cudnn_benchmark=False,\n)\n_ra = RooflineAnalyzer(_model, _sample, peaks=_synth)\n_results = _ra.profile(device=\"cpu\", warmup=2, steps=3)\nassert len(_results) > 0\nassert all(r.bound in {\"memory\", \"compute\", \"undefined\"} for r in _results)\n_ra.summary(top=5)\n_fig = _ra.plot()\nassert isinstance(_fig, go.Figure)"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "test_slow_cuda",
+ "metadata": {},
+ "outputs": [],
+ "source": "#| hide\n#| notest\nif torch.cuda.is_available():\n _p = measure_peaks(device=\"cuda\", matmul_size=512, bandwidth_mb=64, steps=3, warmup=1, cache=False)\n assert _p.peak_flops > 0\n assert _p.peak_bandwidth > 0\n assert _p.ridge_point > 0"
+ },
+ {
+ "cell_type": "markdown",
+ "id": "see_also",
+ "metadata": {},
+ "source": "---\n\n## See Also\n\n- [Per-layer profiling](profiling.html) - Generic per-layer hook infrastructure reused here\n- [Compute metrics](../metrics/compute.html) - Model-level FLOPs counting\n- [Speed metrics](../metrics/speed.html) - Latency measurement"
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "python3",
+ "language": "python",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/nbs/index.ipynb b/nbs/index.ipynb
index a6b7be0..9c8be32 100644
--- a/nbs/index.ipynb
+++ b/nbs/index.ipynb
@@ -37,47 +37,7 @@
"id": "8b6f8c52",
"metadata": {},
"outputs": [],
- "source": [
- "#| export\n",
- "from fasterbench.benchmark import benchmark, BenchmarkResult\n",
- "from fasterbench.size import SizeMetrics, compute_size, get_model_size, get_num_parameters\n",
- "from fasterbench.speed import (\n",
- " SpeedMetrics, compute_speed, compute_speed_multi, \n",
- " sweep_threads, sweep_latency, sweep_batch_sizes,\n",
- ")\n",
- "from fasterbench.profiling import LayerProfiler\n",
- "from fasterbench.compute import ComputeMetrics, compute_compute\n",
- "from fasterbench.memory import MemoryMetrics, compute_memory, compute_memory_multi\n",
- "from fasterbench.energy import EnergyMetrics, compute_energy, compute_energy_multi\n",
- "from fasterbench.plot import create_radar_plot, SPECS\n",
- "from fasterbench.utils import parse_metric_value\n",
- "\n",
- "__all__ = [\n",
- " # Main entry point\n",
- " 'benchmark', 'BenchmarkResult',\n",
- " # Size\n",
- " 'SizeMetrics', 'compute_size', 'get_model_size', 'get_num_parameters',\n",
- " # Speed\n",
- " 'SpeedMetrics', 'compute_speed', 'compute_speed_multi', \n",
- " 'sweep_threads', 'sweep_latency', 'sweep_batch_sizes',\n",
- " # Profiling\n",
- " 'LayerProfiler',\n",
- " # Compute\n",
- " 'ComputeMetrics', 'compute_compute',\n",
- " # Memory\n",
- " 'MemoryMetrics', 'compute_memory', 'compute_memory_multi',\n",
- " # Energy\n",
- " 'EnergyMetrics', 'compute_energy', 'compute_energy_multi',\n",
- " # Plot\n",
- " 'create_radar_plot', 'SPECS',\n",
- " # Report\n",
- " 'Report', 'ComparisonReport', 'ReportMetricDelta',\n",
- " # Utils\n",
- " 'parse_metric_value',\n",
- "]\n",
- "from fasterbench.report import Report, ComparisonReport, ReportMetricDelta\n",
- ""
- ]
+ "source": "#| export\nfrom fasterbench.benchmark import benchmark, BenchmarkResult\nfrom fasterbench.size import SizeMetrics, compute_size, get_model_size, get_num_parameters\nfrom fasterbench.speed import (\n SpeedMetrics, compute_speed, compute_speed_multi, \n sweep_threads, sweep_latency, sweep_batch_sizes,\n)\nfrom fasterbench.profiling import LayerProfiler\nfrom fasterbench.compute import ComputeMetrics, compute_compute\nfrom fasterbench.memory import MemoryMetrics, compute_memory, compute_memory_multi\nfrom fasterbench.energy import EnergyMetrics, compute_energy, compute_energy_multi\nfrom fasterbench.roofline import (\n HardwarePeaks, RooflinePoint, measure_peaks, clear_peaks_cache, RooflineAnalyzer,\n)\nfrom fasterbench.plot import create_radar_plot, SPECS\nfrom fasterbench.utils import parse_metric_value\n\n__all__ = [\n # Main entry point\n 'benchmark', 'BenchmarkResult',\n # Size\n 'SizeMetrics', 'compute_size', 'get_model_size', 'get_num_parameters',\n # Speed\n 'SpeedMetrics', 'compute_speed', 'compute_speed_multi', \n 'sweep_threads', 'sweep_latency', 'sweep_batch_sizes',\n # Profiling\n 'LayerProfiler',\n # Compute\n 'ComputeMetrics', 'compute_compute',\n # Memory\n 'MemoryMetrics', 'compute_memory', 'compute_memory_multi',\n # Energy\n 'EnergyMetrics', 'compute_energy', 'compute_energy_multi',\n # Roofline\n 'HardwarePeaks', 'RooflinePoint', 'measure_peaks', 'clear_peaks_cache', 'RooflineAnalyzer',\n # Plot\n 'create_radar_plot', 'SPECS',\n # Report\n 'Report', 'ComparisonReport', 'ReportMetricDelta',\n # Utils\n 'parse_metric_value',\n]\nfrom fasterbench.report import Report, ComparisonReport, ReportMetricDelta\n"
},
{
"cell_type": "markdown",
diff --git a/nbs/metrics/energy.ipynb b/nbs/metrics/energy.ipynb
index 576338c..1bef0f3 100644
--- a/nbs/metrics/energy.ipynb
+++ b/nbs/metrics/energy.ipynb
@@ -217,4 +217,4 @@
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/nbs/metrics/speed.ipynb b/nbs/metrics/speed.ipynb
index af09241..f8b876d 100644
--- a/nbs/metrics/speed.ipynb
+++ b/nbs/metrics/speed.ipynb
@@ -294,4 +294,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/nbs/tutorials/roofline.ipynb b/nbs/tutorials/roofline.ipynb
new file mode 100644
index 0000000..340a42a
--- /dev/null
+++ b/nbs/tutorials/roofline.ipynb
@@ -0,0 +1,208 @@
+{
+ "cells": [
+ {
+ "cell_type": "raw",
+ "id": "frontmatter",
+ "metadata": {},
+ "source": [
+ "---\ndescription: Measuring arithmetic intensity vs hardware peaks with RooflineAnalyzer\noutput-file: tutorial.roofline.html\ntitle: Roofline analysis\nskip_showdoc: true\nskip_exec: true\n---"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "callout",
+ "metadata": {},
+ "source": [
+ "> This notebook demonstrates measurement primitives. For compression decisions based on roofline data, see fasterrecipes."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "intro",
+ "metadata": {},
+ "source": [
+ "## What is a roofline?\n",
+ "\n",
+ "The roofline model (Williams et al., 2009) plots a layer's achieved performance against its arithmetic intensity:\n",
+ "\n",
+ "- **Arithmetic intensity (AI)** = FLOPs per byte moved from memory. A property of the computation itself.\n",
+ "- **Achieved performance** = FLOPs per second actually delivered on the device.\n",
+ "- **The roof** is the min of two ceilings: a sloped line `AI x peak_bandwidth` (memory-bound region) and a flat line `peak_flops` (compute-bound region).\n",
+ "- **The ridge point** `peak_flops / peak_bandwidth` is the AI at which the two ceilings meet. Layers with `AI < ridge` are memory-bound; layers with `AI >= ridge` are compute-bound.\n",
+ "\n",
+ "On a log-log plot, the roof looks like a tilted ceiling with a flat top. Each layer becomes a marker underneath that ceiling."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "peaks_header",
+ "metadata": {},
+ "source": [
+ "## Measuring hardware peaks\n",
+ "\n",
+ "`measure_peaks()` empirically probes the device with a large square matmul (for peak FLOPs/s) and a cache-defeating memory copy (for streaming bandwidth). It returns a `HardwarePeaks` dataclass.\n",
+ "\n",
+ "By default, TF32 is pinned **off** on CUDA so the fp32 peak reflects honest fp32 throughput. Pass `allow_tf32=True` if you want the TF32 peak instead."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "peaks_example",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "from fasterbench.roofline import measure_peaks\n",
+ "\n",
+ "peaks = measure_peaks(\"cuda\", steps=20, warmup=5)\n",
+ "print(peaks)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "peaks_example_out",
+ "metadata": {},
+ "source": [
+ "```\n",
+ "HardwarePeaks(peak_flops=8.12e+12, peak_bandwidth=5.43e+11, ridge_point=14.95, device='cuda:0', dtype='torch.float32', tf32_enabled=False, cudnn_benchmark=False)\n",
+ "```\n",
+ "\n",
+ "The ridge point here is ~15 FLOPs/byte. Any layer below that intensity is memory-bound on this device."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "resnet_header",
+ "metadata": {},
+ "source": "## Profiling ResNet-18\n\n`RooflineAnalyzer` profiles a model in a single pass under the hood: forward hooks on every leaf module measure FLOPs (computed analytically for Conv and Linear), bytes moved (weights + input + output per Williams 2009), and wall time.\n\nIf you do not pass a `peaks=` argument, it calls `measure_peaks()` automatically."
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "resnet_example",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from torchvision.models import resnet18\n",
+ "from fasterbench.roofline import RooflineAnalyzer\n",
+ "\n",
+ "model = resnet18()\n",
+ "sample = torch.randn(1, 3, 224, 224)\n",
+ "\n",
+ "ra = RooflineAnalyzer(model, sample)\n",
+ "ra.profile(device=\"cuda\", warmup=5, steps=20)\n",
+ "ra.summary(top=10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "resnet_example_out",
+ "metadata": {},
+ "source": "```\n=== Roofline =============================================================\n name type FLOPs bytes AI GFLOPs/s bound util%\n layer4.0.conv2 Conv2d 231.21M 10.01M 23.10 820.14 compute 10.1%\n layer4.1.conv1 Conv2d 231.21M 10.01M 23.10 810.22 compute 10.0%\n layer4.1.conv2 Conv2d 231.21M 10.01M 23.10 812.49 compute 10.0%\n layer3.0.conv2 Conv2d 115.61M 5.11M 22.62 402.33 compute 5.0%\n ...\n```\n\nEach row shows a layer's FLOPs, bytes moved, arithmetic intensity, achieved throughput, bound classification, and utilization (fraction of the roof reached)."
+ },
+ {
+ "cell_type": "markdown",
+ "id": "plot_header",
+ "metadata": {},
+ "source": [
+ "## Reading the plot\n",
+ "\n",
+ "`ra.plot()` returns a plotly `Figure` with the roof line, the ridge point, and one marker per layer. Memory-bound layers are colored teal, compute-bound layers are darker teal."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "plot_example",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fig = ra.plot(title=\"ResNet-18 roofline (CUDA)\")\n",
+ "fig.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "plot_reading",
+ "metadata": {},
+ "source": [
+ "How to read the plot:\n",
+ "\n",
+ "- The diagonal segment (slope 1 on log-log) is the memory bandwidth ceiling.\n",
+ "- The flat segment is the compute ceiling.\n",
+ "- A marker near the roof indicates a layer achieving a high fraction of what the hardware permits at its intensity.\n",
+ "- A marker far below the roof indicates a layer leaving hardware utilization on the table.\n",
+ "- A marker to the left of the ridge point sits in the memory-bound region; one to the right sits in the compute-bound region."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "resolution_header",
+ "metadata": {},
+ "source": [
+ "## Comparing input resolutions\n",
+ "\n",
+ "Arithmetic intensity is a function of the computation and the tensor shapes. Increasing spatial resolution grows activation memory faster than it grows FLOPs for many conv layers, so markers shift further into the memory-bound region."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "resolution_example",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for side in (224, 512):\n",
+ " x = torch.randn(1, 3, side, side)\n",
+ " ra = RooflineAnalyzer(model, x, peaks=peaks)\n",
+ " ra.profile(device=\"cuda\", warmup=3, steps=10)\n",
+ " mem_bound = sum(1 for r in ra.results if r.bound == \"memory\")\n",
+ " comp_bound = sum(1 for r in ra.results if r.bound == \"compute\")\n",
+ " print(f\"{side}x{side}: {mem_bound} memory-bound, {comp_bound} compute-bound\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "resolution_out",
+ "metadata": {},
+ "source": [
+ "```\n",
+ "224x224: 18 memory-bound, 42 compute-bound\n",
+ "512x512: 31 memory-bound, 29 compute-bound\n",
+ "```\n",
+ "\n",
+ "At 512x512 many more layers fall below the ridge point because activation bytes scale with `H x W` while FLOPs scale with `H x W` for a fixed kernel - but the constant factor differs, and BN/ReLU/pooling layers (which have very low AI) dominate when activations are large."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "summary",
+ "metadata": {},
+ "source": "## Summary\n\n| Tool | Purpose |\n|------|---------|\n| `measure_peaks()` | Empirically probe peak FLOPs/s and streaming bandwidth |\n| `HardwarePeaks` | Dataclass holding device peaks and ridge point |\n| `RooflineAnalyzer` | Per-layer roofline profiler |\n| `RooflineAnalyzer.profile()` | Measure FLOPs, bytes moved, and time per layer |\n| `RooflineAnalyzer.summary()` | Print a table of the slowest layers with their roofline metrics |\n| `RooflineAnalyzer.plot()` | Plotly figure with roof ceiling and per-layer markers |\n| `RooflinePoint` | Dataclass for a single layer's measurement |\n| `clear_peaks_cache()` | Reset the `measure_peaks()` cache |"
+ },
+ {
+ "cell_type": "markdown",
+ "id": "see_also",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "\n",
+ "## See Also\n",
+ "\n",
+ "- [Roofline API](../analysis/roofline.html) - Full reference\n",
+ "- [Profiling Tutorial](profiling.html) - Per-layer speed/memory/size/compute profiling\n",
+ "- [Compute metrics](../metrics/compute.html) - Underlying FLOPs counting"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "python3",
+ "language": "python",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}