diff --git a/fasterbench/__init__.py b/fasterbench/__init__.py index 5e09119..ec82712 100644 --- a/fasterbench/__init__.py +++ b/fasterbench/__init__.py @@ -19,6 +19,9 @@ from .compute import ComputeMetrics, compute_compute from .memory import MemoryMetrics, compute_memory, compute_memory_multi from .energy import EnergyMetrics, compute_energy, compute_energy_multi +from fasterbench.roofline import ( + HardwarePeaks, RooflinePoint, measure_peaks, clear_peaks_cache, RooflineAnalyzer, +) from .plot import create_radar_plot, SPECS from .utils import parse_metric_value @@ -38,6 +41,8 @@ 'MemoryMetrics', 'compute_memory', 'compute_memory_multi', # Energy 'EnergyMetrics', 'compute_energy', 'compute_energy_multi', + # Roofline + 'HardwarePeaks', 'RooflinePoint', 'measure_peaks', 'clear_peaks_cache', 'RooflineAnalyzer', # Plot 'create_radar_plot', 'SPECS', # Report diff --git a/fasterbench/_modidx.py b/fasterbench/_modidx.py index 28cabd1..b5647f9 100644 --- a/fasterbench/_modidx.py +++ b/fasterbench/_modidx.py @@ -130,6 +130,36 @@ 'fasterbench.report._generate_css': ('analysis/report.html#_generate_css', 'fasterbench/report.py'), 'fasterbench.report._improvement_indicator': ( 'analysis/report.html#_improvement_indicator', 'fasterbench/report.py')}, + 'fasterbench.roofline': { 'fasterbench.roofline.HardwarePeaks': ( 'analysis/roofline.html#hardwarepeaks', + 'fasterbench/roofline.py'), + 'fasterbench.roofline.HardwarePeaks.as_dict': ( 'analysis/roofline.html#hardwarepeaks.as_dict', + 'fasterbench/roofline.py'), + 'fasterbench.roofline.RooflineAnalyzer': ( 'analysis/roofline.html#rooflineanalyzer', + 'fasterbench/roofline.py'), + 'fasterbench.roofline.RooflineAnalyzer.__init__': ( 'analysis/roofline.html#rooflineanalyzer.__init__', + 'fasterbench/roofline.py'), + 'fasterbench.roofline.RooflineAnalyzer.plot': ( 'analysis/roofline.html#rooflineanalyzer.plot', + 'fasterbench/roofline.py'), + 'fasterbench.roofline.RooflineAnalyzer.profile': ( 'analysis/roofline.html#rooflineanalyzer.profile', + 'fasterbench/roofline.py'), + 'fasterbench.roofline.RooflineAnalyzer.results': ( 'analysis/roofline.html#rooflineanalyzer.results', + 'fasterbench/roofline.py'), + 'fasterbench.roofline.RooflineAnalyzer.summary': ( 'analysis/roofline.html#rooflineanalyzer.summary', + 'fasterbench/roofline.py'), + 'fasterbench.roofline.RooflinePoint': ( 'analysis/roofline.html#rooflinepoint', + 'fasterbench/roofline.py'), + 'fasterbench.roofline.RooflinePoint.as_dict': ( 'analysis/roofline.html#rooflinepoint.as_dict', + 'fasterbench/roofline.py'), + 'fasterbench.roofline._layer_flops': ( 'analysis/roofline.html#_layer_flops', + 'fasterbench/roofline.py'), + 'fasterbench.roofline._pinned_benchmark_flags': ( 'analysis/roofline.html#_pinned_benchmark_flags', + 'fasterbench/roofline.py'), + 'fasterbench.roofline._setup_roofline_hooks': ( 'analysis/roofline.html#_setup_roofline_hooks', + 'fasterbench/roofline.py'), + 'fasterbench.roofline.clear_peaks_cache': ( 'analysis/roofline.html#clear_peaks_cache', + 'fasterbench/roofline.py'), + 'fasterbench.roofline.measure_peaks': ( 'analysis/roofline.html#measure_peaks', + 'fasterbench/roofline.py')}, 'fasterbench.size': { 'fasterbench.size.SizeMetrics': ('metrics/size.html#sizemetrics', 'fasterbench/size.py'), 'fasterbench.size.SizeMetrics.as_dict': ('metrics/size.html#sizemetrics.as_dict', 'fasterbench/size.py'), 'fasterbench.size.compute_size': ('metrics/size.html#compute_size', 'fasterbench/size.py'), diff --git a/fasterbench/roofline.py b/fasterbench/roofline.py new file mode 100644 index 0000000..8ee82d4 --- /dev/null +++ b/fasterbench/roofline.py @@ -0,0 +1,479 @@ +# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/analysis/roofline.ipynb. + +# %% ../nbs/analysis/roofline.ipynb #imports +from __future__ import annotations + +import math +import time +import warnings +from dataclasses import dataclass, asdict +from contextlib import contextmanager + +import numpy as np +import torch +import torch.nn as nn +import plotly.graph_objects as go + +from .core import _device_ctx, _sync, _fmt_float, _section +from .profiling import _leaf_modules, _tensor_bytes, _output_bytes + +# %% auto #0 +__all__ = ['HardwarePeaks', 'measure_peaks', 'clear_peaks_cache', 'RooflinePoint', 'RooflineAnalyzer'] + +# %% ../nbs/analysis/roofline.ipynb #hardware_peaks +@dataclass(slots=True) +class HardwarePeaks: + "Empirically measured achievable peak compute and streaming bandwidth for a device." + peak_flops: float # achievable peak FLOPs/s + peak_bandwidth: float # achievable streaming bandwidth in bytes/s + ridge_point: float # FLOPs/byte = peak_flops / peak_bandwidth + device: str # e.g. "cuda:0", "cpu" + dtype: str # e.g. "torch.float32" + tf32_enabled: bool # whether matmul TF32 was on during probe + cudnn_benchmark: bool # whether cudnn.benchmark was on during probe + + def as_dict(self) -> dict: + return asdict(self) + +# %% ../nbs/analysis/roofline.ipynb #pinned_flags +@contextmanager +def _pinned_benchmark_flags(tf32: bool = False): + "Pin matmul TF32 and cudnn.benchmark during probe; restore on exit." + if torch.cuda.is_available(): + prev_tf32 = torch.backends.cuda.matmul.allow_tf32 + prev_cudnn_tf32 = torch.backends.cudnn.allow_tf32 + prev_bench = torch.backends.cudnn.benchmark + torch.backends.cuda.matmul.allow_tf32 = tf32 + torch.backends.cudnn.allow_tf32 = tf32 + torch.backends.cudnn.benchmark = False + try: + yield (tf32, False) + finally: + torch.backends.cuda.matmul.allow_tf32 = prev_tf32 + torch.backends.cudnn.allow_tf32 = prev_cudnn_tf32 + torch.backends.cudnn.benchmark = prev_bench + else: + yield (False, False) + +# %% ../nbs/analysis/roofline.ipynb #measure_peaks +_PEAKS_CACHE: dict = {} + + +def measure_peaks( + device: str | torch.device = "cuda", # device to probe + *, + dtype: torch.dtype = torch.float32, # probe precision + matmul_size: int = 4096, # N for NxN matmul probe + bandwidth_mb: int = 256, # per-buffer size in MiB (auto-bumped above L3) + warmup: int = 5, # warmup iterations + steps: int = 20, # measurement iterations (report max) + allow_tf32: bool = False, # pin TF32 off by default for honest fp32 peak + cache: bool = True, # cache per (device, dtype, sizes) +) -> HardwarePeaks: + "Empirically probe achievable peak FLOPs/s and streaming bandwidth." + dev = torch.device(device) if isinstance(device, str) else device + if dev.type == "cuda" and not torch.cuda.is_available(): + warnings.warn("CUDA requested but not available - falling back to CPU") + dev = torch.device("cpu") + key = (str(dev), str(dtype), matmul_size, bandwidth_mb, allow_tf32) + if cache and key in _PEAKS_CACHE: + return _PEAKS_CACHE[key] + + with _pinned_benchmark_flags(tf32=allow_tf32) as (tf32_on, bench_on): + # --- peak FLOPs probe --- + N = matmul_size + a = torch.randn(N, N, device=dev, dtype=dtype) + b = torch.randn(N, N, device=dev, dtype=dtype) + for _ in range(warmup): + (a @ b) + _sync(dev) + flops_per_matmul = 2.0 * N * N * N + best_flops = 0.0 + c = None + for _ in range(steps): + _sync(dev) + t0 = time.perf_counter() + c = a @ b + _sync(dev) + dt = time.perf_counter() - t0 + if dt > 0: + best_flops = max(best_flops, flops_per_matmul / dt) + del a, b, c + if dev.type == "cuda": + torch.cuda.empty_cache() + + # --- bandwidth probe (cache-defeating) --- + # target buffer sized to blow past L3 and any GPU L2. + buf_bytes = max(bandwidth_mb * 1024 * 1024, 64 * 1024 * 1024) + n_elems = buf_bytes // dtype.itemsize + src = torch.empty(n_elems, device=dev, dtype=dtype).normal_() + dst = torch.empty(n_elems, device=dev, dtype=dtype).normal_() + for _ in range(warmup): + dst.copy_(src) + _sync(dev) + bytes_moved_per_copy = 2.0 * n_elems * dtype.itemsize # read + write + best_bw = 0.0 + for i in range(steps): + # alternate direction to defeat any residency assumptions + s, d = (src, dst) if (i % 2 == 0) else (dst, src) + _sync(dev) + t0 = time.perf_counter() + d.copy_(s) + _sync(dev) + dt = time.perf_counter() - t0 + if dt > 0: + best_bw = max(best_bw, bytes_moved_per_copy / dt) + del src, dst + if dev.type == "cuda": + torch.cuda.empty_cache() + + result = HardwarePeaks( + peak_flops=best_flops, + peak_bandwidth=best_bw, + ridge_point=best_flops / best_bw if best_bw > 0 else float("inf"), + device=str(dev), + dtype=str(dtype), + tf32_enabled=tf32_on, + cudnn_benchmark=bench_on, + ) + if cache: + _PEAKS_CACHE[key] = result + return result + + +def clear_peaks_cache() -> None: + "Reset the measure_peaks() cache." + _PEAKS_CACHE.clear() + +# %% ../nbs/analysis/roofline.ipynb #roofline_point +@dataclass(slots=True) +class RooflinePoint: + "Per-layer roofline measurement. Bytes formula: weight_bytes + input_bytes + output_bytes (each counted once per forward call, per Williams 2009)." + name: str + type: str + flops: float # total FLOPs (2 * MACs) + bytes_moved: float # weights + input + output bytes per forward call + time_s: float # measured wall time (mean over steps) + arithmetic_intensity: float # flops / bytes_moved + achieved_gflops: float # flops / time / 1e9 + bound: str # "memory" | "compute" | "undefined" + utilization_pct: float # achieved / roof * 100 + + def as_dict(self) -> dict: + return asdict(self) + +# %% ../nbs/analysis/roofline.ipynb #roofline_hooks +def _layer_flops(module: nn.Module, inp, output) -> float: + "Estimate FLOPs for a single leaf module forward call. Returns 0 for layer types we do not model." + # Covers the layer types where FLOPs actually dominate - conv, linear, matmul-adjacent. + # Cheap element-wise ops (ReLU, BN, pooling) are intentionally assigned 0 FLOPs; they end up + # in the \"undefined\" bucket (memory-bound trivially) and are surfaced via the warning. + t_in = inp[0] if isinstance(inp, tuple) and len(inp) > 0 and isinstance(inp[0], torch.Tensor) else None + t_out = output if isinstance(output, torch.Tensor) else None + + if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Conv3d)) and t_out is not None: + cin = module.in_channels // module.groups + cout = module.out_channels + ksize = 1 + for k in (module.kernel_size if isinstance(module.kernel_size, tuple) else (module.kernel_size,)): + ksize *= k + spatial = 1 + for s in t_out.shape[2:]: + spatial *= int(s) + batch = int(t_out.shape[0]) if t_out.ndim > 0 else 1 + macs = batch * cin * cout * ksize * spatial + flops = 2.0 * macs + if module.bias is not None: + flops += batch * cout * spatial # bias add + return float(flops) + + if isinstance(module, (nn.ConvTranspose1d, nn.ConvTranspose2d, nn.ConvTranspose3d)) and t_in is not None: + cin = module.in_channels // module.groups + cout = module.out_channels + ksize = 1 + for k in (module.kernel_size if isinstance(module.kernel_size, tuple) else (module.kernel_size,)): + ksize *= k + spatial = 1 + for s in t_in.shape[2:]: + spatial *= int(s) + batch = int(t_in.shape[0]) if t_in.ndim > 0 else 1 + return float(2.0 * batch * cin * cout * ksize * spatial) + + if isinstance(module, nn.Linear) and t_in is not None: + in_features = module.in_features + out_features = module.out_features + # Everything before the last dim is treated as batch. + batch = 1 + for s in t_in.shape[:-1]: + batch *= int(s) + flops = 2.0 * batch * in_features * out_features + if module.bias is not None: + flops += batch * out_features + return float(flops) + + return 0.0 + + +#| export +def _setup_roofline_hooks( + leaf_modules, # {name: module} for leaf modules + bytes_state, # {name: []} to accumulate bytes per call + time_state, # {name: []} to accumulate seconds per call + flops_state, # {name: []} to accumulate FLOPs per call + call_state, # {name: int} counter of forward calls + device_type, # "cuda" or "cpu" +): + "Register hooks to measure (bytes moved, time, FLOPs, call count) per layer." + handles = [] + is_cuda = device_type == "cuda" + + def make_hooks(name: str, module: nn.Module): + w_bytes = sum(_tensor_bytes(p) for p in module.parameters(recurse=False)) + if is_cuda: + state = {"start": None, "end": None} + def pre(mod, inp): + state["start"] = torch.cuda.Event(enable_timing=True) + state["end"] = torch.cuda.Event(enable_timing=True) + state["start"].record() + in_b = sum(_tensor_bytes(t) for t in inp if isinstance(t, torch.Tensor)) + bytes_state[name].append(in_b + w_bytes) + def post(mod, inp, output): + state["end"].record() + out_b = _output_bytes(output) + bytes_state[name][-1] += out_b + torch.cuda.synchronize() + time_state[name].append(state["start"].elapsed_time(state["end"]) / 1000.0) + flops_state[name].append(_layer_flops(mod, inp, output)) + call_state[name] += 1 + else: + state = {"t0": 0.0} + def pre(mod, inp): + state["t0"] = time.perf_counter() + in_b = sum(_tensor_bytes(t) for t in inp if isinstance(t, torch.Tensor)) + bytes_state[name].append(in_b + w_bytes) + def post(mod, inp, output): + dt = time.perf_counter() - state["t0"] + out_b = _output_bytes(output) + bytes_state[name][-1] += out_b + time_state[name].append(dt) + flops_state[name].append(_layer_flops(mod, inp, output)) + call_state[name] += 1 + return pre, post + + for name, mod in leaf_modules.items(): + pre, post = make_hooks(name, mod) + handles.append(mod.register_forward_pre_hook(pre)) + handles.append(mod.register_forward_hook(post)) + return handles + +# %% ../nbs/analysis/roofline.ipynb #roofline_analyzer +class RooflineAnalyzer: + "Per-layer roofline analysis: measure arithmetic intensity and achieved GFLOPs/s against hardware peaks." + + def __init__( + self, + model: nn.Module, # model to analyze + sample: torch.Tensor, # input tensor (with batch dimension) + peaks: HardwarePeaks | None = None, # optional precomputed hardware peaks + ): + self.model = model + self.sample = sample + self.peaks = peaks + self._results: list[RooflinePoint] = [] + + @property + def results(self) -> list[RooflinePoint]: + "Per-layer roofline measurements (populated after profile())." + return self._results + + @torch.no_grad() + def profile( + self, + *, + device: str | torch.device = "cuda", # device to profile on + warmup: int = 5, # warmup iterations + steps: int = 20, # measurement iterations + ) -> list[RooflinePoint]: + "Run per-layer profiling: collect FLOPs, bytes, and time, then classify against peaks." + with _device_ctx(device) as dev: + if dev.type == "cuda": + torch.cuda.empty_cache() + _sync(dev) + + # Ensure peaks are available + if self.peaks is None: + self.peaks = measure_peaks(dev, dtype=self.sample.dtype) + + model = self.model.eval().to(dev) + sample = self.sample.to(dev) + leaf_mods = _leaf_modules(model) + + bytes_state: dict[str, list] = {n: [] for n in leaf_mods} + time_state: dict[str, list] = {n: [] for n in leaf_mods} + flops_state: dict[str, list] = {n: [] for n in leaf_mods} + call_state: dict[str, int] = {n: 0 for n in leaf_mods} + + handles = _setup_roofline_hooks( + leaf_mods, bytes_state, time_state, flops_state, call_state, dev.type, + ) + try: + for _ in range(warmup): + model(sample) + # Reset accumulators after warmup + for n in leaf_mods: + bytes_state[n].clear() + time_state[n].clear() + flops_state[n].clear() + call_state[n] = 0 + for _ in range(steps): + model(sample) + finally: + for h in handles: + h.remove() + + # Warn if any module was invoked >1x per forward pass (shared module) + multi_call = [n for n, c in call_state.items() if steps > 0 and c > steps] + if multi_call: + warnings.warn( + f"{len(multi_call)} module(s) were called more than once per forward " + f"pass; their bytes/time are summed across calls. First: {multi_call[:3]}" + ) + + # --- Build RooflinePoint per layer --- + peak_flops = self.peaks.peak_flops + peak_bw = self.peaks.peak_bandwidth + ridge = self.peaks.ridge_point + + results: list[RooflinePoint] = [] + for name, mod in leaf_mods.items(): + b_list = bytes_state[name] + t_list = time_state[name] + f_list = flops_state[name] + bytes_moved = float(np.mean(b_list)) if b_list else 0.0 + time_s = float(np.mean(t_list)) if t_list else 0.0 + flops = float(np.mean(f_list)) if f_list else 0.0 + + if flops == 0 or bytes_moved == 0 or time_s == 0: + results.append(RooflinePoint( + name=name, type=mod.__class__.__name__, + flops=flops, bytes_moved=bytes_moved, time_s=time_s, + arithmetic_intensity=0.0, achieved_gflops=0.0, + bound="undefined", utilization_pct=0.0, + )) + continue + + ai = flops / bytes_moved + achieved_gflops = flops / time_s / 1e9 + roof_flops = min(peak_flops, ai * peak_bw) + roof_gflops = roof_flops / 1e9 + bound = "memory" if ai < ridge else "compute" + util = (achieved_gflops / roof_gflops * 100) if roof_gflops > 0 else 0.0 + results.append(RooflinePoint( + name=name, type=mod.__class__.__name__, + flops=flops, bytes_moved=bytes_moved, time_s=time_s, + arithmetic_intensity=ai, achieved_gflops=achieved_gflops, + bound=bound, utilization_pct=util, + )) + + # Warn if any layer had zero flops/bytes/time -> bound is "undefined" + zero_flops = [r.name for r in results if r.bound == "undefined"] + if zero_flops: + warnings.warn( + f"{len(zero_flops)} layer(s) have undefined roofline (zero FLOPs, bytes, " + f"or time). First: {zero_flops[:3]}" + ) + + self._results = results + return results + + def summary(self, *, top: int = 10) -> None: + "Print a table of the slowest layers with their roofline metrics." + if not self._results: + raise RuntimeError("No results available. Call profile() first.") + print(_section("Roofline", 72)) + header = f" {'name':32} {'type':14} {'FLOPs':>10} {'bytes':>10} {'AI':>8} {'GFLOPs/s':>10} {'bound':>9} {'util%':>7}" + print(header) + # Sort by measured time, descending (slowest first) + sorted_rows = sorted(self._results, key=lambda r: r.time_s, reverse=True)[:top] + for r in sorted_rows: + flops_str = f"{r.flops/1e6:>8.2f}M" if r.flops >= 1e6 else f"{r.flops:>10.0f}" + bytes_str = f"{r.bytes_moved/1e6:>8.2f}M" if r.bytes_moved >= 1e6 else f"{r.bytes_moved:>10.0f}" + ai_str = _fmt_float(r.arithmetic_intensity, width=8, decimals=2) + gf_str = _fmt_float(r.achieved_gflops, width=10, decimals=2) + util_str = _fmt_float(r.utilization_pct, width=6, decimals=1) + "%" + print(f" {r.name:32} {r.type:14} {flops_str} {bytes_str} {ai_str} {gf_str} {r.bound:>9} {util_str}") + + def plot( + self, + *, + title: str = "Roofline", # figure title + ) -> go.Figure: + "Render the roofline plot with per-layer scatter points on a log-log grid. Markers are colored by bound classification." + if not self._results: + raise RuntimeError("No results available. Call profile() first.") + if self.peaks is None: + raise RuntimeError("No hardware peaks available.") + + peak_flops = self.peaks.peak_flops + peak_bw = self.peaks.peak_bandwidth + ridge = self.peaks.ridge_point + peak_gflops = peak_flops / 1e9 + + # Build the roof curve: y = min(peak_flops, AI * peak_bw) / 1e9 + valid = [r for r in self._results if r.bound != "undefined"] + if valid: + ai_min = max(min(r.arithmetic_intensity for r in valid) / 10.0, 1e-3) + ai_max = max(r.arithmetic_intensity for r in valid) * 10.0 + else: + ai_min, ai_max = 1e-2, 1e3 + ai_max = max(ai_max, ridge * 10.0) + + ai_grid = np.logspace(math.log10(ai_min), math.log10(ai_max), 200) + roof_gflops = np.minimum(peak_gflops, ai_grid * peak_bw / 1e9) + + fig = go.Figure() + fig.add_trace(go.Scatter( + x=ai_grid, y=roof_gflops, mode="lines", + line=dict(color="#008080", width=2), + name=f"Roof ({peak_gflops:.0f} GFLOPs/s, {peak_bw/1e9:.1f} GB/s)", + hoverinfo="skip", + )) + # Ridge point marker + fig.add_trace(go.Scatter( + x=[ridge], y=[peak_gflops], mode="markers", + marker=dict(color="#008080", size=10, symbol="diamond"), + name=f"Ridge point ({ridge:.1f} FLOP/byte)", + hovertemplate="Ridge: %{x:.2f} FLOP/byte", + )) + + # Color palette for layers + color_map = {"memory": "#89d6c9", "compute": "#008080"} + for bound_label in ("memory", "compute"): + pts = [r for r in valid if r.bound == bound_label] + if not pts: + continue + hover = [ + f"{r.name}
{r.type}
AI: {r.arithmetic_intensity:.3f} FLOP/byte" + f"
{r.achieved_gflops:.2f} GFLOPs/s
util: {r.utilization_pct:.1f}%" + for r in pts + ] + fig.add_trace(go.Scatter( + x=[r.arithmetic_intensity for r in pts], + y=[r.achieved_gflops for r in pts], + mode="markers", + marker=dict(color=color_map[bound_label], size=8, opacity=0.8, + line=dict(color="#008080", width=0.5)), + name=f"{bound_label}-bound", + text=hover, + hovertemplate="%{text}", + )) + + fig.update_layout( + title=title, + xaxis=dict(title="Arithmetic intensity (FLOP/byte)", type="log"), + yaxis=dict(title="Achieved performance (GFLOPs/s)", type="log"), + paper_bgcolor="rgba(0,0,0,0)", + plot_bgcolor="rgba(0,0,0,0)", + legend=dict(bgcolor="rgba(0,0,0,0)"), + ) + return fig diff --git a/nbs/_quarto.yml b/nbs/_quarto.yml index 8066f60..2369135 100644 --- a/nbs/_quarto.yml +++ b/nbs/_quarto.yml @@ -35,6 +35,7 @@ website: contents: - tutorials/benchmark.ipynb - tutorials/profiling.ipynb + - tutorials/roofline.ipynb - tutorials/report.ipynb - section: Core contents: @@ -51,6 +52,7 @@ website: contents: - analysis/benchmark.ipynb - analysis/profiling.ipynb + - analysis/roofline.ipynb - analysis/report.ipynb - section: Visualization contents: diff --git a/nbs/analysis/profiling.ipynb b/nbs/analysis/profiling.ipynb index c600166..ca47f46 100644 --- a/nbs/analysis/profiling.ipynb +++ b/nbs/analysis/profiling.ipynb @@ -341,4 +341,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/nbs/analysis/roofline.ipynb b/nbs/analysis/roofline.ipynb new file mode 100644 index 0000000..b4d4024 --- /dev/null +++ b/nbs/analysis/roofline.ipynb @@ -0,0 +1,427 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "frontmatter", + "metadata": {}, + "source": [ + "---\ndescription: Roofline analysis for arithmetic intensity vs achieved performance\noutput-file: roofline.html\ntitle: Roofline\nskip_showdoc: true\n---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "default_exp", + "metadata": {}, + "outputs": [], + "source": [ + "#| default_exp roofline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "setup_showdoc", + "metadata": {}, + "outputs": [], + "source": [ + "#| include: false\n", + "from nbdev.showdoc import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "imports", + "metadata": {}, + "outputs": [], + "source": "#| export\nfrom __future__ import annotations\n\nimport math\nimport time\nimport warnings\nfrom dataclasses import dataclass, asdict\nfrom contextlib import contextmanager\n\nimport numpy as np\nimport torch\nimport torch.nn as nn\nimport plotly.graph_objects as go\n\nfrom fasterbench.core import _device_ctx, _sync, _fmt_float, _section\nfrom fasterbench.profiling import _leaf_modules, _tensor_bytes, _output_bytes" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "hardware_peaks", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "@dataclass(slots=True)\n", + "class HardwarePeaks:\n", + " \"Empirically measured achievable peak compute and streaming bandwidth for a device.\"\n", + " peak_flops: float # achievable peak FLOPs/s\n", + " peak_bandwidth: float # achievable streaming bandwidth in bytes/s\n", + " ridge_point: float # FLOPs/byte = peak_flops / peak_bandwidth\n", + " device: str # e.g. \"cuda:0\", \"cpu\"\n", + " dtype: str # e.g. \"torch.float32\"\n", + " tf32_enabled: bool # whether matmul TF32 was on during probe\n", + " cudnn_benchmark: bool # whether cudnn.benchmark was on during probe\n", + "\n", + " def as_dict(self) -> dict:\n", + " return asdict(self)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "sd_hw_peaks", + "metadata": {}, + "outputs": [], + "source": [ + "show_doc(HardwarePeaks)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "pinned_flags", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "@contextmanager\n", + "def _pinned_benchmark_flags(tf32: bool = False):\n", + " \"Pin matmul TF32 and cudnn.benchmark during probe; restore on exit.\"\n", + " if torch.cuda.is_available():\n", + " prev_tf32 = torch.backends.cuda.matmul.allow_tf32\n", + " prev_cudnn_tf32 = torch.backends.cudnn.allow_tf32\n", + " prev_bench = torch.backends.cudnn.benchmark\n", + " torch.backends.cuda.matmul.allow_tf32 = tf32\n", + " torch.backends.cudnn.allow_tf32 = tf32\n", + " torch.backends.cudnn.benchmark = False\n", + " try:\n", + " yield (tf32, False)\n", + " finally:\n", + " torch.backends.cuda.matmul.allow_tf32 = prev_tf32\n", + " torch.backends.cudnn.allow_tf32 = prev_cudnn_tf32\n", + " torch.backends.cudnn.benchmark = prev_bench\n", + " else:\n", + " yield (False, False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "measure_peaks", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "_PEAKS_CACHE: dict = {}\n", + "\n", + "\n", + "def measure_peaks(\n", + " device: str | torch.device = \"cuda\", # device to probe\n", + " *,\n", + " dtype: torch.dtype = torch.float32, # probe precision\n", + " matmul_size: int = 4096, # N for NxN matmul probe\n", + " bandwidth_mb: int = 256, # per-buffer size in MiB (auto-bumped above L3)\n", + " warmup: int = 5, # warmup iterations\n", + " steps: int = 20, # measurement iterations (report max)\n", + " allow_tf32: bool = False, # pin TF32 off by default for honest fp32 peak\n", + " cache: bool = True, # cache per (device, dtype, sizes)\n", + ") -> HardwarePeaks:\n", + " \"Empirically probe achievable peak FLOPs/s and streaming bandwidth.\"\n", + " dev = torch.device(device) if isinstance(device, str) else device\n", + " if dev.type == \"cuda\" and not torch.cuda.is_available():\n", + " warnings.warn(\"CUDA requested but not available - falling back to CPU\")\n", + " dev = torch.device(\"cpu\")\n", + " key = (str(dev), str(dtype), matmul_size, bandwidth_mb, allow_tf32)\n", + " if cache and key in _PEAKS_CACHE:\n", + " return _PEAKS_CACHE[key]\n", + "\n", + " with _pinned_benchmark_flags(tf32=allow_tf32) as (tf32_on, bench_on):\n", + " # --- peak FLOPs probe ---\n", + " N = matmul_size\n", + " a = torch.randn(N, N, device=dev, dtype=dtype)\n", + " b = torch.randn(N, N, device=dev, dtype=dtype)\n", + " for _ in range(warmup):\n", + " (a @ b)\n", + " _sync(dev)\n", + " flops_per_matmul = 2.0 * N * N * N\n", + " best_flops = 0.0\n", + " c = None\n", + " for _ in range(steps):\n", + " _sync(dev)\n", + " t0 = time.perf_counter()\n", + " c = a @ b\n", + " _sync(dev)\n", + " dt = time.perf_counter() - t0\n", + " if dt > 0:\n", + " best_flops = max(best_flops, flops_per_matmul / dt)\n", + " del a, b, c\n", + " if dev.type == \"cuda\":\n", + " torch.cuda.empty_cache()\n", + "\n", + " # --- bandwidth probe (cache-defeating) ---\n", + " # target buffer sized to blow past L3 and any GPU L2.\n", + " buf_bytes = max(bandwidth_mb * 1024 * 1024, 64 * 1024 * 1024)\n", + " n_elems = buf_bytes // dtype.itemsize\n", + " src = torch.empty(n_elems, device=dev, dtype=dtype).normal_()\n", + " dst = torch.empty(n_elems, device=dev, dtype=dtype).normal_()\n", + " for _ in range(warmup):\n", + " dst.copy_(src)\n", + " _sync(dev)\n", + " bytes_moved_per_copy = 2.0 * n_elems * dtype.itemsize # read + write\n", + " best_bw = 0.0\n", + " for i in range(steps):\n", + " # alternate direction to defeat any residency assumptions\n", + " s, d = (src, dst) if (i % 2 == 0) else (dst, src)\n", + " _sync(dev)\n", + " t0 = time.perf_counter()\n", + " d.copy_(s)\n", + " _sync(dev)\n", + " dt = time.perf_counter() - t0\n", + " if dt > 0:\n", + " best_bw = max(best_bw, bytes_moved_per_copy / dt)\n", + " del src, dst\n", + " if dev.type == \"cuda\":\n", + " torch.cuda.empty_cache()\n", + "\n", + " result = HardwarePeaks(\n", + " peak_flops=best_flops,\n", + " peak_bandwidth=best_bw,\n", + " ridge_point=best_flops / best_bw if best_bw > 0 else float(\"inf\"),\n", + " device=str(dev),\n", + " dtype=str(dtype),\n", + " tf32_enabled=tf32_on,\n", + " cudnn_benchmark=bench_on,\n", + " )\n", + " if cache:\n", + " _PEAKS_CACHE[key] = result\n", + " return result\n", + "\n", + "\n", + "def clear_peaks_cache() -> None:\n", + " \"Reset the measure_peaks() cache.\"\n", + " _PEAKS_CACHE.clear()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "sd_measure_peaks", + "metadata": {}, + "outputs": [], + "source": [ + "show_doc(measure_peaks)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "roofline_point", + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "@dataclass(slots=True)\n", + "class RooflinePoint:\n", + " \"Per-layer roofline measurement. Bytes formula: weight_bytes + input_bytes + output_bytes (each counted once per forward call, per Williams 2009).\"\n", + " name: str\n", + " type: str\n", + " flops: float # total FLOPs (2 * MACs)\n", + " bytes_moved: float # weights + input + output bytes per forward call\n", + " time_s: float # measured wall time (mean over steps)\n", + " arithmetic_intensity: float # flops / bytes_moved\n", + " achieved_gflops: float # flops / time / 1e9\n", + " bound: str # \"memory\" | \"compute\" | \"undefined\"\n", + " utilization_pct: float # achieved / roof * 100\n", + "\n", + " def as_dict(self) -> dict:\n", + " return asdict(self)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "sd_roofline_point", + "metadata": {}, + "outputs": [], + "source": [ + "show_doc(RooflinePoint)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "roofline_hooks", + "metadata": {}, + "outputs": [], + "source": "#| export\ndef _layer_flops(module: nn.Module, inp, output) -> float:\n \"Estimate FLOPs for a single leaf module forward call. Returns 0 for layer types we do not model.\"\n # Covers the layer types where FLOPs actually dominate - conv, linear, matmul-adjacent.\n # Cheap element-wise ops (ReLU, BN, pooling) are intentionally assigned 0 FLOPs; they end up\n # in the \\\"undefined\\\" bucket (memory-bound trivially) and are surfaced via the warning.\n t_in = inp[0] if isinstance(inp, tuple) and len(inp) > 0 and isinstance(inp[0], torch.Tensor) else None\n t_out = output if isinstance(output, torch.Tensor) else None\n\n if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Conv3d)) and t_out is not None:\n cin = module.in_channels // module.groups\n cout = module.out_channels\n ksize = 1\n for k in (module.kernel_size if isinstance(module.kernel_size, tuple) else (module.kernel_size,)):\n ksize *= k\n spatial = 1\n for s in t_out.shape[2:]:\n spatial *= int(s)\n batch = int(t_out.shape[0]) if t_out.ndim > 0 else 1\n macs = batch * cin * cout * ksize * spatial\n flops = 2.0 * macs\n if module.bias is not None:\n flops += batch * cout * spatial # bias add\n return float(flops)\n\n if isinstance(module, (nn.ConvTranspose1d, nn.ConvTranspose2d, nn.ConvTranspose3d)) and t_in is not None:\n cin = module.in_channels // module.groups\n cout = module.out_channels\n ksize = 1\n for k in (module.kernel_size if isinstance(module.kernel_size, tuple) else (module.kernel_size,)):\n ksize *= k\n spatial = 1\n for s in t_in.shape[2:]:\n spatial *= int(s)\n batch = int(t_in.shape[0]) if t_in.ndim > 0 else 1\n return float(2.0 * batch * cin * cout * ksize * spatial)\n\n if isinstance(module, nn.Linear) and t_in is not None:\n in_features = module.in_features\n out_features = module.out_features\n # Everything before the last dim is treated as batch.\n batch = 1\n for s in t_in.shape[:-1]:\n batch *= int(s)\n flops = 2.0 * batch * in_features * out_features\n if module.bias is not None:\n flops += batch * out_features\n return float(flops)\n\n return 0.0\n\n\n#| export\ndef _setup_roofline_hooks(\n leaf_modules, # {name: module} for leaf modules\n bytes_state, # {name: []} to accumulate bytes per call\n time_state, # {name: []} to accumulate seconds per call\n flops_state, # {name: []} to accumulate FLOPs per call\n call_state, # {name: int} counter of forward calls\n device_type, # \"cuda\" or \"cpu\"\n):\n \"Register hooks to measure (bytes moved, time, FLOPs, call count) per layer.\"\n handles = []\n is_cuda = device_type == \"cuda\"\n\n def make_hooks(name: str, module: nn.Module):\n w_bytes = sum(_tensor_bytes(p) for p in module.parameters(recurse=False))\n if is_cuda:\n state = {\"start\": None, \"end\": None}\n def pre(mod, inp):\n state[\"start\"] = torch.cuda.Event(enable_timing=True)\n state[\"end\"] = torch.cuda.Event(enable_timing=True)\n state[\"start\"].record()\n in_b = sum(_tensor_bytes(t) for t in inp if isinstance(t, torch.Tensor))\n bytes_state[name].append(in_b + w_bytes)\n def post(mod, inp, output):\n state[\"end\"].record()\n out_b = _output_bytes(output)\n bytes_state[name][-1] += out_b\n torch.cuda.synchronize()\n time_state[name].append(state[\"start\"].elapsed_time(state[\"end\"]) / 1000.0)\n flops_state[name].append(_layer_flops(mod, inp, output))\n call_state[name] += 1\n else:\n state = {\"t0\": 0.0}\n def pre(mod, inp):\n state[\"t0\"] = time.perf_counter()\n in_b = sum(_tensor_bytes(t) for t in inp if isinstance(t, torch.Tensor))\n bytes_state[name].append(in_b + w_bytes)\n def post(mod, inp, output):\n dt = time.perf_counter() - state[\"t0\"]\n out_b = _output_bytes(output)\n bytes_state[name][-1] += out_b\n time_state[name].append(dt)\n flops_state[name].append(_layer_flops(mod, inp, output))\n call_state[name] += 1\n return pre, post\n\n for name, mod in leaf_modules.items():\n pre, post = make_hooks(name, mod)\n handles.append(mod.register_forward_pre_hook(pre))\n handles.append(mod.register_forward_hook(post))\n return handles" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "roofline_analyzer", + "metadata": {}, + "outputs": [], + "source": "#| export\nclass RooflineAnalyzer:\n \"Per-layer roofline analysis: measure arithmetic intensity and achieved GFLOPs/s against hardware peaks.\"\n\n def __init__(\n self,\n model: nn.Module, # model to analyze\n sample: torch.Tensor, # input tensor (with batch dimension)\n peaks: HardwarePeaks | None = None, # optional precomputed hardware peaks\n ):\n self.model = model\n self.sample = sample\n self.peaks = peaks\n self._results: list[RooflinePoint] = []\n\n @property\n def results(self) -> list[RooflinePoint]:\n \"Per-layer roofline measurements (populated after profile()).\"\n return self._results\n\n @torch.no_grad()\n def profile(\n self,\n *,\n device: str | torch.device = \"cuda\", # device to profile on\n warmup: int = 5, # warmup iterations\n steps: int = 20, # measurement iterations\n ) -> list[RooflinePoint]:\n \"Run per-layer profiling: collect FLOPs, bytes, and time, then classify against peaks.\"\n with _device_ctx(device) as dev:\n if dev.type == \"cuda\":\n torch.cuda.empty_cache()\n _sync(dev)\n\n # Ensure peaks are available\n if self.peaks is None:\n self.peaks = measure_peaks(dev, dtype=self.sample.dtype)\n\n model = self.model.eval().to(dev)\n sample = self.sample.to(dev)\n leaf_mods = _leaf_modules(model)\n\n bytes_state: dict[str, list] = {n: [] for n in leaf_mods}\n time_state: dict[str, list] = {n: [] for n in leaf_mods}\n flops_state: dict[str, list] = {n: [] for n in leaf_mods}\n call_state: dict[str, int] = {n: 0 for n in leaf_mods}\n\n handles = _setup_roofline_hooks(\n leaf_mods, bytes_state, time_state, flops_state, call_state, dev.type,\n )\n try:\n for _ in range(warmup):\n model(sample)\n # Reset accumulators after warmup\n for n in leaf_mods:\n bytes_state[n].clear()\n time_state[n].clear()\n flops_state[n].clear()\n call_state[n] = 0\n for _ in range(steps):\n model(sample)\n finally:\n for h in handles:\n h.remove()\n\n # Warn if any module was invoked >1x per forward pass (shared module)\n multi_call = [n for n, c in call_state.items() if steps > 0 and c > steps]\n if multi_call:\n warnings.warn(\n f\"{len(multi_call)} module(s) were called more than once per forward \"\n f\"pass; their bytes/time are summed across calls. First: {multi_call[:3]}\"\n )\n\n # --- Build RooflinePoint per layer ---\n peak_flops = self.peaks.peak_flops\n peak_bw = self.peaks.peak_bandwidth\n ridge = self.peaks.ridge_point\n\n results: list[RooflinePoint] = []\n for name, mod in leaf_mods.items():\n b_list = bytes_state[name]\n t_list = time_state[name]\n f_list = flops_state[name]\n bytes_moved = float(np.mean(b_list)) if b_list else 0.0\n time_s = float(np.mean(t_list)) if t_list else 0.0\n flops = float(np.mean(f_list)) if f_list else 0.0\n\n if flops == 0 or bytes_moved == 0 or time_s == 0:\n results.append(RooflinePoint(\n name=name, type=mod.__class__.__name__,\n flops=flops, bytes_moved=bytes_moved, time_s=time_s,\n arithmetic_intensity=0.0, achieved_gflops=0.0,\n bound=\"undefined\", utilization_pct=0.0,\n ))\n continue\n\n ai = flops / bytes_moved\n achieved_gflops = flops / time_s / 1e9\n roof_flops = min(peak_flops, ai * peak_bw)\n roof_gflops = roof_flops / 1e9\n bound = \"memory\" if ai < ridge else \"compute\"\n util = (achieved_gflops / roof_gflops * 100) if roof_gflops > 0 else 0.0\n results.append(RooflinePoint(\n name=name, type=mod.__class__.__name__,\n flops=flops, bytes_moved=bytes_moved, time_s=time_s,\n arithmetic_intensity=ai, achieved_gflops=achieved_gflops,\n bound=bound, utilization_pct=util,\n ))\n\n # Warn if any layer had zero flops/bytes/time -> bound is \"undefined\"\n zero_flops = [r.name for r in results if r.bound == \"undefined\"]\n if zero_flops:\n warnings.warn(\n f\"{len(zero_flops)} layer(s) have undefined roofline (zero FLOPs, bytes, \"\n f\"or time). First: {zero_flops[:3]}\"\n )\n\n self._results = results\n return results\n\n def summary(self, *, top: int = 10) -> None:\n \"Print a table of the slowest layers with their roofline metrics.\"\n if not self._results:\n raise RuntimeError(\"No results available. Call profile() first.\")\n print(_section(\"Roofline\", 72))\n header = f\" {'name':32} {'type':14} {'FLOPs':>10} {'bytes':>10} {'AI':>8} {'GFLOPs/s':>10} {'bound':>9} {'util%':>7}\"\n print(header)\n # Sort by measured time, descending (slowest first)\n sorted_rows = sorted(self._results, key=lambda r: r.time_s, reverse=True)[:top]\n for r in sorted_rows:\n flops_str = f\"{r.flops/1e6:>8.2f}M\" if r.flops >= 1e6 else f\"{r.flops:>10.0f}\"\n bytes_str = f\"{r.bytes_moved/1e6:>8.2f}M\" if r.bytes_moved >= 1e6 else f\"{r.bytes_moved:>10.0f}\"\n ai_str = _fmt_float(r.arithmetic_intensity, width=8, decimals=2)\n gf_str = _fmt_float(r.achieved_gflops, width=10, decimals=2)\n util_str = _fmt_float(r.utilization_pct, width=6, decimals=1) + \"%\"\n print(f\" {r.name:32} {r.type:14} {flops_str} {bytes_str} {ai_str} {gf_str} {r.bound:>9} {util_str}\")\n\n def plot(\n self,\n *,\n title: str = \"Roofline\", # figure title\n ) -> go.Figure:\n \"Render the roofline plot with per-layer scatter points on a log-log grid. Markers are colored by bound classification.\"\n if not self._results:\n raise RuntimeError(\"No results available. Call profile() first.\")\n if self.peaks is None:\n raise RuntimeError(\"No hardware peaks available.\")\n\n peak_flops = self.peaks.peak_flops\n peak_bw = self.peaks.peak_bandwidth\n ridge = self.peaks.ridge_point\n peak_gflops = peak_flops / 1e9\n\n # Build the roof curve: y = min(peak_flops, AI * peak_bw) / 1e9\n valid = [r for r in self._results if r.bound != \"undefined\"]\n if valid:\n ai_min = max(min(r.arithmetic_intensity for r in valid) / 10.0, 1e-3)\n ai_max = max(r.arithmetic_intensity for r in valid) * 10.0\n else:\n ai_min, ai_max = 1e-2, 1e3\n ai_max = max(ai_max, ridge * 10.0)\n\n ai_grid = np.logspace(math.log10(ai_min), math.log10(ai_max), 200)\n roof_gflops = np.minimum(peak_gflops, ai_grid * peak_bw / 1e9)\n\n fig = go.Figure()\n fig.add_trace(go.Scatter(\n x=ai_grid, y=roof_gflops, mode=\"lines\",\n line=dict(color=\"#008080\", width=2),\n name=f\"Roof ({peak_gflops:.0f} GFLOPs/s, {peak_bw/1e9:.1f} GB/s)\",\n hoverinfo=\"skip\",\n ))\n # Ridge point marker\n fig.add_trace(go.Scatter(\n x=[ridge], y=[peak_gflops], mode=\"markers\",\n marker=dict(color=\"#008080\", size=10, symbol=\"diamond\"),\n name=f\"Ridge point ({ridge:.1f} FLOP/byte)\",\n hovertemplate=\"Ridge: %{x:.2f} FLOP/byte\",\n ))\n\n # Color palette for layers\n color_map = {\"memory\": \"#89d6c9\", \"compute\": \"#008080\"}\n for bound_label in (\"memory\", \"compute\"):\n pts = [r for r in valid if r.bound == bound_label]\n if not pts:\n continue\n hover = [\n f\"{r.name}
{r.type}
AI: {r.arithmetic_intensity:.3f} FLOP/byte\"\n f\"
{r.achieved_gflops:.2f} GFLOPs/s
util: {r.utilization_pct:.1f}%\"\n for r in pts\n ]\n fig.add_trace(go.Scatter(\n x=[r.arithmetic_intensity for r in pts],\n y=[r.achieved_gflops for r in pts],\n mode=\"markers\",\n marker=dict(color=color_map[bound_label], size=8, opacity=0.8,\n line=dict(color=\"#008080\", width=0.5)),\n name=f\"{bound_label}-bound\",\n text=hover,\n hovertemplate=\"%{text}\",\n ))\n\n fig.update_layout(\n title=title,\n xaxis=dict(title=\"Arithmetic intensity (FLOP/byte)\", type=\"log\"),\n yaxis=dict(title=\"Achieved performance (GFLOPs/s)\", type=\"log\"),\n paper_bgcolor=\"rgba(0,0,0,0)\",\n plot_bgcolor=\"rgba(0,0,0,0)\",\n legend=dict(bgcolor=\"rgba(0,0,0,0)\"),\n )\n return fig" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "sd_analyzer", + "metadata": {}, + "outputs": [], + "source": [ + "show_doc(RooflineAnalyzer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "sd_profile", + "metadata": {}, + "outputs": [], + "source": [ + "show_doc(RooflineAnalyzer.profile)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "sd_summary", + "metadata": {}, + "outputs": [], + "source": [ + "show_doc(RooflineAnalyzer.summary)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "sd_plot", + "metadata": {}, + "outputs": [], + "source": [ + "show_doc(RooflineAnalyzer.plot)" + ] + }, + { + "cell_type": "markdown", + "id": "usage", + "metadata": {}, + "source": [ + "## Usage\n", + "\n", + "```python\n", + "from fasterbench.roofline import RooflineAnalyzer\n", + "\n", + "ra = RooflineAnalyzer(model, sample)\n", + "ra.profile(device=\"cuda\")\n", + "ra.summary()\n", + "fig = ra.plot()\n", + "fig.show()\n", + "```\n", + "\n", + "This is a measurement primitive. Downstream compression workflows (see fasterrecipes) can consume `ra.results` to make decisions - fasterbench itself never prescribes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "test_peaks_basic", + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "from fastcore.test import *\n", + "\n", + "_p = measure_peaks(\"cpu\", steps=3, warmup=1, matmul_size=256, bandwidth_mb=32, cache=False)\n", + "assert isinstance(_p, HardwarePeaks)\n", + "assert _p.peak_flops > 0\n", + "assert _p.peak_bandwidth > 0\n", + "test_close(_p.ridge_point, _p.peak_flops / _p.peak_bandwidth, eps=1e-6)\n", + "assert _p.device == \"cpu\"\n", + "assert _p.tf32_enabled is False\n", + "assert _p.cudnn_benchmark is False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "test_peaks_cache", + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "clear_peaks_cache()\n", + "_p1 = measure_peaks(\"cpu\", steps=2, warmup=1, matmul_size=128, bandwidth_mb=16, cache=True)\n", + "_p2 = measure_peaks(\"cpu\", steps=2, warmup=1, matmul_size=128, bandwidth_mb=16, cache=True)\n", + "assert _p1 is _p2 # cache hit returns same object\n", + "\n", + "clear_peaks_cache()\n", + "_p3 = measure_peaks(\"cpu\", steps=2, warmup=1, matmul_size=128, bandwidth_mb=16, cache=True)\n", + "assert _p3 is not _p1 # cache was cleared" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "test_conv_hand_computed", + "metadata": {}, + "outputs": [], + "source": "#| hide\n# Hand-computed Conv2d test: wrap in Sequential so leaf name is non-empty\nimport torch\nimport torch.nn as nn\n\n_synth_peaks = HardwarePeaks(\n peak_flops=1e12, peak_bandwidth=1e11, ridge_point=10.0,\n device=\"cpu\", dtype=\"torch.float32\",\n tf32_enabled=False, cudnn_benchmark=False,\n)\n_conv = nn.Conv2d(4, 8, kernel_size=3, padding=1, bias=False)\n_x = torch.randn(1, 4, 8, 8)\n_model = nn.Sequential(_conv)\n_ra = RooflineAnalyzer(_model, _x, peaks=_synth_peaks)\n_res = _ra.profile(device=\"cpu\", warmup=1, steps=2)\nassert len(_res) == 1\n_r = _res[0]\n# Expected MACs = 4*8*3*3 * 8*8 = 18432; FLOPs = 2*MACs = 36864\ntest_eq(_r.flops, 36864.0)\n# Expected bytes: weights 4*8*3*3*4 = 1152, input 1*4*8*8*4 = 1024, output 1*8*8*8*4 = 2048\n# total = 4224\ntest_eq(_r.bytes_moved, 4224.0)\ntest_close(_r.arithmetic_intensity, 36864.0 / 4224.0, eps=1e-3)\nassert _r.bound in (\"memory\", \"compute\")\nassert math.isfinite(_r.utilization_pct)" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "test_linear_stack", + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "# Tiny Linear stack with synthetic peaks\n", + "_m = nn.Sequential(nn.Linear(32, 64), nn.ReLU(), nn.Linear(64, 16))\n", + "_x = torch.randn(1, 32)\n", + "_peaks = HardwarePeaks(\n", + " peak_flops=1e12, peak_bandwidth=1e11, ridge_point=10.0,\n", + " device=\"cpu\", dtype=\"torch.float32\",\n", + " tf32_enabled=False, cudnn_benchmark=False,\n", + ")\n", + "_ra = RooflineAnalyzer(_m, _x, peaks=_peaks)\n", + "_res = _ra.profile(device=\"cpu\", warmup=1, steps=2)\n", + "assert len(_res) > 0\n", + "for _r in _res:\n", + " assert _r.arithmetic_intensity >= 0\n", + " assert _r.bound in {\"memory\", \"compute\", \"undefined\"}\n", + " assert math.isfinite(_r.utilization_pct)\n", + "# summary and plot should run without error\n", + "_ra.summary(top=5)\n", + "_fig = _ra.plot()\n", + "assert isinstance(_fig, go.Figure)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "test_slow_resnet", + "metadata": {}, + "outputs": [], + "source": "#| hide\n#| notest\nfrom torchvision.models import resnet18\n\n_model = resnet18()\n_sample = torch.randn(1, 3, 64, 64)\n_synth = HardwarePeaks(\n peak_flops=1e13, peak_bandwidth=5e11, ridge_point=20.0,\n device=\"cpu\", dtype=\"torch.float32\",\n tf32_enabled=False, cudnn_benchmark=False,\n)\n_ra = RooflineAnalyzer(_model, _sample, peaks=_synth)\n_results = _ra.profile(device=\"cpu\", warmup=2, steps=3)\nassert len(_results) > 0\nassert all(r.bound in {\"memory\", \"compute\", \"undefined\"} for r in _results)\n_ra.summary(top=5)\n_fig = _ra.plot()\nassert isinstance(_fig, go.Figure)" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "test_slow_cuda", + "metadata": {}, + "outputs": [], + "source": "#| hide\n#| notest\nif torch.cuda.is_available():\n _p = measure_peaks(device=\"cuda\", matmul_size=512, bandwidth_mb=64, steps=3, warmup=1, cache=False)\n assert _p.peak_flops > 0\n assert _p.peak_bandwidth > 0\n assert _p.ridge_point > 0" + }, + { + "cell_type": "markdown", + "id": "see_also", + "metadata": {}, + "source": "---\n\n## See Also\n\n- [Per-layer profiling](profiling.html) - Generic per-layer hook infrastructure reused here\n- [Compute metrics](../metrics/compute.html) - Model-level FLOPs counting\n- [Speed metrics](../metrics/speed.html) - Latency measurement" + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/index.ipynb b/nbs/index.ipynb index a6b7be0..9c8be32 100644 --- a/nbs/index.ipynb +++ b/nbs/index.ipynb @@ -37,47 +37,7 @@ "id": "8b6f8c52", "metadata": {}, "outputs": [], - "source": [ - "#| export\n", - "from fasterbench.benchmark import benchmark, BenchmarkResult\n", - "from fasterbench.size import SizeMetrics, compute_size, get_model_size, get_num_parameters\n", - "from fasterbench.speed import (\n", - " SpeedMetrics, compute_speed, compute_speed_multi, \n", - " sweep_threads, sweep_latency, sweep_batch_sizes,\n", - ")\n", - "from fasterbench.profiling import LayerProfiler\n", - "from fasterbench.compute import ComputeMetrics, compute_compute\n", - "from fasterbench.memory import MemoryMetrics, compute_memory, compute_memory_multi\n", - "from fasterbench.energy import EnergyMetrics, compute_energy, compute_energy_multi\n", - "from fasterbench.plot import create_radar_plot, SPECS\n", - "from fasterbench.utils import parse_metric_value\n", - "\n", - "__all__ = [\n", - " # Main entry point\n", - " 'benchmark', 'BenchmarkResult',\n", - " # Size\n", - " 'SizeMetrics', 'compute_size', 'get_model_size', 'get_num_parameters',\n", - " # Speed\n", - " 'SpeedMetrics', 'compute_speed', 'compute_speed_multi', \n", - " 'sweep_threads', 'sweep_latency', 'sweep_batch_sizes',\n", - " # Profiling\n", - " 'LayerProfiler',\n", - " # Compute\n", - " 'ComputeMetrics', 'compute_compute',\n", - " # Memory\n", - " 'MemoryMetrics', 'compute_memory', 'compute_memory_multi',\n", - " # Energy\n", - " 'EnergyMetrics', 'compute_energy', 'compute_energy_multi',\n", - " # Plot\n", - " 'create_radar_plot', 'SPECS',\n", - " # Report\n", - " 'Report', 'ComparisonReport', 'ReportMetricDelta',\n", - " # Utils\n", - " 'parse_metric_value',\n", - "]\n", - "from fasterbench.report import Report, ComparisonReport, ReportMetricDelta\n", - "" - ] + "source": "#| export\nfrom fasterbench.benchmark import benchmark, BenchmarkResult\nfrom fasterbench.size import SizeMetrics, compute_size, get_model_size, get_num_parameters\nfrom fasterbench.speed import (\n SpeedMetrics, compute_speed, compute_speed_multi, \n sweep_threads, sweep_latency, sweep_batch_sizes,\n)\nfrom fasterbench.profiling import LayerProfiler\nfrom fasterbench.compute import ComputeMetrics, compute_compute\nfrom fasterbench.memory import MemoryMetrics, compute_memory, compute_memory_multi\nfrom fasterbench.energy import EnergyMetrics, compute_energy, compute_energy_multi\nfrom fasterbench.roofline import (\n HardwarePeaks, RooflinePoint, measure_peaks, clear_peaks_cache, RooflineAnalyzer,\n)\nfrom fasterbench.plot import create_radar_plot, SPECS\nfrom fasterbench.utils import parse_metric_value\n\n__all__ = [\n # Main entry point\n 'benchmark', 'BenchmarkResult',\n # Size\n 'SizeMetrics', 'compute_size', 'get_model_size', 'get_num_parameters',\n # Speed\n 'SpeedMetrics', 'compute_speed', 'compute_speed_multi', \n 'sweep_threads', 'sweep_latency', 'sweep_batch_sizes',\n # Profiling\n 'LayerProfiler',\n # Compute\n 'ComputeMetrics', 'compute_compute',\n # Memory\n 'MemoryMetrics', 'compute_memory', 'compute_memory_multi',\n # Energy\n 'EnergyMetrics', 'compute_energy', 'compute_energy_multi',\n # Roofline\n 'HardwarePeaks', 'RooflinePoint', 'measure_peaks', 'clear_peaks_cache', 'RooflineAnalyzer',\n # Plot\n 'create_radar_plot', 'SPECS',\n # Report\n 'Report', 'ComparisonReport', 'ReportMetricDelta',\n # Utils\n 'parse_metric_value',\n]\nfrom fasterbench.report import Report, ComparisonReport, ReportMetricDelta\n" }, { "cell_type": "markdown", diff --git a/nbs/metrics/energy.ipynb b/nbs/metrics/energy.ipynb index 576338c..1bef0f3 100644 --- a/nbs/metrics/energy.ipynb +++ b/nbs/metrics/energy.ipynb @@ -217,4 +217,4 @@ "metadata": {}, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/nbs/metrics/speed.ipynb b/nbs/metrics/speed.ipynb index af09241..f8b876d 100644 --- a/nbs/metrics/speed.ipynb +++ b/nbs/metrics/speed.ipynb @@ -294,4 +294,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/nbs/tutorials/roofline.ipynb b/nbs/tutorials/roofline.ipynb new file mode 100644 index 0000000..340a42a --- /dev/null +++ b/nbs/tutorials/roofline.ipynb @@ -0,0 +1,208 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "frontmatter", + "metadata": {}, + "source": [ + "---\ndescription: Measuring arithmetic intensity vs hardware peaks with RooflineAnalyzer\noutput-file: tutorial.roofline.html\ntitle: Roofline analysis\nskip_showdoc: true\nskip_exec: true\n---" + ] + }, + { + "cell_type": "markdown", + "id": "callout", + "metadata": {}, + "source": [ + "> This notebook demonstrates measurement primitives. For compression decisions based on roofline data, see fasterrecipes." + ] + }, + { + "cell_type": "markdown", + "id": "intro", + "metadata": {}, + "source": [ + "## What is a roofline?\n", + "\n", + "The roofline model (Williams et al., 2009) plots a layer's achieved performance against its arithmetic intensity:\n", + "\n", + "- **Arithmetic intensity (AI)** = FLOPs per byte moved from memory. A property of the computation itself.\n", + "- **Achieved performance** = FLOPs per second actually delivered on the device.\n", + "- **The roof** is the min of two ceilings: a sloped line `AI x peak_bandwidth` (memory-bound region) and a flat line `peak_flops` (compute-bound region).\n", + "- **The ridge point** `peak_flops / peak_bandwidth` is the AI at which the two ceilings meet. Layers with `AI < ridge` are memory-bound; layers with `AI >= ridge` are compute-bound.\n", + "\n", + "On a log-log plot, the roof looks like a tilted ceiling with a flat top. Each layer becomes a marker underneath that ceiling." + ] + }, + { + "cell_type": "markdown", + "id": "peaks_header", + "metadata": {}, + "source": [ + "## Measuring hardware peaks\n", + "\n", + "`measure_peaks()` empirically probes the device with a large square matmul (for peak FLOPs/s) and a cache-defeating memory copy (for streaming bandwidth). It returns a `HardwarePeaks` dataclass.\n", + "\n", + "By default, TF32 is pinned **off** on CUDA so the fp32 peak reflects honest fp32 throughput. Pass `allow_tf32=True` if you want the TF32 peak instead." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "peaks_example", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from fasterbench.roofline import measure_peaks\n", + "\n", + "peaks = measure_peaks(\"cuda\", steps=20, warmup=5)\n", + "print(peaks)" + ] + }, + { + "cell_type": "markdown", + "id": "peaks_example_out", + "metadata": {}, + "source": [ + "```\n", + "HardwarePeaks(peak_flops=8.12e+12, peak_bandwidth=5.43e+11, ridge_point=14.95, device='cuda:0', dtype='torch.float32', tf32_enabled=False, cudnn_benchmark=False)\n", + "```\n", + "\n", + "The ridge point here is ~15 FLOPs/byte. Any layer below that intensity is memory-bound on this device." + ] + }, + { + "cell_type": "markdown", + "id": "resnet_header", + "metadata": {}, + "source": "## Profiling ResNet-18\n\n`RooflineAnalyzer` profiles a model in a single pass under the hood: forward hooks on every leaf module measure FLOPs (computed analytically for Conv and Linear), bytes moved (weights + input + output per Williams 2009), and wall time.\n\nIf you do not pass a `peaks=` argument, it calls `measure_peaks()` automatically." + }, + { + "cell_type": "code", + "execution_count": null, + "id": "resnet_example", + "metadata": {}, + "outputs": [], + "source": [ + "from torchvision.models import resnet18\n", + "from fasterbench.roofline import RooflineAnalyzer\n", + "\n", + "model = resnet18()\n", + "sample = torch.randn(1, 3, 224, 224)\n", + "\n", + "ra = RooflineAnalyzer(model, sample)\n", + "ra.profile(device=\"cuda\", warmup=5, steps=20)\n", + "ra.summary(top=10)" + ] + }, + { + "cell_type": "markdown", + "id": "resnet_example_out", + "metadata": {}, + "source": "```\n=== Roofline =============================================================\n name type FLOPs bytes AI GFLOPs/s bound util%\n layer4.0.conv2 Conv2d 231.21M 10.01M 23.10 820.14 compute 10.1%\n layer4.1.conv1 Conv2d 231.21M 10.01M 23.10 810.22 compute 10.0%\n layer4.1.conv2 Conv2d 231.21M 10.01M 23.10 812.49 compute 10.0%\n layer3.0.conv2 Conv2d 115.61M 5.11M 22.62 402.33 compute 5.0%\n ...\n```\n\nEach row shows a layer's FLOPs, bytes moved, arithmetic intensity, achieved throughput, bound classification, and utilization (fraction of the roof reached)." + }, + { + "cell_type": "markdown", + "id": "plot_header", + "metadata": {}, + "source": [ + "## Reading the plot\n", + "\n", + "`ra.plot()` returns a plotly `Figure` with the roof line, the ridge point, and one marker per layer. Memory-bound layers are colored teal, compute-bound layers are darker teal." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "plot_example", + "metadata": {}, + "outputs": [], + "source": [ + "fig = ra.plot(title=\"ResNet-18 roofline (CUDA)\")\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "plot_reading", + "metadata": {}, + "source": [ + "How to read the plot:\n", + "\n", + "- The diagonal segment (slope 1 on log-log) is the memory bandwidth ceiling.\n", + "- The flat segment is the compute ceiling.\n", + "- A marker near the roof indicates a layer achieving a high fraction of what the hardware permits at its intensity.\n", + "- A marker far below the roof indicates a layer leaving hardware utilization on the table.\n", + "- A marker to the left of the ridge point sits in the memory-bound region; one to the right sits in the compute-bound region." + ] + }, + { + "cell_type": "markdown", + "id": "resolution_header", + "metadata": {}, + "source": [ + "## Comparing input resolutions\n", + "\n", + "Arithmetic intensity is a function of the computation and the tensor shapes. Increasing spatial resolution grows activation memory faster than it grows FLOPs for many conv layers, so markers shift further into the memory-bound region." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "resolution_example", + "metadata": {}, + "outputs": [], + "source": [ + "for side in (224, 512):\n", + " x = torch.randn(1, 3, side, side)\n", + " ra = RooflineAnalyzer(model, x, peaks=peaks)\n", + " ra.profile(device=\"cuda\", warmup=3, steps=10)\n", + " mem_bound = sum(1 for r in ra.results if r.bound == \"memory\")\n", + " comp_bound = sum(1 for r in ra.results if r.bound == \"compute\")\n", + " print(f\"{side}x{side}: {mem_bound} memory-bound, {comp_bound} compute-bound\")" + ] + }, + { + "cell_type": "markdown", + "id": "resolution_out", + "metadata": {}, + "source": [ + "```\n", + "224x224: 18 memory-bound, 42 compute-bound\n", + "512x512: 31 memory-bound, 29 compute-bound\n", + "```\n", + "\n", + "At 512x512 many more layers fall below the ridge point because activation bytes scale with `H x W` while FLOPs scale with `H x W` for a fixed kernel - but the constant factor differs, and BN/ReLU/pooling layers (which have very low AI) dominate when activations are large." + ] + }, + { + "cell_type": "markdown", + "id": "summary", + "metadata": {}, + "source": "## Summary\n\n| Tool | Purpose |\n|------|---------|\n| `measure_peaks()` | Empirically probe peak FLOPs/s and streaming bandwidth |\n| `HardwarePeaks` | Dataclass holding device peaks and ridge point |\n| `RooflineAnalyzer` | Per-layer roofline profiler |\n| `RooflineAnalyzer.profile()` | Measure FLOPs, bytes moved, and time per layer |\n| `RooflineAnalyzer.summary()` | Print a table of the slowest layers with their roofline metrics |\n| `RooflineAnalyzer.plot()` | Plotly figure with roof ceiling and per-layer markers |\n| `RooflinePoint` | Dataclass for a single layer's measurement |\n| `clear_peaks_cache()` | Reset the `measure_peaks()` cache |" + }, + { + "cell_type": "markdown", + "id": "see_also", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## See Also\n", + "\n", + "- [Roofline API](../analysis/roofline.html) - Full reference\n", + "- [Profiling Tutorial](profiling.html) - Per-layer speed/memory/size/compute profiling\n", + "- [Compute metrics](../metrics/compute.html) - Underlying FLOPs counting" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}