Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -276,3 +276,7 @@ python utils/summarize.py <results_directory>
```

Outputs GitHub-flavored markdown tables with metrics including TTFT, TPOT, interactivity, E2EL, and throughput per GPU for both single-node and multi-node results.

For multi-node/disaggregated results:
- `TPUT per GPU`, `Output TPUT per GPU`, and `Input TPUT per GPU` are cluster averages (divided by total GPUs in the run).
- `Output TPUT per Decode GPU` and `Input TPUT per Prefill GPU` are also emitted for role-specific analysis.
34 changes: 28 additions & 6 deletions utils/process_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@
from pathlib import Path


def safe_division(numerator, denominator, metric_name):
"""Divide safely and raise a clear error on invalid denominators."""
if denominator <= 0:
raise ValueError(
f"{metric_name} requires a positive denominator, got {denominator}.")
return numerator / denominator


def get_required_env_vars(required_vars):
"""Load and validate required environment variables."""
env_values = {}
Expand Down Expand Up @@ -57,6 +65,9 @@ def get_required_env_vars(required_vars):
}

is_multinode = os.environ.get('IS_MULTINODE', 'false').lower() == 'true'
total_token_throughput = float(bmk_result['total_token_throughput'])
output_throughput = float(bmk_result['output_throughput'])
input_throughput = total_token_throughput - output_throughput

if is_multinode:
# TODO: Eventually will have to have a separate condition in here for multinode disagg and
Expand All @@ -74,6 +85,12 @@ def get_required_env_vars(required_vars):
decode_tp = int(multinode_env['DECODE_TP'])
decode_ep = int(multinode_env['DECODE_EP'])
decode_dp_attn = multinode_env['DECODE_DP_ATTN']
total_gpus = prefill_gpus + decode_gpus

output_tput_per_decode_gpu = safe_division(
output_throughput, decode_gpus, 'output_tput_per_decode_gpu')
input_tput_per_prefill_gpu = safe_division(
input_throughput, prefill_gpus, 'input_tput_per_prefill_gpu')

multi_node_data = {
'is_multinode': True,
Expand All @@ -87,9 +104,14 @@ def get_required_env_vars(required_vars):
'decode_num_workers': decode_num_workers,
'num_prefill_gpu': prefill_gpus,
'num_decode_gpu': decode_gpus,
'tput_per_gpu': float(bmk_result['total_token_throughput']) / (prefill_gpus + decode_gpus),
'output_tput_per_gpu': float(bmk_result['output_throughput']) / decode_gpus,
'input_tput_per_gpu': (float(bmk_result['total_token_throughput']) - float(bmk_result['output_throughput'])) / prefill_gpus,
# For disaggregated runs, keep input/output throughput per GPU
# comparable to aggregated setups by averaging over the whole cluster.
'tput_per_gpu': safe_division(total_token_throughput, total_gpus, 'tput_per_gpu'),
'output_tput_per_gpu': safe_division(output_throughput, total_gpus, 'output_tput_per_gpu'),
'input_tput_per_gpu': safe_division(input_throughput, total_gpus, 'input_tput_per_gpu'),
# Keep role-specific metrics for deeper disaggregated analysis.
'output_tput_per_decode_gpu': output_tput_per_decode_gpu,
'input_tput_per_prefill_gpu': input_tput_per_prefill_gpu,
}

data = data | multi_node_data
Expand All @@ -107,9 +129,9 @@ def get_required_env_vars(required_vars):
'tp': tp_size,
'ep': ep_size,
'dp_attention': dp_attention,
'tput_per_gpu': float(bmk_result['total_token_throughput']) / tp_size,
'output_tput_per_gpu': float(bmk_result['output_throughput']) / tp_size,
'input_tput_per_gpu': (float(bmk_result['total_token_throughput']) - float(bmk_result['output_throughput'])) / tp_size,
'tput_per_gpu': safe_division(total_token_throughput, tp_size, 'tput_per_gpu'),
'output_tput_per_gpu': safe_division(output_throughput, tp_size, 'output_tput_per_gpu'),
'input_tput_per_gpu': safe_division(input_throughput, tp_size, 'input_tput_per_gpu'),
}

data = data | single_node_data
Expand Down
61 changes: 52 additions & 9 deletions utils/summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@
TPUT_PER_GPU = "TPUT per GPU"
OUTPUT_TPUT_PER_GPU = "Output TPUT per GPU"
INPUT_TPUT_PER_GPU = "Input TPUT per GPU"
OUTPUT_TPUT_PER_GPU_CLUSTER = "Output TPUT per GPU (cluster avg)"
INPUT_TPUT_PER_GPU_CLUSTER = "Input TPUT per GPU (cluster avg)"
OUTPUT_TPUT_PER_DECODE_GPU = "Output TPUT per Decode GPU"
INPUT_TPUT_PER_PREFILL_GPU = "Input TPUT per Prefill GPU"
PREFILL_TP = "Prefill TP"
PREFILL_EP = "Prefill EP"
PREFILL_DP_ATTN = "Prefill DP Attn"
Expand Down Expand Up @@ -52,6 +56,40 @@ def load_json(path: Path) -> Optional[Dict[str, Any]]:
return None


def get_multinode_tput_metrics(result: Dict[str, Any]) -> tuple[float, float, float, float]:
"""Return normalized throughput metrics for multinode summaries.

New results include both:
- cluster-averaged IO throughput (`output_tput_per_gpu` / `input_tput_per_gpu`)
- role-scoped IO throughput (`output_tput_per_decode_gpu` / `input_tput_per_prefill_gpu`)

Older results only include role-scoped IO throughput in
`output_tput_per_gpu` and `input_tput_per_gpu`. In that case we derive
comparable cluster averages using prefill/decode GPU counts.
"""
output_tput_per_gpu = float(result['output_tput_per_gpu'])
input_tput_per_gpu = float(result['input_tput_per_gpu'])
output_tput_per_decode_gpu = result.get('output_tput_per_decode_gpu')
input_tput_per_prefill_gpu = result.get('input_tput_per_prefill_gpu')

if output_tput_per_decode_gpu is None or input_tput_per_prefill_gpu is None:
output_tput_per_decode_gpu = output_tput_per_gpu
input_tput_per_prefill_gpu = input_tput_per_gpu
prefill_gpus = int(result.get('num_prefill_gpu', 0))
decode_gpus = int(result.get('num_decode_gpu', 0))
total_gpus = prefill_gpus + decode_gpus
if total_gpus > 0:
output_tput_per_gpu = output_tput_per_decode_gpu * (decode_gpus / total_gpus)
input_tput_per_gpu = input_tput_per_prefill_gpu * (prefill_gpus / total_gpus)

return (
output_tput_per_gpu,
input_tput_per_gpu,
float(output_tput_per_decode_gpu),
float(input_tput_per_prefill_gpu),
)


def main():
if len(sys.argv) < 2:
print("Usage: python summarize.py <results_dir>")
Expand Down Expand Up @@ -114,11 +152,16 @@ def main():
MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL,
PREFILL_TP, PREFILL_EP, PREFILL_DP_ATTN, PREFILL_WORKERS, PREFILL_GPUS,
DECODE_TP, DECODE_EP, DECODE_DP_ATTN, DECODE_WORKERS, DECODE_GPUS,
CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU
CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU,
OUTPUT_TPUT_PER_GPU_CLUSTER, INPUT_TPUT_PER_GPU_CLUSTER,
OUTPUT_TPUT_PER_DECODE_GPU, INPUT_TPUT_PER_PREFILL_GPU
]

multinode_rows = [
[
multinode_rows = []
for r in multinode_results:
output_tput_per_gpu, input_tput_per_gpu, output_tput_per_decode_gpu, input_tput_per_prefill_gpu = get_multinode_tput_metrics(
r)
multinode_rows.append([
r['infmax_model_prefix'],
r['model'],
r['hw'].upper(),
Expand All @@ -142,16 +185,16 @@ def main():
f"{r['median_intvty']:.4f}",
f"{r['median_e2el']:.4f}",
f"{r['tput_per_gpu']:.4f}",
f"{r['output_tput_per_gpu']:.4f}",
f"{r['input_tput_per_gpu']:.4f}",
]
for r in multinode_results
]
f"{output_tput_per_gpu:.4f}",
f"{input_tput_per_gpu:.4f}",
f"{output_tput_per_decode_gpu:.4f}",
f"{input_tput_per_prefill_gpu:.4f}",
])

print("## Multi-Node Results\n")
print("Only [InferenceX](https://github.com/SemiAnalysisAI/InferenceX) repo contains the Official InferenceX™ result, all other forks & repos are Unofficial. The benchmark setup & quality of machines/clouds in unofficial repos may be differ leading to subpar benchmarking. Unofficial must be explicitly labelled as Unofficial. Forks may not remove this disclaimer.\n")
print(tabulate(multinode_rows, headers=multinode_headers, tablefmt="github"))


if __name__ == "__main__":
main()
main()
34 changes: 29 additions & 5 deletions utils/test_process_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def base_env_vars():
"OSL": "1024",
"DISAGG": "false",
"MODEL_PREFIX": "dsr1",
"IMAGE": "lmsysorg/sglang:test",
}


Expand Down Expand Up @@ -235,8 +236,10 @@ def test_multinode_processing(self, tmp_path, sample_benchmark_result, multinode
# Verify throughput calculations
total_gpus = 20 + 8 # prefill + decode
assert output_data["tput_per_gpu"] == pytest.approx(15000.5 / total_gpus)
assert output_data["output_tput_per_gpu"] == pytest.approx(12000.0 / 8) # decode gpus
assert output_data["input_tput_per_gpu"] == pytest.approx((15000.5 - 12000.0) / 20) # prefill gpus
assert output_data["output_tput_per_gpu"] == pytest.approx(12000.0 / total_gpus)
assert output_data["input_tput_per_gpu"] == pytest.approx((15000.5 - 12000.0) / total_gpus)
assert output_data["output_tput_per_decode_gpu"] == pytest.approx(12000.0 / 8)
assert output_data["input_tput_per_prefill_gpu"] == pytest.approx((15000.5 - 12000.0) / 20)

def test_missing_base_env_vars(self, tmp_path, sample_benchmark_result):
"""Test that missing base env vars causes failure."""
Expand Down Expand Up @@ -367,7 +370,7 @@ def test_throughput_per_gpu_multinode(self, tmp_path, multinode_env_vars):
"model_id": "test-model",
"max_concurrency": 64,
"total_token_throughput": 28000.0, # Will be divided by total GPUs
"output_throughput": 16000.0, # Will be divided by decode GPUs
"output_throughput": 16000.0, # Cluster-average field uses total GPUs
}

env = multinode_env_vars.copy()
Expand All @@ -379,8 +382,29 @@ def test_throughput_per_gpu_multinode(self, tmp_path, multinode_env_vars):

output_data = json.loads(result.stdout)
assert output_data["tput_per_gpu"] == pytest.approx(1000.0) # 28000 / 28
assert output_data["output_tput_per_gpu"] == pytest.approx(2000.0) # 16000 / 8
assert output_data["input_tput_per_gpu"] == pytest.approx(600.0) # (28000 - 16000) / 20
assert output_data["output_tput_per_gpu"] == pytest.approx(571.428571, rel=1e-6) # 16000 / 28
assert output_data["input_tput_per_gpu"] == pytest.approx(428.571429, rel=1e-6) # (28000 - 16000) / 28
assert output_data["output_tput_per_decode_gpu"] == pytest.approx(2000.0) # 16000 / 8
assert output_data["input_tput_per_prefill_gpu"] == pytest.approx(600.0) # (28000 - 16000) / 20
assert output_data["tput_per_gpu"] == pytest.approx(
output_data["output_tput_per_gpu"] + output_data["input_tput_per_gpu"], rel=1e-6)

def test_multinode_zero_decode_gpus_fails(self, tmp_path, multinode_env_vars):
"""Test multinode validation for zero decode GPU denominator."""
benchmark_result = {
"model_id": "test-model",
"max_concurrency": 64,
"total_token_throughput": 28000.0,
"output_throughput": 16000.0,
}

env = multinode_env_vars.copy()
env["DECODE_GPUS"] = "0"

result = run_script(tmp_path, env, benchmark_result)

assert result.returncode != 0
assert "output_tput_per_decode_gpu requires a positive denominator" in result.stderr


# =============================================================================
Expand Down
71 changes: 71 additions & 0 deletions utils/test_summarize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""Focused tests for summarize.py multinode throughput normalization."""

import importlib
import sys
import types

import pytest


# summarize.py depends on tabulate at import time. Stub it for unit tests.
if "tabulate" not in sys.modules:
tabulate_stub = types.ModuleType("tabulate")
tabulate_stub.tabulate = lambda *args, **kwargs: ""
sys.modules["tabulate"] = tabulate_stub

summarize = importlib.import_module("summarize")


def test_get_multinode_tput_metrics_new_schema_passthrough():
"""New schema should preserve cluster + role-scoped values as-is."""
result = {
"output_tput_per_gpu": 1911.56,
"input_tput_per_gpu": 1910.71,
"output_tput_per_decode_gpu": 2867.34,
"input_tput_per_prefill_gpu": 5732.13,
"num_prefill_gpu": 24,
"num_decode_gpu": 48,
}

output_cluster, input_cluster, output_decode, input_prefill = summarize.get_multinode_tput_metrics(result)

assert output_cluster == pytest.approx(1911.56)
assert input_cluster == pytest.approx(1910.71)
assert output_decode == pytest.approx(2867.34)
assert input_prefill == pytest.approx(5732.13)


def test_get_multinode_tput_metrics_legacy_schema_normalized_to_cluster_avg():
"""Legacy role-scoped fields should be normalized to cluster averages."""
result = {
# Legacy meaning: per-decode GPU and per-prefill GPU
"output_tput_per_gpu": 2867.34,
"input_tput_per_gpu": 5732.13,
"num_prefill_gpu": 24,
"num_decode_gpu": 48,
}

output_cluster, input_cluster, output_decode, input_prefill = summarize.get_multinode_tput_metrics(result)

total_gpus = 24 + 48
assert output_cluster == pytest.approx(2867.34 * (48 / total_gpus))
assert input_cluster == pytest.approx(5732.13 * (24 / total_gpus))
assert output_decode == pytest.approx(2867.34)
assert input_prefill == pytest.approx(5732.13)


def test_get_multinode_tput_metrics_legacy_schema_zero_gpu_count():
"""Legacy data with zero GPU counts should not divide by zero."""
result = {
"output_tput_per_gpu": 1000.0,
"input_tput_per_gpu": 500.0,
"num_prefill_gpu": 0,
"num_decode_gpu": 0,
}

output_cluster, input_cluster, output_decode, input_prefill = summarize.get_multinode_tput_metrics(result)

assert output_cluster == pytest.approx(1000.0)
assert input_cluster == pytest.approx(500.0)
assert output_decode == pytest.approx(1000.0)
assert input_prefill == pytest.approx(500.0)