Skip to content

Commit a9a9e80

Browse files
committed
feat: add helpful error/investigation/fix for api response errors
Signed-off-by: lkomali <[email protected]>
1 parent 3ac7729 commit a9a9e80

File tree

3 files changed

+146
-0
lines changed

3 files changed

+146
-0
lines changed

src/aiperf/common/enums/data_exporter_enums.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66

77
class ConsoleExporterType(CaseInsensitiveStrEnum):
8+
API_ERRORS = "api_errors"
89
ERRORS = "errors"
910
EXPERIMENTAL_METRICS = "experimental_metrics"
1011
INTERNAL_METRICS = "internal_metrics"

src/aiperf/exporters/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@
88
## ⚠️ This file is auto-generated by mkinit ⚠️ ##
99
## ⚠️ Do not edit below this line ⚠️ ##
1010
########################################################################
11+
from aiperf.exporters.console_api_error_insight_exporter import (
12+
ConsoleApiErrorInsightExporter,
13+
ErrorInsight,
14+
MaxCompletionTokensDetector,
15+
)
1116
from aiperf.exporters.console_error_exporter import (
1217
ConsoleErrorExporter,
1318
)
@@ -55,15 +60,18 @@
5560
)
5661

5762
__all__ = [
63+
"ConsoleApiErrorInsightExporter",
5864
"ConsoleErrorExporter",
5965
"ConsoleExperimentalMetricsExporter",
6066
"ConsoleInternalMetricsExporter",
6167
"ConsoleMetricsExporter",
6268
"ConsoleUsageDiscrepancyExporter",
69+
"ErrorInsight",
6370
"ExporterConfig",
6471
"ExporterManager",
6572
"FileExportInfo",
6673
"GPUTelemetryConsoleExporter",
74+
"MaxCompletionTokensDetector",
6775
"MetricsBaseExporter",
6876
"MetricsCsvExporter",
6977
"MetricsJsonExporter",
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
5+
import contextlib
6+
import json
7+
8+
from rich.console import Console
9+
from rich.panel import Panel
10+
11+
from aiperf.common.decorators import implements_protocol
12+
from aiperf.common.enums import ConsoleExporterType
13+
from aiperf.common.factories import ConsoleExporterFactory
14+
from aiperf.common.mixins import AIPerfLoggerMixin
15+
from aiperf.common.protocols import ConsoleExporterProtocol
16+
from aiperf.exporters.exporter_config import ExporterConfig
17+
18+
19+
class ErrorInsight:
20+
"""Model to describe a detected API error insight."""
21+
22+
def __init__(
23+
self,
24+
title: str,
25+
problem: str,
26+
causes: list[str],
27+
investigation: list[str],
28+
fixes: list[str],
29+
):
30+
self.title = title
31+
self.problem = problem
32+
self.causes = causes
33+
self.investigation = investigation
34+
self.fixes = fixes
35+
36+
37+
class MaxCompletionTokensDetector:
38+
@staticmethod
39+
def detect(error_summary):
40+
if not error_summary or not isinstance(error_summary, list):
41+
return None
42+
43+
for item in error_summary:
44+
err = getattr(item, "error_details", None)
45+
if err is None:
46+
print("No error_details, skipping")
47+
continue
48+
49+
raw_msg = err.message or ""
50+
parsed = None
51+
with contextlib.suppress(Exception):
52+
parsed = json.loads(raw_msg)
53+
54+
backend_msg = None
55+
if isinstance(parsed, dict):
56+
backend_msg = parsed.get("message")
57+
58+
error_blob = str(backend_msg or raw_msg)
59+
60+
if (
61+
"extra_forbidden" in error_blob
62+
and "max_completion_tokens" in error_blob
63+
and "Extra inputs are not permitted" in error_blob
64+
):
65+
return ErrorInsight(
66+
title="Unsupported Parameter: max_completion_tokens",
67+
problem=(
68+
"The backend rejected 'max_completion_tokens'. "
69+
"This backend only supports 'max_tokens'."
70+
),
71+
causes=[
72+
"AIPerf generated 'max_completion_tokens' due to --output-tokens-mean.",
73+
"TRT-LLM rejects this field.",
74+
],
75+
investigation=[
76+
"Inspect request payloads in profile_export.jsonl.",
77+
"Check TRT-LLM supported parameters.",
78+
],
79+
fixes=[
80+
"Remove --output-tokens-mean.",
81+
'Or use --extra-inputs "max_tokens:<value>".',
82+
],
83+
)
84+
85+
return None
86+
87+
88+
@implements_protocol(ConsoleExporterProtocol)
89+
@ConsoleExporterFactory.register(ConsoleExporterType.API_ERRORS)
90+
class ConsoleApiErrorInsightExporter(AIPerfLoggerMixin):
91+
"""Displays helpful diagnostic panels for known API error patterns."""
92+
93+
DETECTORS = [
94+
MaxCompletionTokensDetector(),
95+
]
96+
97+
def __init__(self, exporter_config: ExporterConfig, **kwargs):
98+
super().__init__(**kwargs)
99+
self._results = exporter_config.results
100+
101+
async def export(self, console: Console) -> None:
102+
error_summary = getattr(self._results, "error_summary", None)
103+
104+
for detector in self.DETECTORS:
105+
insight = detector.detect(error_summary)
106+
if insight:
107+
panel = Panel(
108+
self._format_text(insight),
109+
title=insight.title,
110+
border_style="bold yellow",
111+
title_align="center",
112+
padding=(0, 2),
113+
expand=False,
114+
)
115+
console.print()
116+
console.print(panel)
117+
console.file.flush()
118+
119+
def _format_text(self, insight: ErrorInsight) -> str:
120+
return (
121+
f"""\
122+
[bold]{insight.problem}[/bold]
123+
124+
[bold]Possible Causes:[/bold]
125+
• """
126+
+ "\n • ".join(insight.causes)
127+
+ """
128+
129+
[bold]Investigation Steps:[/bold]
130+
1. """
131+
+ "\n 1. ".join(insight.investigation)
132+
+ """
133+
134+
[bold]Suggested Fixes:[/bold]
135+
• """
136+
+ "\n • ".join(insight.fixes)
137+
)

0 commit comments

Comments
 (0)