Skip to content

Commit 3ad9cc2

Browse files
committed
jax-toolbox-triage: minor usability/doc improvements
- Print the stdout/stderr of the first execution of the test case, which is supposed to fail, at INFO level along with a message encouraging the user to check that it is the correct failure. - Print the path to the DEBUG log file at INFO level and, therefore, to the console. - Expand the documentation.
1 parent bde47a4 commit 3ad9cc2

File tree

4 files changed

+72
-11
lines changed

4 files changed

+72
-11
lines changed

.github/triage/jax_toolbox_triage/logic.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,21 @@
1+
from dataclasses import dataclass
12
import datetime
23
import functools
34
import logging
45
import typing
56

67

8+
@dataclass
9+
class TestResult:
10+
"""
11+
Hold the result/stdout/stderr of a test execution
12+
"""
13+
14+
result: bool
15+
stdout: typing.Optional[str] = None
16+
stderr: typing.Optional[str] = None
17+
18+
719
def as_datetime(date: datetime.date) -> datetime.datetime:
820
return datetime.datetime.combine(date, datetime.time())
921

@@ -59,7 +71,7 @@ def adjust_date(
5971
def container_search(
6072
*,
6173
container_exists: typing.Callable[[datetime.date], bool],
62-
container_passes: typing.Callable[[datetime.date], bool],
74+
container_passes: typing.Callable[[datetime.date], TestResult],
6375
start_date: typing.Optional[datetime.date],
6476
end_date: typing.Optional[datetime.date],
6577
logger: logging.Logger,
@@ -88,8 +100,17 @@ def container_search(
88100
logger.info(f"Skipping check for end-of-range failure in {end_date}")
89101
else:
90102
logger.info(f"Checking end-of-range failure in {end_date}")
91-
if container_passes(end_date):
103+
test_end_date = container_passes(end_date)
104+
logger.info(f"stdout: {test_end_date.stdout}")
105+
logger.info(f"stderr: {test_end_date.stderr}")
106+
if test_end_date.result:
92107
raise Exception(f"Could not reproduce failure in {end_date}")
108+
logger.info(
109+
"IMPORTANT: you should check that the test output above shows the "
110+
f"*expected* failure of your test case in the {end_date} container. It is "
111+
"very easy to accidentally provide a test case that fails for the wrong "
112+
"reason, which will not triage the correct issue!"
113+
)
93114

94115
# Start the coarse, container-level, search for a starting point to the bisection range
95116
earliest_failure = end_date
@@ -127,7 +148,7 @@ def container_search(
127148
logger.info(f"Skipping check that the test passes on start_date={start_date}")
128149
else:
129150
# While condition prints an info message
130-
while not container_passes(search_date):
151+
while not container_passes(search_date).result:
131152
# Test failed on `search_date`, go further into the past
132153
earliest_failure = search_date
133154
new_search_date = adjust(
@@ -155,7 +176,7 @@ def container_search(
155176
if range_mid is None:
156177
# It wasn't possible to refine further.
157178
break
158-
result = container_passes(range_mid)
179+
result = container_passes(range_mid).result
159180
if result:
160181
range_start = range_mid
161182
else:

.github/triage/jax_toolbox_triage/main.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from .args import parse_args
1010
from .docker import DockerContainer
11-
from .logic import commit_search, container_search
11+
from .logic import commit_search, container_search, TestResult
1212
from .utils import (
1313
container_exists as container_exists_base,
1414
container_url as container_url_base,
@@ -21,6 +21,10 @@ def main():
2121
args = parse_args()
2222
bazel_cache_mounts = prepare_bazel_cache_mounts(args.bazel_cache)
2323
logger = get_logger(args.output_prefix)
24+
logger.info(
25+
"Verbose output, including stdout/err of triage commands, will be written to "
26+
f'{(args.output_prefix / "debug.log").resolve()}'
27+
)
2428
container_url = functools.partial(container_url_base, container=args.container)
2529
container_exists = functools.partial(
2630
container_exists_base, container=args.container, logger=logger
@@ -75,7 +79,7 @@ def get_commit(container: DockerContainer, repo: str) -> typing.Tuple[str, str]:
7579
f"Could not extract commit of {repo} from {args.container} container {container}"
7680
)
7781

78-
def check_container(date: datetime.date) -> bool:
82+
def check_container(date: datetime.date) -> TestResult:
7983
"""
8084
See if the test passes in the given container.
8185
"""
@@ -100,7 +104,7 @@ def check_container(date: datetime.date) -> bool:
100104
"xla": xla_commit,
101105
},
102106
)
103-
return test_pass
107+
return TestResult(result=test_pass, stdout=result.stdout, stderr=result.stderr)
104108

105109
# Search through the published containers, narrowing down to a pair of dates with
106110
# the property that the test passed on `range_start` and fails on `range_end`.

.github/triage/tests/test_triage_logic.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import logging
44
import pytest
55
import random
6-
from jax_toolbox_triage.logic import commit_search, container_search
6+
from jax_toolbox_triage.logic import commit_search, container_search, TestResult
77

88

99
def wrap(b):
@@ -306,7 +306,7 @@ def test_container_search_limits(
306306
with pytest.raises(Exception, match=match_string):
307307
container_search(
308308
container_exists=lambda dt: dt in dates_that_exist,
309-
container_passes=lambda dt: False,
309+
container_passes=lambda dt: TestResult(result=False),
310310
start_date=start_date,
311311
end_date=end_date,
312312
logger=logger,
@@ -353,7 +353,7 @@ def test_container_search_checks(
353353
with pytest.raises(Exception, match=match_string):
354354
container_search(
355355
container_exists=lambda dt: True,
356-
container_passes=lambda dt: dt in dates_that_pass,
356+
container_passes=lambda dt: TestResult(result=dt in dates_that_pass),
357357
start_date=start_date,
358358
end_date=end_date,
359359
logger=logger,
@@ -374,7 +374,7 @@ def test_container_search(logger, start_date, days_of_failure, threshold_days):
374374
assert start_date is None or threshold_date >= start_date
375375
good_date, bad_date = container_search(
376376
container_exists=lambda dt: True,
377-
container_passes=lambda dt: dt < threshold_date,
377+
container_passes=lambda dt: TestResult(result=dt < threshold_date),
378378
start_date=start_date,
379379
end_date=end_date,
380380
logger=logger,

docs/triage-tool.md

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,20 @@ The triage tool can be installed using `pip`:
2727
pip install git+https://github.com/NVIDIA/JAX-Toolbox.git#subdirectory=.github/triage
2828
```
2929
or directly from a checkout of the JAX-Toolbox repository.
30+
3031
Because the tool needs to orchestrate running commands in multiple containers, it is
3132
most convenient to install it in a virtual environment on the host system, rather than
3233
attempting to install it inside a container.
3334

35+
The recommended installation method is to install `virtualenv` natively on the host
36+
system, and then use that to create an isolated environment on the host system for the
37+
triage tool, *i.e.*:
38+
```bash
39+
virtualenv triage-venv
40+
./triage-venv/bin/pip install git+https://github.com/NVIDIA/JAX-Toolbox.git#subdirectory=.github/triage
41+
./triage-venv/bin/jax-toolbox-triage ...
42+
```
43+
3444
The tool should be invoked on a machine with `docker` available and whatever GPUs are
3545
needed to execute the test case.
3646

@@ -47,6 +57,10 @@ sure not to add excessive quotation marks (*i.e.* run
4757
`jax-toolbox-triage --container=jax test-jax.sh foo` not
4858
`jax-toolbox-triage --container=jax "test-jax.sh foo"`), and you should aim to make it
4959
as fast and targeted as possible.
60+
61+
If you want to run multiple commands, you might want to use something like
62+
`jax-toolbox-triage --container=jax sh -c "command1 && command2"`.
63+
5064
The expectation is that the test case will be executed successfully several times as
5165
part of the triage, so you may want to tune some parameters to reduce the execution
5266
time in the successful case.
@@ -55,6 +69,28 @@ probably reduce `--steps` to optimise execution time in the successful case.
5569

5670
A JSON status file and both info-level and debug-level logfiles are written to the
5771
directory given by `--output-prefix`.
72+
Info-level output is also written to the console, and includes the path to the debug
73+
log file.
74+
75+
You should pay attention to the first execution of your test case, to make sure it is
76+
failing for the correct reason. For example:
77+
```console
78+
$ jax-toolbox-triage --container jax command-you-forgot-to-install
79+
```
80+
will not immediately abort, because the tool is **expecting** the command to fail in
81+
the early stages of the triage:
82+
```
83+
[INFO] 2024-10-29 01:49:01 Verbose output, including stdout/err of triage commands, will be written to /home/olupton/JAX-Toolbox/triage-2024-10-29-01-49-01/debug.log
84+
[INFO] 2024-10-29 01:49:05 Checking end-of-range failure in 2024-10-27
85+
[INFO] 2024-10-29 01:49:05 Ran test case in 2024-10-27 in 0.4s, pass=False
86+
[INFO] 2024-10-29 01:49:05 stdout: OCI runtime exec failed: exec failed: unable to start container process: exec: "command-you-forgot-to-install": executable file not found in $PATH: unknown
87+
88+
[INFO] 2024-10-29 01:49:05 stderr:
89+
[INFO] 2024-10-29 01:49:05 IMPORTANT: you should check that the test output above shows the *expected* failure of your test case in the 2024-10-27 container. It is very easy to accidentally provide a test case that fails for the wrong reason, which will not triage the correct issue!
90+
[INFO] 2024-10-29 01:49:06 Starting coarse search with 2024-10-26 based on end_date=2024-10-27
91+
[INFO] 2024-10-29 01:49:06 Ran test case in 2024-10-26 in 0.4s, pass=False
92+
```
93+
where, notably, the triage search is continuing.
5894

5995
### Optimising container-level search performance
6096

0 commit comments

Comments
 (0)