NVIDIA
diff --git a/‎.github/triage/jax_toolbox_triage/args.py‎
Lines changed: 10 additions & 9 deletions b/‎.github/triage/jax_toolbox_triage/args.py‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎.github/triage/jax_toolbox_triage/container.py‎
Lines changed: 59 additions & 0 deletions b/‎.github/triage/jax_toolbox_triage/container.py‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎.github/triage/jax_toolbox_triage/docker.py‎
Lines changed: 3 additions & 15 deletions b/‎.github/triage/jax_toolbox_triage/docker.py‎
Lines changed: 3 additions & 15 deletions
diff --git a/‎.github/triage/jax_toolbox_triage/logic.py‎
Lines changed: 93 additions & 23 deletions b/‎.github/triage/jax_toolbox_triage/logic.py‎
Lines changed: 93 additions & 23 deletions
@@ -4,26 +4,28 @@
 import os
 import pathlib
 import tempfile
+import typing
 
 # Software we know may exist in the containers that we might be able to triage
 # We know how to recompile JAX/XLA, so it's OK that they include C++ code
 # TransformerEngine is intentionally excluded because build-te.sh is not plumbed yet.
 # Flax and MaxText are pure Python, so it's OK we don't have a way of compiling them,
 # but they are not always installed in containers we want to triage.
-compulsory_software = {"xla", "jax"}
-optional_software = {"flax", "maxtext"}
+# Note this is not a `set` for the sake of run-to-run determinism.
+compulsory_software = ["xla", "jax"]
+optional_software = ["flax", "maxtext"]
 
 
-def parse_commit_argument(s):
-    ret = {}
+def parse_commit_argument(s: str) -> typing.Dict[str, str]:
+    ret: typing.Dict[str, str] = {}
     for part in s.split(","):
         sw, commit = part.split(":", 1)
         assert sw not in ret, ret
         ret[sw] = commit
     return ret
 
 
-def parse_args(args=None):
+def parse_args(args=None) -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         description="""
             Triage failures in JAX/XLA-related tests. The expectation is that the given
@@ -203,10 +205,9 @@ def parse_args(args=None):
             args.passing_container is not None or args.failing_container is not None
         ), "At least one of --passing-container and --failing-container must be passed."
         for prefix in ["passing", "failing"]:
-            assert (
-                getattr(args, f"{prefix}_container") is not None
-                or getattr(args, f"{prefix}_commits").keys() >= compulsory_software
-            ), (
+            assert getattr(args, f"{prefix}_container") is not None or getattr(
+                args, f"{prefix}_commits"
+            ).keys() >= set(compulsory_software), (
                 f"--{prefix}-commits must specify all of {compulsory_software} if "
                 f"--{prefix}-container is not specified"
             )
 
@@ -0,0 +1,59 @@
+from abc import ABC, abstractmethod
+import logging
+import subprocess
+import typing
+
+
+class Container(ABC):
+    def __init__(self, *, logger: logging.Logger):
+        self._logger = logger
+
+    @abstractmethod
+    def __enter__(self) -> "Container":
+        """
+        Launch the container instance
+        """
+        pass
+
+    @abstractmethod
+    def __exit__(self, *exc_info) -> None:
+        """
+        Shut down the container instance
+        """
+        pass
+
+    @abstractmethod
+    def __repr__(self) -> str:
+        pass
+
+    @abstractmethod
+    def exec(
+        self,
+        command: typing.List[str],
+        policy: typing.Literal["once", "once_per_container", "default"] = "default",
+        stderr: typing.Literal["interleaved", "separate"] = "interleaved",
+        workdir=None,
+    ) -> subprocess.CompletedProcess:
+        """
+        Run a command inside a persistent container.
+        """
+        pass
+
+    def check_exec(
+        self, cmd: typing.List[str], **kwargs
+    ) -> subprocess.CompletedProcess:
+        result = self.exec(cmd, **kwargs)
+        if result.returncode != 0:
+            self._logger.fatal(
+                f"{' '.join(cmd)} exited with return code {result.returncode}"
+            )
+            self._logger.fatal(result.stdout)
+            result.check_returncode()
+        return result
+
+    @abstractmethod
+    def exists(self) -> bool:
+        """
+        Check if the container exists.
+        """
+        pass
@@ -3,18 +3,19 @@
 import subprocess
 import typing
 
+from .container import Container
 from .utils import run_and_log
 
 
-class DockerContainer:
+class DockerContainer(Container):
     def __init__(
         self,
         url: str,
         *,
         logger: logging.Logger,
         mounts: typing.List[typing.Tuple[pathlib.Path, pathlib.Path]],
     ):
-        self._logger = logger
+        super().__init__(logger=logger)
         self._mount_args = []
         for src, dst in mounts:
             self._mount_args += ["-v", f"{src}:{dst}"]
@@ -74,19 +75,6 @@ def exec(
             stderr=stderr,
         )
 
-    def check_exec(
-        self, cmd: typing.List[str], **kwargs
-    ) -> subprocess.CompletedProcess:
-        result = self.exec(cmd, **kwargs)
-        if result.returncode != 0:
-            self._logger.fatal(
-                f"{' '.join(cmd)} exited with return code {result.returncode}"
-            )
-            self._logger.fatal(result.stdout)
-            self._logger.fatal(result.stderr)
-            result.check_returncode()
-        return result
-
     def exists(self) -> bool:
         """
         Check if the given container exists.
 
@@ -3,6 +3,7 @@
 import functools
 import itertools
 import logging
+import pathlib
 import typing
 
 from .utils import console_log_level
@@ -15,6 +16,7 @@ class TestResult:
     """
 
     __test__ = False  # stop pytest gathering this
+    host_output_directory: pathlib.Path
     result: bool
     stdouterr: str
 
@@ -80,7 +82,7 @@ def container_search(
     logger: logging.Logger,
     skip_precondition_checks: bool,
     threshold_days: int,
-):
+) -> typing.Tuple[datetime.date, datetime.date]:
     adjust = functools.partial(
         adjust_date, logger=logger, container_exists=container_exists
     )
@@ -197,36 +199,29 @@ def __call__(self, *, commits: typing.Dict[str, str]) -> TestResult:
         ...
 
 
-def _first(xs):
+T = typing.TypeVar("T")
+U = typing.TypeVar("U")
+FlatCommitDict = typing.Tuple[typing.Tuple[str, str], ...]
+
+
+def _first(xs: typing.Iterable[T]) -> T:
     return next(iter(xs))
 
 
-def _not_first(d):
+def _not_first(d: typing.Dict[T, U]) -> typing.Iterable[typing.Tuple[T, U]]:
     return itertools.islice(d.items(), 1, None)
 
 
-def commit_search(
+def _commit_search(
     *,
     commits: typing.OrderedDict[
         str, typing.Sequence[typing.Tuple[str, datetime.datetime]]
     ],
     build_and_test: BuildAndTest,
     logger: logging.Logger,
     skip_precondition_checks: bool,
-):
-    """
-    Bisect a failure back to a single commit.
-
-    Arguments:
-    commits: *ordered* dictionary of commit sequences for different software
-        packages, e.g. commits["jax"][0] is (hash, date) of the passing JAX
-        commit. The ordering of packages has implications for precisely how
-        the triage proceeds.
-    build_and_test: callable that tests if a given vector of commits passes
-    logger: instance to log output to
-    skip_precondition_checks: if True, some tests that should pass/fail by
-        construction are skipped
-    """
+    result_cache: typing.Dict[FlatCommitDict, TestResult],
+) -> typing.Tuple[typing.Dict[str, str], TestResult, typing.Optional[TestResult]]:
     assert all(len(commit_list) for commit_list in commits.values()), (
         "Not enough commits: need at least one commit for each package",
         commits,
@@ -236,6 +231,11 @@ def commit_search(
         commits,
     )
 
+    def _cache_key(
+        commits: typing.Dict[str, str],
+    ) -> FlatCommitDict:
+        return tuple(sorted(commits.items()))
+
     if skip_precondition_checks:
         logger.info("Skipping check that 'good' commits reproduce success")
     else:
@@ -245,6 +245,8 @@ def commit_search(
             package: commit_list[0][0] for package, commit_list in commits.items()
         }
         check_pass = build_and_test(commits=passing_commits)
+        assert _cache_key(passing_commits) not in result_cache
+        result_cache[_cache_key(passing_commits)] = check_pass
         if check_pass.result:
             logger.info("Verified test passes using 'good' commits")
         else:
@@ -264,6 +266,8 @@ def commit_search(
         # below is actionable without checking the debug logfile.
         with console_log_level(logger, logging.DEBUG):
             check_fail = build_and_test(commits=failing_commits)
+        assert _cache_key(failing_commits) not in result_cache
+        result_cache[_cache_key(failing_commits)] = check_fail
         if not check_fail.result:
             logger.info(
                 "Verified test failure using 'bad' commits. IMPORTANT: you should check "
@@ -303,9 +307,11 @@ def commit_search(
             bisect_commits[secondary] = commit
             log_msg += f", {len(commit_list)} remaining {secondary} commits"
         logger.info(log_msg)
-        bisect_result = build_and_test(commits=bisect_commits).result
+        bisect_result = build_and_test(commits=bisect_commits)
+        assert _cache_key(bisect_commits) not in result_cache
+        result_cache[_cache_key(bisect_commits)] = bisect_result
 
-        if bisect_result:
+        if bisect_result.result:
             # Test passed, continue searching in the second half
             for package, index in indices.items():
                 commits[package] = commits[package][index:]
@@ -336,7 +342,11 @@ def commit_search(
         f"Two {primary} commits remain, checking if {commits[primary][-1][0]} is the "
         "culprit"
     )
-    blame = build_and_test(commits=blame_commits)
+    # It's possible that this combination has already been tested at this point
+    blame = result_cache.get(_cache_key(blame_commits))
+    if blame is None:
+        blame = build_and_test(commits=blame_commits)
+        result_cache[_cache_key(blame_commits)] = blame
     if blame.result:
         # Test passed with {pX, sZ, tZ, ...} but was known to fail with
         # {pZ, sZ, tZ, ...}. Therefore pZ is the culprit commit.
@@ -349,18 +359,78 @@ def commit_search(
             f"{primary}_bad": bad_commit,
             f"{primary}_good": good_commit,
         }
+        first_known_bad = {primary: bad_commit}
         for secondary, secondary_commit in _not_first(blame_commits):
+            first_known_bad[secondary] = secondary_commit
             ret[f"{secondary}_ref"] = secondary_commit
-        return ret
+        # `blame` represents the last-known-good test result, first-known-bad was seen
+        # earlier, or possibly not at all e.g. if `skip_precondition_checks` is True
+        # and first-known-bad was the end of the search range.
+        first_known_bad_result = result_cache.get(_cache_key(first_known_bad))
+        if first_known_bad_result is None:
+            if skip_precondition_checks:
+                logger.info(
+                    "Did not find a cached result for the first-known-bad "
+                    f"configuration {first_known_bad}, this is probably due to "
+                    "--skip-precondition-checks having been passed."
+                )
+            else:
+                logger.error(
+                    "Did not find a cached result for the first-known-bad "
+                    f"configuration {first_known_bad}, this is unexpected!"
+                )
+        return ret, blame, first_known_bad_result
     else:
         # Test failed with both {pX, sZ, tZ, ...} and {pZ, sZ, tZ, ...}, so
         # we can fix the primary package to pX and recurse with the old
         # secondary package (s) as the new primary, and the old primary
         # package (p) moved to the end.
         commits[primary] = [commits.pop(primary)[0]]
-        return commit_search(
+        return _commit_search(
             build_and_test=build_and_test,
             commits=commits,
             logger=logger,
             skip_precondition_checks=True,
+            result_cache=result_cache,
         )
+
+
+def commit_search(
+    *,
+    commits: typing.OrderedDict[
+        str, typing.Sequence[typing.Tuple[str, datetime.datetime]]
+    ],
+    build_and_test: BuildAndTest,
+    logger: logging.Logger,
+    skip_precondition_checks: bool,
+) -> typing.Tuple[
+    typing.Dict[str, str],
+    TestResult,
+    typing.Optional[TestResult],
+]:
+    """
+    Bisect a failure back to a single commit.
+
+    Arguments:
+    commits: *ordered* dictionary of commit sequences for different software
+        packages, e.g. commits["jax"][0] is (hash, date) of the passing JAX
+        commit. The ordering of packages has implications for precisely how
+        the triage proceeds.
+    build_and_test: callable that tests if a given vector of commits passes
+    logger: instance to log output to
+    skip_precondition_checks: if True, some tests that should pass/fail by
+        construction are skipped
+
+    Returns a 3-tuple of (summary_dict, last_known_good, first_known_bad),
+    where the last element can be None if skip_precondition_checks=True. The
+    last two elements' .result fields will always be, respectively, True and
+    False, but the other fields can be used to obtain stdout+stderr and
+    output files from those test invocations.
+    """
+    return _commit_search(
+        commits=commits,
+        build_and_test=build_and_test,
+        logger=logger,
+        skip_precondition_checks=skip_precondition_checks,
+        result_cache={},
+    )