[RFC] Implement basic on disk caching

oulgen · oulgen · commit 7bd22d1362d5 · 2025-07-19T17:08:09.000-07:00
stack-info: PR: #336, branch: oulgen/stack/26
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -41,7 +41,7 @@ jobs:
 
       - name: Install PyTorch
         run: |
-          pip3 install torch --index-url https://download.pytorch.org/whl/cu128
+          pip3 install -U --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128
 
       - name: Install lint dependencies
         run: ./lint.sh install
diff --git a/helion/_testing.py b/helion/_testing.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import collections
+import contextlib
 import importlib
 import inspect
 import operator
@@ -15,6 +16,7 @@
 import torch
 from triton.testing import do_bench
 
+from ._utils import counters
 from .runtime.config import Config
 from helion._compat import get_tensor_descriptor_fn_name
 
@@ -289,6 +291,20 @@ def tearDownClass(cls) -> None:
         super().tearDownClass()
         del cls._expected_journal
 
+    def setUp(self) -> None:
+        super().setUp()
+        self._test_stack = contextlib.ExitStack()
+
+        from torch._inductor.utils import fresh_cache
+
+        self._test_stack.enter_context(fresh_cache())
+
+        counters.clear()
+
+    def tearDown(self) -> None:
+        super().tearDown()
+        self._test_stack.close()
+
     def assertExpectedJournal(self, value: str) -> None:
         """
         Assert that the given value matches the expected output stored in <testfile>.expected.
diff --git a/helion/_utils.py b/helion/_utils.py
@@ -0,0 +1,7 @@
+from __future__ import annotations
+
+import collections
+
+counters: collections.defaultdict[str, collections.Counter[str]] = (
+    collections.defaultdict(collections.Counter)
+)
diff --git a/helion/autotuner/__init__.py b/helion/autotuner/__init__.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from .cache import AutotuneCache as AutotuneCache
 from .config_fragment import BooleanFragment as BooleanFragment
 from .config_fragment import EnumFragment as EnumFragment
 from .config_fragment import IntegerFragment as IntegerFragment
diff --git a/helion/autotuner/cache.py b/helion/autotuner/cache.py
@@ -0,0 +1,121 @@
+from __future__ import annotations
+
+import dataclasses
+import functools
+import hashlib
+import inspect
+import logging
+import os
+from pathlib import Path
+import textwrap
+from typing import TYPE_CHECKING
+from typing import Sequence
+
+import torch
+
+from .._utils import counters
+from ..runtime.config import Config
+
+if TYPE_CHECKING:
+    from ..runtime.kernel import Kernel
+
+log: logging.Logger = logging.getLogger(__name__)
+
+"""
+TODO(oulgen)
+- Allow user defined cache keys that can be passed on @helion.kernel
+- Add import/export for set of configs
+"""
+
+
+@dataclasses.dataclass(frozen=True)
+class AutotuneCacheKey:
+    """
+    helion_key: Hash of source code of Helion
+    torch_key: Hash of source code of PyTorch
+    system_hash: Hash of system information,
+        including Triton, current device, cuda/rocm arch version
+    function_source_hash: Hash of source code of input Helion kernel
+    input_dtypes: dtypes of input tensors
+    input_shapes: shapes of input tensors
+    """
+
+    helion_key: str
+    torch_key: str
+    system_hash: str
+    kernel_source_hash: str
+    input_dtypes: list[tuple[int, torch.dtype]]
+    input_shapes: list[tuple[int, torch.Size]]
+
+    def stable_hash(self) -> str:
+        return hashlib.sha256(repr(self).encode("utf-8")).hexdigest()
+
+
+class AutotuneCache:
+    def __init__(self, kernel: Kernel, args: Sequence[object]) -> None:
+        self.key: AutotuneCacheKey = AutotuneCache._generate_key(kernel, args)
+
+    @staticmethod
+    def _generate_key(kernel: Kernel, args: Sequence[object]) -> AutotuneCacheKey:
+        from torch._inductor.codecache import CacheBase
+        from torch._inductor.codecache import torch_key
+
+        kernel_source = textwrap.dedent(inspect.getsource(kernel.fn))
+        kernel_source_hash = hashlib.sha256(kernel_source.encode("utf-8")).hexdigest()
+
+        input_dtypes = []
+        input_shapes = []
+
+        for idx, a in enumerate(args):
+            if isinstance(a, torch.Tensor):
+                input_dtypes.append((idx, a.dtype))
+                input_shapes.append((idx, a.shape))
+
+        return AutotuneCacheKey(
+            helion_key=helion_key(),
+            torch_key=torch_key().hex(),
+            system_hash=CacheBase.get_system()["hash"],
+            kernel_source_hash=kernel_source_hash,
+            input_dtypes=input_dtypes,
+            input_shapes=input_shapes,
+        )
+
+    def _get_cache_key(self) -> str:
+        return self.key.stable_hash()
+
+    def _get_local_cache_path(self) -> Path:
+        from torch._inductor.runtime.cache_dir_utils import (
+            cache_dir,  # pyright: ignore[reportPrivateImportUsage]
+        )
+
+        return Path(cache_dir()) / "helion" / f"{self._get_cache_key()}.best_config"
+
+    def get(self) -> Config | None:
+        path = self._get_local_cache_path()
+        try:
+            config = Config.load(path)
+            log.debug("Cache hit on config at %s", path)
+            counters["autotune"]["cache_hit"] += 1
+            return config
+        except Exception:
+            log.debug("Cache miss on config at %s", path)
+            counters["autotune"]["cache_miss"] += 1
+            return None
+
+    def put(self, config: Config) -> None:
+        path = self._get_local_cache_path()
+        config.save(path)
+        log.debug("Cache write of config at %s", path)
+        counters["autotune"]["cache_put"] += 1
+
+
+@functools.cache
+def helion_key() -> str:
+    from torch._inductor.codecache import build_code_hash
+
+    here = os.path.abspath(__file__)
+    helion_path = os.path.dirname(os.path.dirname(here))
+
+    combined_hash = hashlib.sha256()
+    build_code_hash([helion_path], "", combined_hash)
+    return combined_hash.hexdigest()
diff --git a/helion/runtime/config.py b/helion/runtime/config.py
@@ -118,6 +118,7 @@ def from_json(cls, json_str: str) -> Config:
 
     def save(self, path: str | Path) -> None:
         """Save the config to a JSON file."""
+        Path(path).parent.mkdir(parents=True, exist_ok=True)
         Path(path).write_text(self.to_json())
 
     @classmethod
diff --git a/helion/runtime/kernel.py b/helion/runtime/kernel.py
@@ -438,13 +438,21 @@ def autotune(
         else:
             self.settings.check_autotuning_disabled()
 
-            from ..autotuner import DifferentialEvolutionSearch
+            from ..autotuner import AutotuneCache
 
-            config = DifferentialEvolutionSearch(
-                self,
-                args,
-                **kwargs,  # pyright: ignore[reportArgumentType]
-            ).autotune()
+            cache = AutotuneCache(self.kernel, args)
+            config = cache.get()
+
+            if config is None:
+                from ..autotuner import DifferentialEvolutionSearch
+
+                config = DifferentialEvolutionSearch(
+                    self,
+                    args,
+                    **kwargs,  # pyright: ignore[reportArgumentType]
+                ).autotune()
+
+                cache.put(config)
         self.set_config(config)
         return config
 
diff --git a/test/test_cache.py b/test/test_cache.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+from pathlib import Path
+import unittest
+from unittest import mock
+
+import torch
+
+from helion._testing import DEVICE
+from helion._testing import TestCase
+from helion._testing import import_path
+from helion._utils import counters
+
+datadir = Path(__file__).parent / "data"
+basic_kernels = import_path(datadir / "basic_kernels.py")
+
+
+class BasicSearch:
+    def __init__(self, bound_kernel, *args, **kwargs):
+        self.bound_kernel = bound_kernel
+
+    def autotune(self):
+        return self.bound_kernel.config_spec.default_config()
+
+
+class TestCache(TestCase):
+    @mock.patch("helion.autotuner.DifferentialEvolutionSearch", new=BasicSearch)
+    def test_basic(self):
+        a = torch.randn(16, device=DEVICE, dtype=torch.bfloat16)
+        b = torch.randn(16, device=DEVICE, dtype=torch.float16)
+
+        result = basic_kernels.add(a, a)
+        torch.testing.assert_close(result, a + a)
+
+        self.assertEqual(counters["autotune"]["cache_miss"], 1)
+        self.assertEqual(counters["autotune"]["cache_hit"], 0)
+        self.assertEqual(counters["autotune"]["cache_put"], 1)
+
+        basic_kernels.add.reset()
+
+        result = basic_kernels.add(a, a)
+        torch.testing.assert_close(result, a + a)
+
+        self.assertEqual(counters["autotune"]["cache_miss"], 1)
+        self.assertEqual(counters["autotune"]["cache_hit"], 1)
+        self.assertEqual(counters["autotune"]["cache_put"], 1)
+
+        basic_kernels.add.reset()
+
+        result = basic_kernels.add(b, b)
+        torch.testing.assert_close(result, b + b)
+
+        self.assertEqual(counters["autotune"]["cache_miss"], 2)
+        self.assertEqual(counters["autotune"]["cache_hit"], 1)
+        self.assertEqual(counters["autotune"]["cache_put"], 2)
+
+
+if __name__ == "__main__":
+    unittest.main()