jdinalt · jdinalt · May 15, 2026 · May 15, 2026 · May 15, 2026
diff --git a/docs/guides/evaluating-models.md b/docs/guides/evaluating-models.md
@@ -120,14 +120,12 @@ same mechanism `forgather train` uses). Pass an explicit path with
 `AutoModelForCausalLM.from_pretrained` on the model directory.
 
 **Quantized models** (artifacts produced by `forgather finalize --quantize`)
-are autodetected: if `config.json` has a `quantization_config` block, eval
-forces the `from_pretrained()` path and ignores both `--checkpoint` and
-`--no-checkpoint`. An explicit `--checkpoint PATH` logs a warning before
-being overridden. The checkpoint-resume path uses `from_config()` +
-`load_state_dict()` which has no quantizer hook and fails on quantized
-tensor subclasses; `from_pretrained()` runs HF's `TorchAoHfQuantizer`
-pre-process so the right linear modules are in place before the weights
-load. See [QAT Training § Evaluating Quantized
+load transparently through the standard checkpoint-resume path. Forgather's
+native loader (`forgather.ml.sharded_checkpoint.load_checkpoint`) detects
+torchao quantization from `config.json`'s `quantization_config` block (or
+falls back to scanning the saved state_dict) and installs the matching
+quantized linear modules before `load_state_dict` runs. No extra flag,
+no caller-side recipe argument. See [QAT Training § Evaluating Quantized
 Models](../trainers/qat-training.md#evaluating-quantized-models).
 
 The tokenizer is always loaded directly from `--model` via

diff --git a/docs/guides/finalize-model.md b/docs/guides/finalize-model.md
@@ -123,11 +123,13 @@ which the safetensors writer requires. If `--safetensors` is passed
 alongside `--quantize`, it is silently disabled with a warning.
 
 Finalize also writes a `quantization_config` block into `config.json`
-with the recipe. This makes HF `AutoModelForCausalLM.from_pretrained()`
-auto-detect the quantization on reload and run the
-`TorchAoHfQuantizer` pre-process path — so `forgather eval` and the
-inference server load the artifact correctly without any caller-side
-flag. See [Evaluating Quantized
+with the recipe. Forgather's native checkpoint loader consumes this
+hint (with a state_dict scan as fallback) and installs the matching
+quantized linear modules before weights load — so `forgather eval`,
+the inference server, and any other tool using the native loader
+handle the artifact transparently with no caller-side flag. The same
+block also enables HF `AutoModelForCausalLM.from_pretrained()`
+auto-detection for non-Forgather consumers. See [Evaluating Quantized
 Models](../trainers/qat-training.md#evaluating-quantized-models).
 
 ### Misc

diff --git a/docs/trainers/qat-training.md b/docs/trainers/qat-training.md
@@ -271,26 +271,32 @@ numbers on Tiny Llama.
 
 ```bash
 # Same invocation as for a bf16 model — eval autodetects the quantized
-# artifact and routes through HF `from_pretrained()` so the
-# TorchAoHfQuantizer pre-process path installs the right linear modules.
+# artifact and installs the right linear modules via the native loader.
 forgather -p examples/tutorials/tiny_llama eval test tinystories \
     -M /path/to/quantized_model
 ```
 
 How it works: at finalize time, `--quantize` writes a
-`quantization_config` block into `config.json` with the recipe. At eval
-time, `scripts/eval_script.py:resolve_checkpoint()` reads that field
-and forces the `from_pretrained()` load path, **ignoring** any
-`--checkpoint` / `--no-checkpoint` flag (an explicit `--checkpoint
-PATH` logs a warning before being overridden). The normal
-checkpoint-resume path uses `from_config()` + `load_state_dict()`,
-which does not run any quantizer hook and fails when the saved tensors
-are torchao quantized subclasses. Same mechanism applies to the
-inference server — no caller-side changes needed.
-
-The check is purely additive: bf16 models keep using the
-checkpoint-resume path. Pass `--no-checkpoint` to opt into
-`from_pretrained()` manually for non-quantized models.
+`quantization_config` block into `config.json` with the recipe.
+Forgather's native checkpoint loader
+(`forgather.ml.sharded_checkpoint.load_checkpoint`) reads that block,
+or — as a fallback when the block is absent — scans the first shard for
+torchao tensor subclasses. When quantization is detected, the loader
+installs the matching quantized linear modules (`quantize_(model,
+QATConfig(base_config, step="convert"))`) on the constructed model
+*before* `load_state_dict` runs, so the saved tensor subclasses land in
+slots that know how to hold them.
+
+This is built into the native loader, so it applies uniformly to every
+tool that loads via Forgather checkpoints (`-c`):
+
+- `forgather eval test ... -M <dir>` (and its `--checkpoint PATH` variant)
+- `forgather inf server -m <dir> --from-checkpoint`
+- Trainer resume (`resume_from_checkpoint`)
+
+No caller-side recipe flag, no marker file. The check is purely
+additive — bf16 models load through the exact same path with no
+quantization step.
 
 ### Three-Way Comparison: bf16 / PTQ / QAT
 

diff --git a/scripts/eval_script.py b/scripts/eval_script.py
@@ -129,23 +129,6 @@ def init_model():
     return init_model
 
 
-def _model_is_quantized(model_path: str) -> bool:
-    """True iff ``<model_path>/config.json`` carries a ``quantization_config`` block.
-
-    Written by ``forgather finalize --quantize``; HF's
-    ``from_pretrained()`` consumes it to install the TorchAoHfQuantizer
-    pre-process path. Unreadable or malformed config.json -> False.
-    """
-    cfg_path = os.path.join(model_path, "config.json")
-    if not os.path.isfile(cfg_path):
-        return False
-    try:
-        with open(cfg_path) as f:
-            return "quantization_config" in json.load(f)
-    except (OSError, ValueError):
-        return False
-
-
 def resolve_checkpoint(args):
     """Return (checkpoint_arg, use_checkpoint) for the trainer.
 
@@ -154,33 +137,20 @@ def resolve_checkpoint(args):
     - str path: explicit
     - False: do not resume
 
-    Quantized models force ``(False, False)`` regardless of flags: only HF
-    `from_pretrained()` runs the TorchAoHfQuantizer pre-process that
-    installs the right quantized linear modules before weight load. The
-    checkpoint-resume path uses `from_config()` + `load_state_dict()`,
-    which has no quantizer hook and crashes with `'Parameter' object has
-    no attribute 'tensor_data_names'` on torchao tensor subclasses.
-
     Cache the result on ``args`` so repeated calls don't re-log.
+
+    Quantization is handled transparently downstream: when the native
+    loader at ``forgather.ml.sharded_checkpoint.load_checkpoint`` detects
+    torchao quantization (via ``config.json`` or a state_dict scan), it
+    installs the matching quantized linear modules before
+    ``load_state_dict``. Eval doesn't need to special-case quantized
+    models here.
     """
     cached = getattr(args, "_resolved_checkpoint", None)
     if cached is not None:
         return cached
 
-    if _model_is_quantized(args.model):
-        if args.checkpoint:
-            logger.warning(
-                "Detected quantization_config in model config; ignoring "
-                "--checkpoint %r and loading via from_pretrained().",
-                args.checkpoint,
-            )
-        else:
-            logger.info(
-                "Detected quantization_config in model config; "
-                "loading via from_pretrained() (overrides default checkpoint resume)."
-            )
-        result = (False, False)
-    elif args.no_checkpoint:
+    if args.no_checkpoint:
         result = (False, False)
     elif args.checkpoint:
         result = (args.checkpoint, True)