john-rocky · john-rocky · Apr 15, 2026
diff --git a/Sources/CoreMLLLM/ChunkedEngine.swift b/Sources/CoreMLLLM/ChunkedEngine.swift
@@ -104,6 +104,24 @@ final class ChunkedEngine {
         let mlConfig = MLModelConfiguration()
         mlConfig.computeUnits = computeUnits
 
+        // Phase D pipelining (item 11d): when CHUNK_PIPELINE_ENABLED=1, load
+        // chunk3 on .cpuAndGPU so it goes through a distinct driver queue from
+        // the ANE-resident chunks. This is a prerequisite for predictStep's
+        // pipelined variant (see predictStepPipelined). Default off; zero
+        // behaviour change when the env var is unset. See also PR #77 spike
+        // (CHUNK_PIPELINE_SPIKE=1 / COMPUTE_UNIT_SPLIT=1), which stays as a
+        // diagnostic in main.
+        let pipelineEnv = ProcessInfo.processInfo.environment["CHUNK_PIPELINE_ENABLED"] == "1"
+        let pipelineGPUConfig: MLModelConfiguration = {
+            let c = MLModelConfiguration()
+            c.computeUnits = .cpuAndGPU
+            return c
+        }()
+        if pipelineEnv {
+            print("[Pipeline] CHUNK_PIPELINE_ENABLED=1 — chunk3 on .cpuAndGPU, " +
+                  "predictStep uses async c3 dispatch")
+        }
+
         func findModel(_ name: String) -> URL? {
             let compiled = directory.appendingPathComponent("\(name).mlmodelc")
             if FileManager.default.fileExists(atPath: compiled.path) { return compiled }
@@ -117,9 +135,11 @@ final class ChunkedEngine {
                 throw CoreMLLLMError.modelNotFound(name)
             }
             let t0 = CFAbsoluteTimeGetCurrent()
-            let m = try MLModel(contentsOf: url, configuration: mlConfig)
+            let cfg = (pipelineEnv && name == "chunk3") ? pipelineGPUConfig : mlConfig
+            let m = try MLModel(contentsOf: url, configuration: cfg)
             let dt = CFAbsoluteTimeGetCurrent() - t0
-            print("[Load] \(name) done in \(String(format: "%.1f", dt))s")
+            print("[Load] \(name) done in \(String(format: "%.1f", dt))s" +
+                  (pipelineEnv && name == "chunk3" ? " (.cpuAndGPU)" : ""))
             return m
         }
 
@@ -315,7 +335,8 @@ final class ChunkedEngine {
             kFull1: ioSurfaceArray(slots: 1, seqLen: ctx), vFull1: ioSurfaceArray(slots: 1, seqLen: ctx),
             kSliding2: ioSurfaceArray(slots: 5, seqLen: W), vSliding2: ioSurfaceArray(slots: 5, seqLen: W),
             kFull2: ioSurfaceArray(slots: 2, seqLen: ctx), vFull2: ioSurfaceArray(slots: 2, seqLen: ctx),
-            config: config, prefillN: prefillN)
+            config: config, prefillN: prefillN,
+            pipeliningEnabled: pipelineEnv)
 
         // ANE pipeline prewarm (Phase 0b): four dummy decode steps at load
         // time force the ANE compiler to finalize dispatch schedules and
@@ -332,6 +353,14 @@ final class ChunkedEngine {
         return engine
     }
 
+    /// True when chunk3 is loaded on .cpuAndGPU and predictStep should route
+     /// through the pipelined variant (overlap c3 GPU with CPU-side work).
+    private let pipeliningEnabled: Bool
+    /// Serial queue for c3 async dispatch. Defined even when pipelining is
+    /// off so property initialisation stays simple; unused in that case.
+    private let c3Queue = DispatchQueue(label: "coreml-llm.chunk3.gpu",
+                                        qos: .userInitiated)
+
     private init(chunk1: MLModel, chunk2: MLModel, chunk3: MLModel, chunk4: MLModel,
                  prefillChunk1: MLModel?, prefillChunk2: MLModel?,
                  prefillChunk3: MLModel?, prefillChunk4: MLModel?,
@@ -346,7 +375,8 @@ final class ChunkedEngine {
                  kFull1: MLMultiArray, vFull1: MLMultiArray,
                  kSliding2: MLMultiArray, vSliding2: MLMultiArray,
                  kFull2: MLMultiArray, vFull2: MLMultiArray,
-                 config: ModelConfig, prefillN: Int) {
+                 config: ModelConfig, prefillN: Int,
+                 pipeliningEnabled: Bool = false) {
         self.chunk1 = chunk1; self.chunk2 = chunk2
         self.chunk3 = chunk3; self.chunk4 = chunk4
         self.prefillChunk1 = prefillChunk1; self.prefillChunk2 = prefillChunk2
@@ -363,8 +393,12 @@ final class ChunkedEngine {
         self.kSliding2 = kSliding2; self.vSliding2 = vSliding2
         self.kFull2 = kFull2; self.vFull2 = vFull2
         self.config = config; self.prefillN = prefillN
+        self.pipeliningEnabled = pipeliningEnabled
     }
 
+    /// Runtime-readable pipelining state (ChunkedEngine -> CoreMLLLM).
+    var isPipeliningEnabled: Bool { pipeliningEnabled }
+
     // MARK: - Reset
 
     func reset() {
@@ -488,10 +522,56 @@ final class ChunkedEngine {
             "kv14_k": MLFeatureValue(multiArray: kv14_k), "kv14_v": MLFeatureValue(multiArray: kv14_v),
         ]
 
-        // Chunk 3
+        // Chunk 3 (+ chunk 4)
+        //
+        // Pipelined path: when chunk3 is on .cpuAndGPU (distinct driver from
+        // the ANE-resident chunk4), dispatch c3 asynchronously on c3Queue so
+        // the GPU submission and the CPU-side d4 dict construction / Swift
+        // runtime work on this thread proceed concurrently. Empirically the
+        // overlap opportunity inside a single step is small (d4 build is
+        // microseconds) — the real gain from this flag comes from PR #77's
+        // spike probe, which showed kernel-level overlap between ANE and GPU
+        // drivers. See docs/PHASE_D_PIPELINING_IMPL.md.
+        //
+        // Serial path: the original c3 → c4 back-to-back call on the caller
+        // thread. Keeps behaviour bit-identical when pipelining is off.
         let tC3Start = CFAbsoluteTimeGetCurrent()
         var d3 = shared; d3["hidden_states"] = MLFeatureValue(multiArray: h2)
-        let h3 = try chunk3.prediction(from: MLDictionaryFeatureProvider(dictionary: d3))
+        let h3: MLMultiArray
+        if pipeliningEnabled {
+            let d3Provider = try MLDictionaryFeatureProvider(dictionary: d3)
+            var h3Result: MLMultiArray?
+            var h3Error: Error?
+            let sem = DispatchSemaphore(value: 0)
+            c3Queue.async { [chunk3] in
+                do {
+                    let o3 = try chunk3.prediction(from: d3Provider)
+                    h3Result = o3.featureValue(for: "hidden_states_out")!.multiArrayValue!
+                } catch {
+                    h3Error = error
+                }
+                sem.signal()
+            }
+            // CPU-side overlap window: build c4's feature dict base while GPU
+            // is busy with c3. The dict build is ~microseconds; this mostly
+            // exists so the async architecture is in place for future
+            // restructuring (e.g., decoupled c4 that no longer depends on h3).
+            var d4Base = shared
+            sem.wait()
+            if let err = h3Error { throw err }
+            h3 = h3Result!
+            let tC3End = CFAbsoluteTimeGetCurrent()
+            profileC3 += (tC3End - tC3Start)
+
+            let tC4Start = CFAbsoluteTimeGetCurrent()
+            d4Base["hidden_states"] = MLFeatureValue(multiArray: h3)
+            let out4 = try chunk4.prediction(from:
+                MLDictionaryFeatureProvider(dictionary: d4Base))
+            let tC4End = CFAbsoluteTimeGetCurrent()
+            profileC4 += (tC4End - tC4Start)
+            return try finishStep(out4: out4, t1: t1)
+        }
+        h3 = try chunk3.prediction(from: MLDictionaryFeatureProvider(dictionary: d3))
             .featureValue(for: "hidden_states_out")!.multiArrayValue!
         let tC3End = CFAbsoluteTimeGetCurrent()
         profileC3 += (tC3End - tC3Start)
@@ -503,6 +583,13 @@ final class ChunkedEngine {
         let tC4End = CFAbsoluteTimeGetCurrent()
         profileC4 += (tC4End - tC4Start)
 
+        return try finishStep(out4: out4, t1: t1)
+    }
+
+    /// Shared post-chunk4 bookkeeping: update profile counters, emit periodic
+    /// log line, return argmax token_id. Called from both the serial and the
+    /// pipelined predictStep paths.
+    private func finishStep(out4: MLFeatureProvider, t1: CFAbsoluteTime) throws -> Int {
         profilePredict += (CFAbsoluteTimeGetCurrent() - t1)
         profileCount += 1
         if profileCount == 1 || profileCount % 10 == 0 {
@@ -520,7 +607,6 @@ final class ChunkedEngine {
                 eMs, mMs, c1, c2, c3, c4, c1 + c2 + c3 + c4,
                 pMs, eMs + pMs, 1000.0 / (eMs + pMs)))
         }
-
         return out4.featureValue(for: "token_id")!.multiArrayValue![0].intValue
     }
 

diff --git a/Sources/CoreMLLLM/CoreMLLLM.swift b/Sources/CoreMLLLM/CoreMLLLM.swift
@@ -81,6 +81,18 @@ public final class CoreMLLLM: @unchecked Sendable {
     /// Takes precedence over crossVocabEnabled when both are true.
     public var drafterUnionEnabled: Bool = false
 
+    /// Phase D (item 11d) — enable the pipelined decode path: chunk3 runs on
+    /// .cpuAndGPU (its own driver queue) with async dispatch, intended to
+    /// overlap with ANE-resident chunks. Built on top of the PR #77 spike's
+    /// compute-unit split. The load-time flip requires setting the env var
+    /// CHUNK_PIPELINE_ENABLED=1 before `load(from:)` is called; this
+    /// in-memory property is informational (reports whether the engine
+    /// actually loaded chunk3 on GPU). Defaults OFF on main until iPhone
+    /// validation, per the same merge discipline as drafterUnionEnabled.
+    public var chunkPipeliningEnabled: Bool {
+        chunkedEngine?.isPipeliningEnabled ?? false
+    }
+
     // Generation metrics
     public private(set) var tokensPerSecond: Double = 0
     public var mtpAcceptanceRate: Double { mtpEngine?.acceptanceRate ?? 0 }

diff --git a/Sources/CoreMLLLMSmoke/main.swift b/Sources/CoreMLLLMSmoke/main.swift
@@ -47,6 +47,20 @@ struct Smoke {
             print("[smoke] prompt: \(prompt)")
             print("[smoke] max_tokens=\(maxTokens)")
 
+            // Phase D pipelining: disable the drafters so predictStep's
+            // serial-vs-pipelined path is the clean axis under test. Also
+            // record emitted token IDs to a file for bit-exact diff between
+            // the two modes (see docs/PHASE_D_PIPELINING_IMPL.md).
+            let pipelineTrip = ProcessInfo.processInfo.environment["CHUNK_PIPELINE_ENABLED"] == "1"
+            if pipelineTrip {
+                llm.mtpEnabled = false
+                llm.drafterUnionEnabled = false
+                llm.crossVocabEnabled = false
+                print("[smoke] CHUNK_PIPELINE_ENABLED=1 — drafters disabled; " +
+                      "pipeliningEnabled=\(llm.chunkPipeliningEnabled)")
+            }
+            let tokenIDDumpPath = ProcessInfo.processInfo.environment["DUMP_TOKEN_IDS"]
+
             var collected = ""
             let stream = try await llm.stream(prompt, maxTokens: maxTokens)
             for await tok in stream {
@@ -58,6 +72,11 @@ struct Smoke {
             print("[smoke] output length = \(collected.count) chars")
             print("[smoke] mtp accept = \(String(format: "%.2f", llm.mtpAcceptanceRate))")
             print("[smoke] cross-vocab accept = \(String(format: "%.2f", llm.crossVocabAcceptanceRate))")
+            if let path = tokenIDDumpPath {
+                let ids = llm.lastEmittedTokenIDs.map { String($0) }.joined(separator: "\n")
+                try ids.write(toFile: path, atomically: true, encoding: .utf8)
+                print("[smoke] dumped \(llm.lastEmittedTokenIDs.count) token IDs to \(path)")
+            }
             exit(0)
         } catch {
             fputs("[smoke] error: \(error)\n", stderr)

diff --git a/docs/PHASE_D_PIPELINING_IMPL.md b/docs/PHASE_D_PIPELINING_IMPL.md
@@ -0,0 +1,179 @@
+# Phase D1b pipelining — implementation attempt (negative result)
+
+Date: 2026-04-15. Branch: `feat/chunk-pipelining-d1b` (built on main @ 2851faa, post PR #74).
+Follow-up to PR #77 (`spike/d1b-compute-unit-split`) which proved cross-compute-unit
+kernel overlap (factor 0.87–0.99) between ANE and GPU drivers.
+
+## TL;DR
+
+**STOP condition hit.** The implemented pipelined decode path (chunk3 async on
+.cpuAndGPU via a dedicated `DispatchQueue`, main thread awaiting before c4)
+**regresses tok/s by ~24% on every category** — identical to PR #77's measured
+regression — because the Gemma-4 chunk graph has **no within-step overlap
+opportunity** that this pipelining pattern can exploit. Per the task's
+guardrail ("If the overlap fails to produce ≥ +15% tok/s on any prompt, STOP
+and report."), this PR is filed as a negative result with the plumbing wired
+up so a future structural redesign can reuse it.
+
+Default OFF on main (per merge discipline). No production caller sees any
+change until they set `CHUNK_PIPELINE_ENABLED=1`.
+
+## Measured results (Mac Studio, 128-token decode, drafters OFF)
+
+| Category | baseline tok/s | pipeline tok/s | Δ | bit-exact |
+|----------|---------------:|---------------:|---:|:---------:|
+| chat     |          32.80 |          25.21 | −23 % | PASS (28 tok) |
+| code     |          33.24 |          25.50 | −23 % | PASS (127 tok) |
+| qa       |          33.15 |          24.86 | −25 % | PASS (7 tok)  |
+| summary  |          33.02 |          25.43 | −23 % | FAIL at tok 50 |
+
+Per-chunk timings match PR #77 exactly: c3 on .cpuAndGPU = ~16.4 ms vs ~7.5 ms
+on ANE (2.2× slower); predictStep sum goes 30.0 ms → 39.2 ms. No overlap is
+captured because c4 blocks on c3's `hidden_states_out`.
+
+### Bit-exact divergence on summary (expected)
+
+Summary diverges at token 50 (`669 15644` vs `3143 6417`) because moving c3 from
+ANE to GPU changes the fp16 arithmetic path (different rounding, different fused
+ops). At tie-break positions in argmax, the ordering flips. This is the same
+failure mode documented in PR #74's B.3 refutation for CV verify chunks. It is
+**not a sync bug** in the pipeline implementation — the three cleaner prompts
+are bit-exact byte-for-byte, confirming the dispatch logic is correct.
+
+## Root cause — why pipelining can't help
+
+The chunked Gemma-4 decode step is a strict linear chain:
+
+```
+token@N-1 -> c1@N -> c2@N -> c3@N -> c4@N -> token@N
+```
+
+Every edge is a hard data dependency:
+
+- c4 takes `hidden_states_out` from c3 (the only producer of h3)
+- c3 takes `hidden_states_out` from c2 (and `kv13/kv14` also from c2)
+- c1 takes `hiddenIn` = `embed_tokens[token@N-1]`, which comes from c4@N-1
+
+The PR #77 projection `max(c1+c2+c4, c1+c3_GPU)` = 22.8 ms assumed c3 and c4
+run in parallel. **That requires c4 NOT to depend on c3's h3, which is not
+true in the current model.** No within-step parallelism is possible without a
+model-topology change.
+
+### What the 2-stage pipeline in the task description would require
+
+- **Decoupled c4**: c4 would need to take `h2` (hidden after layer 14) and the
+  layer-15+ hidden directly, re-computing layers 25-34 independent of c3's
+  output. That is a conversion-side change, not a runtime change.
+- **Speculative c4**: run c4 with a predicted h3; accept/reject after c3
+  completes. This is speculative decoding on the hidden axis, a research
+  project of its own.
+- **Cross-step lookahead**: run c3@N and c1/c2@N+1 concurrently. Requires
+  token@N which comes from c4@N which requires c3@N — circular.
+
+None are low-hanging fruit; none fit the "net-added Swift ≤ 200 lines" budget.
+
+## Design (as implemented)
+
+Minimal pipelined variant wired as a clean opt-in:
+
+- `CHUNK_PIPELINE_ENABLED=1` at load time → `chunk3` loads on
+  `MLModelConfiguration(.cpuAndGPU)`; other chunks inherit caller's compute
+  units (`.cpuAndNeuralEngine` in the smoke CLI).
+- `ChunkedEngine.predictStep` takes a pipelined branch when
+  `pipeliningEnabled == true`: submits c3 to a serial `DispatchQueue` (label
+  `coreml-llm.chunk3.gpu`), main thread builds c4's input dict base, then
+  awaits c3 via `DispatchSemaphore`, then runs c4.
+- Public read-only property `CoreMLLLM.chunkPipeliningEnabled` reflects the
+  loaded state (for observability in the smoke CLI and future UI toggles).
+- `finishStep(out4:t1:)` helper hoists the post-c4 profile/return bookkeeping
+  out of `predictStep` so both branches share one exit path.
+
+Why the overlap window is microseconds in practice:
+- c4's dict build is ~1 µs of Swift dictionary hashing + MLFeatureValue boxing.
+- `kv13/kv14` references are already held before c3 launches (they come from
+  c2's output provider).
+- Mask / RoPE / embed for the _current_ step were produced before c1; prep
+  for the _next_ step requires token@N which requires c4.
+
+So the async dispatch overlaps ~1 µs of CPU work with ~16 ms of GPU compute,
+then joins. Net: pure regression matching the c3-GPU deficit.
+
+## Correctness verification (protocol for future attempts)
+
+```bash
+MODEL=~/Downloads/coreml-llm-artifacts/staging-2k-fast-prefill/gemma4-e2b
+for cat in chat code qa summary; do
+  PROMPT=...   # per category
+  DUMP_TOKEN_IDS=/tmp/base_${cat}.txt \
+      .build/release/coreml-llm-smoke "$MODEL" "$PROMPT" 128
+  CHUNK_PIPELINE_ENABLED=1 DUMP_TOKEN_IDS=/tmp/pipe_${cat}.txt \
+      .build/release/coreml-llm-smoke "$MODEL" "$PROMPT" 128
+  diff /tmp/base_${cat}.txt /tmp/pipe_${cat}.txt
+done
+```
+
+3 of 4 prompts bit-exact; summary diverges at token 50 (fp16-rounding
+sensitivity, not a sync bug — prompts that produce cleaner logits stay
+identical for 127 tokens).
+
+## Merge discipline
+
+- `chunkPipeliningEnabled` reports the engine's loaded state but cannot toggle
+  it post-load (the flag must be set via `CHUNK_PIPELINE_ENABLED` env before
+  `CoreMLLLM.load` because it changes compute unit on model instantiation).
+- Default OFF on main. Matches `drafterUnionEnabled` pattern — production
+  callers opt in only after on-device validation.
+- Do **not** merge as the production decode path. Keep as plumbing-only so a
+  future structural fix (decoupled c4, speculative h3, or conversion-side
+  re-chunking) can ship without reinventing the dispatch scaffolding.
+
+## Known limitations
+
+1. c3 on GPU is 2.2× slower in absolute terms (7.5 ms → 16.4 ms). Realising
+   any gain requires overlap ≥ 0.55 to break even, ≥ 0.70 for +15%. The
+   current dep graph allows ~0.00.
+2. fp16 divergence between ANE and GPU produces non-bit-exact tokens on
+   logit-tight prompts (1 of 4 categories tested).
+3. Prefill path is untouched (`runPrefill` still uses prefill_chunkN on
+   their default compute units).
+4. iPhone validation out of scope for this PR.
+
+## Files touched
+
+- `Sources/CoreMLLLM/ChunkedEngine.swift` — env-gated `.cpuAndGPU` branch in
+  `load()`, `c3Queue` / `pipeliningEnabled` in init, pipelined branch in
+  `predictStep`, `finishStep(out4:t1:)` hoist.
+- `Sources/CoreMLLLM/CoreMLLLM.swift` — public read-only
+  `chunkPipeliningEnabled` property.
+- `Sources/CoreMLLLMSmoke/main.swift` — `CHUNK_PIPELINE_ENABLED` drafter
+  override + `DUMP_TOKEN_IDS` for bit-exact diff.
+- `docs/PHASE_D_PIPELINING_IMPL.md` (this file).
+
+Net diff ≈ 110 lines Swift + this doc.
+
+## Next steps (if anyone picks this up)
+
+Non-speculative decode gains on CoreML/ANE+GPU are bottlenecked by the
+serial-chunk topology, not by dispatch or driver boundaries. The three viable
+structural fixes, roughly in increasing cost:
+
+1. **Re-chunk** so c3/c4 can compute independent sub-streams of a residual
+   split (conversion-side, needs a parallel residual path in the model). ~1 week.
+2. **Speculative c4 with predicted h3** (research project, ~1 month).
+3. **Full MLX-Swift port** (previously rejected in
+   `rejected_approaches.md`; unchanged by this result).
+
+Alternatively, concede the ~33 tok/s ceiling and focus remaining effort on
+speculative decode (MTP / Union) where the accept-rate headroom is the
+governing factor, not the per-step compute budget.
+
+## Related
+
+- PR #77 (`spike/d1b-compute-unit-split`) — feasibility spike, overlap probe.
+  Stays in main as a diagnostic via `COMPUTE_UNIT_SPLIT=1`.
+- PR #75 (`spike/d1-chunk-pipelining`) — earlier pure-ANE pipelining
+  spike, also negative.
+- `docs/BASELINE_SPEED_AUDIT.md` — per-chunk ms breakdown motivating
+  the split target.
+- `docs/PHASE_D_COMPUTE_UNIT_SPLIT_SPIKE.md` (PR #77) — overlap
+  methodology + the +30% projection this PR refutes empirically.