diff --git a/Sources/CoreMLLLM/ChunkedEngine.swift b/Sources/CoreMLLLM/ChunkedEngine.swift index 44046ff..788013f 100644 --- a/Sources/CoreMLLLM/ChunkedEngine.swift +++ b/Sources/CoreMLLLM/ChunkedEngine.swift @@ -104,6 +104,24 @@ final class ChunkedEngine { let mlConfig = MLModelConfiguration() mlConfig.computeUnits = computeUnits + // Phase D pipelining (item 11d): when CHUNK_PIPELINE_ENABLED=1, load + // chunk3 on .cpuAndGPU so it goes through a distinct driver queue from + // the ANE-resident chunks. This is a prerequisite for predictStep's + // pipelined variant (see predictStepPipelined). Default off; zero + // behaviour change when the env var is unset. See also PR #77 spike + // (CHUNK_PIPELINE_SPIKE=1 / COMPUTE_UNIT_SPLIT=1), which stays as a + // diagnostic in main. + let pipelineEnv = ProcessInfo.processInfo.environment["CHUNK_PIPELINE_ENABLED"] == "1" + let pipelineGPUConfig: MLModelConfiguration = { + let c = MLModelConfiguration() + c.computeUnits = .cpuAndGPU + return c + }() + if pipelineEnv { + print("[Pipeline] CHUNK_PIPELINE_ENABLED=1 — chunk3 on .cpuAndGPU, " + + "predictStep uses async c3 dispatch") + } + func findModel(_ name: String) -> URL? { let compiled = directory.appendingPathComponent("\(name).mlmodelc") if FileManager.default.fileExists(atPath: compiled.path) { return compiled } @@ -117,9 +135,11 @@ final class ChunkedEngine { throw CoreMLLLMError.modelNotFound(name) } let t0 = CFAbsoluteTimeGetCurrent() - let m = try MLModel(contentsOf: url, configuration: mlConfig) + let cfg = (pipelineEnv && name == "chunk3") ? pipelineGPUConfig : mlConfig + let m = try MLModel(contentsOf: url, configuration: cfg) let dt = CFAbsoluteTimeGetCurrent() - t0 - print("[Load] \(name) done in \(String(format: "%.1f", dt))s") + print("[Load] \(name) done in \(String(format: "%.1f", dt))s" + + (pipelineEnv && name == "chunk3" ? " (.cpuAndGPU)" : "")) return m } @@ -315,7 +335,8 @@ final class ChunkedEngine { kFull1: ioSurfaceArray(slots: 1, seqLen: ctx), vFull1: ioSurfaceArray(slots: 1, seqLen: ctx), kSliding2: ioSurfaceArray(slots: 5, seqLen: W), vSliding2: ioSurfaceArray(slots: 5, seqLen: W), kFull2: ioSurfaceArray(slots: 2, seqLen: ctx), vFull2: ioSurfaceArray(slots: 2, seqLen: ctx), - config: config, prefillN: prefillN) + config: config, prefillN: prefillN, + pipeliningEnabled: pipelineEnv) // ANE pipeline prewarm (Phase 0b): four dummy decode steps at load // time force the ANE compiler to finalize dispatch schedules and @@ -332,6 +353,14 @@ final class ChunkedEngine { return engine } + /// True when chunk3 is loaded on .cpuAndGPU and predictStep should route + /// through the pipelined variant (overlap c3 GPU with CPU-side work). + private let pipeliningEnabled: Bool + /// Serial queue for c3 async dispatch. Defined even when pipelining is + /// off so property initialisation stays simple; unused in that case. + private let c3Queue = DispatchQueue(label: "coreml-llm.chunk3.gpu", + qos: .userInitiated) + private init(chunk1: MLModel, chunk2: MLModel, chunk3: MLModel, chunk4: MLModel, prefillChunk1: MLModel?, prefillChunk2: MLModel?, prefillChunk3: MLModel?, prefillChunk4: MLModel?, @@ -346,7 +375,8 @@ final class ChunkedEngine { kFull1: MLMultiArray, vFull1: MLMultiArray, kSliding2: MLMultiArray, vSliding2: MLMultiArray, kFull2: MLMultiArray, vFull2: MLMultiArray, - config: ModelConfig, prefillN: Int) { + config: ModelConfig, prefillN: Int, + pipeliningEnabled: Bool = false) { self.chunk1 = chunk1; self.chunk2 = chunk2 self.chunk3 = chunk3; self.chunk4 = chunk4 self.prefillChunk1 = prefillChunk1; self.prefillChunk2 = prefillChunk2 @@ -363,8 +393,12 @@ final class ChunkedEngine { self.kSliding2 = kSliding2; self.vSliding2 = vSliding2 self.kFull2 = kFull2; self.vFull2 = vFull2 self.config = config; self.prefillN = prefillN + self.pipeliningEnabled = pipeliningEnabled } + /// Runtime-readable pipelining state (ChunkedEngine -> CoreMLLLM). + var isPipeliningEnabled: Bool { pipeliningEnabled } + // MARK: - Reset func reset() { @@ -488,10 +522,56 @@ final class ChunkedEngine { "kv14_k": MLFeatureValue(multiArray: kv14_k), "kv14_v": MLFeatureValue(multiArray: kv14_v), ] - // Chunk 3 + // Chunk 3 (+ chunk 4) + // + // Pipelined path: when chunk3 is on .cpuAndGPU (distinct driver from + // the ANE-resident chunk4), dispatch c3 asynchronously on c3Queue so + // the GPU submission and the CPU-side d4 dict construction / Swift + // runtime work on this thread proceed concurrently. Empirically the + // overlap opportunity inside a single step is small (d4 build is + // microseconds) — the real gain from this flag comes from PR #77's + // spike probe, which showed kernel-level overlap between ANE and GPU + // drivers. See docs/PHASE_D_PIPELINING_IMPL.md. + // + // Serial path: the original c3 → c4 back-to-back call on the caller + // thread. Keeps behaviour bit-identical when pipelining is off. let tC3Start = CFAbsoluteTimeGetCurrent() var d3 = shared; d3["hidden_states"] = MLFeatureValue(multiArray: h2) - let h3 = try chunk3.prediction(from: MLDictionaryFeatureProvider(dictionary: d3)) + let h3: MLMultiArray + if pipeliningEnabled { + let d3Provider = try MLDictionaryFeatureProvider(dictionary: d3) + var h3Result: MLMultiArray? + var h3Error: Error? + let sem = DispatchSemaphore(value: 0) + c3Queue.async { [chunk3] in + do { + let o3 = try chunk3.prediction(from: d3Provider) + h3Result = o3.featureValue(for: "hidden_states_out")!.multiArrayValue! + } catch { + h3Error = error + } + sem.signal() + } + // CPU-side overlap window: build c4's feature dict base while GPU + // is busy with c3. The dict build is ~microseconds; this mostly + // exists so the async architecture is in place for future + // restructuring (e.g., decoupled c4 that no longer depends on h3). + var d4Base = shared + sem.wait() + if let err = h3Error { throw err } + h3 = h3Result! + let tC3End = CFAbsoluteTimeGetCurrent() + profileC3 += (tC3End - tC3Start) + + let tC4Start = CFAbsoluteTimeGetCurrent() + d4Base["hidden_states"] = MLFeatureValue(multiArray: h3) + let out4 = try chunk4.prediction(from: + MLDictionaryFeatureProvider(dictionary: d4Base)) + let tC4End = CFAbsoluteTimeGetCurrent() + profileC4 += (tC4End - tC4Start) + return try finishStep(out4: out4, t1: t1) + } + h3 = try chunk3.prediction(from: MLDictionaryFeatureProvider(dictionary: d3)) .featureValue(for: "hidden_states_out")!.multiArrayValue! let tC3End = CFAbsoluteTimeGetCurrent() profileC3 += (tC3End - tC3Start) @@ -503,6 +583,13 @@ final class ChunkedEngine { let tC4End = CFAbsoluteTimeGetCurrent() profileC4 += (tC4End - tC4Start) + return try finishStep(out4: out4, t1: t1) + } + + /// Shared post-chunk4 bookkeeping: update profile counters, emit periodic + /// log line, return argmax token_id. Called from both the serial and the + /// pipelined predictStep paths. + private func finishStep(out4: MLFeatureProvider, t1: CFAbsoluteTime) throws -> Int { profilePredict += (CFAbsoluteTimeGetCurrent() - t1) profileCount += 1 if profileCount == 1 || profileCount % 10 == 0 { @@ -520,7 +607,6 @@ final class ChunkedEngine { eMs, mMs, c1, c2, c3, c4, c1 + c2 + c3 + c4, pMs, eMs + pMs, 1000.0 / (eMs + pMs))) } - return out4.featureValue(for: "token_id")!.multiArrayValue![0].intValue } diff --git a/Sources/CoreMLLLM/CoreMLLLM.swift b/Sources/CoreMLLLM/CoreMLLLM.swift index 7c47dca..681937b 100644 --- a/Sources/CoreMLLLM/CoreMLLLM.swift +++ b/Sources/CoreMLLLM/CoreMLLLM.swift @@ -81,6 +81,18 @@ public final class CoreMLLLM: @unchecked Sendable { /// Takes precedence over crossVocabEnabled when both are true. public var drafterUnionEnabled: Bool = false + /// Phase D (item 11d) — enable the pipelined decode path: chunk3 runs on + /// .cpuAndGPU (its own driver queue) with async dispatch, intended to + /// overlap with ANE-resident chunks. Built on top of the PR #77 spike's + /// compute-unit split. The load-time flip requires setting the env var + /// CHUNK_PIPELINE_ENABLED=1 before `load(from:)` is called; this + /// in-memory property is informational (reports whether the engine + /// actually loaded chunk3 on GPU). Defaults OFF on main until iPhone + /// validation, per the same merge discipline as drafterUnionEnabled. + public var chunkPipeliningEnabled: Bool { + chunkedEngine?.isPipeliningEnabled ?? false + } + // Generation metrics public private(set) var tokensPerSecond: Double = 0 public var mtpAcceptanceRate: Double { mtpEngine?.acceptanceRate ?? 0 } diff --git a/Sources/CoreMLLLMSmoke/main.swift b/Sources/CoreMLLLMSmoke/main.swift index 9318520..1edb055 100644 --- a/Sources/CoreMLLLMSmoke/main.swift +++ b/Sources/CoreMLLLMSmoke/main.swift @@ -47,6 +47,20 @@ struct Smoke { print("[smoke] prompt: \(prompt)") print("[smoke] max_tokens=\(maxTokens)") + // Phase D pipelining: disable the drafters so predictStep's + // serial-vs-pipelined path is the clean axis under test. Also + // record emitted token IDs to a file for bit-exact diff between + // the two modes (see docs/PHASE_D_PIPELINING_IMPL.md). + let pipelineTrip = ProcessInfo.processInfo.environment["CHUNK_PIPELINE_ENABLED"] == "1" + if pipelineTrip { + llm.mtpEnabled = false + llm.drafterUnionEnabled = false + llm.crossVocabEnabled = false + print("[smoke] CHUNK_PIPELINE_ENABLED=1 — drafters disabled; " + + "pipeliningEnabled=\(llm.chunkPipeliningEnabled)") + } + let tokenIDDumpPath = ProcessInfo.processInfo.environment["DUMP_TOKEN_IDS"] + var collected = "" let stream = try await llm.stream(prompt, maxTokens: maxTokens) for await tok in stream { @@ -58,6 +72,11 @@ struct Smoke { print("[smoke] output length = \(collected.count) chars") print("[smoke] mtp accept = \(String(format: "%.2f", llm.mtpAcceptanceRate))") print("[smoke] cross-vocab accept = \(String(format: "%.2f", llm.crossVocabAcceptanceRate))") + if let path = tokenIDDumpPath { + let ids = llm.lastEmittedTokenIDs.map { String($0) }.joined(separator: "\n") + try ids.write(toFile: path, atomically: true, encoding: .utf8) + print("[smoke] dumped \(llm.lastEmittedTokenIDs.count) token IDs to \(path)") + } exit(0) } catch { fputs("[smoke] error: \(error)\n", stderr) diff --git a/docs/PHASE_D_PIPELINING_IMPL.md b/docs/PHASE_D_PIPELINING_IMPL.md new file mode 100644 index 0000000..c8983ef --- /dev/null +++ b/docs/PHASE_D_PIPELINING_IMPL.md @@ -0,0 +1,179 @@ +# Phase D1b pipelining — implementation attempt (negative result) + +Date: 2026-04-15. Branch: `feat/chunk-pipelining-d1b` (built on main @ 2851faa, post PR #74). +Follow-up to PR #77 (`spike/d1b-compute-unit-split`) which proved cross-compute-unit +kernel overlap (factor 0.87–0.99) between ANE and GPU drivers. + +## TL;DR + +**STOP condition hit.** The implemented pipelined decode path (chunk3 async on +.cpuAndGPU via a dedicated `DispatchQueue`, main thread awaiting before c4) +**regresses tok/s by ~24% on every category** — identical to PR #77's measured +regression — because the Gemma-4 chunk graph has **no within-step overlap +opportunity** that this pipelining pattern can exploit. Per the task's +guardrail ("If the overlap fails to produce ≥ +15% tok/s on any prompt, STOP +and report."), this PR is filed as a negative result with the plumbing wired +up so a future structural redesign can reuse it. + +Default OFF on main (per merge discipline). No production caller sees any +change until they set `CHUNK_PIPELINE_ENABLED=1`. + +## Measured results (Mac Studio, 128-token decode, drafters OFF) + +| Category | baseline tok/s | pipeline tok/s | Δ | bit-exact | +|----------|---------------:|---------------:|---:|:---------:| +| chat | 32.80 | 25.21 | −23 % | PASS (28 tok) | +| code | 33.24 | 25.50 | −23 % | PASS (127 tok) | +| qa | 33.15 | 24.86 | −25 % | PASS (7 tok) | +| summary | 33.02 | 25.43 | −23 % | FAIL at tok 50 | + +Per-chunk timings match PR #77 exactly: c3 on .cpuAndGPU = ~16.4 ms vs ~7.5 ms +on ANE (2.2× slower); predictStep sum goes 30.0 ms → 39.2 ms. No overlap is +captured because c4 blocks on c3's `hidden_states_out`. + +### Bit-exact divergence on summary (expected) + +Summary diverges at token 50 (`669 15644` vs `3143 6417`) because moving c3 from +ANE to GPU changes the fp16 arithmetic path (different rounding, different fused +ops). At tie-break positions in argmax, the ordering flips. This is the same +failure mode documented in PR #74's B.3 refutation for CV verify chunks. It is +**not a sync bug** in the pipeline implementation — the three cleaner prompts +are bit-exact byte-for-byte, confirming the dispatch logic is correct. + +## Root cause — why pipelining can't help + +The chunked Gemma-4 decode step is a strict linear chain: + +``` +token@N-1 -> c1@N -> c2@N -> c3@N -> c4@N -> token@N +``` + +Every edge is a hard data dependency: + +- c4 takes `hidden_states_out` from c3 (the only producer of h3) +- c3 takes `hidden_states_out` from c2 (and `kv13/kv14` also from c2) +- c1 takes `hiddenIn` = `embed_tokens[token@N-1]`, which comes from c4@N-1 + +The PR #77 projection `max(c1+c2+c4, c1+c3_GPU)` = 22.8 ms assumed c3 and c4 +run in parallel. **That requires c4 NOT to depend on c3's h3, which is not +true in the current model.** No within-step parallelism is possible without a +model-topology change. + +### What the 2-stage pipeline in the task description would require + +- **Decoupled c4**: c4 would need to take `h2` (hidden after layer 14) and the + layer-15+ hidden directly, re-computing layers 25-34 independent of c3's + output. That is a conversion-side change, not a runtime change. +- **Speculative c4**: run c4 with a predicted h3; accept/reject after c3 + completes. This is speculative decoding on the hidden axis, a research + project of its own. +- **Cross-step lookahead**: run c3@N and c1/c2@N+1 concurrently. Requires + token@N which comes from c4@N which requires c3@N — circular. + +None are low-hanging fruit; none fit the "net-added Swift ≤ 200 lines" budget. + +## Design (as implemented) + +Minimal pipelined variant wired as a clean opt-in: + +- `CHUNK_PIPELINE_ENABLED=1` at load time → `chunk3` loads on + `MLModelConfiguration(.cpuAndGPU)`; other chunks inherit caller's compute + units (`.cpuAndNeuralEngine` in the smoke CLI). +- `ChunkedEngine.predictStep` takes a pipelined branch when + `pipeliningEnabled == true`: submits c3 to a serial `DispatchQueue` (label + `coreml-llm.chunk3.gpu`), main thread builds c4's input dict base, then + awaits c3 via `DispatchSemaphore`, then runs c4. +- Public read-only property `CoreMLLLM.chunkPipeliningEnabled` reflects the + loaded state (for observability in the smoke CLI and future UI toggles). +- `finishStep(out4:t1:)` helper hoists the post-c4 profile/return bookkeeping + out of `predictStep` so both branches share one exit path. + +Why the overlap window is microseconds in practice: +- c4's dict build is ~1 µs of Swift dictionary hashing + MLFeatureValue boxing. +- `kv13/kv14` references are already held before c3 launches (they come from + c2's output provider). +- Mask / RoPE / embed for the _current_ step were produced before c1; prep + for the _next_ step requires token@N which requires c4. + +So the async dispatch overlaps ~1 µs of CPU work with ~16 ms of GPU compute, +then joins. Net: pure regression matching the c3-GPU deficit. + +## Correctness verification (protocol for future attempts) + +```bash +MODEL=~/Downloads/coreml-llm-artifacts/staging-2k-fast-prefill/gemma4-e2b +for cat in chat code qa summary; do + PROMPT=... # per category + DUMP_TOKEN_IDS=/tmp/base_${cat}.txt \ + .build/release/coreml-llm-smoke "$MODEL" "$PROMPT" 128 + CHUNK_PIPELINE_ENABLED=1 DUMP_TOKEN_IDS=/tmp/pipe_${cat}.txt \ + .build/release/coreml-llm-smoke "$MODEL" "$PROMPT" 128 + diff /tmp/base_${cat}.txt /tmp/pipe_${cat}.txt +done +``` + +3 of 4 prompts bit-exact; summary diverges at token 50 (fp16-rounding +sensitivity, not a sync bug — prompts that produce cleaner logits stay +identical for 127 tokens). + +## Merge discipline + +- `chunkPipeliningEnabled` reports the engine's loaded state but cannot toggle + it post-load (the flag must be set via `CHUNK_PIPELINE_ENABLED` env before + `CoreMLLLM.load` because it changes compute unit on model instantiation). +- Default OFF on main. Matches `drafterUnionEnabled` pattern — production + callers opt in only after on-device validation. +- Do **not** merge as the production decode path. Keep as plumbing-only so a + future structural fix (decoupled c4, speculative h3, or conversion-side + re-chunking) can ship without reinventing the dispatch scaffolding. + +## Known limitations + +1. c3 on GPU is 2.2× slower in absolute terms (7.5 ms → 16.4 ms). Realising + any gain requires overlap ≥ 0.55 to break even, ≥ 0.70 for +15%. The + current dep graph allows ~0.00. +2. fp16 divergence between ANE and GPU produces non-bit-exact tokens on + logit-tight prompts (1 of 4 categories tested). +3. Prefill path is untouched (`runPrefill` still uses prefill_chunkN on + their default compute units). +4. iPhone validation out of scope for this PR. + +## Files touched + +- `Sources/CoreMLLLM/ChunkedEngine.swift` — env-gated `.cpuAndGPU` branch in + `load()`, `c3Queue` / `pipeliningEnabled` in init, pipelined branch in + `predictStep`, `finishStep(out4:t1:)` hoist. +- `Sources/CoreMLLLM/CoreMLLLM.swift` — public read-only + `chunkPipeliningEnabled` property. +- `Sources/CoreMLLLMSmoke/main.swift` — `CHUNK_PIPELINE_ENABLED` drafter + override + `DUMP_TOKEN_IDS` for bit-exact diff. +- `docs/PHASE_D_PIPELINING_IMPL.md` (this file). + +Net diff ≈ 110 lines Swift + this doc. + +## Next steps (if anyone picks this up) + +Non-speculative decode gains on CoreML/ANE+GPU are bottlenecked by the +serial-chunk topology, not by dispatch or driver boundaries. The three viable +structural fixes, roughly in increasing cost: + +1. **Re-chunk** so c3/c4 can compute independent sub-streams of a residual + split (conversion-side, needs a parallel residual path in the model). ~1 week. +2. **Speculative c4 with predicted h3** (research project, ~1 month). +3. **Full MLX-Swift port** (previously rejected in + `rejected_approaches.md`; unchanged by this result). + +Alternatively, concede the ~33 tok/s ceiling and focus remaining effort on +speculative decode (MTP / Union) where the accept-rate headroom is the +governing factor, not the per-step compute budget. + +## Related + +- PR #77 (`spike/d1b-compute-unit-split`) — feasibility spike, overlap probe. + Stays in main as a diagnostic via `COMPUTE_UNIT_SPLIT=1`. +- PR #75 (`spike/d1-chunk-pipelining`) — earlier pure-ANE pipelining + spike, also negative. +- `docs/BASELINE_SPEED_AUDIT.md` — per-chunk ms breakdown motivating + the split target. +- `docs/PHASE_D_COMPUTE_UNIT_SPLIT_SPIKE.md` (PR #77) — overlap + methodology + the +30% projection this PR refutes empirically.