From fa80b24041ca801982f384bcf1c5cc2d7aa7ef3d Mon Sep 17 00:00:00 2001
From: John Rocky <john-rocky@users.noreply.github.com>
Date: Sat, 18 Apr 2026 16:03:19 +0900
Subject: [PATCH] feat(chat): compute-unit picker + ANE-vs-GPU A/B benchmark

- LLMRunner: expose computeUnits; loadModel accepts override
- verifyANEPlacement now uses the active computeUnits (and mirrors
  GPU_PREFILL env var for prefill chunks) so the audit matches reality
- Add runABBenchmark: sequential reload+bench for each compute unit,
  60s cool-down between sides, restores original CU at end
- ChatView: toolbar picker (ANE / GPU / All) that triggers a reload,
  and "Compare ANE vs GPU" menu producing side-by-side tok/s, drain,
  thermal results with a faster/lower-drain summary
---
 .../CoreMLLLMChat/ChatView.swift              | 148 ++++++++++++++++++
 .../CoreMLLLMChat/LLMRunner.swift             | 127 +++++++++++++--
 2 files changed, 266 insertions(+), 9 deletions(-)

diff --git a/Examples/CoreMLLLMChat/CoreMLLLMChat/ChatView.swift b/Examples/CoreMLLLMChat/CoreMLLLMChat/ChatView.swift
index b053dc4..210f83b 100644
--- a/Examples/CoreMLLLMChat/CoreMLLLMChat/ChatView.swift
+++ b/Examples/CoreMLLLMChat/CoreMLLLMChat/ChatView.swift
@@ -1,9 +1,25 @@
 import SwiftUI
 import PhotosUI
+import CoreML
 import CoreMLLLM
 
+enum ComputeChoice: String, CaseIterable, Identifiable {
+    case ane = "ANE"
+    case gpu = "GPU"
+    case all = "All"
+    var id: String { rawValue }
+    var mlValue: MLComputeUnits {
+        switch self {
+        case .ane: return .cpuAndNeuralEngine
+        case .gpu: return .cpuAndGPU
+        case .all: return .all
+        }
+    }
+}
+
 struct ChatView: View {
     @State private var runner = LLMRunner()
+    @State private var computeChoice: ComputeChoice = .ane
     @State private var messages: [ChatMessage] = []
     @State private var inputText = ""
     @State private var showModelPicker = false
@@ -210,8 +226,31 @@ struct ChatView: View {
                             Button("5 min")  { startBenchmark(minutes: 5) }
                             Button("10 min") { startBenchmark(minutes: 10) }
                             Button("30 min") { startBenchmark(minutes: 30) }
+                            Divider()
+                            Section("Compare ANE vs GPU") {
+                                Button("2 min each")  { startABBenchmark(minutesPerSide: 2) }
+                                Button("5 min each")  { startABBenchmark(minutesPerSide: 5) }
+                                Button("10 min each") { startABBenchmark(minutesPerSide: 10) }
+                            }
+                        }
+                        .disabled(runner.isGenerating || benchmarkRunning)
+                    }
+                }
+                if runner.isLoaded {
+                    ToolbarItem(placement: .topBarTrailing) {
+                        Menu {
+                            Picker("Compute", selection: $computeChoice) {
+                                ForEach(ComputeChoice.allCases) { c in
+                                    Text(c.rawValue).tag(c)
+                                }
+                            }
+                        } label: {
+                            Text(computeChoice.rawValue)
                         }
                         .disabled(runner.isGenerating || benchmarkRunning)
+                        .onChange(of: computeChoice) { _, new in
+                            reloadWithCompute(new)
+                        }
                     }
                 }
                 if runner.hasAudio {
@@ -556,6 +595,115 @@ struct ChatView: View {
         }
     }
 
+    private func reloadWithCompute(_ choice: ComputeChoice) {
+        guard let folder = runner.modelFolderURL else { return }
+        let modelURL = folder.appendingPathComponent("model.mlpackage")
+        messages.append(ChatMessage(role: .system,
+            content: "Reloading model on \(LLMRunner.computeUnitsString(choice.mlValue))…"))
+        Task.detached(priority: .userInitiated) {
+            do {
+                try await runner.loadModel(from: modelURL, computeUnits: choice.mlValue)
+                await MainActor.run {
+                    messages.append(ChatMessage(role: .system,
+                        content: "Loaded on \(LLMRunner.computeUnitsString(choice.mlValue))."))
+                }
+            } catch {
+                await MainActor.run {
+                    messages.append(ChatMessage(role: .system,
+                        content: "Reload failed: \(error.localizedDescription)"))
+                }
+            }
+        }
+    }
+
+    private func startABBenchmark(minutesPerSide: Int) {
+        UIDevice.current.isBatteryMonitoringEnabled = true
+        let state = UIDevice.current.batteryState
+        if state == .charging || state == .full {
+            messages.append(ChatMessage(role: .system, content: "[A/B] Device is charging — unplug for accurate SoC drain measurement."))
+        }
+        benchmarkRunning = true
+        benchmarkStatus = "A/B starting… (\(minutesPerSide) min per side)"
+        messages.append(ChatMessage(role: .system,
+            content: "[A/B] Running ANE then GPU for \(minutesPerSide) min each. Reload between sides takes ~1 min; 60s cool-down in between."))
+        UIApplication.shared.isIdleTimerDisabled = true
+
+        Task {
+            defer { UIApplication.shared.isIdleTimerDisabled = false }
+            do {
+                let ab = try await runner.runABBenchmark(
+                    durationPerSide: TimeInterval(minutesPerSide * 60),
+                    onPhase: { phase in
+                        benchmarkStatus = phase
+                    },
+                    onProgress: { prog in
+                        let batNow = prog.batteryNow >= 0 ? Int(prog.batteryNow * 100) : -1
+                        benchmarkStatus = String(
+                            format: "[A/B] %ds  %d tok  avg %.1f tok/s  SoC %d%%  %@",
+                            Int(prog.elapsed), prog.totalTokens, prog.avgTokPerSec,
+                            batNow,
+                            LLMRunner.thermalString(prog.thermal) as NSString)
+                    }
+                )
+                benchmarkRunning = false
+                benchmarkStatus = "A/B done. See chat for result."
+
+                var out = ["[A/B RESULT] \(minutesPerSide) min per side"]
+                for e in ab.entries {
+                    let label = LLMRunner.computeUnitsString(e.units)
+                    if let err = e.error {
+                        out.append("\(label): \(err)")
+                        continue
+                    }
+                    guard let r = e.result else { continue }
+                    let bs = r.batteryStart >= 0 ? Int(r.batteryStart * 100) : -1
+                    let be = r.batteryEnd   >= 0 ? Int(r.batteryEnd   * 100) : -1
+                    out.append("""
+                    \(label):
+                      tok/s avg   : \(String(format: "%.2f", r.avgTokPerSec))
+                      tokens      : \(r.totalTokens)  (rounds \(r.rounds))
+                      battery     : \(bs)% → \(be)%  (Δ \(String(format: "%.2f", r.drainedPercent))%)
+                      drain/min   : \(String(format: "%.3f", r.drainedPerMinute))%/min
+                      tokens/%SoC : \(String(format: "%.0f", r.tokensPerPercent))
+                      thermal     : \(LLMRunner.thermalString(r.thermalStart)) → \(LLMRunner.thermalString(r.thermalEnd))\(r.abortedThermal ? "  (aborted .serious)" : "")
+                    """)
+                }
+                // Head-to-head delta on tok/s and drain if both sides finished.
+                let done = ab.entries.compactMap { e -> (MLComputeUnits, LLMRunner.BenchmarkResult)? in
+                    if let r = e.result { return (e.units, r) } else { return nil }
+                }
+                if done.count >= 2 {
+                    let (u0, r0) = done[0]
+                    let (u1, r1) = done[1]
+                    let fasterLabel = r0.avgTokPerSec >= r1.avgTokPerSec
+                        ? LLMRunner.computeUnitsString(u0)
+                        : LLMRunner.computeUnitsString(u1)
+                    let speedRatio = r0.avgTokPerSec > 0 && r1.avgTokPerSec > 0
+                        ? max(r0.avgTokPerSec, r1.avgTokPerSec) / min(r0.avgTokPerSec, r1.avgTokPerSec)
+                        : 1.0
+                    let coolerLabel: String
+                    if r0.drainedPerMinute > 0 && r1.drainedPerMinute > 0 {
+                        coolerLabel = r0.drainedPerMinute <= r1.drainedPerMinute
+                            ? LLMRunner.computeUnitsString(u0)
+                            : LLMRunner.computeUnitsString(u1)
+                    } else {
+                        coolerLabel = "n/a (charging or unmeasured)"
+                    }
+                    out.append("""
+                    Summary:
+                      faster       : \(fasterLabel)  (×\(String(format: "%.2f", speedRatio)))
+                      lower drain  : \(coolerLabel)
+                    """)
+                }
+                messages.append(ChatMessage(role: .system, content: out.joined(separator: "\n\n")))
+            } catch {
+                benchmarkRunning = false
+                benchmarkStatus = ""
+                messages.append(ChatMessage(role: .system, content: "[A/B] Failed: \(error.localizedDescription)"))
+            }
+        }
+    }
+
     private func verifyANE() {
         messages.append(ChatMessage(role: .system, content: "Checking MLComputePlan device placement..."))
         Task.detached(priority: .userInitiated) {
diff --git a/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift b/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift
index e041dc6..e9722c9 100644
--- a/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift
+++ b/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift
@@ -20,6 +20,12 @@ final class LLMRunner {
     var hasAudio = false
     var maxAudioDuration: TimeInterval = 10.0
 
+    /// Active compute unit selection. Applied on the next `loadModel` call —
+    /// changing this on a loaded model does NOT migrate weights between ANE
+    /// and GPU; you must reload. Vision/audio submodels are intentionally
+    /// pinned to `.cpuAndGPU` upstream and ignore this setting.
+    var computeUnits: MLComputeUnits = .cpuAndNeuralEngine
+
     // MTP speculation metrics
     var mtpAcceptanceRate: Double = 0
     var mtpTokensPerRound: Double = 0
@@ -30,11 +36,25 @@ final class LLMRunner {
     var crossVocabTokensPerCycle: Double = 0
 
     private var llm: CoreMLLLM?
-    private var modelFolderURL: URL?
+    /// Folder of the most recently loaded model. Exposed (read-only) so the
+    /// app can re-issue `loadModel` with a different compute unit (A/B
+    /// benchmark, picker change) without re-prompting the user.
+    private(set) var modelFolderURL: URL?
+
+    static func computeUnitsString(_ cu: MLComputeUnits) -> String {
+        switch cu {
+        case .cpuOnly:             return "CPU"
+        case .cpuAndGPU:           return "CPU+GPU"
+        case .all:                 return "All (CPU+GPU+ANE)"
+        case .cpuAndNeuralEngine:  return "CPU+ANE"
+        @unknown default:          return "?"
+        }
+    }
 
     // MARK: - Loading
 
-    func loadModel(from url: URL) async throws {
+    func loadModel(from url: URL, computeUnits: MLComputeUnits? = nil) async throws {
+        if let cu = computeUnits { self.computeUnits = cu }
         let folder = url.deletingLastPathComponent()
 
         // Release the previous model BEFORE allocating the new one — otherwise
@@ -62,11 +82,13 @@ final class LLMRunner {
         modelFolderURL = folder
         loadingStatus = "Loading..."
 
-        llm = try await CoreMLLLM.load(from: folder) { [weak self] status in
+        let cu = self.computeUnits
+        llm = try await CoreMLLLM.load(from: folder, computeUnits: cu) { [weak self] status in
             Task { @MainActor in
                 self?.loadingStatus = status
             }
         }
+        print("[LLMRunner] computeUnits=\(Self.computeUnitsString(cu))")
 
         modelName = llm!.modelName
         hasVision = llm!.supportsVision
@@ -243,6 +265,77 @@ final class LLMRunner {
     }
     #endif
 
+    // MARK: - A/B compute-unit comparison
+
+    struct ABBenchmarkResult {
+        var perSideDuration: TimeInterval
+        var entries: [(units: MLComputeUnits, result: BenchmarkResult?, error: String?)]
+    }
+
+    /// Reload the current model under each requested compute unit and run
+    /// `runBenchmark` on each. The same prompt, same duration, sequential.
+    /// Restores the original compute unit at the end (best-effort).
+    ///
+    /// Caveats baked in:
+    /// - Reloading is slow (first-run ANE compile ≈ 1–2 min). The phase
+    ///   callback distinguishes load vs run.
+    /// - Vision/audio submodels are pinned to `.cpuAndGPU` upstream and are
+    ///   not affected by the side under test.
+    /// - The model must already have been loaded once so we know its folder.
+    #if os(iOS)
+    @MainActor
+    func runABBenchmark(
+        durationPerSide: TimeInterval,
+        units: [MLComputeUnits] = [.cpuAndNeuralEngine, .cpuAndGPU],
+        onPhase: @escaping (String) -> Void,
+        onProgress: @escaping (BenchmarkProgress) -> Void
+    ) async throws -> ABBenchmarkResult {
+        guard let folder = modelFolderURL else {
+            throw NSError(domain: "LLMRunner", code: 2,
+                          userInfo: [NSLocalizedDescriptionKey: "Load a model first"])
+        }
+        let modelURL = folder.appendingPathComponent("model.mlpackage")
+        let originalCU = self.computeUnits
+
+        var entries: [(MLComputeUnits, BenchmarkResult?, String?)] = []
+        for cu in units {
+            let label = Self.computeUnitsString(cu)
+            onPhase("[\(label)] reloading model…")
+            do {
+                try await loadModel(from: modelURL, computeUnits: cu)
+            } catch {
+                entries.append((cu, nil, "load failed: \(error.localizedDescription)"))
+                continue
+            }
+            onPhase("[\(label)] benchmarking \(Int(durationPerSide))s…")
+            do {
+                let r = try await runBenchmark(duration: durationPerSide,
+                                               onProgress: onProgress)
+                entries.append((cu, r, nil))
+            } catch {
+                entries.append((cu, nil, "bench failed: \(error.localizedDescription)"))
+            }
+            // Cool-down between sides so thermal state from side A doesn't
+            // bleed into side B's measurements. 60s is a compromise — not
+            // enough to hit nominal from .serious, but enough to drop
+            // surface temperature noticeably on most devices.
+            if cu != units.last {
+                onPhase("Cooling down 60s before next side…")
+                try? await Task.sleep(nanoseconds: 60_000_000_000)
+            }
+        }
+
+        // Restore original compute units (best-effort — caller may want a
+        // specific side left loaded; they can reload after if needed).
+        if originalCU != units.last {
+            onPhase("Restoring original compute units (\(Self.computeUnitsString(originalCU)))…")
+            try? await loadModel(from: modelURL, computeUnits: originalCU)
+        }
+
+        return ABBenchmarkResult(perSideDuration: durationPerSide, entries: entries)
+    }
+    #endif
+
     static func thermalString(_ s: ProcessInfo.ThermalState) -> String {
         switch s {
         case .nominal:  return "nominal"
@@ -253,8 +346,12 @@ final class LLMRunner {
         }
     }
 
-    // MARK: - ANE placement verification
+    // MARK: - Compute placement verification
 
+    /// Reports MLComputePlan placement for the currently-loaded model, using
+    /// the runner's active `computeUnits` for the LLM chunks (so the audit
+    /// matches reality, not a hardcoded ANE config). Vision stays on
+    /// `.cpuAndGPU` because that is what the package always uses for it.
     @available(iOS 17.0, macOS 14.0, *)
     func verifyANEPlacement() async -> String {
         guard let folder = modelFolderURL else {
@@ -262,25 +359,37 @@ final class LLMRunner {
         }
 
         let cfg = MLModelConfiguration()
-        cfg.computeUnits = .cpuAndNeuralEngine
+        cfg.computeUnits = self.computeUnits
         let visionCfg = MLModelConfiguration()
         visionCfg.computeUnits = .cpuAndGPU
 
+        // Prefill chunks may have been forced to GPU via the GPU_PREFILL
+        // env var — see ChunkedEngine.swift. Mirror that here so the audit
+        // matches what was actually loaded.
+        let useGPUPrefill = ProcessInfo.processInfo.environment["GPU_PREFILL"] == "1"
+        let prefillCfg = MLModelConfiguration()
+        prefillCfg.computeUnits = useGPUPrefill ? .cpuAndGPU : self.computeUnits
+
         struct Entry { let label: String; let url: URL; let cfg: MLModelConfiguration }
         var entries: [Entry] = []
-        let names = ["chunk1", "chunk2", "chunk3", "chunk4",
-                     "prefill_chunk1", "prefill_chunk2", "prefill_chunk3", "prefill_chunk4"]
-        for name in names {
+        let decodeNames = ["chunk1", "chunk2", "chunk3", "chunk4"]
+        let prefillNames = ["prefill_chunk1", "prefill_chunk2", "prefill_chunk3", "prefill_chunk4"]
+        for name in decodeNames {
             if let u = findModel(in: folder, name: name) {
                 entries.append(Entry(label: name, url: u, cfg: cfg))
             }
         }
+        for name in prefillNames {
+            if let u = findModel(in: folder, name: name) {
+                entries.append(Entry(label: name, url: u, cfg: prefillCfg))
+            }
+        }
         if let u = findModel(in: folder, name: "vision") {
             entries.append(Entry(label: "vision", url: u, cfg: visionCfg))
         }
         if entries.isEmpty { return "No chunks found." }
 
-        var lines: [String] = ["MLComputePlan placement:"]
+        var lines: [String] = ["MLComputePlan placement (cfg=\(Self.computeUnitsString(self.computeUnits))):"]
         var tAll = 0, aAll = 0, gAll = 0, cAll = 0
         for e in entries {
             do {