From fa80b24041ca801982f384bcf1c5cc2d7aa7ef3d Mon Sep 17 00:00:00 2001 From: John Rocky Date: Sat, 18 Apr 2026 16:03:19 +0900 Subject: [PATCH] feat(chat): compute-unit picker + ANE-vs-GPU A/B benchmark - LLMRunner: expose computeUnits; loadModel accepts override - verifyANEPlacement now uses the active computeUnits (and mirrors GPU_PREFILL env var for prefill chunks) so the audit matches reality - Add runABBenchmark: sequential reload+bench for each compute unit, 60s cool-down between sides, restores original CU at end - ChatView: toolbar picker (ANE / GPU / All) that triggers a reload, and "Compare ANE vs GPU" menu producing side-by-side tok/s, drain, thermal results with a faster/lower-drain summary --- .../CoreMLLLMChat/ChatView.swift | 148 ++++++++++++++++++ .../CoreMLLLMChat/LLMRunner.swift | 127 +++++++++++++-- 2 files changed, 266 insertions(+), 9 deletions(-) diff --git a/Examples/CoreMLLLMChat/CoreMLLLMChat/ChatView.swift b/Examples/CoreMLLLMChat/CoreMLLLMChat/ChatView.swift index b053dc4..210f83b 100644 --- a/Examples/CoreMLLLMChat/CoreMLLLMChat/ChatView.swift +++ b/Examples/CoreMLLLMChat/CoreMLLLMChat/ChatView.swift @@ -1,9 +1,25 @@ import SwiftUI import PhotosUI +import CoreML import CoreMLLLM +enum ComputeChoice: String, CaseIterable, Identifiable { + case ane = "ANE" + case gpu = "GPU" + case all = "All" + var id: String { rawValue } + var mlValue: MLComputeUnits { + switch self { + case .ane: return .cpuAndNeuralEngine + case .gpu: return .cpuAndGPU + case .all: return .all + } + } +} + struct ChatView: View { @State private var runner = LLMRunner() + @State private var computeChoice: ComputeChoice = .ane @State private var messages: [ChatMessage] = [] @State private var inputText = "" @State private var showModelPicker = false @@ -210,8 +226,31 @@ struct ChatView: View { Button("5 min") { startBenchmark(minutes: 5) } Button("10 min") { startBenchmark(minutes: 10) } Button("30 min") { startBenchmark(minutes: 30) } + Divider() + Section("Compare ANE vs GPU") { + Button("2 min each") { startABBenchmark(minutesPerSide: 2) } + Button("5 min each") { startABBenchmark(minutesPerSide: 5) } + Button("10 min each") { startABBenchmark(minutesPerSide: 10) } + } + } + .disabled(runner.isGenerating || benchmarkRunning) + } + } + if runner.isLoaded { + ToolbarItem(placement: .topBarTrailing) { + Menu { + Picker("Compute", selection: $computeChoice) { + ForEach(ComputeChoice.allCases) { c in + Text(c.rawValue).tag(c) + } + } + } label: { + Text(computeChoice.rawValue) } .disabled(runner.isGenerating || benchmarkRunning) + .onChange(of: computeChoice) { _, new in + reloadWithCompute(new) + } } } if runner.hasAudio { @@ -556,6 +595,115 @@ struct ChatView: View { } } + private func reloadWithCompute(_ choice: ComputeChoice) { + guard let folder = runner.modelFolderURL else { return } + let modelURL = folder.appendingPathComponent("model.mlpackage") + messages.append(ChatMessage(role: .system, + content: "Reloading model on \(LLMRunner.computeUnitsString(choice.mlValue))…")) + Task.detached(priority: .userInitiated) { + do { + try await runner.loadModel(from: modelURL, computeUnits: choice.mlValue) + await MainActor.run { + messages.append(ChatMessage(role: .system, + content: "Loaded on \(LLMRunner.computeUnitsString(choice.mlValue)).")) + } + } catch { + await MainActor.run { + messages.append(ChatMessage(role: .system, + content: "Reload failed: \(error.localizedDescription)")) + } + } + } + } + + private func startABBenchmark(minutesPerSide: Int) { + UIDevice.current.isBatteryMonitoringEnabled = true + let state = UIDevice.current.batteryState + if state == .charging || state == .full { + messages.append(ChatMessage(role: .system, content: "[A/B] Device is charging — unplug for accurate SoC drain measurement.")) + } + benchmarkRunning = true + benchmarkStatus = "A/B starting… (\(minutesPerSide) min per side)" + messages.append(ChatMessage(role: .system, + content: "[A/B] Running ANE then GPU for \(minutesPerSide) min each. Reload between sides takes ~1 min; 60s cool-down in between.")) + UIApplication.shared.isIdleTimerDisabled = true + + Task { + defer { UIApplication.shared.isIdleTimerDisabled = false } + do { + let ab = try await runner.runABBenchmark( + durationPerSide: TimeInterval(minutesPerSide * 60), + onPhase: { phase in + benchmarkStatus = phase + }, + onProgress: { prog in + let batNow = prog.batteryNow >= 0 ? Int(prog.batteryNow * 100) : -1 + benchmarkStatus = String( + format: "[A/B] %ds %d tok avg %.1f tok/s SoC %d%% %@", + Int(prog.elapsed), prog.totalTokens, prog.avgTokPerSec, + batNow, + LLMRunner.thermalString(prog.thermal) as NSString) + } + ) + benchmarkRunning = false + benchmarkStatus = "A/B done. See chat for result." + + var out = ["[A/B RESULT] \(minutesPerSide) min per side"] + for e in ab.entries { + let label = LLMRunner.computeUnitsString(e.units) + if let err = e.error { + out.append("\(label): \(err)") + continue + } + guard let r = e.result else { continue } + let bs = r.batteryStart >= 0 ? Int(r.batteryStart * 100) : -1 + let be = r.batteryEnd >= 0 ? Int(r.batteryEnd * 100) : -1 + out.append(""" + \(label): + tok/s avg : \(String(format: "%.2f", r.avgTokPerSec)) + tokens : \(r.totalTokens) (rounds \(r.rounds)) + battery : \(bs)% → \(be)% (Δ \(String(format: "%.2f", r.drainedPercent))%) + drain/min : \(String(format: "%.3f", r.drainedPerMinute))%/min + tokens/%SoC : \(String(format: "%.0f", r.tokensPerPercent)) + thermal : \(LLMRunner.thermalString(r.thermalStart)) → \(LLMRunner.thermalString(r.thermalEnd))\(r.abortedThermal ? " (aborted .serious)" : "") + """) + } + // Head-to-head delta on tok/s and drain if both sides finished. + let done = ab.entries.compactMap { e -> (MLComputeUnits, LLMRunner.BenchmarkResult)? in + if let r = e.result { return (e.units, r) } else { return nil } + } + if done.count >= 2 { + let (u0, r0) = done[0] + let (u1, r1) = done[1] + let fasterLabel = r0.avgTokPerSec >= r1.avgTokPerSec + ? LLMRunner.computeUnitsString(u0) + : LLMRunner.computeUnitsString(u1) + let speedRatio = r0.avgTokPerSec > 0 && r1.avgTokPerSec > 0 + ? max(r0.avgTokPerSec, r1.avgTokPerSec) / min(r0.avgTokPerSec, r1.avgTokPerSec) + : 1.0 + let coolerLabel: String + if r0.drainedPerMinute > 0 && r1.drainedPerMinute > 0 { + coolerLabel = r0.drainedPerMinute <= r1.drainedPerMinute + ? LLMRunner.computeUnitsString(u0) + : LLMRunner.computeUnitsString(u1) + } else { + coolerLabel = "n/a (charging or unmeasured)" + } + out.append(""" + Summary: + faster : \(fasterLabel) (×\(String(format: "%.2f", speedRatio))) + lower drain : \(coolerLabel) + """) + } + messages.append(ChatMessage(role: .system, content: out.joined(separator: "\n\n"))) + } catch { + benchmarkRunning = false + benchmarkStatus = "" + messages.append(ChatMessage(role: .system, content: "[A/B] Failed: \(error.localizedDescription)")) + } + } + } + private func verifyANE() { messages.append(ChatMessage(role: .system, content: "Checking MLComputePlan device placement...")) Task.detached(priority: .userInitiated) { diff --git a/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift b/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift index e041dc6..e9722c9 100644 --- a/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift +++ b/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift @@ -20,6 +20,12 @@ final class LLMRunner { var hasAudio = false var maxAudioDuration: TimeInterval = 10.0 + /// Active compute unit selection. Applied on the next `loadModel` call — + /// changing this on a loaded model does NOT migrate weights between ANE + /// and GPU; you must reload. Vision/audio submodels are intentionally + /// pinned to `.cpuAndGPU` upstream and ignore this setting. + var computeUnits: MLComputeUnits = .cpuAndNeuralEngine + // MTP speculation metrics var mtpAcceptanceRate: Double = 0 var mtpTokensPerRound: Double = 0 @@ -30,11 +36,25 @@ final class LLMRunner { var crossVocabTokensPerCycle: Double = 0 private var llm: CoreMLLLM? - private var modelFolderURL: URL? + /// Folder of the most recently loaded model. Exposed (read-only) so the + /// app can re-issue `loadModel` with a different compute unit (A/B + /// benchmark, picker change) without re-prompting the user. + private(set) var modelFolderURL: URL? + + static func computeUnitsString(_ cu: MLComputeUnits) -> String { + switch cu { + case .cpuOnly: return "CPU" + case .cpuAndGPU: return "CPU+GPU" + case .all: return "All (CPU+GPU+ANE)" + case .cpuAndNeuralEngine: return "CPU+ANE" + @unknown default: return "?" + } + } // MARK: - Loading - func loadModel(from url: URL) async throws { + func loadModel(from url: URL, computeUnits: MLComputeUnits? = nil) async throws { + if let cu = computeUnits { self.computeUnits = cu } let folder = url.deletingLastPathComponent() // Release the previous model BEFORE allocating the new one — otherwise @@ -62,11 +82,13 @@ final class LLMRunner { modelFolderURL = folder loadingStatus = "Loading..." - llm = try await CoreMLLLM.load(from: folder) { [weak self] status in + let cu = self.computeUnits + llm = try await CoreMLLLM.load(from: folder, computeUnits: cu) { [weak self] status in Task { @MainActor in self?.loadingStatus = status } } + print("[LLMRunner] computeUnits=\(Self.computeUnitsString(cu))") modelName = llm!.modelName hasVision = llm!.supportsVision @@ -243,6 +265,77 @@ final class LLMRunner { } #endif + // MARK: - A/B compute-unit comparison + + struct ABBenchmarkResult { + var perSideDuration: TimeInterval + var entries: [(units: MLComputeUnits, result: BenchmarkResult?, error: String?)] + } + + /// Reload the current model under each requested compute unit and run + /// `runBenchmark` on each. The same prompt, same duration, sequential. + /// Restores the original compute unit at the end (best-effort). + /// + /// Caveats baked in: + /// - Reloading is slow (first-run ANE compile ≈ 1–2 min). The phase + /// callback distinguishes load vs run. + /// - Vision/audio submodels are pinned to `.cpuAndGPU` upstream and are + /// not affected by the side under test. + /// - The model must already have been loaded once so we know its folder. + #if os(iOS) + @MainActor + func runABBenchmark( + durationPerSide: TimeInterval, + units: [MLComputeUnits] = [.cpuAndNeuralEngine, .cpuAndGPU], + onPhase: @escaping (String) -> Void, + onProgress: @escaping (BenchmarkProgress) -> Void + ) async throws -> ABBenchmarkResult { + guard let folder = modelFolderURL else { + throw NSError(domain: "LLMRunner", code: 2, + userInfo: [NSLocalizedDescriptionKey: "Load a model first"]) + } + let modelURL = folder.appendingPathComponent("model.mlpackage") + let originalCU = self.computeUnits + + var entries: [(MLComputeUnits, BenchmarkResult?, String?)] = [] + for cu in units { + let label = Self.computeUnitsString(cu) + onPhase("[\(label)] reloading model…") + do { + try await loadModel(from: modelURL, computeUnits: cu) + } catch { + entries.append((cu, nil, "load failed: \(error.localizedDescription)")) + continue + } + onPhase("[\(label)] benchmarking \(Int(durationPerSide))s…") + do { + let r = try await runBenchmark(duration: durationPerSide, + onProgress: onProgress) + entries.append((cu, r, nil)) + } catch { + entries.append((cu, nil, "bench failed: \(error.localizedDescription)")) + } + // Cool-down between sides so thermal state from side A doesn't + // bleed into side B's measurements. 60s is a compromise — not + // enough to hit nominal from .serious, but enough to drop + // surface temperature noticeably on most devices. + if cu != units.last { + onPhase("Cooling down 60s before next side…") + try? await Task.sleep(nanoseconds: 60_000_000_000) + } + } + + // Restore original compute units (best-effort — caller may want a + // specific side left loaded; they can reload after if needed). + if originalCU != units.last { + onPhase("Restoring original compute units (\(Self.computeUnitsString(originalCU)))…") + try? await loadModel(from: modelURL, computeUnits: originalCU) + } + + return ABBenchmarkResult(perSideDuration: durationPerSide, entries: entries) + } + #endif + static func thermalString(_ s: ProcessInfo.ThermalState) -> String { switch s { case .nominal: return "nominal" @@ -253,8 +346,12 @@ final class LLMRunner { } } - // MARK: - ANE placement verification + // MARK: - Compute placement verification + /// Reports MLComputePlan placement for the currently-loaded model, using + /// the runner's active `computeUnits` for the LLM chunks (so the audit + /// matches reality, not a hardcoded ANE config). Vision stays on + /// `.cpuAndGPU` because that is what the package always uses for it. @available(iOS 17.0, macOS 14.0, *) func verifyANEPlacement() async -> String { guard let folder = modelFolderURL else { @@ -262,25 +359,37 @@ final class LLMRunner { } let cfg = MLModelConfiguration() - cfg.computeUnits = .cpuAndNeuralEngine + cfg.computeUnits = self.computeUnits let visionCfg = MLModelConfiguration() visionCfg.computeUnits = .cpuAndGPU + // Prefill chunks may have been forced to GPU via the GPU_PREFILL + // env var — see ChunkedEngine.swift. Mirror that here so the audit + // matches what was actually loaded. + let useGPUPrefill = ProcessInfo.processInfo.environment["GPU_PREFILL"] == "1" + let prefillCfg = MLModelConfiguration() + prefillCfg.computeUnits = useGPUPrefill ? .cpuAndGPU : self.computeUnits + struct Entry { let label: String; let url: URL; let cfg: MLModelConfiguration } var entries: [Entry] = [] - let names = ["chunk1", "chunk2", "chunk3", "chunk4", - "prefill_chunk1", "prefill_chunk2", "prefill_chunk3", "prefill_chunk4"] - for name in names { + let decodeNames = ["chunk1", "chunk2", "chunk3", "chunk4"] + let prefillNames = ["prefill_chunk1", "prefill_chunk2", "prefill_chunk3", "prefill_chunk4"] + for name in decodeNames { if let u = findModel(in: folder, name: name) { entries.append(Entry(label: name, url: u, cfg: cfg)) } } + for name in prefillNames { + if let u = findModel(in: folder, name: name) { + entries.append(Entry(label: name, url: u, cfg: prefillCfg)) + } + } if let u = findModel(in: folder, name: "vision") { entries.append(Entry(label: "vision", url: u, cfg: visionCfg)) } if entries.isEmpty { return "No chunks found." } - var lines: [String] = ["MLComputePlan placement:"] + var lines: [String] = ["MLComputePlan placement (cfg=\(Self.computeUnitsString(self.computeUnits))):"] var tAll = 0, aAll = 0, gAll = 0, cAll = 0 for e in entries { do {