Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 148 additions & 0 deletions Examples/CoreMLLLMChat/CoreMLLLMChat/ChatView.swift
Original file line number Diff line number Diff line change
@@ -1,9 +1,25 @@
import SwiftUI
import PhotosUI
import CoreML
import CoreMLLLM

enum ComputeChoice: String, CaseIterable, Identifiable {
case ane = "ANE"
case gpu = "GPU"
case all = "All"
var id: String { rawValue }
var mlValue: MLComputeUnits {
switch self {
case .ane: return .cpuAndNeuralEngine
case .gpu: return .cpuAndGPU
case .all: return .all
}
}
}

struct ChatView: View {
@State private var runner = LLMRunner()
@State private var computeChoice: ComputeChoice = .ane
@State private var messages: [ChatMessage] = []
@State private var inputText = ""
@State private var showModelPicker = false
Expand Down Expand Up @@ -210,8 +226,31 @@ struct ChatView: View {
Button("5 min") { startBenchmark(minutes: 5) }
Button("10 min") { startBenchmark(minutes: 10) }
Button("30 min") { startBenchmark(minutes: 30) }
Divider()
Section("Compare ANE vs GPU") {
Button("2 min each") { startABBenchmark(minutesPerSide: 2) }
Button("5 min each") { startABBenchmark(minutesPerSide: 5) }
Button("10 min each") { startABBenchmark(minutesPerSide: 10) }
}
}
.disabled(runner.isGenerating || benchmarkRunning)
}
}
if runner.isLoaded {
ToolbarItem(placement: .topBarTrailing) {
Menu {
Picker("Compute", selection: $computeChoice) {
ForEach(ComputeChoice.allCases) { c in
Text(c.rawValue).tag(c)
}
}
} label: {
Text(computeChoice.rawValue)
}
.disabled(runner.isGenerating || benchmarkRunning)
.onChange(of: computeChoice) { _, new in
reloadWithCompute(new)
}
}
}
if runner.hasAudio {
Expand Down Expand Up @@ -556,6 +595,115 @@ struct ChatView: View {
}
}

private func reloadWithCompute(_ choice: ComputeChoice) {
guard let folder = runner.modelFolderURL else { return }
let modelURL = folder.appendingPathComponent("model.mlpackage")
messages.append(ChatMessage(role: .system,
content: "Reloading model on \(LLMRunner.computeUnitsString(choice.mlValue))…"))
Task.detached(priority: .userInitiated) {
do {
try await runner.loadModel(from: modelURL, computeUnits: choice.mlValue)
await MainActor.run {
messages.append(ChatMessage(role: .system,
content: "Loaded on \(LLMRunner.computeUnitsString(choice.mlValue))."))
}
} catch {
await MainActor.run {
messages.append(ChatMessage(role: .system,
content: "Reload failed: \(error.localizedDescription)"))
}
}
}
}

private func startABBenchmark(minutesPerSide: Int) {
UIDevice.current.isBatteryMonitoringEnabled = true
let state = UIDevice.current.batteryState
if state == .charging || state == .full {
messages.append(ChatMessage(role: .system, content: "[A/B] Device is charging — unplug for accurate SoC drain measurement."))
}
benchmarkRunning = true
benchmarkStatus = "A/B starting… (\(minutesPerSide) min per side)"
messages.append(ChatMessage(role: .system,
content: "[A/B] Running ANE then GPU for \(minutesPerSide) min each. Reload between sides takes ~1 min; 60s cool-down in between."))
UIApplication.shared.isIdleTimerDisabled = true

Task {
defer { UIApplication.shared.isIdleTimerDisabled = false }
do {
let ab = try await runner.runABBenchmark(
durationPerSide: TimeInterval(minutesPerSide * 60),
onPhase: { phase in
benchmarkStatus = phase
},
onProgress: { prog in
let batNow = prog.batteryNow >= 0 ? Int(prog.batteryNow * 100) : -1
benchmarkStatus = String(
format: "[A/B] %ds %d tok avg %.1f tok/s SoC %d%% %@",
Int(prog.elapsed), prog.totalTokens, prog.avgTokPerSec,
batNow,
LLMRunner.thermalString(prog.thermal) as NSString)
}
)
benchmarkRunning = false
benchmarkStatus = "A/B done. See chat for result."

var out = ["[A/B RESULT] \(minutesPerSide) min per side"]
for e in ab.entries {
let label = LLMRunner.computeUnitsString(e.units)
if let err = e.error {
out.append("\(label): \(err)")
continue
}
guard let r = e.result else { continue }
let bs = r.batteryStart >= 0 ? Int(r.batteryStart * 100) : -1
let be = r.batteryEnd >= 0 ? Int(r.batteryEnd * 100) : -1
out.append("""
\(label):
tok/s avg : \(String(format: "%.2f", r.avgTokPerSec))
tokens : \(r.totalTokens) (rounds \(r.rounds))
battery : \(bs)% → \(be)% (Δ \(String(format: "%.2f", r.drainedPercent))%)
drain/min : \(String(format: "%.3f", r.drainedPerMinute))%/min
tokens/%SoC : \(String(format: "%.0f", r.tokensPerPercent))
thermal : \(LLMRunner.thermalString(r.thermalStart)) → \(LLMRunner.thermalString(r.thermalEnd))\(r.abortedThermal ? " (aborted .serious)" : "")
""")
}
// Head-to-head delta on tok/s and drain if both sides finished.
let done = ab.entries.compactMap { e -> (MLComputeUnits, LLMRunner.BenchmarkResult)? in
if let r = e.result { return (e.units, r) } else { return nil }
}
if done.count >= 2 {
let (u0, r0) = done[0]
let (u1, r1) = done[1]
let fasterLabel = r0.avgTokPerSec >= r1.avgTokPerSec
? LLMRunner.computeUnitsString(u0)
: LLMRunner.computeUnitsString(u1)
let speedRatio = r0.avgTokPerSec > 0 && r1.avgTokPerSec > 0
? max(r0.avgTokPerSec, r1.avgTokPerSec) / min(r0.avgTokPerSec, r1.avgTokPerSec)
: 1.0
let coolerLabel: String
if r0.drainedPerMinute > 0 && r1.drainedPerMinute > 0 {
coolerLabel = r0.drainedPerMinute <= r1.drainedPerMinute
? LLMRunner.computeUnitsString(u0)
: LLMRunner.computeUnitsString(u1)
} else {
coolerLabel = "n/a (charging or unmeasured)"
}
out.append("""
Summary:
faster : \(fasterLabel) (×\(String(format: "%.2f", speedRatio)))
lower drain : \(coolerLabel)
""")
}
messages.append(ChatMessage(role: .system, content: out.joined(separator: "\n\n")))
} catch {
benchmarkRunning = false
benchmarkStatus = ""
messages.append(ChatMessage(role: .system, content: "[A/B] Failed: \(error.localizedDescription)"))
}
}
}

private func verifyANE() {
messages.append(ChatMessage(role: .system, content: "Checking MLComputePlan device placement..."))
Task.detached(priority: .userInitiated) {
Expand Down
127 changes: 118 additions & 9 deletions Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ final class LLMRunner {
var hasAudio = false
var maxAudioDuration: TimeInterval = 10.0

/// Active compute unit selection. Applied on the next `loadModel` call —
/// changing this on a loaded model does NOT migrate weights between ANE
/// and GPU; you must reload. Vision/audio submodels are intentionally
/// pinned to `.cpuAndGPU` upstream and ignore this setting.
var computeUnits: MLComputeUnits = .cpuAndNeuralEngine

// MTP speculation metrics
var mtpAcceptanceRate: Double = 0
var mtpTokensPerRound: Double = 0
Expand All @@ -30,11 +36,25 @@ final class LLMRunner {
var crossVocabTokensPerCycle: Double = 0

private var llm: CoreMLLLM?
private var modelFolderURL: URL?
/// Folder of the most recently loaded model. Exposed (read-only) so the
/// app can re-issue `loadModel` with a different compute unit (A/B
/// benchmark, picker change) without re-prompting the user.
private(set) var modelFolderURL: URL?

static func computeUnitsString(_ cu: MLComputeUnits) -> String {
switch cu {
case .cpuOnly: return "CPU"
case .cpuAndGPU: return "CPU+GPU"
case .all: return "All (CPU+GPU+ANE)"
case .cpuAndNeuralEngine: return "CPU+ANE"
@unknown default: return "?"
}
}

// MARK: - Loading

func loadModel(from url: URL) async throws {
func loadModel(from url: URL, computeUnits: MLComputeUnits? = nil) async throws {
if let cu = computeUnits { self.computeUnits = cu }
let folder = url.deletingLastPathComponent()

// Release the previous model BEFORE allocating the new one — otherwise
Expand Down Expand Up @@ -62,11 +82,13 @@ final class LLMRunner {
modelFolderURL = folder
loadingStatus = "Loading..."

llm = try await CoreMLLLM.load(from: folder) { [weak self] status in
let cu = self.computeUnits
llm = try await CoreMLLLM.load(from: folder, computeUnits: cu) { [weak self] status in
Task { @MainActor in
self?.loadingStatus = status
}
}
print("[LLMRunner] computeUnits=\(Self.computeUnitsString(cu))")

modelName = llm!.modelName
hasVision = llm!.supportsVision
Expand Down Expand Up @@ -243,6 +265,77 @@ final class LLMRunner {
}
#endif

// MARK: - A/B compute-unit comparison

struct ABBenchmarkResult {
var perSideDuration: TimeInterval
var entries: [(units: MLComputeUnits, result: BenchmarkResult?, error: String?)]
}

/// Reload the current model under each requested compute unit and run
/// `runBenchmark` on each. The same prompt, same duration, sequential.
/// Restores the original compute unit at the end (best-effort).
///
/// Caveats baked in:
/// - Reloading is slow (first-run ANE compile ≈ 1–2 min). The phase
/// callback distinguishes load vs run.
/// - Vision/audio submodels are pinned to `.cpuAndGPU` upstream and are
/// not affected by the side under test.
/// - The model must already have been loaded once so we know its folder.
#if os(iOS)
@MainActor
func runABBenchmark(
durationPerSide: TimeInterval,
units: [MLComputeUnits] = [.cpuAndNeuralEngine, .cpuAndGPU],
onPhase: @escaping (String) -> Void,
onProgress: @escaping (BenchmarkProgress) -> Void
) async throws -> ABBenchmarkResult {
guard let folder = modelFolderURL else {
throw NSError(domain: "LLMRunner", code: 2,
userInfo: [NSLocalizedDescriptionKey: "Load a model first"])
}
let modelURL = folder.appendingPathComponent("model.mlpackage")
let originalCU = self.computeUnits

var entries: [(MLComputeUnits, BenchmarkResult?, String?)] = []
for cu in units {
let label = Self.computeUnitsString(cu)
onPhase("[\(label)] reloading model…")
do {
try await loadModel(from: modelURL, computeUnits: cu)
} catch {
entries.append((cu, nil, "load failed: \(error.localizedDescription)"))
continue
}
onPhase("[\(label)] benchmarking \(Int(durationPerSide))s…")
do {
let r = try await runBenchmark(duration: durationPerSide,
onProgress: onProgress)
entries.append((cu, r, nil))
} catch {
entries.append((cu, nil, "bench failed: \(error.localizedDescription)"))
}
// Cool-down between sides so thermal state from side A doesn't
// bleed into side B's measurements. 60s is a compromise — not
// enough to hit nominal from .serious, but enough to drop
// surface temperature noticeably on most devices.
if cu != units.last {
onPhase("Cooling down 60s before next side…")
try? await Task.sleep(nanoseconds: 60_000_000_000)
}
}

// Restore original compute units (best-effort — caller may want a
// specific side left loaded; they can reload after if needed).
if originalCU != units.last {
onPhase("Restoring original compute units (\(Self.computeUnitsString(originalCU)))…")
try? await loadModel(from: modelURL, computeUnits: originalCU)
}

return ABBenchmarkResult(perSideDuration: durationPerSide, entries: entries)
}
#endif

static func thermalString(_ s: ProcessInfo.ThermalState) -> String {
switch s {
case .nominal: return "nominal"
Expand All @@ -253,34 +346,50 @@ final class LLMRunner {
}
}

// MARK: - ANE placement verification
// MARK: - Compute placement verification

/// Reports MLComputePlan placement for the currently-loaded model, using
/// the runner's active `computeUnits` for the LLM chunks (so the audit
/// matches reality, not a hardcoded ANE config). Vision stays on
/// `.cpuAndGPU` because that is what the package always uses for it.
@available(iOS 17.0, macOS 14.0, *)
func verifyANEPlacement() async -> String {
guard let folder = modelFolderURL else {
return "No model folder (load a model first)."
}

let cfg = MLModelConfiguration()
cfg.computeUnits = .cpuAndNeuralEngine
cfg.computeUnits = self.computeUnits
let visionCfg = MLModelConfiguration()
visionCfg.computeUnits = .cpuAndGPU

// Prefill chunks may have been forced to GPU via the GPU_PREFILL
// env var — see ChunkedEngine.swift. Mirror that here so the audit
// matches what was actually loaded.
let useGPUPrefill = ProcessInfo.processInfo.environment["GPU_PREFILL"] == "1"
let prefillCfg = MLModelConfiguration()
prefillCfg.computeUnits = useGPUPrefill ? .cpuAndGPU : self.computeUnits

struct Entry { let label: String; let url: URL; let cfg: MLModelConfiguration }
var entries: [Entry] = []
let names = ["chunk1", "chunk2", "chunk3", "chunk4",
"prefill_chunk1", "prefill_chunk2", "prefill_chunk3", "prefill_chunk4"]
for name in names {
let decodeNames = ["chunk1", "chunk2", "chunk3", "chunk4"]
let prefillNames = ["prefill_chunk1", "prefill_chunk2", "prefill_chunk3", "prefill_chunk4"]
for name in decodeNames {
if let u = findModel(in: folder, name: name) {
entries.append(Entry(label: name, url: u, cfg: cfg))
}
}
for name in prefillNames {
if let u = findModel(in: folder, name: name) {
entries.append(Entry(label: name, url: u, cfg: prefillCfg))
}
}
if let u = findModel(in: folder, name: "vision") {
entries.append(Entry(label: "vision", url: u, cfg: visionCfg))
}
if entries.isEmpty { return "No chunks found." }

var lines: [String] = ["MLComputePlan placement:"]
var lines: [String] = ["MLComputePlan placement (cfg=\(Self.computeUnitsString(self.computeUnits))):"]
var tAll = 0, aAll = 0, gAll = 0, cAll = 0
for e in entries {
do {
Expand Down