diff --git a/.claude/skills/release/SKILL.md b/.claude/skills/release/SKILL.md new file mode 100644 index 00000000..d608ae46 --- /dev/null +++ b/.claude/skills/release/SKILL.md @@ -0,0 +1,106 @@ +--- +name: release +description: Release flutter_gemma โ€” rebuild JAR, update all version numbers, checksums, CHANGELOG, upload to GitHub release +user_invocable: true +--- + +# Flutter Gemma Release + +Complete release checklist for flutter_gemma plugin. Run as `/release ` (e.g. `/release 0.14.0`). + +## Pre-flight + +Before starting, verify you're on the correct branch and all changes are committed: +```bash +git status +git log --oneline -5 +``` + +## Step 1: Update version numbers + +All files that contain the version: + +| File | Variable/Field | Example | +|------|---------------|---------| +| `pubspec.yaml` | `version:` | `version: ` | +| `ios/flutter_gemma.podspec` | `s.version` | `s.version = ''` | +| `litertlm-server/build.gradle.kts` | `version =` | `version = ""` | +| `CLAUDE.md` | `Current Version:` | `- **Current Version**: ` | +| `macos/scripts/setup_desktop.sh:61` | `JAR_VERSION=` | `JAR_VERSION=""` | +| `macos/scripts/prepare_resources.sh:42` | `JAR_VERSION=` | `JAR_VERSION=""` | +| `linux/scripts/setup_desktop.sh:62` | `JAR_VERSION=` | `JAR_VERSION=""` | +| `windows/scripts/setup_desktop.ps1:90` | `$JarVersion =` | `$JarVersion = ""` | + +> JAR_URL is auto-derived from JAR_VERSION in all scripts โ€” no separate update needed. + +## Step 2: Update CHANGELOG.md + +Add new section at top with all changes. Categories: features, fixes, breaking changes. + +## Step 3: Build JAR + +```bash +cd litertlm-server && ./gradlew fatJar +``` + +Verify build success. JAR output: `litertlm-server/build/libs/litertlm-server--all.jar` + +## Step 4: Compute new SHA256 + +```bash +shasum -a 256 litertlm-server/build/libs/litertlm-server-*-all.jar +``` + +## Step 5: Update JAR checksums in all 4 scripts + +| File | Variable | +|------|----------| +| `macos/scripts/setup_desktop.sh:63` | `JAR_CHECKSUM=""` | +| `macos/scripts/prepare_resources.sh:44` | `JAR_CHECKSUM=""` | +| `linux/scripts/setup_desktop.sh:64` | `JAR_CHECKSUM=""` | +| `windows/scripts/setup_desktop.ps1:92` | `$JarChecksum = ""` | + +JAR is cross-platform (JVM bytecode) โ€” same checksum for all platforms. + +## Step 6: Verify + +```bash +flutter analyze # 0 errors +flutter test # all pass +dart pub publish --dry-run # 0 warnings +``` + +**NEVER publish without dry-run first.** Publishing is IRREVERSIBLE. + +## Step 7: Create/update GitHub release + +```bash +# Create new release +gh release create v \ + litertlm-server/build/libs/litertlm-server--all.jar \ + --title "v" \ + --notes-file CHANGELOG_EXCERPT.md + +# OR update existing release (delete old JAR first) +gh release delete-asset v litertlm-server.jar --yes 2>/dev/null +gh release upload v litertlm-server/build/libs/litertlm-server--all.jar +``` + +Verify JAR URL returns 200: +```bash +curl -sI "https://github.com/DenisovAV/flutter_gemma/releases/download/v/litertlm-server.jar" | head -1 +``` + +## Step 8: Commit & PR + +- Author: `--author="Sasha Denisov "` +- No AI attribution in commit messages +- No "Co-Authored-By" or "Generated with Claude" footers +- Create PR via `gh pr create` + +## Step 9: After merge โ€” publish + +```bash +dart pub publish --dry-run # verify one more time +dart pub publish # only after user approval! +``` diff --git a/CHANGELOG.md b/CHANGELOG.md index aa0f7f75..c023af2d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.13.1 +- **LiteRT-LM 0.10.0**: Updated Android and JVM SDK from 0.9.0 to 0.10.0 +- **Gemma 4 Thinking Mode**: `isThinking: true` now works with Gemma 4 E2B/E4B models (Android, iOS, Desktop; not Web) +- **Fix cancel download**: Cancel download now works correctly (#196) +- **Fix `large_file_handler` platform support**: Conditional imports for pub.dev platform analysis compatibility + ## 0.13.0 - **Gemma 4 E2B/E4B**: Added support for next-gen multimodal models (text + image + audio) - **systemInstruction**: New parameter in `createChat()` and `createSession()` for setting system-level context diff --git a/CLAUDE.md b/CLAUDE.md index fce52938..9d74e576 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -69,7 +69,7 @@ final token = const String.fromEnvironment('HF_TOKEN'); - ๐Ÿ”ฅ **Local AI Inference** - Run Gemma models directly on device - ๐Ÿ–ผ๏ธ **Multimodal Support** - Text + Image input with Gemma 3 Nano - ๐Ÿ› ๏ธ **Function Calling** - Enable models to call external functions -- ๐Ÿง  **Thinking Mode** - View reasoning process of DeepSeek models +- ๐Ÿง  **Thinking Mode** - View reasoning process of DeepSeek and Gemma 4 models - ๐Ÿ“ฑ **Cross-Platform** - Android, iOS, Web, macOS, Windows, Linux - โšก **GPU Acceleration** - Hardware-accelerated inference - ๐Ÿ”ง **LoRA Support** - Efficient fine-tuning weights @@ -401,6 +401,8 @@ Future close() async { | Model Family | Function Calling | Thinking Mode | Multimodal | Platform Support | |--------------|------------------|---------------|------------|------------------| +| Gemma 4 E2B | โœ… | โœ… ยน | โœ… | Android, iOS, Web, Desktop | +| Gemma 4 E4B | โœ… | โœ… ยน | โœ… | Android, iOS, Web, Desktop | | Gemma 3 Nano | โœ… | โŒ | โœ… | Android, iOS, Web | | Gemma 3 270M | โŒ | โŒ | โŒ | Android, iOS, Web | | Gemma-3 1B | โœ… | โŒ | โŒ | Android, iOS, Web | @@ -411,6 +413,8 @@ Future close() async { | Qwen2.5 | โœ… | โŒ | โŒ | Android, iOS, Web | | Phi-4 | โŒ | โŒ | โŒ | Android, iOS, Web | +> ยน Thinking Mode for Gemma 4: Android, iOS, Desktop only. Web (MediaPipe) does not support `extraContext`. + ### Platform Limitations | Platform | Vision/Multimodal | Audio | Embeddings | Notes | @@ -457,10 +461,10 @@ dev_dependencies: ### MediaPipe GenAI Integration -- **Current Version Web**: v0.10.26 +- **Current Version Web**: v0.10.27 - **Current Version Android**: v0.10.33 - **Current Version iOS**: v0.10.33 -- **Web CDN**: `https://cdn.jsdelivr.net/npm/@mediapipe/tasks-genai@0.10.26` +- **Web CDN**: `https://cdn.jsdelivr.net/npm/@mediapipe/tasks-genai@0.10.27` - **iOS/Android**: Integrated via CocoaPods/Gradle ## Development Best Practices @@ -623,7 +627,7 @@ Log.w(TAG, "sizeInTokens: LiteRT-LM does not support token counting. " + **Dependency (build.gradle):** ```gradle -implementation 'com.google.ai.edge.litertlm:litertlm-android:0.9.0-beta' +implementation 'com.google.ai.edge.litertlm:litertlm-android:0.10.0' ``` **Usage (Dart - no changes required):** @@ -642,7 +646,7 @@ await FlutterGemma.installModel(modelType: ModelType.gemmaIt) ```html @@ -1243,7 +1247,7 @@ flutter_gemma/ - **GitHub**: https://github.com/DenisovAV/flutter_gemma - **Pub.dev**: https://pub.dev/packages/flutter_gemma -- **Current Version**: 0.13.0 +- **Current Version**: 0.13.1 - **License**: Check repository for license details - **Issues**: Report bugs via GitHub Issues - **Changelog**: See `CHANGELOG.md` for version history \ No newline at end of file diff --git a/README.md b/README.md index f9167ce5..553a8901 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ **The plugin supports not only Gemma, but also other models. Here's the full list of supported models:** [Gemma 4 E2B/E4B](https://huggingface.co/google/gemma-4-E2B-it-litert-lm), [Gemma3n E2B/E4B](https://huggingface.co/google/gemma-3n-E2B-it-litert-preview), [FastVLM 0.5B](https://huggingface.co/litert-community/FastVLM-0.5B), [Gemma-3 1B](https://huggingface.co/litert-community/Gemma3-1B-IT), [Gemma 3 270M](https://huggingface.co/litert-community/gemma-3-270m-it), [FunctionGemma 270M](https://huggingface.co/sasha-denisov/function-gemma-270M-it), [Qwen3 0.6B](https://huggingface.co/litert-community/Qwen3-0.6B), [Qwen 2.5](https://huggingface.co/litert-community/Qwen2.5-1.5B-Instruct), [Phi-4 Mini](https://huggingface.co/litert-community/Phi-4-mini-instruct), [DeepSeek R1](https://huggingface.co/litert-community/DeepSeek-R1-Distill-Qwen-1.5B), [SmolLM 135M](https://huggingface.co/litert-community/SmolLM-135M-Instruct). -*Note: The flutter_gemma plugin supports Gemma3n (with **multimodal vision and audio support**), FastVLM (vision), Gemma-3, FunctionGemma, Qwen3, Qwen 2.5, Phi-4, DeepSeek R1 and SmolLM. Desktop platforms (macOS, Windows, Linux) require `.litertlm` model format. +*Note: The flutter_gemma plugin supports Gemma 4 and Gemma3n (with **multimodal vision and audio support**), FastVLM (vision), Gemma-3, FunctionGemma, Qwen3, Qwen 2.5, Phi-4, DeepSeek R1 and SmolLM. Desktop platforms (macOS, Windows, Linux) require `.litertlm` model format. [Gemma](https://ai.google.dev/gemma) is a family of lightweight, state-of-the art open models built from the same research and technology used to create the Gemini models @@ -32,7 +32,7 @@ There is an example of using: - **๐Ÿ–ผ๏ธ Multimodal Support:** Text + Image input with Gemma3n vision models - **๐ŸŽ™๏ธ Audio Input:** Record and send audio messages with Gemma3n E2B/E4B models (Android, Desktop - LiteRT-LM engine) - **๐Ÿ› ๏ธ Function Calling:** Enable your models to call external functions and integrate with other services (supported by select models) -- **๐Ÿง  Thinking Mode:** View the reasoning process of DeepSeek models with blocks +- **๐Ÿง  Thinking Mode:** View the reasoning process of DeepSeek and Gemma 4 models with thinking blocks - **๐Ÿ›‘ Stop Generation:** Cancel text generation mid-process on Android, Web, and Desktop - **โš™๏ธ Backend Switching:** Choose between CPU and GPU backends for each model individually in the example app - **๐Ÿ” Advanced Model Filtering:** Filter models by features (Multimodal, Function Calls, Thinking) with expandable UI @@ -72,8 +72,8 @@ The example app offers a curated list of models, each suited for different tasks | Model Family | Best For | Function Calling | Thinking Mode | Vision | Languages | Size | |---|---|:---:|:---:|:---:|---|---| -| **Gemma 4 E2B** | Next-gen multimodal chat โ€” text, image, audio | โœ… | โŒ | โœ… | Multilingual | 2.4GB | -| **Gemma 4 E4B** | Next-gen multimodal chat โ€” text, image, audio | โœ… | โŒ | โœ… | Multilingual | 4.3GB | +| **Gemma 4 E2B** | Next-gen multimodal chat โ€” text, image, audio | โœ… | โœ… | โœ… | Multilingual | 2.4GB | +| **Gemma 4 E4B** | Next-gen multimodal chat โ€” text, image, audio | โœ… | โœ… | โœ… | Multilingual | 4.3GB | | **Gemma3n** | On-device multimodal chat and image analysis | โœ… | โŒ | โœ… | Multilingual | 3-6GB | | **FastVLM 0.5B** | Fast vision-language inference | โŒ | โŒ | โœ… | Multilingual | 0.5GB | | **Phi-4 Mini** | Advanced reasoning and instruction following | โœ… | โŒ | โŒ | Multilingual | 3.9GB | @@ -1544,11 +1544,11 @@ FunctionGemma uses a special format (different from JSON-based function calling) The `flutter_gemma` plugin handles this format automatically via `FunctionCallParser`. -9. **๐Ÿง  Thinking Mode (DeepSeek Models)** +9. **๐Ÿง  Thinking Mode (DeepSeek & Gemma 4 Models)** -DeepSeek models support "thinking mode" where you can see the model's reasoning process before it generates the final response. This provides transparency into how the model approaches problems. +DeepSeek and Gemma 4 (E2B/E4B) models support "thinking mode" where you can see the model's reasoning process before it generates the final response. This provides transparency into how the model approaches problems. -**Enable Thinking Mode:** +**Enable Thinking Mode (DeepSeek):** ```dart final chat = await inferenceModel.createChat( @@ -1559,7 +1559,6 @@ final chat = await inferenceModel.createChat( modelType: ModelType.deepSeek, // Required for DeepSeek models supportsFunctionCalls: true, // DeepSeek also supports function calls tools: _tools, // Optional: add tools for function calling - // tokenBuffer: 256, // Token buffer for context management ); ``` @@ -1586,12 +1585,25 @@ chat.generateChatResponseAsync().listen((response) { }); ``` +**Enable Thinking Mode (Gemma 4):** + +```dart +final chat = await inferenceModel.createChat( + temperature: 1.0, + topK: 64, + topP: 0.95, + isThinking: true, // Enable thinking mode + modelType: ModelType.gemmaIt, // Gemma 4 E2B/E4B +); +// <|think|> is auto-injected into systemInstruction โ€” no manual prompt needed. +``` + **Thinking Mode Features:** - โœ… **Transparent Reasoning**: See how the model thinks through problems - โœ… **Interactive UI**: Show/hide thinking bubbles with expandable content - โœ… **Streaming Support**: Thinking content streams in real-time - โœ… **Function Integration**: Models can think before calling functions -- โœ… **DeepSeek Optimized**: Designed specifically for DeepSeek model architecture +- โœ… **Supported Models**: DeepSeek R1 and Gemma 4 E2B/E4B **Example Thinking Flow:** 1. User asks: "Change the background to blue and explain why blue is calming" @@ -2096,7 +2108,7 @@ Function calling is currently supported by the following models: | **Image Input (Multimodal)** | โœ… Full | โœ… Full | โœ… Full | โš ๏ธ Broken (#684) | macOS: model hallucinates | | **Audio Input** | โœ… Full | โœ… Full | โŒ Not supported | โœ… Full | Gemma3n E2B/E4B | | **Function Calling** | โœ… Full | โœ… Full | โœ… Full | โŒ Not supported | LiteRT-LM limitation | -| **Thinking Mode** | โœ… Full | โœ… Full | โœ… Full | โŒ Not supported | DeepSeek models | +| **Thinking Mode** | โœ… Full | โœ… Full | โœ… Full | โœ… Full | DeepSeek & Gemma 4 | | **Stop Generation** | โœ… Full | โœ… Full | โœ… Full | โœ… Full | Cancel mid-process | | **GPU Acceleration** | โœ… Full | โœ… Full | โœ… Full | โš ๏ธ Partial | macOS GPU broken | | **NPU Acceleration** | โœ… Full | โŒ Not supported | โŒ Not supported | โŒ Not supported | Android only (.litertlm) | @@ -2264,13 +2276,14 @@ import 'package:flutter_gemma/core/extensions.dart'; // Clean response based on model type String cleanedResponse = ModelThinkingFilter.cleanResponse( - rawResponse, + rawResponse, ModelType.deepSeek ); // The filter automatically removes model-specific tokens like: // - tags (Gemma models) -// - Special DeepSeek tokens +// - ... blocks (DeepSeek) +// - <|channel>thought\n... blocks (Gemma 4 E2B/E4B) // - Extra whitespace and formatting ``` diff --git a/android/build.gradle b/android/build.gradle index a01e18b3..21fc9117 100644 --- a/android/build.gradle +++ b/android/build.gradle @@ -76,7 +76,7 @@ dependencies { implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-guava:1.9.0' // LiteRT-LM Engine for .litertlm model files - implementation 'com.google.ai.edge.litertlm:litertlm-android:0.9.0' + implementation 'com.google.ai.edge.litertlm:litertlm-android:0.10.0' implementation 'androidx.core:core-ktx:1.12.0' implementation 'androidx.lifecycle:lifecycle-runtime-ktx:2.7.0' diff --git a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/FlutterGemmaPlugin.kt b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/FlutterGemmaPlugin.kt index 70637a10..3263579a 100644 --- a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/FlutterGemmaPlugin.kt +++ b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/FlutterGemmaPlugin.kt @@ -82,8 +82,8 @@ private class PlatformServiceImpl( private val engineLock = Any() // Lock for thread-safe engine access // NEW: Use InferenceEngine abstraction instead of InferenceModel - private var engine: InferenceEngine? = null - private var session: InferenceSession? = null + @Volatile private var engine: InferenceEngine? = null + @Volatile private var session: InferenceSession? = null // RAG components private var embeddingModel: EmbeddingModel? = null @@ -130,6 +130,9 @@ private class PlatformServiceImpl( // Only now clear old state and swap in new engine (thread-safe) synchronized(engineLock) { + // Cancel stale stream collector before replacing engine + streamJob?.cancel() + streamJob = null session?.cancelGeneration() try { session?.close() @@ -176,6 +179,7 @@ private class PlatformServiceImpl( enableVisionModality: Boolean?, enableAudioModality: Boolean?, systemInstruction: String?, + enableThinking: Boolean?, callback: (Result) -> Unit ) { scope.launch { @@ -193,6 +197,7 @@ private class PlatformServiceImpl( enableVisionModality = enableVisionModality, enableAudioModality = enableAudioModality, systemInstruction = systemInstruction, + enableThinking = enableThinking ?: false, ) session?.close() diff --git a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/PigeonInterface.g.kt b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/PigeonInterface.g.kt index d758bd63..7ec66918 100644 --- a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/PigeonInterface.g.kt +++ b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/PigeonInterface.g.kt @@ -203,7 +203,7 @@ private open class PigeonInterfacePigeonCodec : StandardMessageCodec() { interface PlatformService { fun createModel(maxTokens: Long, modelPath: String, loraRanks: List?, preferredBackend: PreferredBackend?, maxNumImages: Long?, supportAudio: Boolean?, callback: (Result) -> Unit) fun closeModel(callback: (Result) -> Unit) - fun createSession(temperature: Double, randomSeed: Long, topK: Long, topP: Double?, loraPath: String?, enableVisionModality: Boolean?, enableAudioModality: Boolean?, systemInstruction: String?, callback: (Result) -> Unit) + fun createSession(temperature: Double, randomSeed: Long, topK: Long, topP: Double?, loraPath: String?, enableVisionModality: Boolean?, enableAudioModality: Boolean?, systemInstruction: String?, enableThinking: Boolean?, callback: (Result) -> Unit) fun closeSession(callback: (Result) -> Unit) fun sizeInTokens(prompt: String, callback: (Result) -> Unit) fun addQueryChunk(prompt: String, callback: (Result) -> Unit) @@ -315,7 +315,8 @@ interface PlatformService { val enableVisionModalityArg = args[5] as Boolean? val enableAudioModalityArg = args[6] as Boolean? val systemInstructionArg = args[7] as String? - api.createSession(temperatureArg, randomSeedArg, topKArg, topPArg, loraPathArg, enableVisionModalityArg, enableAudioModalityArg, systemInstructionArg) { result: Result -> + val enableThinkingArg = args[8] as Boolean? + api.createSession(temperatureArg, randomSeedArg, topKArg, topPArg, loraPathArg, enableVisionModalityArg, enableAudioModalityArg, systemInstructionArg, enableThinkingArg) { result: Result -> val error = result.exceptionOrNull() if (error != null) { reply.reply(wrapError(error)) diff --git a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/EngineConfig.kt b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/EngineConfig.kt index 292ca1f9..1dc60047 100644 --- a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/EngineConfig.kt +++ b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/EngineConfig.kt @@ -28,6 +28,7 @@ data class SessionConfig( val enableVisionModality: Boolean? = null, val enableAudioModality: Boolean? = null, val systemInstruction: String? = null, + val enableThinking: Boolean = false, ) /** diff --git a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/litertlm/LiteRtLmSession.kt b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/litertlm/LiteRtLmSession.kt index cdece1fd..90b5b8f8 100644 --- a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/litertlm/LiteRtLmSession.kt +++ b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/litertlm/LiteRtLmSession.kt @@ -31,6 +31,13 @@ class LiteRtLmSession( private val conversation: Conversation + // Extra context for thinking mode (Gemma 4 via Jinja template variable) + private val extraContext: Map = if (config.enableThinking) { + mapOf("enable_thinking" to true) + } else { + emptyMap() + } + // Chunk buffering (MediaPipe compatibility) - thread-safe access private val pendingPrompt = StringBuilder() private val promptLock = Any() @@ -84,11 +91,23 @@ class LiteRtLmSession( Log.d(TAG, "Generating sync response for message: ${message.toString().length} chars") return try { - val response = conversation.sendMessage(message) - response.toString() + val response = if (extraContext.isNotEmpty()) { + conversation.sendMessage(message, extraContext) + } else { + conversation.sendMessage(message) + } + val thinking = response.channels["thought"] + val text = response.toString() + if (!thinking.isNullOrEmpty()) { + "<|channel>thought\n$thinking$text" + } else { + text + } } catch (e: Exception) { Log.e(TAG, "Error generating response", e) - errorFlow.tryEmit(e) + if (!errorFlow.tryEmit(e)) { + Log.w(TAG, "Error emission dropped (buffer full): ${e.message}") + } throw e } } @@ -97,27 +116,49 @@ class LiteRtLmSession( val message = buildAndConsumeMessage() Log.d(TAG, "Generating async response for message: ${message.toString().length} chars") - try { - // Use callback-based API - conversation.sendMessageAsync(message, object : MessageCallback { - override fun onMessage(message: Message) { - val text = message.toString() - resultFlow.tryEmit(text to false) + val callback = object : MessageCallback { + override fun onMessage(msg: Message) { + // Combine thinking + text into single emission to prevent DROP_OLDEST loss + // (buffer=1, two rapid tryEmit calls would drop the first) + val thinking = msg.channels["thought"] + val text = msg.toString() + val combined = buildString { + if (!thinking.isNullOrEmpty()) { + append("<|channel>thought\n$thinking") + } + if (text.isNotEmpty()) { + append(text) + } } - - override fun onDone() { - resultFlow.tryEmit("" to true) + if (combined.isNotEmpty()) { + resultFlow.tryEmit(combined to false) } + } - override fun onError(throwable: Throwable) { - Log.e(TAG, "Async generation error", throwable) - errorFlow.tryEmit(throwable) - resultFlow.tryEmit("" to true) + override fun onDone() { + resultFlow.tryEmit("" to true) + } + + override fun onError(throwable: Throwable) { + Log.e(TAG, "Async generation error", throwable) + if (!errorFlow.tryEmit(throwable)) { + Log.w(TAG, "Error emission dropped (buffer full): ${throwable.message}") } - }) + resultFlow.tryEmit("" to true) + } + } + + try { + if (extraContext.isNotEmpty()) { + conversation.sendMessageAsync(message, callback, extraContext) + } else { + conversation.sendMessageAsync(message, callback) + } } catch (e: Exception) { Log.e(TAG, "Failed to start async generation", e) - errorFlow.tryEmit(e) + if (!errorFlow.tryEmit(e)) { + Log.w(TAG, "Error emission dropped (buffer full): ${e.message}") + } resultFlow.tryEmit("" to true) } } diff --git a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/mediapipe/MediaPipeEngine.kt b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/mediapipe/MediaPipeEngine.kt index 07285555..91b868a3 100644 --- a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/mediapipe/MediaPipeEngine.kt +++ b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/mediapipe/MediaPipeEngine.kt @@ -24,7 +24,7 @@ class MediaPipeEngine( override var isInitialized: Boolean = false private set - override val capabilities = EngineCapabilities( + override var capabilities = EngineCapabilities( supportsVision = true, supportsAudio = false, // Audio is LiteRT-LM only (not supported by MediaPipe SDK) supportsFunctionCalls = true, // Manual via chat templates @@ -74,6 +74,10 @@ class MediaPipeEngine( val options = optionsBuilder.build() llmInference = LlmInference.createFromOptions(context, options) isInitialized = true + // Update audio capability if audio was successfully configured + if (config.supportAudio == true) { + capabilities = capabilities.copy(supportsAudio = true) + } } catch (e: Exception) { throw RuntimeException("Failed to initialize MediaPipe LlmInference: ${e.message}", e) } diff --git a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/mediapipe/MediaPipeSession.kt b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/mediapipe/MediaPipeSession.kt index d06af38a..3004a51b 100644 --- a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/mediapipe/MediaPipeSession.kt +++ b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/mediapipe/MediaPipeSession.kt @@ -50,6 +50,11 @@ class MediaPipeSession( val sessionOptions = sessionOptionsBuilder.build() session = LlmInferenceSession.createFromOptions(llmInference, sessionOptions) + + if (config.enableThinking) { + Log.w(TAG, "enableThinking=true is not supported by MediaPipe engine. " + + "Use LiteRT-LM (.litertlm) models for thinking mode.") + } } override fun addQueryChunk(prompt: String) { @@ -93,7 +98,11 @@ class MediaPipeSession( } override fun cancelGeneration() { - session.cancelGenerateResponseAsync() + try { + session.cancelGenerateResponseAsync() + } catch (e: Exception) { + Log.w(TAG, "cancelGeneration failed", e) + } } override fun close() { diff --git a/example/integration_test/desktop_thinking_test.dart b/example/integration_test/desktop_thinking_test.dart new file mode 100644 index 00000000..3a29ed9f --- /dev/null +++ b/example/integration_test/desktop_thinking_test.dart @@ -0,0 +1,165 @@ +// Integration test: Gemma 4 thinking mode on Desktop (macOS/Windows/Linux) +// Run with: cd example && flutter test integration_test/desktop_thinking_test.dart -d macos +// +// Prerequisites: +// Copy gemma-4-E2B-it.litertlm to the app sandbox container: +// cp ~/Downloads/gemma-4-E2B-it.litertlm \ +// ~/Library/Containers/dev.flutterberlin.flutterGemmaExample55/Data/Documents/ + +import 'dart:io'; + +import 'package:flutter_test/flutter_test.dart'; +import 'package:integration_test/integration_test.dart'; +import 'package:flutter_gemma/flutter_gemma.dart'; + +const _modelFileName = 'gemma-4-E2B-it.litertlm'; + +String _resolveModelPath() { + // Inside macOS sandbox, HOME already points to the container: + // ~/Library/Containers//Data + final home = Platform.environment['HOME'] ?? ''; + return '$home/Documents/$_modelFileName'; +} + +void main() { + IntegrationTestWidgetsFlutterBinding.ensureInitialized(); + + late String modelPath; + + group('Desktop Gemma 4 Thinking Mode', () { + setUpAll(() { + if (!Platform.isMacOS && !Platform.isWindows && !Platform.isLinux) { + fail('Test requires desktop platform'); + } + modelPath = _resolveModelPath(); + if (!File(modelPath).existsSync()) { + fail('Model not found: $modelPath'); + } + }); + + testWidgets('thinking_stream', (tester) async { + print('=== Initializing ==='); + await FlutterGemma.initialize(); + + print('=== Installing model from file ==='); + await FlutterGemma.installModel( + modelType: ModelType.gemmaIt, + fileType: ModelFileType.litertlm, + ).fromFile(modelPath).install(); + + expect(FlutterGemma.hasActiveModel(), isTrue); + print('Model installed'); + + final model = await FlutterGemma.getActiveModel( + maxTokens: 2048, + preferredBackend: PreferredBackend.gpu, + ); + + try { + final chat = await model.createChat( + temperature: 1.0, + topK: 64, + topP: 0.95, + isThinking: true, + modelType: ModelType.gemmaIt, + ); + + await chat.addQuery( + const Message(text: 'Explain why the sky is blue. Think step by step.', isUser: true), + ); + + final responses = []; + await tester.runAsync(() async { + await for (final response in chat.generateChatResponseAsync()) { + responses.add(response); + } + }); + + final thinkingTokens = responses + .whereType() + .map((r) => r.content) + .join(); + final textTokens = responses + .whereType() + .map((r) => r.token) + .join(); + + print('[Gemma 4 E2B Desktop] Thinking tokens: ${thinkingTokens.length} chars'); + print('[Gemma 4 E2B Desktop] Text tokens: ${textTokens.length} chars'); + + // Should have thinking content + expect(thinkingTokens.isNotEmpty, isTrue, + reason: 'Expected non-empty thinking content'); + + // Should have text content + expect(textTokens.isNotEmpty, isTrue, + reason: 'Expected non-empty text response'); + + // Thinking should come before text in stream order + final firstThinkingIdx = responses.indexWhere((r) => r is ThinkingResponse); + final firstTextIdx = responses.indexWhere((r) => r is TextResponse); + + if (firstThinkingIdx >= 0 && firstTextIdx >= 0) { + expect(firstThinkingIdx, lessThan(firstTextIdx), + reason: 'First thinking should appear before first text'); + } + + print('[Gemma 4 E2B Desktop] thinking_stream PASSED'); + } finally { + await model.close(); + } + }, timeout: const Timeout(Duration(minutes: 10))); + + testWidgets('no_thinking', (tester) async { + await FlutterGemma.initialize(); + + await FlutterGemma.installModel( + modelType: ModelType.gemmaIt, + fileType: ModelFileType.litertlm, + ).fromFile(modelPath).install(); + + final model = await FlutterGemma.getActiveModel( + maxTokens: 2048, + preferredBackend: PreferredBackend.gpu, + ); + + try { + final chat = await model.createChat( + temperature: 1.0, + topK: 64, + topP: 0.95, + isThinking: false, + modelType: ModelType.gemmaIt, + ); + + await chat.addQuery( + const Message(text: 'What is 2+2?', isUser: true), + ); + + final responses = []; + await tester.runAsync(() async { + await for (final response in chat.generateChatResponseAsync()) { + responses.add(response); + } + }); + + // Without thinking enabled, no ThinkingResponse should appear + final thinkingResponses = responses.whereType().toList(); + expect(thinkingResponses, isEmpty, + reason: 'No ThinkingResponse expected with isThinking=false'); + + // Should still have text content + final textTokens = responses + .whereType() + .map((r) => r.token) + .join(); + expect(textTokens.isNotEmpty, isTrue, + reason: 'Expected non-empty text response'); + + print('[Gemma 4 E2B Desktop] no_thinking PASSED'); + } finally { + await model.close(); + } + }, timeout: const Timeout(Duration(minutes: 10))); + }); +} diff --git a/example/integration_test/sequential_gemma4_test.dart b/example/integration_test/sequential_gemma4_test.dart new file mode 100644 index 00000000..7d0f9b1f --- /dev/null +++ b/example/integration_test/sequential_gemma4_test.dart @@ -0,0 +1,67 @@ +// Integration test: Sequential inference with Gemma 4 E2B .litertlm +// Reproduces issue #209 โ€” SIGSEGV crash on second sendMessage +// +// Run: +// cd example +// flutter test integration_test/sequential_gemma4_test.dart -d + +import 'dart:io'; + +import 'package:flutter_test/flutter_test.dart'; +import 'package:integration_test/integration_test.dart'; +import 'package:flutter_gemma/flutter_gemma.dart'; + +const _modelPath = '/data/local/tmp/flutter_gemma_test/gemma-4-E2B-it.litertlm'; + +void main() { + IntegrationTestWidgetsFlutterBinding.ensureInitialized(); + + setUpAll(() { + if (!Platform.isAndroid) { + fail('Test requires Android with .litertlm models'); + } + if (!File(_modelPath).existsSync()) { + fail('Model not found: $_modelPath\nPush it first: adb push $_modelPath'); + } + }); + + testWidgets('Gemma 4 E2B: two sequential queries on same chat', (tester) async { + await FlutterGemma.initialize(); + + await FlutterGemma.installModel( + modelType: ModelType.gemmaIt, + fileType: ModelFileType.litertlm, + ).fromFile(_modelPath).install(); + + // No preferredBackend = CPU (default), matching issue #209 reporter's code + final model = await FlutterGemma.getActiveModel( + maxTokens: 2048, + ); + + try { + final chat = await model.createChat(modelType: ModelType.gemmaIt); + + // First query + await chat.addQueryChunk( + const Message(text: 'What is 2+2? Answer with just the number.', isUser: true), + ); + final r1 = await chat.generateChatResponse(); + expect(r1, isA()); + final text1 = (r1 as TextResponse).token; + print('[Gemma4] First response: "$text1"'); + expect(text1, isNotEmpty); + + // Second query โ€” crash point in issue #209 + await chat.addQueryChunk( + const Message(text: 'What is 3+3? Answer with just the number.', isUser: true), + ); + final r2 = await chat.generateChatResponse(); + expect(r2, isA()); + final text2 = (r2 as TextResponse).token; + print('[Gemma4] Second response: "$text2"'); + expect(text2, isNotEmpty); + } finally { + await model.close(); + } + }, timeout: const Timeout(Duration(minutes: 20))); +} diff --git a/example/integration_test/sequential_litertlm_test.dart b/example/integration_test/sequential_litertlm_test.dart new file mode 100644 index 00000000..c2ddc9d1 --- /dev/null +++ b/example/integration_test/sequential_litertlm_test.dart @@ -0,0 +1,171 @@ +// Integration test: Sequential inference with .litertlm models +// Reproduces issue #209 โ€” SIGSEGV crash on second sendMessage +// +// Prerequisites: +// adb push /path/to/gemma-4-E2B-it.litertlm /data/local/tmp/flutter_gemma_test/ +// adb push /path/to/Qwen3-0.6B.litertlm /data/local/tmp/flutter_gemma_test/ +// +// Run: +// cd example +// flutter test integration_test/sequential_litertlm_test.dart -d + +import 'dart:io'; + +import 'package:flutter_test/flutter_test.dart'; +import 'package:integration_test/integration_test.dart'; +import 'package:flutter_gemma/flutter_gemma.dart'; + +const _deviceDir = '/data/local/tmp/flutter_gemma_test'; + +const _models = <({String path, String name, ModelType modelType})>[ + ( + path: '$_deviceDir/gemma-3n-E2B-it-int4.litertlm', + name: 'Gemma 3n E2B', + modelType: ModelType.gemmaIt, + ), + ( + path: '$_deviceDir/gemma-4-E2B-it.litertlm', + name: 'Gemma 4 E2B', + modelType: ModelType.gemmaIt, + ), +]; + +Future _installAndLoad(String path, ModelType modelType) async { + await FlutterGemma.initialize(); + + await FlutterGemma.installModel( + modelType: modelType, + fileType: ModelFileType.litertlm, + ).fromFile(path).install(); + + return FlutterGemma.getActiveModel( + maxTokens: 2048, + preferredBackend: PreferredBackend.gpu, + ); +} + +void main() { + IntegrationTestWidgetsFlutterBinding.ensureInitialized(); + + for (final (:path, :name, :modelType) in _models) { + group('Sequential inference [$name]', () { + setUpAll(() { + if (!Platform.isAndroid) { + fail('Test requires Android with .litertlm models'); + } + if (!File(path).existsSync()) { + fail('Model not found: $path\nPush it first: adb push $path'); + } + }); + + // --- Test 1: Two sequential queries on same chat (issue #209 core repro) --- + testWidgets('two sequential queries on same chat', (tester) async { + final model = await _installAndLoad(path, modelType); + try { + final chat = await model.createChat(modelType: modelType); + + // First query โ€” should work + await chat.addQueryChunk( + const Message(text: 'What is 2+2? Answer with just the number.', isUser: true), + ); + final r1 = await chat.generateChatResponse(); + expect(r1, isA()); + final text1 = (r1 as TextResponse).token; + print('[$name] First response: "$text1"'); + expect(text1, isNotEmpty); + + // Second query โ€” crashes with SIGSEGV in issue #209 + await chat.addQueryChunk( + const Message(text: 'What is 3+3? Answer with just the number.', isUser: true), + ); + final r2 = await chat.generateChatResponse(); + expect(r2, isA()); + final text2 = (r2 as TextResponse).token; + print('[$name] Second response: "$text2"'); + expect(text2, isNotEmpty); + } finally { + await model.close(); + } + }, timeout: const Timeout(Duration(minutes: 10))); + + // --- Test 2: Three sequential queries (longer conversation) --- + testWidgets('three sequential queries on same chat', (tester) async { + final model = await _installAndLoad(path, modelType); + try { + final chat = await model.createChat(modelType: modelType); + + for (var i = 1; i <= 3; i++) { + await chat.addQueryChunk( + Message(text: 'What is ${i}+${i}? Answer briefly.', isUser: true), + ); + final r = await chat.generateChatResponse(); + expect(r, isA()); + final text = (r as TextResponse).token; + print('[$name] Query $i response: "$text"'); + expect(text, isNotEmpty); + } + } finally { + await model.close(); + } + }, timeout: const Timeout(Duration(minutes: 15))); + + // --- Test 3: Streaming sequential queries --- + testWidgets('two sequential streaming queries on same chat', (tester) async { + final model = await _installAndLoad(path, modelType); + try { + final chat = await model.createChat(modelType: modelType); + + // First streaming query + await chat.addQueryChunk( + const Message(text: 'Say hello in one word.', isUser: true), + ); + final chunks1 = []; + await tester.runAsync(() async { + await for (final r in chat.generateChatResponseAsync()) { + if (r is TextResponse) chunks1.add(r.token); + } + }); + final text1 = chunks1.join(); + print('[$name] First streaming response: "$text1"'); + expect(text1, isNotEmpty); + + // Second streaming query โ€” issue #209 crash point + await chat.addQueryChunk( + const Message(text: 'Say goodbye in one word.', isUser: true), + ); + final chunks2 = []; + await tester.runAsync(() async { + await for (final r in chat.generateChatResponseAsync()) { + if (r is TextResponse) chunks2.add(r.token); + } + }); + final text2 = chunks2.join(); + print('[$name] Second streaming response: "$text2"'); + expect(text2, isNotEmpty); + } finally { + await model.close(); + } + }, timeout: const Timeout(Duration(minutes: 10))); + + // --- Test 4: New chat per query (workaround test) --- + testWidgets('new chat per query works', (tester) async { + final model = await _installAndLoad(path, modelType); + try { + for (var i = 1; i <= 2; i++) { + final chat = await model.createChat(modelType: modelType); + await chat.addQueryChunk( + Message(text: 'What is ${i * 10}? Answer briefly.', isUser: true), + ); + final r = await chat.generateChatResponse(); + expect(r, isA()); + final text = (r as TextResponse).token; + print('[$name] New chat #$i response: "$text"'); + expect(text, isNotEmpty); + } + } finally { + await model.close(); + } + }, timeout: const Timeout(Duration(minutes: 10))); + }); + } +} diff --git a/example/integration_test/thinking_mode_test.dart b/example/integration_test/thinking_mode_test.dart new file mode 100644 index 00000000..d815a499 --- /dev/null +++ b/example/integration_test/thinking_mode_test.dart @@ -0,0 +1,208 @@ +// Integration test: thinking mode across DeepSeek and Gemma 4 models. +// Run on Android: flutter test integration_test/thinking_mode_test.dart -d +// +// Prerequisites: +// Push models to device: +// adb push deepseek_q8_ekv1280.task /data/local/tmp/flutter_gemma_test/ +// adb push gemma-4-E2B-it.litertlm /data/local/tmp/flutter_gemma_test/ +// +// Tests per model: +// - install: model loads from device file +// - thinking_stream: async stream verifies ThinkingResponse + TextResponse ordering +// - no_thinking: isThinking: false produces only TextResponse + +import 'dart:io'; + +import 'package:flutter_test/flutter_test.dart'; +import 'package:integration_test/integration_test.dart'; +import 'package:flutter_gemma/flutter_gemma.dart'; + +const _deviceModelDir = '/data/local/tmp/flutter_gemma_test'; + +/// Test model configuration for thinking mode tests. +class ThinkingTestModel { + final String name; + final String filePath; + final ModelType modelType; + final ModelFileType fileType; + final double temperature; + final int topK; + final double topP; + final int maxTokens; + + const ThinkingTestModel({ + required this.name, + required this.filePath, + required this.modelType, + this.fileType = ModelFileType.task, + this.temperature = 1.0, + this.topK = 64, + this.topP = 0.95, + this.maxTokens = 1024, + }); +} + +const _testModels = [ + ThinkingTestModel( + name: 'DeepSeek R1 1.5B', + filePath: '$_deviceModelDir/deepseek_q8_ekv1280.task', + modelType: ModelType.deepSeek, + temperature: 0.6, + topK: 40, + topP: 0.7, + ), + ThinkingTestModel( + name: 'Gemma 4 E2B', + filePath: '$_deviceModelDir/gemma-4-E2B-it.litertlm', + modelType: ModelType.gemmaIt, + fileType: ModelFileType.litertlm, + maxTokens: 2048, + ), +]; + +Future _ensureModelInstalled(ThinkingTestModel model) async { + await FlutterGemma.installModel( + modelType: model.modelType, + fileType: model.fileType, + ).fromFile(model.filePath).install(); +} + +void main() { + IntegrationTestWidgetsFlutterBinding.ensureInitialized(); + + for (final model in _testModels) { + group(model.name, () { + setUpAll(() { + if (!Platform.isAndroid) { + fail('Test requires Android with .litertlm/.task models'); + } + if (!File(model.filePath).existsSync()) { + fail('Model not found: ${model.filePath}\n' + 'Push it first: adb push ${model.filePath}'); + } + }); + + testWidgets('install', (tester) async { + await FlutterGemma.initialize(); + + print('[${model.name}] Installing from file: ${model.filePath}'); + await _ensureModelInstalled(model); + + expect(FlutterGemma.hasActiveModel(), isTrue); + print('[${model.name}] Installed successfully'); + }, timeout: const Timeout(Duration(minutes: 5))); + + testWidgets('thinking_stream', (tester) async { + await FlutterGemma.initialize(); + await _ensureModelInstalled(model); + + final inferenceModel = await FlutterGemma.getActiveModel( + maxTokens: model.maxTokens, + preferredBackend: PreferredBackend.cpu, + ); + + try { + final chat = await inferenceModel.createChat( + temperature: model.temperature, + topK: model.topK, + topP: model.topP, + isThinking: true, + modelType: model.modelType, + ); + + await chat.addQuery( + const Message(text: 'Explain why the sky is blue. Think step by step.', isUser: true), + ); + + final responses = []; + await tester.runAsync(() async { + await for (final response in chat.generateChatResponseAsync()) { + responses.add(response); + } + }); + + final thinkingTokens = responses + .whereType() + .map((r) => r.content) + .join(); + final textTokens = responses + .whereType() + .map((r) => r.token) + .join(); + + print('[${model.name}] Thinking tokens: ${thinkingTokens.length} chars'); + print('[${model.name}] Text tokens: ${textTokens.length} chars'); + + // Should have thinking content + expect(thinkingTokens.isNotEmpty, isTrue, + reason: '${model.name}: Expected non-empty thinking content'); + + // Should have text content + expect(textTokens.isNotEmpty, isTrue, + reason: '${model.name}: Expected non-empty text response'); + + // Thinking should come before text in stream order + final firstThinkingIdx = responses.indexWhere((r) => r is ThinkingResponse); + final firstTextIdx = responses.indexWhere((r) => r is TextResponse); + + if (firstThinkingIdx >= 0 && firstTextIdx >= 0) { + expect(firstThinkingIdx, lessThan(firstTextIdx), + reason: '${model.name}: First thinking should appear before first text'); + } + + print('[${model.name}] thinking_stream PASSED'); + } finally { + await inferenceModel.close(); + } + }, timeout: const Timeout(Duration(minutes: 5))); + + testWidgets('no_thinking', (tester) async { + await FlutterGemma.initialize(); + await _ensureModelInstalled(model); + + final inferenceModel = await FlutterGemma.getActiveModel( + maxTokens: model.maxTokens, + preferredBackend: PreferredBackend.cpu, + ); + + try { + final chat = await inferenceModel.createChat( + temperature: model.temperature, + topK: model.topK, + topP: model.topP, + isThinking: false, + modelType: model.modelType, + ); + + await chat.addQuery( + const Message(text: 'What is 2+2?', isUser: true), + ); + + final responses = []; + await tester.runAsync(() async { + await for (final response in chat.generateChatResponseAsync()) { + responses.add(response); + } + }); + + // Without thinking enabled, no ThinkingResponse should appear + final thinkingResponses = responses.whereType().toList(); + expect(thinkingResponses, isEmpty, + reason: '${model.name}: No ThinkingResponse expected with isThinking=false'); + + // Should still have text content + final textTokens = responses + .whereType() + .map((r) => r.token) + .join(); + expect(textTokens.isNotEmpty, isTrue, + reason: '${model.name}: Expected non-empty text response'); + + print('[${model.name}] no_thinking PASSED'); + } finally { + await inferenceModel.close(); + } + }, timeout: const Timeout(Duration(minutes: 5))); + }); + } +} diff --git a/example/lib/models/model.dart b/example/lib/models/model.dart index e4a8378a..536b15e7 100644 --- a/example/lib/models/model.dart +++ b/example/lib/models/model.dart @@ -39,6 +39,7 @@ enum Model implements InferenceModelInterface { supportAudio: true, maxTokens: 4096, maxNumImages: 1, + isThinking: true, ), gemma4_E4B( baseUrl: @@ -62,6 +63,7 @@ enum Model implements InferenceModelInterface { supportAudio: true, maxTokens: 4096, maxNumImages: 1, + isThinking: true, ), // Gemma 3 Nano models (Multimodal + Function Calls) diff --git a/example/pubspec.lock b/example/pubspec.lock index ade2a6de..cb1f33a9 100644 --- a/example/pubspec.lock +++ b/example/pubspec.lock @@ -209,7 +209,7 @@ packages: path: ".." relative: true source: path - version: "0.13.0" + version: "0.13.1" flutter_lints: dependency: "direct dev" description: diff --git a/ios/Classes/FlutterGemmaPlugin.swift b/ios/Classes/FlutterGemmaPlugin.swift index a1adcae6..691b4fba 100644 --- a/ios/Classes/FlutterGemmaPlugin.swift +++ b/ios/Classes/FlutterGemmaPlugin.swift @@ -97,6 +97,7 @@ class PlatformServiceImpl : NSObject, PlatformService, FlutterStreamHandler { enableVisionModality: Bool?, enableAudioModality: Bool?, systemInstruction: String?, + enableThinking: Bool?, completion: @escaping (Result) -> Void ) { guard let inference = model?.inference else { @@ -104,6 +105,11 @@ class PlatformServiceImpl : NSObject, PlatformService, FlutterStreamHandler { return } + if enableThinking == true { + print("[FlutterGemma] Warning: enableThinking=true is not supported on iOS (MediaPipe). " + + "Use Android or Desktop with .litertlm models for Gemma 4 thinking mode.") + } + DispatchQueue.global(qos: .userInitiated).async { do { let newSession = try InferenceSession( diff --git a/ios/Classes/PigeonInterface.g.swift b/ios/Classes/PigeonInterface.g.swift index ac8fda75..a70156ee 100644 --- a/ios/Classes/PigeonInterface.g.swift +++ b/ios/Classes/PigeonInterface.g.swift @@ -231,7 +231,7 @@ class PigeonInterfacePigeonCodec: FlutterStandardMessageCodec, @unchecked Sendab protocol PlatformService { func createModel(maxTokens: Int64, modelPath: String, loraRanks: [Int64]?, preferredBackend: PreferredBackend?, maxNumImages: Int64?, supportAudio: Bool?, completion: @escaping (Result) -> Void) func closeModel(completion: @escaping (Result) -> Void) - func createSession(temperature: Double, randomSeed: Int64, topK: Int64, topP: Double?, loraPath: String?, enableVisionModality: Bool?, enableAudioModality: Bool?, systemInstruction: String?, completion: @escaping (Result) -> Void) + func createSession(temperature: Double, randomSeed: Int64, topK: Int64, topP: Double?, loraPath: String?, enableVisionModality: Bool?, enableAudioModality: Bool?, systemInstruction: String?, enableThinking: Bool?, completion: @escaping (Result) -> Void) func closeSession(completion: @escaping (Result) -> Void) func sizeInTokens(prompt: String, completion: @escaping (Result) -> Void) func addQueryChunk(prompt: String, completion: @escaping (Result) -> Void) @@ -332,7 +332,8 @@ class PlatformServiceSetup { let enableVisionModalityArg: Bool? = nilOrValue(args[5]) let enableAudioModalityArg: Bool? = nilOrValue(args[6]) let systemInstructionArg: String? = nilOrValue(args[7]) - api.createSession(temperature: temperatureArg, randomSeed: randomSeedArg, topK: topKArg, topP: topPArg, loraPath: loraPathArg, enableVisionModality: enableVisionModalityArg, enableAudioModality: enableAudioModalityArg, systemInstruction: systemInstructionArg) { result in + let enableThinkingArg: Bool? = nilOrValue(args[8]) + api.createSession(temperature: temperatureArg, randomSeed: randomSeedArg, topK: topKArg, topP: topPArg, loraPath: loraPathArg, enableVisionModality: enableVisionModalityArg, enableAudioModality: enableAudioModalityArg, systemInstruction: systemInstructionArg, enableThinking: enableThinkingArg) { result in switch result { case .success: reply(wrapResult(nil)) diff --git a/ios/flutter_gemma.podspec b/ios/flutter_gemma.podspec index 7695f83b..671a3e2a 100644 --- a/ios/flutter_gemma.podspec +++ b/ios/flutter_gemma.podspec @@ -4,7 +4,7 @@ # Pod::Spec.new do |s| s.name = 'flutter_gemma' - s.version = '0.13.0' + s.version = '0.13.1' s.summary = 'Flutter plugin for running Gemma AI models locally with Gemma 3 Nano support.' s.description = <<-DESC The plugin allows running the Gemma AI model locally on a device from a Flutter application. diff --git a/lib/core/di/service_registry.dart b/lib/core/di/service_registry.dart index 8947c912..f5005f72 100644 --- a/lib/core/di/service_registry.dart +++ b/lib/core/di/service_registry.dart @@ -22,7 +22,8 @@ import 'package:flutter_gemma/core/handlers/web_file_source_handler_stub.dart' import 'package:flutter_gemma/core/handlers/source_handler_registry.dart'; import 'package:flutter_gemma/core/infrastructure/platform_file_system_service.dart'; import 'package:flutter_gemma/core/infrastructure/web_file_system_service.dart'; -import 'package:flutter_gemma/core/infrastructure/flutter_asset_loader.dart'; +import 'package:flutter_gemma/core/infrastructure/flutter_asset_loader_stub.dart' + if (dart.library.io) 'package:flutter_gemma/core/infrastructure/flutter_asset_loader.dart'; import 'package:flutter_gemma/core/infrastructure/shared_preferences_model_repository.dart'; import 'package:flutter_gemma/core/infrastructure/in_memory_model_repository.dart'; import 'package:flutter_gemma/core/services/vector_store_repository.dart'; diff --git a/lib/core/extensions.dart b/lib/core/extensions.dart index 8bac2744..5f5db09a 100644 --- a/lib/core/extensions.dart +++ b/lib/core/extensions.dart @@ -190,8 +190,8 @@ extension MessageExtension on Message { // Filter class for thinking models class ModelThinkingFilter { - /// Filters ModelResponse stream for models with thinking support - /// Only supports DeepSeek models with ... blocks + /// Filters ModelResponse stream for models with thinking support. + /// Supports DeepSeek (`...`) and Gemma 4 (`<|channel>thought\n...`) models. static Stream filterThinkingStream(Stream originalStream, {required ModelType modelType}) async* { switch (modelType) { @@ -244,8 +244,69 @@ class ModelThinkingFilter { } break; - case ModelType.general: case ModelType.gemmaIt: + // Gemma 4 E2B/E4B: <|channel>thought\n... + const startMarker = '<|channel>thought\n'; + const endMarker = ''; + bool gemmaInsideThinking = false; + String gemmaBuffer = ''; + + await for (final response in originalStream) { + if (response is TextResponse) { + gemmaBuffer += response.token; + + while (gemmaBuffer.isNotEmpty) { + if (gemmaInsideThinking) { + final endIdx = gemmaBuffer.indexOf(endMarker); + if (endIdx >= 0) { + final thinkingContent = gemmaBuffer.substring(0, endIdx); + if (thinkingContent.isNotEmpty) { + yield ThinkingResponse(thinkingContent); + } + gemmaBuffer = gemmaBuffer.substring(endIdx + endMarker.length); + gemmaInsideThinking = false; + } else { + // Check for partial end marker at tail + final partial = _findPartialSuffix(gemmaBuffer, endMarker); + final safe = gemmaBuffer.substring(0, gemmaBuffer.length - partial); + if (safe.isNotEmpty) { + yield ThinkingResponse(safe); + } + gemmaBuffer = gemmaBuffer.substring(gemmaBuffer.length - partial); + break; + } + } else { + final startIdx = gemmaBuffer.indexOf(startMarker); + if (startIdx >= 0) { + final textBefore = gemmaBuffer.substring(0, startIdx); + if (textBefore.isNotEmpty) { + yield TextResponse(textBefore); + } + gemmaBuffer = gemmaBuffer.substring(startIdx + startMarker.length); + gemmaInsideThinking = true; + } else { + // Check for partial start marker at tail + final partial = _findPartialSuffix(gemmaBuffer, startMarker); + final safe = gemmaBuffer.substring(0, gemmaBuffer.length - partial); + if (safe.isNotEmpty) { + yield TextResponse(safe); + } + gemmaBuffer = gemmaBuffer.substring(gemmaBuffer.length - partial); + break; + } + } + } + } else { + yield response; + } + } + // Flush remaining buffer + if (gemmaBuffer.isNotEmpty) { + yield gemmaInsideThinking ? ThinkingResponse(gemmaBuffer) : TextResponse(gemmaBuffer); + } + break; + + case ModelType.general: case ModelType.qwen: case ModelType.llama: case ModelType.hammer: @@ -258,8 +319,9 @@ class ModelThinkingFilter { } } - /// Removes thinking blocks from final text - /// Only supports DeepSeek (...) models + /// Removes thinking blocks from final text. + /// Supports DeepSeek (`...`) and Gemma 4 (`<|channel>thought\n...`) models. + /// Note: For streaming thinking output, use [filterThinkingStream] with generateChatResponseAsync() instead. static String removeThinkingFromText(String text, {required ModelType modelType}) { switch (modelType) { case ModelType.deepSeek: @@ -267,8 +329,12 @@ class ModelThinkingFilter { RegExp thinkingRegex = RegExp(r'.*?', dotAll: true); return text.replaceAll(thinkingRegex, '').trim(); - case ModelType.general: case ModelType.gemmaIt: + // Remove all <|channel>thought\n... blocks (Gemma 4 E2B/E4B) + return text.replaceAll( + RegExp(r'<\|channel>thought\n.*?', dotAll: true), '').trim(); + + case ModelType.general: case ModelType.qwen: case ModelType.llama: case ModelType.hammer: @@ -326,4 +392,14 @@ class ModelThinkingFilter { return cleaned.trim(); } } + + /// Returns length of the longest suffix of [text] that is a prefix of [marker]. + static int _findPartialSuffix(String text, String marker) { + for (int i = marker.length.clamp(0, text.length); i >= 1; i--) { + if (text.endsWith(marker.substring(0, i))) { + return i; + } + } + return 0; + } } diff --git a/lib/core/handlers/asset_source_handler.dart b/lib/core/handlers/asset_source_handler.dart index aaf02a09..d499769b 100644 --- a/lib/core/handlers/asset_source_handler.dart +++ b/lib/core/handlers/asset_source_handler.dart @@ -4,7 +4,8 @@ import 'package:flutter_gemma/core/model_management/cancel_token.dart'; import 'package:flutter_gemma/core/services/asset_loader.dart'; import 'package:flutter_gemma/core/services/file_system_service.dart'; import 'package:flutter_gemma/core/services/model_repository.dart'; -import 'package:flutter_gemma/core/infrastructure/flutter_asset_loader.dart'; +import 'package:flutter_gemma/core/infrastructure/flutter_asset_loader_stub.dart' + if (dart.library.io) 'package:flutter_gemma/core/infrastructure/flutter_asset_loader.dart'; import 'package:path/path.dart' as path; /// Handles installation of models from Flutter assets diff --git a/lib/core/infrastructure/flutter_asset_loader_stub.dart b/lib/core/infrastructure/flutter_asset_loader_stub.dart new file mode 100644 index 00000000..a2e655ab --- /dev/null +++ b/lib/core/infrastructure/flutter_asset_loader_stub.dart @@ -0,0 +1,19 @@ +/// Stub implementation for platforms where dart:io is not available (web) +/// This file is used when large_file_handler cannot be imported +library; + +import 'dart:typed_data'; +import 'package:flutter_gemma/core/services/asset_loader.dart'; + +/// Stub class - should never be instantiated on web platform +class FlutterAssetLoader implements AssetLoader { + @override + Future loadAsset(String path) => + throw UnsupportedError('FlutterAssetLoader is not available on this platform'); + + Future copyAssetToFile(String assetPath, String targetPath) => + throw UnsupportedError('FlutterAssetLoader is not available on this platform'); + + Stream copyAssetToFileWithProgress(String assetPath, String targetPath) => + throw UnsupportedError('FlutterAssetLoader is not available on this platform'); +} diff --git a/lib/core/infrastructure/hnsw_vector_index.dart b/lib/core/infrastructure/hnsw_vector_index.dart index ef6cd761..868d5980 100644 --- a/lib/core/infrastructure/hnsw_vector_index.dart +++ b/lib/core/infrastructure/hnsw_vector_index.dart @@ -16,7 +16,7 @@ import 'package:local_hnsw/local_hnsw.item.dart'; /// /// **Key Design Decisions:** /// - In-memory index: Rebuilt on initialize() from SQLite data -/// - Generic string IDs: LocalHNSW stores document IDs directly +/// - Generic string IDs: `LocalHNSW` stores document IDs directly /// - Cosine metric: Matches SQLite brute-force implementation /// - Exact similarity: Recalculated from cached embeddings during search /// diff --git a/lib/core/infrastructure/web_opfs_interop.dart b/lib/core/infrastructure/web_opfs_interop.dart index 53c9b70d..ea28ebc0 100644 --- a/lib/core/infrastructure/web_opfs_interop.dart +++ b/lib/core/infrastructure/web_opfs_interop.dart @@ -14,13 +14,13 @@ extension type OPFSInterop._(JSObject _) implements JSObject { /// Check if a model is cached in OPFS /// /// @param filename Model filename (cache key) - /// @returns Promise + /// @returns `Promise` external JSPromise isModelCached(JSString filename); /// Get the size of a cached model file /// /// @param filename Model filename - /// @returns Promise Size in bytes, or null if not found + /// @returns `Promise` Size in bytes, or null if not found external JSPromise getCachedModelSize(JSString filename); /// Download a model to OPFS with progress tracking and cancellation support @@ -30,7 +30,7 @@ extension type OPFSInterop._(JSObject _) implements JSObject { /// @param authToken Optional authentication token (HuggingFace, etc.) /// @param onProgress Progress callback (receives 0-100) /// @param abortSignal Optional AbortSignal for cancellation - /// @returns Promise True on success + /// @returns `Promise` True on success /// @throws Error on download failure, quota exceeded, or cancellation external JSPromise downloadToOPFS( JSString url, @@ -45,14 +45,14 @@ extension type OPFSInterop._(JSObject _) implements JSObject { /// This is passed to MediaPipe's modelAssetBuffer parameter. /// /// @param filename Model filename in OPFS - /// @returns Promise + /// @returns `Promise` /// @throws Error if file not found external JSPromise getStreamReader(JSString filename); /// Delete a model from OPFS /// /// @param filename Model filename to delete - /// @returns Promise + /// @returns `Promise` external JSPromise deleteModel(JSString filename); /// Get current storage statistics @@ -62,7 +62,7 @@ extension type OPFSInterop._(JSObject _) implements JSObject { /// Clear all files from OPFS (for testing/development) /// - /// @returns Promise Number of files deleted + /// @returns `Promise` Number of files deleted external JSPromise clearAll(); } diff --git a/lib/core/model_management/constants/preferences_keys.dart b/lib/core/model_management/constants/preferences_keys.dart index 888ccf63..e6515367 100644 --- a/lib/core/model_management/constants/preferences_keys.dart +++ b/lib/core/model_management/constants/preferences_keys.dart @@ -9,16 +9,16 @@ class PreferencesKeys { // Multi-model lists (NEW system - supports multiple models) // ============================================================================ - /// List of installed inference model files + /// `List` of installed inference model files static const String installedModels = 'installed_models'; - /// List of installed LoRA files + /// `List` of installed LoRA files static const String installedLoras = 'installed_loras'; - /// List of installed embedding model files + /// `List` of installed embedding model files static const String installedEmbeddingModels = 'installed_embedding_models'; - /// List of installed tokenizer files + /// `List` of installed tokenizer files static const String installedTokenizers = 'installed_tokenizers'; // ============================================================================ diff --git a/lib/desktop/desktop_inference_model.dart b/lib/desktop/desktop_inference_model.dart index 41841c60..d7a4be5a 100644 --- a/lib/desktop/desktop_inference_model.dart +++ b/lib/desktop/desktop_inference_model.dart @@ -39,6 +39,7 @@ class DesktopInferenceModel extends InferenceModel { bool? enableVisionModality, bool? enableAudioModality, String? systemInstruction, + bool enableThinking = false, }) async { if (_isClosed) { throw StateError('Model is closed. Create a new instance to use it again'); @@ -65,6 +66,7 @@ class DesktopInferenceModel extends InferenceModel { fileType: fileType, supportImage: enableVisionModality ?? supportImage, supportAudio: enableAudioModality ?? supportAudio, + enableThinking: enableThinking, onClose: () { _session = null; _createCompleter = null; @@ -107,6 +109,7 @@ class DesktopInferenceModel extends InferenceModel { enableVisionModality: supportImage ?? this.supportImage, enableAudioModality: supportAudio ?? this.supportAudio, systemInstruction: systemInstruction, + enableThinking: isThinking, ), maxTokens: maxTokens, tokenBuffer: tokenBuffer, @@ -158,6 +161,7 @@ class DesktopInferenceModelSession extends InferenceModelSession { required this.fileType, required this.supportImage, required this.supportAudio, + this.enableThinking = false, required this.onClose, }); @@ -166,6 +170,7 @@ class DesktopInferenceModelSession extends InferenceModelSession { final ModelFileType fileType; final bool supportImage; final bool supportAudio; + final bool enableThinking; final VoidCallback onClose; final StringBuffer _queryBuffer = StringBuffer(); @@ -213,15 +218,15 @@ class DesktopInferenceModelSession extends InferenceModelSession { final buffer = StringBuffer(); if (audio != null) { - await for (final token in grpcClient.chatWithAudio(text, audio)) { + await for (final token in grpcClient.chatWithAudio(text, audio, enableThinking: enableThinking)) { buffer.write(token); } } else if (image != null) { - await for (final token in grpcClient.chatWithImage(text, image)) { + await for (final token in grpcClient.chatWithImage(text, image, enableThinking: enableThinking)) { buffer.write(token); } } else { - await for (final token in grpcClient.chat(text)) { + await for (final token in grpcClient.chat(text, enableThinking: enableThinking)) { buffer.write(token); } } @@ -247,13 +252,13 @@ class DesktopInferenceModelSession extends InferenceModelSession { if (audio != null) { debugPrint('[DesktopSession] Calling chatWithAudio: audio=${audio.length} bytes'); - yield* grpcClient.chatWithAudio(text, audio); + yield* grpcClient.chatWithAudio(text, audio, enableThinking: enableThinking); } else if (image != null) { debugPrint('[DesktopSession] Calling chatWithImage: image=${image.length} bytes'); - yield* grpcClient.chatWithImage(text, image); + yield* grpcClient.chatWithImage(text, image, enableThinking: enableThinking); } else { debugPrint('[DesktopSession] Calling chat (no image/audio)'); - yield* grpcClient.chat(text); + yield* grpcClient.chat(text, enableThinking: enableThinking); } } diff --git a/lib/desktop/generated/litertlm.pb.dart b/lib/desktop/generated/litertlm.pb.dart index a2bd35e2..4c503b3a 100644 --- a/lib/desktop/generated/litertlm.pb.dart +++ b/lib/desktop/generated/litertlm.pb.dart @@ -431,10 +431,12 @@ class ChatRequest extends $pb.GeneratedMessage { factory ChatRequest({ $core.String? conversationId, $core.String? text, + $core.bool? enableThinking, }) { final result = create(); if (conversationId != null) result.conversationId = conversationId; if (text != null) result.text = text; + if (enableThinking != null) result.enableThinking = enableThinking; return result; } @@ -453,6 +455,7 @@ class ChatRequest extends $pb.GeneratedMessage { createEmptyInstance: create) ..aOS(1, _omitFieldNames ? '' : 'conversationId') ..aOS(2, _omitFieldNames ? '' : 'text') + ..aOB(3, _omitFieldNames ? '' : 'enableThinking') ..hasRequiredFields = false; @$core.Deprecated('See https://github.com/google/protobuf.dart/issues/998.') @@ -491,6 +494,15 @@ class ChatRequest extends $pb.GeneratedMessage { $core.bool hasText() => $_has(1); @$pb.TagNumber(2) void clearText() => $_clearField(2); + + @$pb.TagNumber(3) + $core.bool get enableThinking => $_getBF(2); + @$pb.TagNumber(3) + set enableThinking($core.bool value) => $_setBool(2, value); + @$pb.TagNumber(3) + $core.bool hasEnableThinking() => $_has(2); + @$pb.TagNumber(3) + void clearEnableThinking() => $_clearField(3); } class ChatWithImageRequest extends $pb.GeneratedMessage { @@ -498,11 +510,13 @@ class ChatWithImageRequest extends $pb.GeneratedMessage { $core.String? conversationId, $core.String? text, $core.List<$core.int>? image, + $core.bool? enableThinking, }) { final result = create(); if (conversationId != null) result.conversationId = conversationId; if (text != null) result.text = text; if (image != null) result.image = image; + if (enableThinking != null) result.enableThinking = enableThinking; return result; } @@ -523,6 +537,7 @@ class ChatWithImageRequest extends $pb.GeneratedMessage { ..aOS(2, _omitFieldNames ? '' : 'text') ..a<$core.List<$core.int>>( 3, _omitFieldNames ? '' : 'image', $pb.PbFieldType.OY) + ..aOB(4, _omitFieldNames ? '' : 'enableThinking') ..hasRequiredFields = false; @$core.Deprecated('See https://github.com/google/protobuf.dart/issues/998.') @@ -570,6 +585,15 @@ class ChatWithImageRequest extends $pb.GeneratedMessage { $core.bool hasImage() => $_has(2); @$pb.TagNumber(3) void clearImage() => $_clearField(3); + + @$pb.TagNumber(4) + $core.bool get enableThinking => $_getBF(3); + @$pb.TagNumber(4) + set enableThinking($core.bool value) => $_setBool(3, value); + @$pb.TagNumber(4) + $core.bool hasEnableThinking() => $_has(3); + @$pb.TagNumber(4) + void clearEnableThinking() => $_clearField(4); } class ChatWithAudioRequest extends $pb.GeneratedMessage { @@ -577,11 +601,13 @@ class ChatWithAudioRequest extends $pb.GeneratedMessage { $core.String? conversationId, $core.String? text, $core.List<$core.int>? audio, + $core.bool? enableThinking, }) { final result = create(); if (conversationId != null) result.conversationId = conversationId; if (text != null) result.text = text; if (audio != null) result.audio = audio; + if (enableThinking != null) result.enableThinking = enableThinking; return result; } @@ -602,6 +628,7 @@ class ChatWithAudioRequest extends $pb.GeneratedMessage { ..aOS(2, _omitFieldNames ? '' : 'text') ..a<$core.List<$core.int>>( 3, _omitFieldNames ? '' : 'audio', $pb.PbFieldType.OY) + ..aOB(4, _omitFieldNames ? '' : 'enableThinking') ..hasRequiredFields = false; @$core.Deprecated('See https://github.com/google/protobuf.dart/issues/998.') @@ -649,6 +676,15 @@ class ChatWithAudioRequest extends $pb.GeneratedMessage { $core.bool hasAudio() => $_has(2); @$pb.TagNumber(3) void clearAudio() => $_clearField(3); + + @$pb.TagNumber(4) + $core.bool get enableThinking => $_getBF(3); + @$pb.TagNumber(4) + set enableThinking($core.bool value) => $_setBool(3, value); + @$pb.TagNumber(4) + $core.bool hasEnableThinking() => $_has(3); + @$pb.TagNumber(4) + void clearEnableThinking() => $_clearField(4); } class ChatResponse extends $pb.GeneratedMessage { @@ -656,11 +692,13 @@ class ChatResponse extends $pb.GeneratedMessage { $core.String? text, $core.bool? done, $core.String? error, + $core.String? thinking, }) { final result = create(); if (text != null) result.text = text; if (done != null) result.done = done; if (error != null) result.error = error; + if (thinking != null) result.thinking = thinking; return result; } @@ -680,6 +718,7 @@ class ChatResponse extends $pb.GeneratedMessage { ..aOS(1, _omitFieldNames ? '' : 'text') ..aOB(2, _omitFieldNames ? '' : 'done') ..aOS(3, _omitFieldNames ? '' : 'error') + ..aOS(4, _omitFieldNames ? '' : 'thinking') ..hasRequiredFields = false; @$core.Deprecated('See https://github.com/google/protobuf.dart/issues/998.') @@ -727,6 +766,15 @@ class ChatResponse extends $pb.GeneratedMessage { $core.bool hasError() => $_has(2); @$pb.TagNumber(3) void clearError() => $_clearField(3); + + @$pb.TagNumber(4) + $core.String get thinking => $_getSZ(3); + @$pb.TagNumber(4) + set thinking($core.String value) => $_setString(3, value); + @$pb.TagNumber(4) + $core.bool hasThinking() => $_has(3); + @$pb.TagNumber(4) + void clearThinking() => $_clearField(4); } class CancelGenerationRequest extends $pb.GeneratedMessage { diff --git a/lib/desktop/generated/litertlm.pbjson.dart b/lib/desktop/generated/litertlm.pbjson.dart index 16879590..125b74f1 100644 --- a/lib/desktop/generated/litertlm.pbjson.dart +++ b/lib/desktop/generated/litertlm.pbjson.dart @@ -108,13 +108,15 @@ const ChatRequest$json = { '2': [ {'1': 'conversation_id', '3': 1, '4': 1, '5': 9, '10': 'conversationId'}, {'1': 'text', '3': 2, '4': 1, '5': 9, '10': 'text'}, + {'1': 'enable_thinking', '3': 3, '4': 1, '5': 8, '10': 'enableThinking'}, ], }; /// Descriptor for `ChatRequest`. Decode as a `google.protobuf.DescriptorProto`. final $typed_data.Uint8List chatRequestDescriptor = $convert.base64Decode( 'CgtDaGF0UmVxdWVzdBInCg9jb252ZXJzYXRpb25faWQYASABKAlSDmNvbnZlcnNhdGlvbklkEh' - 'IKBHRleHQYAiABKAlSBHRleHQ='); + 'IKBHRleHQYAiABKAlSBHRleHQSJwoPZW5hYmxlX3RoaW5raW5nGAMgASgIUg5lbmFibGVUaGlu' + 'a2luZw=='); @$core.Deprecated('Use chatWithImageRequestDescriptor instead') const ChatWithImageRequest$json = { @@ -123,13 +125,15 @@ const ChatWithImageRequest$json = { {'1': 'conversation_id', '3': 1, '4': 1, '5': 9, '10': 'conversationId'}, {'1': 'text', '3': 2, '4': 1, '5': 9, '10': 'text'}, {'1': 'image', '3': 3, '4': 1, '5': 12, '10': 'image'}, + {'1': 'enable_thinking', '3': 4, '4': 1, '5': 8, '10': 'enableThinking'}, ], }; /// Descriptor for `ChatWithImageRequest`. Decode as a `google.protobuf.DescriptorProto`. final $typed_data.Uint8List chatWithImageRequestDescriptor = $convert.base64Decode( 'ChRDaGF0V2l0aEltYWdlUmVxdWVzdBInCg9jb252ZXJzYXRpb25faWQYASABKAlSDmNvbnZlcn' - 'NhdGlvbklkEhIKBHRleHQYAiABKAlSBHRleHQSFAoFaW1hZ2UYAyABKAxSBWltYWdl'); + 'NhdGlvbklkEhIKBHRleHQYAiABKAlSBHRleHQSFAoFaW1hZ2UYAyABKAxSBWltYWdlEicKD2Vu' + 'YWJsZV90aGlua2luZxgEIAEoCFIOZW5hYmxlVGhpbmtpbmc='); @$core.Deprecated('Use chatWithAudioRequestDescriptor instead') const ChatWithAudioRequest$json = { @@ -138,13 +142,15 @@ const ChatWithAudioRequest$json = { {'1': 'conversation_id', '3': 1, '4': 1, '5': 9, '10': 'conversationId'}, {'1': 'text', '3': 2, '4': 1, '5': 9, '10': 'text'}, {'1': 'audio', '3': 3, '4': 1, '5': 12, '10': 'audio'}, + {'1': 'enable_thinking', '3': 4, '4': 1, '5': 8, '10': 'enableThinking'}, ], }; /// Descriptor for `ChatWithAudioRequest`. Decode as a `google.protobuf.DescriptorProto`. final $typed_data.Uint8List chatWithAudioRequestDescriptor = $convert.base64Decode( 'ChRDaGF0V2l0aEF1ZGlvUmVxdWVzdBInCg9jb252ZXJzYXRpb25faWQYASABKAlSDmNvbnZlcn' - 'NhdGlvbklkEhIKBHRleHQYAiABKAlSBHRleHQSFAoFYXVkaW8YAyABKAxSBWF1ZGlv'); + 'NhdGlvbklkEhIKBHRleHQYAiABKAlSBHRleHQSFAoFYXVkaW8YAyABKAxSBWF1ZGlvEicKD2Vu' + 'YWJsZV90aGlua2luZxgEIAEoCFIOZW5hYmxlVGhpbmtpbmc='); @$core.Deprecated('Use chatResponseDescriptor instead') const ChatResponse$json = { @@ -153,13 +159,14 @@ const ChatResponse$json = { {'1': 'text', '3': 1, '4': 1, '5': 9, '10': 'text'}, {'1': 'done', '3': 2, '4': 1, '5': 8, '10': 'done'}, {'1': 'error', '3': 3, '4': 1, '5': 9, '10': 'error'}, + {'1': 'thinking', '3': 4, '4': 1, '5': 9, '10': 'thinking'}, ], }; /// Descriptor for `ChatResponse`. Decode as a `google.protobuf.DescriptorProto`. final $typed_data.Uint8List chatResponseDescriptor = $convert.base64Decode( 'CgxDaGF0UmVzcG9uc2USEgoEdGV4dBgBIAEoCVIEdGV4dBISCgRkb25lGAIgASgIUgRkb25lEh' - 'QKBWVycm9yGAMgASgJUgVlcnJvcg=='); + 'QKBWVycm9yGAMgASgJUgVlcnJvchIaCgh0aGlua2luZxgEIAEoCVIIdGhpbmtpbmc='); @$core.Deprecated('Use cancelGenerationRequestDescriptor instead') const CancelGenerationRequest$json = { diff --git a/lib/desktop/grpc_client.dart b/lib/desktop/grpc_client.dart index ac171909..6f1f7739 100644 --- a/lib/desktop/grpc_client.dart +++ b/lib/desktop/grpc_client.dart @@ -111,7 +111,7 @@ class LiteRtLmClient { static const _streamTimeout = Duration(minutes: 5); /// Send a chat message and get streaming response - Stream chat(String text, {String? conversationId}) async* { + Stream chat(String text, {String? conversationId, bool enableThinking = false}) async* { _assertInitialized(); final convId = conversationId ?? _currentConversationId; @@ -121,7 +121,8 @@ class LiteRtLmClient { final request = ChatRequest() ..conversationId = convId - ..text = text; + ..text = text + ..enableThinking = enableThinking; // Add timeout to prevent infinite hanging await for (final response in _client!.chat(request).timeout( @@ -137,7 +138,11 @@ class LiteRtLmClient { throw Exception('Chat error: ${response.error}'); } - if (response.hasText()) { + if (response.hasThinking() && response.thinking.isNotEmpty) { + yield '<|channel>thought\n${response.thinking}'; + } + + if (response.hasText() && response.text.isNotEmpty) { yield response.text; } } @@ -148,6 +153,7 @@ class LiteRtLmClient { String text, Uint8List imageBytes, { String? conversationId, + bool enableThinking = false, }) async* { _assertInitialized(); debugPrint('[LiteRtLmClient] chatWithImage: text=${text.length} chars, image=${imageBytes.length} bytes'); @@ -160,7 +166,8 @@ class LiteRtLmClient { final request = ChatWithImageRequest() ..conversationId = convId ..text = text - ..image = imageBytes; + ..image = imageBytes + ..enableThinking = enableThinking; // Add timeout to prevent infinite hanging await for (final response in _client!.chatWithImage(request).timeout( @@ -176,7 +183,11 @@ class LiteRtLmClient { throw Exception('Chat error: ${response.error}'); } - if (response.hasText()) { + if (response.hasThinking() && response.thinking.isNotEmpty) { + yield '<|channel>thought\n${response.thinking}'; + } + + if (response.hasText() && response.text.isNotEmpty) { yield response.text; } } @@ -187,6 +198,7 @@ class LiteRtLmClient { String text, Uint8List imageBytes, { String? conversationId, + bool enableThinking = false, }) async { _assertInitialized(); @@ -198,7 +210,8 @@ class LiteRtLmClient { final request = ChatWithImageRequest() ..conversationId = convId ..text = text - ..image = imageBytes; + ..image = imageBytes + ..enableThinking = enableThinking; final response = await _client!.chatWithImageSync(request); @@ -214,6 +227,7 @@ class LiteRtLmClient { String text, Uint8List audioBytes, { String? conversationId, + bool enableThinking = false, }) async* { _assertInitialized(); @@ -225,7 +239,8 @@ class LiteRtLmClient { final request = ChatWithAudioRequest() ..conversationId = convId ..text = text - ..audio = audioBytes; + ..audio = audioBytes + ..enableThinking = enableThinking; // Add timeout to prevent infinite hanging await for (final response in _client!.chatWithAudio(request).timeout( @@ -241,7 +256,11 @@ class LiteRtLmClient { throw Exception('Chat error: ${response.error}'); } - if (response.hasText()) { + if (response.hasThinking() && response.thinking.isNotEmpty) { + yield '<|channel>thought\n${response.thinking}'; + } + + if (response.hasText() && response.text.isNotEmpty) { yield response.text; } } diff --git a/lib/flutter_gemma_interface.dart b/lib/flutter_gemma_interface.dart index 59d48f1f..059bb1eb 100644 --- a/lib/flutter_gemma_interface.dart +++ b/lib/flutter_gemma_interface.dart @@ -136,6 +136,7 @@ abstract class InferenceModel { bool? enableVisionModality, // Add vision modality support bool? enableAudioModality, // Add audio modality support (Gemma 3n E4B) String? systemInstruction, + bool enableThinking = false, // Enable thinking mode (Gemma 4 via extraContext) }); Future createChat({ @@ -164,6 +165,7 @@ abstract class InferenceModel { enableVisionModality: supportImage ?? false, enableAudioModality: supportAudio ?? false, systemInstruction: systemInstruction, + enableThinking: isThinking, ), maxTokens: maxTokens, tokenBuffer: tokenBuffer, @@ -171,10 +173,10 @@ abstract class InferenceModel { supportAudio: supportAudio ?? false, supportsFunctionCalls: supportsFunctionCalls ?? false, tools: tools, - isThinking: isThinking, // Pass isThinking parameter - modelType: modelType ?? ModelType.gemmaIt, // Use provided modelType or default - fileType: fileType, // Pass fileType from model - toolChoice: toolChoice, // Pass tool calling mode + isThinking: isThinking, + modelType: modelType ?? ModelType.gemmaIt, + fileType: fileType, + toolChoice: toolChoice, systemInstruction: systemInstruction, ); await chat!.initSession(); diff --git a/lib/mobile/flutter_gemma_mobile_inference_model.dart b/lib/mobile/flutter_gemma_mobile_inference_model.dart index a460164d..596262d0 100644 --- a/lib/mobile/flutter_gemma_mobile_inference_model.dart +++ b/lib/mobile/flutter_gemma_mobile_inference_model.dart @@ -43,6 +43,7 @@ class MobileInferenceModel extends InferenceModel { enableVisionModality: supportImage ?? false, enableAudioModality: supportAudio ?? this.supportAudio, systemInstruction: systemInstruction, + enableThinking: isThinking, ), maxTokens: maxTokens, tokenBuffer: tokenBuffer, @@ -86,6 +87,7 @@ class MobileInferenceModel extends InferenceModel { bool? enableVisionModality, bool? enableAudioModality, String? systemInstruction, + bool enableThinking = false, }) async { if (_isClosed) { throw StateError('Model is closed. Create a new instance to use it again'); @@ -109,6 +111,7 @@ class MobileInferenceModel extends InferenceModel { // Enable audio modality if the model supports it (Gemma 3n E4B) enableAudioModality: enableAudioModality ?? supportAudio, systemInstruction: systemInstruction, + enableThinking: enableThinking, ); final session = _session = MobileInferenceModelSession( diff --git a/lib/pigeon.g.dart b/lib/pigeon.g.dart index 03682ef4..ea24405d 100644 --- a/lib/pigeon.g.dart +++ b/lib/pigeon.g.dart @@ -231,7 +231,7 @@ class PlatformService { } } - Future createSession({required double temperature, required int randomSeed, required int topK, double? topP, String? loraPath, bool? enableVisionModality, bool? enableAudioModality, String? systemInstruction, }) async { + Future createSession({required double temperature, required int randomSeed, required int topK, double? topP, String? loraPath, bool? enableVisionModality, bool? enableAudioModality, String? systemInstruction, bool? enableThinking, }) async { final String pigeonVar_channelName = 'dev.flutter.pigeon.flutter_gemma.PlatformService.createSession$pigeonVar_messageChannelSuffix'; final BasicMessageChannel pigeonVar_channel = BasicMessageChannel( pigeonVar_channelName, @@ -239,7 +239,7 @@ class PlatformService { binaryMessenger: pigeonVar_binaryMessenger, ); final List? pigeonVar_replyList = - await pigeonVar_channel.send([temperature, randomSeed, topK, topP, loraPath, enableVisionModality, enableAudioModality, systemInstruction]) as List?; + await pigeonVar_channel.send([temperature, randomSeed, topK, topP, loraPath, enableVisionModality, enableAudioModality, systemInstruction, enableThinking]) as List?; if (pigeonVar_replyList == null) { throw _createConnectionError(pigeonVar_channelName); } else if (pigeonVar_replyList.length > 1) { @@ -741,7 +741,7 @@ class PlatformService { /// /// **Performance:** /// - Returns all documents in single call - /// - Embeddings as List (decoded from BLOB) + /// - Embeddings as `List` (decoded from BLOB) /// /// Returns empty list if no documents stored. Future> getAllDocumentsWithEmbeddings() async { diff --git a/lib/web/flutter_gemma_web.dart b/lib/web/flutter_gemma_web.dart index 5d61ba95..2bf71e66 100644 --- a/lib/web/flutter_gemma_web.dart +++ b/lib/web/flutter_gemma_web.dart @@ -391,7 +391,16 @@ class WebInferenceModel extends InferenceModel { bool? enableVisionModality, // Enabling vision modality support bool? enableAudioModality, // Enabling audio modality support (Gemma 3n E4B) String? systemInstruction, + bool enableThinking = false, // Not supported on Web (MediaPipe) }) async { + // Thinking mode not supported on Web (MediaPipe has no extraContext/channels API) + if (enableThinking) { + if (kDebugMode) { + debugPrint('Warning: enableThinking is not supported on Web (MediaPipe). ' + 'Use Android or Desktop with .litertlm models for Gemma 4 thinking mode.'); + } + } + // TODO: Implement vision modality for web if (enableVisionModality == true) { if (kDebugMode) { @@ -504,6 +513,7 @@ class WebInferenceModel extends InferenceModel { Future close() async { await session?.close(); session = null; + _initCompleter = null; onClose(); } } @@ -740,6 +750,7 @@ class WebModelSession extends InferenceModelSession { debugPrint('โŒ getResponse: Exception caught: $e'); debugPrint('โŒ getResponse: Stack trace: $stackTrace'); } + _promptParts.clear(); rethrow; } } @@ -750,6 +761,8 @@ class WebModelSession extends InferenceModelSession { debugPrint('๐ŸŒŠ getResponseAsync: Starting async response generation'); } + // Close previous controller to prevent leak if called again before completion + _controller?.close(); _controller = StreamController(); try { @@ -834,10 +847,17 @@ class WebModelSession extends InferenceModelSession { @override Future stopGeneration() async { - llmInference.cancelProcessing(); - _controller?.close(); - _controller = null; - _promptParts.clear(); + try { + llmInference.cancelProcessing(); + } catch (e) { + if (kDebugMode) { + debugPrint('[WebModelSession] cancelProcessing error: $e'); + } + } finally { + _controller?.close(); + _controller = null; + _promptParts.clear(); + } } @override diff --git a/lib/web/litert_web_embeddings.dart b/lib/web/litert_web_embeddings.dart index 85f6142e..82bbbded 100644 --- a/lib/web/litert_web_embeddings.dart +++ b/lib/web/litert_web_embeddings.dart @@ -87,7 +87,7 @@ class LiteRTWebEmbeddings { /// /// [text] - Text to embed /// - /// Returns [List] - Embedding vector (768 dimensions) + /// Returns `List` - Embedding vector (768 dimensions) /// /// Throws [Exception] if not initialized or generation fails static Future> generateEmbedding(String text) async { @@ -122,7 +122,7 @@ class LiteRTWebEmbeddings { /// /// [text] - Text to embed /// - /// Returns [List] - Embedding vector (768 dimensions) + /// Returns `List` - Embedding vector (768 dimensions) /// /// Throws [Exception] if not initialized or generation fails static Future> generateDocumentEmbedding(String text) async { @@ -154,7 +154,7 @@ class LiteRTWebEmbeddings { /// /// [texts] - List of texts to embed /// - /// Returns [List>] - List of embedding vectors + /// Returns `List>` - List of embedding vectors /// /// Throws [Exception] if not initialized or generation fails static Future>> generateEmbeddings(List texts) async { diff --git a/lib/web/vector_store_web.dart b/lib/web/vector_store_web.dart index cd93f5be..1c0feb6a 100644 --- a/lib/web/vector_store_web.dart +++ b/lib/web/vector_store_web.dart @@ -67,8 +67,8 @@ extension type SQLiteVectorStore._(JSObject _) implements JSObject { /// Add document with embedding (Dart-friendly API) /// /// Type conversions: - /// - Dart List โ†’ JS Array - /// - Dart String? โ†’ JS String | null + /// - Dart `List` โ†’ JS `Array` + /// - Dart `String?` โ†’ JS `String | null` /// /// Throws: /// - Dimension mismatch @@ -86,10 +86,10 @@ extension type SQLiteVectorStore._(JSObject _) implements JSObject { /// Search for similar documents (Dart-friendly API) /// /// Type conversions: - /// - Dart List โ†’ JS Array + /// - Dart `List` โ†’ JS `Array` /// - Dart int โ†’ JS Number /// - Dart double โ†’ JS Number - /// - JS Array โ†’ Dart List + /// - JS `Array` โ†’ Dart `List` /// /// Returns: /// - List sorted by similarity (descending) diff --git a/linux/scripts/setup_desktop.sh b/linux/scripts/setup_desktop.sh index 6e3732b7..4ec05c86 100755 --- a/linux/scripts/setup_desktop.sh +++ b/linux/scripts/setup_desktop.sh @@ -59,9 +59,9 @@ JRE_URL="https://cdn.azul.com/zulu/bin/${JRE_ARCHIVE}" # JAR settings JAR_NAME="litertlm-server.jar" -JAR_VERSION="0.13.0" +JAR_VERSION="0.13.1" JAR_URL="https://github.com/DenisovAV/flutter_gemma/releases/download/v${JAR_VERSION}/${JAR_NAME}" -JAR_CHECKSUM="61191862ae56f130366f5539e0a2d36adc9cb4ea99fe6568fb9a7b7cd2e88f02" +JAR_CHECKSUM="97e01020f921c098f7cfc0a9509e4b207b8bc326703ae2f26bbce3c11b957430" # Plugin root (parent of linux/) PLUGIN_ROOT=$(dirname "$PLUGIN_DIR") @@ -81,8 +81,8 @@ verify_checksum() { elif command -v shasum &> /dev/null; then actual=$(shasum -a 256 "$file" | awk '{print $1}') else - echo "WARNING: No sha256sum or shasum available, skipping checksum verification" - return 0 + echo "ERROR: No sha256sum or shasum available, cannot verify checksum" + return 1 fi if [ "$actual" != "$expected" ]; then @@ -177,6 +177,15 @@ setup_jar() { if [ -z "$JAR_SOURCE" ]; then local CACHED_JAR="$CACHE_DIR/jar/$JAR_NAME" + if [ -f "$CACHED_JAR" ] && [ -n "$JAR_CHECKSUM" ]; then + # Verify cached JAR checksum + echo "Verifying cached JAR checksum..." + if ! verify_checksum "$CACHED_JAR" "$JAR_CHECKSUM"; then + echo "Cached JAR checksum mismatch, re-downloading..." + rm -f "$CACHED_JAR" + fi + fi + if [ ! -f "$CACHED_JAR" ]; then echo "Downloading JAR from $JAR_URL..." curl -L --progress-bar -o "$CACHED_JAR" "$JAR_URL" || { @@ -185,7 +194,7 @@ setup_jar() { exit 1 } - # Verify checksum (skip if not yet available for this version) + # Verify checksum if [ -n "$JAR_CHECKSUM" ]; then echo "Verifying JAR checksum..." if ! verify_checksum "$CACHED_JAR" "$JAR_CHECKSUM"; then @@ -196,7 +205,7 @@ setup_jar() { echo "WARNING: JAR checksum not set, skipping verification" fi else - echo "Using cached JAR" + echo "Using cached JAR (checksum verified)" fi JAR_SOURCE="$CACHED_JAR" diff --git a/litertlm-server/build.gradle.kts b/litertlm-server/build.gradle.kts index 5729f814..61c77315 100644 --- a/litertlm-server/build.gradle.kts +++ b/litertlm-server/build.gradle.kts @@ -7,7 +7,7 @@ plugins { } group = "dev.flutterberlin" -version = "0.13.0" +version = "0.13.1" repositories { mavenCentral() @@ -16,7 +16,7 @@ repositories { dependencies { // LiteRT-LM JVM (only version with Contents API for multimodal) - implementation("com.google.ai.edge.litertlm:litertlm-jvm:0.9.0") + implementation("com.google.ai.edge.litertlm:litertlm-jvm:0.10.0") // gRPC + Protobuf implementation("io.grpc:grpc-kotlin-stub:1.4.1") diff --git a/litertlm-server/src/main/kotlin/dev/flutterberlin/litertlm/LiteRtLmServiceImpl.kt b/litertlm-server/src/main/kotlin/dev/flutterberlin/litertlm/LiteRtLmServiceImpl.kt index 32a9248f..c5c56137 100644 --- a/litertlm-server/src/main/kotlin/dev/flutterberlin/litertlm/LiteRtLmServiceImpl.kt +++ b/litertlm-server/src/main/kotlin/dev/flutterberlin/litertlm/LiteRtLmServiceImpl.kt @@ -203,21 +203,23 @@ class LiteRtLmServiceImpl : LiteRtLmServiceGrpcKt.LiteRtLmServiceCoroutineImplBa logger.info("=== CHAT REQUEST ===") logger.info("conversationId: '${request.conversationId}'") logger.info("text: '${request.text}' (length=${request.text.length})") - logger.info("text bytes: ${request.text.toByteArray().take(20).map { it.toInt() and 0xFF }}") + logger.info("enableThinking: ${request.enableThinking}") // Use Contents format (like Android does) val message = Contents.of(listOf(Content.Text(request.text))) - logger.info("Created Contents: $message") - // Use callback-based API (like Android does) - conversation.sendMessageAsync(message, object : MessageCallback { + val extraContext = if (request.enableThinking) mapOf("enable_thinking" to true) else emptyMap() + + val messageCallback = object : MessageCallback { override fun onMessage(msg: Message) { - trySend( - ChatResponse.newBuilder() - .setText(msg.toString()) - .setDone(false) - .build() - ) + val builder = ChatResponse.newBuilder() + .setText(msg.toString()) + .setDone(false) + val thinking = msg.channels["thought"] + if (!thinking.isNullOrEmpty()) { + builder.setThinking(thinking) + } + trySend(builder.build()) } override fun onDone() { @@ -240,7 +242,14 @@ class LiteRtLmServiceImpl : LiteRtLmServiceGrpcKt.LiteRtLmServiceCoroutineImplBa ) close(throwable) } - }) + } + + // Use callback-based API (like Android does) + if (extraContext.isNotEmpty()) { + conversation.sendMessageAsync(message, messageCallback, extraContext) + } else { + conversation.sendMessageAsync(message, messageCallback) + } } catch (e: Exception) { logger.error("Error starting chat", e) trySend( @@ -267,16 +276,25 @@ class LiteRtLmServiceImpl : LiteRtLmServiceGrpcKt.LiteRtLmServiceCoroutineImplBa logger.info("ChatWithImageSync: text='${request.text.take(50)}', imageBytes=${imageBytes.size}") val message = buildContents(request.text, imageBytes = imageBytes) + val extraContext = if (request.enableThinking) mapOf("enable_thinking" to true) else emptyMap() logger.info("Calling SYNC sendMessage...") - val response = conversation.sendMessage(message) + val response = if (extraContext.isNotEmpty()) { + conversation.sendMessage(message, extraContext) + } else { + conversation.sendMessage(message) + } val responseText = response.toString() + val thinking = response.channels["thought"] logger.info("Sync response (${responseText.length} chars): ${responseText.take(200)}") - ChatResponse.newBuilder() + val builder = ChatResponse.newBuilder() .setText(responseText) .setDone(true) - .build() + if (!thinking.isNullOrEmpty()) { + builder.setThinking(thinking) + } + builder.build() } catch (e: Exception) { logger.error("Error during sync chat with image", e) ChatResponse.newBuilder() @@ -323,23 +341,25 @@ class LiteRtLmServiceImpl : LiteRtLmServiceGrpcKt.LiteRtLmServiceCoroutineImplBa logger.info("Image header: $header (JPEG=FFD8, PNG=89504E47)") } val message = buildContents(request.text, imageBytes = imageBytes) + val extraContext = if (request.enableThinking) mapOf("enable_thinking" to true) else emptyMap() logger.info("Sending message to conversation...") var responseCount = 0 - // Use callback-based API (like Android does) - conversation.sendMessageAsync(message, object : MessageCallback { + val messageCallback = object : MessageCallback { override fun onMessage(msg: Message) { responseCount++ if (responseCount <= 3) { logger.info("Response chunk $responseCount: '${msg.toString().take(100)}'") } - trySend( - ChatResponse.newBuilder() - .setText(msg.toString()) - .setDone(false) - .build() - ) + val builder = ChatResponse.newBuilder() + .setText(msg.toString()) + .setDone(false) + val thinking = msg.channels["thought"] + if (!thinking.isNullOrEmpty()) { + builder.setThinking(thinking) + } + trySend(builder.build()) } override fun onDone() { @@ -362,7 +382,14 @@ class LiteRtLmServiceImpl : LiteRtLmServiceGrpcKt.LiteRtLmServiceCoroutineImplBa ) close(throwable) } - }) + } + + // Use callback-based API (like Android does) + if (extraContext.isNotEmpty()) { + conversation.sendMessageAsync(message, messageCallback, extraContext) + } else { + conversation.sendMessageAsync(message, messageCallback) + } } catch (e: Exception) { logger.error("Error starting chat with image", e) trySend( @@ -413,24 +440,26 @@ class LiteRtLmServiceImpl : LiteRtLmServiceGrpcKt.LiteRtLmServiceCoroutineImplBa } val message = buildContents(request.text, audioBytes = audioBytes) + val extraContext = if (request.enableThinking) mapOf("enable_thinking" to true) else emptyMap() logger.info("Sending message to conversation...") var responseCount = 0 - // Use callback-based API (like Android does) - conversation.sendMessageAsync(message, object : MessageCallback { + val messageCallback = object : MessageCallback { override fun onMessage(msg: Message) { responseCount++ val responseText = msg.toString() if (responseCount <= 3) { logger.info("Response chunk $responseCount: '${responseText.take(100)}'") } - trySend( - ChatResponse.newBuilder() - .setText(responseText) - .setDone(false) - .build() - ) + val builder = ChatResponse.newBuilder() + .setText(responseText) + .setDone(false) + val thinking = msg.channels["thought"] + if (!thinking.isNullOrEmpty()) { + builder.setThinking(thinking) + } + trySend(builder.build()) } override fun onDone() { @@ -453,7 +482,14 @@ class LiteRtLmServiceImpl : LiteRtLmServiceGrpcKt.LiteRtLmServiceCoroutineImplBa ) close(throwable) } - }) + } + + // Use callback-based API (like Android does) + if (extraContext.isNotEmpty()) { + conversation.sendMessageAsync(message, messageCallback, extraContext) + } else { + conversation.sendMessageAsync(message, messageCallback) + } } catch (e: Exception) { logger.error("Error starting chat with audio", e) trySend( @@ -569,10 +605,7 @@ class LiteRtLmServiceImpl : LiteRtLmServiceGrpcKt.LiteRtLmServiceCoroutineImplBa // Read image (JPEG, PNG, BMP, etc.) val inputStream = ByteArrayInputStream(imageBytes) val bufferedImage = ImageIO.read(inputStream) - if (bufferedImage == null) { - logger.warn("Failed to read image, returning original bytes") - return imageBytes - } + ?: throw IllegalArgumentException("Failed to read image: unsupported format or corrupt data") logger.info("Read image: ${bufferedImage.width}x${bufferedImage.height}, type=${bufferedImage.type}") @@ -589,8 +622,7 @@ class LiteRtLmServiceImpl : LiteRtLmServiceGrpcKt.LiteRtLmServiceCoroutineImplBa pngBytes } catch (e: Exception) { - logger.error("Failed to convert image to PNG: ${e.message}", e) - imageBytes // Return original on error + throw IllegalArgumentException("Image conversion to PNG failed: ${e.message}", e) } } } diff --git a/litertlm-server/src/main/proto/litertlm.proto b/litertlm-server/src/main/proto/litertlm.proto index 5108161b..fe0cf2c2 100644 --- a/litertlm-server/src/main/proto/litertlm.proto +++ b/litertlm-server/src/main/proto/litertlm.proto @@ -74,24 +74,28 @@ message CreateConversationResponse { message ChatRequest { string conversation_id = 1; string text = 2; + bool enable_thinking = 3; } message ChatWithImageRequest { string conversation_id = 1; string text = 2; bytes image = 3; // Image bytes (JPEG/PNG) + bool enable_thinking = 4; } message ChatWithAudioRequest { string conversation_id = 1; string text = 2; bytes audio = 3; // Audio bytes (PCM 16kHz, 16-bit, mono) + bool enable_thinking = 4; } message ChatResponse { string text = 1; // Partial or complete text bool done = 2; // Is generation complete string error = 3; + string thinking = 4; // Thinking channel content (Gemma 4) } message CancelGenerationRequest { diff --git a/macos/scripts/prepare_resources.sh b/macos/scripts/prepare_resources.sh index 8b593ba1..8002d53d 100755 --- a/macos/scripts/prepare_resources.sh +++ b/macos/scripts/prepare_resources.sh @@ -39,9 +39,9 @@ TFLITE_CACHE_DIR="$HOME/Library/Caches/flutter_gemma/tflite" # JAR settings JAR_NAME="litertlm-server.jar" -JAR_VERSION="0.13.0" +JAR_VERSION="0.13.1" JAR_URL="https://github.com/DenisovAV/flutter_gemma/releases/download/v${JAR_VERSION}/${JAR_NAME}" -JAR_CHECKSUM="61191862ae56f130366f5539e0a2d36adc9cb4ea99fe6568fb9a7b7cd2e88f02" +JAR_CHECKSUM="97e01020f921c098f7cfc0a9509e4b207b8bc326703ae2f26bbce3c11b957430" JAR_CACHE_DIR="$HOME/Library/Caches/flutter_gemma/jar" # Create Resources directory @@ -147,9 +147,23 @@ download_jar() { local cached_jar="$JAR_CACHE_DIR/$JAR_NAME" if [[ -f "$cached_jar" ]]; then - echo "Using cached JAR" >&2 - echo "$cached_jar" - return 0 + # Verify cached JAR checksum before reuse + if [[ -n "$JAR_CHECKSUM" ]]; then + local actual_checksum + actual_checksum=$(shasum -a 256 "$cached_jar" | awk '{print $1}') + if [[ "$actual_checksum" != "$JAR_CHECKSUM" ]]; then + echo "Cached JAR checksum mismatch, re-downloading..." >&2 + rm -f "$cached_jar" + else + echo "Using cached JAR (checksum verified)" >&2 + echo "$cached_jar" + return 0 + fi + else + echo "Using cached JAR" >&2 + echo "$cached_jar" + return 0 + fi fi if ! curl -L -o "$cached_jar" "$JAR_URL" --fail --retry 3 --progress-bar; then diff --git a/macos/scripts/setup_desktop.sh b/macos/scripts/setup_desktop.sh index 16bd2262..77f16ecc 100755 --- a/macos/scripts/setup_desktop.sh +++ b/macos/scripts/setup_desktop.sh @@ -58,9 +58,9 @@ JRE_CHECKSUM_X64="4a36280b411db58952bc97a26f96b184222b23d36ea5008a6ee34744989ff9 # JAR settings JAR_NAME="litertlm-server.jar" -JAR_VERSION="0.12.6" +JAR_VERSION="0.13.1" JAR_URL="https://github.com/DenisovAV/flutter_gemma/releases/download/v${JAR_VERSION}/${JAR_NAME}" -JAR_CHECKSUM="fefc53d076533de164b5ce07c65f9aedc4739f83efc93e67625f0d90029ae5b7" +JAR_CHECKSUM="97e01020f921c098f7cfc0a9509e4b207b8bc326703ae2f26bbce3c11b957430" JAR_CACHE_DIR="$HOME/Library/Caches/flutter_gemma/jar" echo "Plugin root: $PLUGIN_ROOT" @@ -313,10 +313,20 @@ setup_jar() { # 3. Download as fallback if [[ -z "$jar_source" ]]; then - # Check cache first + # Check cache first (with checksum verification) local cached_jar="$JAR_CACHE_DIR/$JAR_NAME" if [[ -f "$cached_jar" ]]; then - echo "Using cached JAR" + if [[ -n "$JAR_CHECKSUM" ]]; then + local actual_checksum + actual_checksum=$(shasum -a 256 "$cached_jar" | awk '{print $1}') + if [[ "$actual_checksum" != "$JAR_CHECKSUM" ]]; then + echo "Cached JAR checksum mismatch, re-downloading..." >&2 + rm -f "$cached_jar" + fi + fi + fi + if [[ -f "$cached_jar" ]]; then + echo "Using cached JAR (checksum verified)" jar_source="$cached_jar" else if jar_source=$(download_jar); then diff --git a/pigeon.dart b/pigeon.dart index d9a76507..8d5f0316 100644 --- a/pigeon.dart +++ b/pigeon.dart @@ -53,6 +53,8 @@ abstract class PlatformService { bool? enableAudioModality, // System instruction for LiteRT-LM native support String? systemInstruction, + // Enable thinking mode (Gemma 4 via extraContext) + bool? enableThinking, }); @async diff --git a/pubspec.yaml b/pubspec.yaml index db1aa1c3..dcc6fcae 100644 --- a/pubspec.yaml +++ b/pubspec.yaml @@ -1,6 +1,6 @@ name: flutter_gemma description: "The plugin allows running the Gemma AI model locally on a device from a Flutter application. Includes support for Gemma 3 Nano models with optimized MediaPipe GenAI v0.10.33." -version: 0.13.0 +version: 0.13.1 homepage: https://github.com/DenisovAV/flutter_gemma repository: https://github.com/DenisovAV/flutter_gemma diff --git a/test/core/gemma4_thinking_test.dart b/test/core/gemma4_thinking_test.dart new file mode 100644 index 00000000..6d004bc0 --- /dev/null +++ b/test/core/gemma4_thinking_test.dart @@ -0,0 +1,200 @@ +import 'package:flutter_test/flutter_test.dart'; +import 'package:flutter_gemma/core/extensions.dart'; +import 'package:flutter_gemma/core/model.dart'; +import 'package:flutter_gemma/core/model_response.dart'; + +void main() { + group('Gemma 4 thinking - filterThinkingStream', () { + Stream makeStream(List chunks) { + return Stream.fromIterable(chunks.map((c) => TextResponse(c))); + } + + test('complete block in single chunk yields ThinkingResponse + TextResponse', () async { + final stream = makeStream([ + '<|channel>thought\nI need to think about this.The answer is 42.', + ]); + + final results = await ModelThinkingFilter.filterThinkingStream( + stream, + modelType: ModelType.gemmaIt, + ).toList(); + + expect(results, [ + const ThinkingResponse('I need to think about this.'), + const TextResponse('The answer is 42.'), + ]); + }); + + test('thinking split across multiple chunks buffers correctly', () async { + final stream = makeStream([ + '<|channel>thought\nI am ', + 'thinking hard', + 'Final answer.', + ]); + + final results = await ModelThinkingFilter.filterThinkingStream( + stream, + modelType: ModelType.gemmaIt, + ).toList(); + + // Intermediate thinking chunks are yielded as they arrive + final thinkingParts = results.whereType().map((r) => r.content).join(); + final textParts = results.whereType().map((r) => r.token).join(); + + expect(thinkingParts, 'I am thinking hard'); + expect(textParts, 'Final answer.'); + }); + + test('no thinking block passes through as TextResponse', () async { + final stream = makeStream([ + 'Hello, ', + 'world!', + ]); + + final results = await ModelThinkingFilter.filterThinkingStream( + stream, + modelType: ModelType.gemmaIt, + ).toList(); + + final text = results.whereType().map((r) => r.token).join(); + expect(text, 'Hello, world!'); + expect(results.whereType(), isEmpty); + }); + + test('multiple thinking blocks in one response', () async { + final stream = makeStream([ + '<|channel>thought\nFirst thought.Text between.<|channel>thought\nSecond thought.Final text.', + ]); + + final results = await ModelThinkingFilter.filterThinkingStream( + stream, + modelType: ModelType.gemmaIt, + ).toList(); + + final thinking = results.whereType().map((r) => r.content).toList(); + final text = results.whereType().map((r) => r.token).toList(); + + expect(thinking, ['First thought.', 'Second thought.']); + expect(text, ['Text between.', 'Final text.']); + }); + + test('partial start marker at stream end is flushed as text', () async { + final stream = makeStream([ + 'Some text<|chan', + ]); + + final results = await ModelThinkingFilter.filterThinkingStream( + stream, + modelType: ModelType.gemmaIt, + ).toList(); + + final text = results.whereType().map((r) => r.token).join(); + expect(text, 'Some text<|chan'); + }); + + test('partial end marker at stream end is flushed as thinking', () async { + final stream = makeStream([ + '<|channel>thought\nThinking content().map((r) => r.content).join(); + expect(thinking, 'Thinking contentthought\nThe answer.', + ]); + + final results = await ModelThinkingFilter.filterThinkingStream( + stream, + modelType: ModelType.gemmaIt, + ).toList(); + + expect(results.whereType(), isEmpty); + expect(results.whereType().map((r) => r.token).join(), 'The answer.'); + }); + + test('start marker split across chunks', () async { + final stream = makeStream([ + 'Hello <|channel>', + 'thought\nThinking.Done.', + ]); + + final results = await ModelThinkingFilter.filterThinkingStream( + stream, + modelType: ModelType.gemmaIt, + ).toList(); + + final thinking = results.whereType().map((r) => r.content).join(); + final text = results.whereType().map((r) => r.token).join(); + + expect(thinking, 'Thinking.'); + expect(text, 'Hello Done.'); + }); + }); + + group('Gemma 4 thinking - removeThinkingFromText', () { + test('strips thinking blocks from text', () { + const input = 'Prefix <|channel>thought\nSome reasoning. Suffix'; + final result = ModelThinkingFilter.removeThinkingFromText( + input, + modelType: ModelType.gemmaIt, + ); + expect(result, 'Prefix Suffix'); + }); + + test('strips multiple thinking blocks', () { + const input = '<|channel>thought\nAText<|channel>thought\nBEnd'; + final result = ModelThinkingFilter.removeThinkingFromText( + input, + modelType: ModelType.gemmaIt, + ); + expect(result, 'TextEnd'); + }); + + test('no thinking blocks returns text unchanged', () { + const input = 'Just regular text'; + final result = ModelThinkingFilter.removeThinkingFromText( + input, + modelType: ModelType.gemmaIt, + ); + expect(result, 'Just regular text'); + }); + + test('multiline thinking content is stripped', () { + const input = '<|channel>thought\nLine 1\nLine 2\nLine 3Answer.'; + final result = ModelThinkingFilter.removeThinkingFromText( + input, + modelType: ModelType.gemmaIt, + ); + expect(result, 'Answer.'); + }); + }); + + group('DeepSeek thinking still works', () { + test('filterThinkingStream handles DeepSeek format', () async { + final stream = Stream.fromIterable([ + const TextResponse('I think '), + const TextResponse('about this'), + const TextResponse('The answer.'), + ]); + + final results = await ModelThinkingFilter.filterThinkingStream( + stream, + modelType: ModelType.deepSeek, + ).toList(); + + final thinking = results.whereType().map((r) => r.content).join(); + final text = results.whereType().map((r) => r.token).join(); + + expect(thinking.contains('I think '), isTrue); + expect(text, 'The answer.'); + }); + }); +} diff --git a/windows/scripts/setup_desktop.ps1 b/windows/scripts/setup_desktop.ps1 index 10102a06..d1b78817 100644 --- a/windows/scripts/setup_desktop.ps1 +++ b/windows/scripts/setup_desktop.ps1 @@ -87,9 +87,9 @@ $JreChecksums = @{ # JAR settings $JarName = "litertlm-server.jar" -$JarVersion = "0.13.0" +$JarVersion = "0.13.1" $JarUrl = "https://github.com/DenisovAV/flutter_gemma/releases/download/v$JarVersion/$JarName" -$JarChecksum = "61191862ae56f130366f5539e0a2d36adc9cb4ea99fe6568fb9a7b7cd2e88f02" +$JarChecksum = "97e01020f921c098f7cfc0a9509e4b207b8bc326703ae2f26bbce3c11b957430" $JarCacheDir = "$env:LOCALAPPDATA\flutter_gemma\jar" $PluginRoot = Split-Path -Parent $PluginDir @@ -386,9 +386,23 @@ function Setup-Jar { # Check cache first $cachedJar = "$JarCacheDir\$JarName" if (Test-Path $cachedJar) { - Write-Host "Using cached JAR" -ForegroundColor Green - $jarSource = $cachedJar - } else { + # Verify cached JAR checksum before reuse + if ($JarChecksum) { + $actualChecksum = (Get-FileHash -Path $cachedJar -Algorithm SHA256).Hash.ToLower() + if ($actualChecksum -ne $JarChecksum.ToLower()) { + Write-Host "Cached JAR checksum mismatch, re-downloading..." -ForegroundColor Yellow + Remove-Item -Path $cachedJar -Force + } else { + Write-Host "Using cached JAR (checksum verified)" -ForegroundColor Green + $jarSource = $cachedJar + } + } else { + Write-Host "Using cached JAR" -ForegroundColor Green + $jarSource = $cachedJar + } + } + # Download if no valid cached JAR (missing or checksum mismatch) + if (-not $jarSource) { $jarSource = Download-Jar if (-not $jarSource) { Write-Error "Could not obtain JAR (build failed, download failed)"