diff --git a/.claude/skills/release/SKILL.md b/.claude/skills/release/SKILL.md
new file mode 100644
index 00000000..d608ae46
--- /dev/null
+++ b/.claude/skills/release/SKILL.md
@@ -0,0 +1,106 @@
+---
+name: release
+description: Release flutter_gemma — rebuild JAR, update all version numbers, checksums, CHANGELOG, upload to GitHub release
+user_invocable: true
+---
+
+# Flutter Gemma Release
+
+Complete release checklist for flutter_gemma plugin. Run as `/release <version>` (e.g. `/release 0.14.0`).
+
+## Pre-flight
+
+Before starting, verify you're on the correct branch and all changes are committed:
+```bash
+git status
+git log --oneline -5
+```
+
+## Step 1: Update version numbers
+
+All files that contain the version:
+
+| File | Variable/Field | Example |
+|------|---------------|---------|
+| `pubspec.yaml` | `version:` | `version: <VERSION>` |
+| `ios/flutter_gemma.podspec` | `s.version` | `s.version = '<VERSION>'` |
+| `litertlm-server/build.gradle.kts` | `version =` | `version = "<VERSION>"` |
+| `CLAUDE.md` | `Current Version:` | `- **Current Version**: <VERSION>` |
+| `macos/scripts/setup_desktop.sh:61` | `JAR_VERSION=` | `JAR_VERSION="<VERSION>"` |
+| `macos/scripts/prepare_resources.sh:42` | `JAR_VERSION=` | `JAR_VERSION="<VERSION>"` |
+| `linux/scripts/setup_desktop.sh:62` | `JAR_VERSION=` | `JAR_VERSION="<VERSION>"` |
+| `windows/scripts/setup_desktop.ps1:90` | `$JarVersion =` | `$JarVersion = "<VERSION>"` |
+
+> JAR_URL is auto-derived from JAR_VERSION in all scripts — no separate update needed.
+
+## Step 2: Update CHANGELOG.md
+
+Add new section at top with all changes. Categories: features, fixes, breaking changes.
+
+## Step 3: Build JAR
+
+```bash
+cd litertlm-server && ./gradlew fatJar
+```
+
+Verify build success. JAR output: `litertlm-server/build/libs/litertlm-server-<VERSION>-all.jar`
+
+## Step 4: Compute new SHA256
+
+```bash
+shasum -a 256 litertlm-server/build/libs/litertlm-server-*-all.jar
+```
+
+## Step 5: Update JAR checksums in all 4 scripts
+
+| File | Variable |
+|------|----------|
+| `macos/scripts/setup_desktop.sh:63` | `JAR_CHECKSUM="<sha256>"` |
+| `macos/scripts/prepare_resources.sh:44` | `JAR_CHECKSUM="<sha256>"` |
+| `linux/scripts/setup_desktop.sh:64` | `JAR_CHECKSUM="<sha256>"` |
+| `windows/scripts/setup_desktop.ps1:92` | `$JarChecksum = "<sha256>"` |
+
+JAR is cross-platform (JVM bytecode) — same checksum for all platforms.
+
+## Step 6: Verify
+
+```bash
+flutter analyze    # 0 errors
+flutter test       # all pass
+dart pub publish --dry-run   # 0 warnings
+```
+
+**NEVER publish without dry-run first.** Publishing is IRREVERSIBLE.
+
+## Step 7: Create/update GitHub release
+
+```bash
+# Create new release
+gh release create v<VERSION> \
+  litertlm-server/build/libs/litertlm-server-<VERSION>-all.jar \
+  --title "v<VERSION>" \
+  --notes-file CHANGELOG_EXCERPT.md
+
+# OR update existing release (delete old JAR first)
+gh release delete-asset v<VERSION> litertlm-server.jar --yes 2>/dev/null
+gh release upload v<VERSION> litertlm-server/build/libs/litertlm-server-<VERSION>-all.jar
+```
+
+Verify JAR URL returns 200:
+```bash
+curl -sI "https://github.com/DenisovAV/flutter_gemma/releases/download/v<VERSION>/litertlm-server.jar" | head -1
+```
+
+## Step 8: Commit & PR
+
+- Author: `--author="Sasha Denisov <denisov.shureg@gmail.com>"`
+- No AI attribution in commit messages
+- No "Co-Authored-By" or "Generated with Claude" footers
+- Create PR via `gh pr create`
+
+## Step 9: After merge — publish
+
+```bash
+dart pub publish --dry-run   # verify one more time
+dart pub publish             # only after user approval!
+```
diff --git a/CHANGELOG.md b/CHANGELOG.md
index aa0f7f75..c023af2d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,9 @@
+## 0.13.1
+- **LiteRT-LM 0.10.0**: Updated Android and JVM SDK from 0.9.0 to 0.10.0
+- **Gemma 4 Thinking Mode**: `isThinking: true` now works with Gemma 4 E2B/E4B models (Android, iOS, Desktop; not Web)
+- **Fix cancel download**: Cancel download now works correctly (#196)
+- **Fix `large_file_handler` platform support**: Conditional imports for pub.dev platform analysis compatibility
+
 ## 0.13.0
 - **Gemma 4 E2B/E4B**: Added support for next-gen multimodal models (text + image + audio)
 - **systemInstruction**: New parameter in `createChat()` and `createSession()` for setting system-level context
diff --git a/CLAUDE.md b/CLAUDE.md
index fce52938..9d74e576 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -69,7 +69,7 @@ final token = const String.fromEnvironment('HF_TOKEN');
 - 🔥 **Local AI Inference** - Run Gemma models directly on device
 - 🖼️ **Multimodal Support** - Text + Image input with Gemma 3 Nano
 - 🛠️ **Function Calling** - Enable models to call external functions
-- 🧠 **Thinking Mode** - View reasoning process of DeepSeek models
+- 🧠 **Thinking Mode** - View reasoning process of DeepSeek and Gemma 4 models
 - 📱 **Cross-Platform** - Android, iOS, Web, macOS, Windows, Linux
 - ⚡ **GPU Acceleration** - Hardware-accelerated inference
 - 🔧 **LoRA Support** - Efficient fine-tuning weights
@@ -401,6 +401,8 @@ Future<void> close() async {
 
 | Model Family | Function Calling | Thinking Mode | Multimodal | Platform Support |
 |--------------|------------------|---------------|------------|------------------|
+| Gemma 4 E2B | ✅ | ✅ ¹ | ✅ | Android, iOS, Web, Desktop |
+| Gemma 4 E4B | ✅ | ✅ ¹ | ✅ | Android, iOS, Web, Desktop |
 | Gemma 3 Nano | ✅ | ❌ | ✅ | Android, iOS, Web |
 | Gemma 3 270M | ❌ | ❌ | ❌ | Android, iOS, Web |
 | Gemma-3 1B | ✅ | ❌ | ❌ | Android, iOS, Web |
@@ -411,6 +413,8 @@ Future<void> close() async {
 | Qwen2.5 | ✅ | ❌ | ❌ | Android, iOS, Web |
 | Phi-4 | ❌ | ❌ | ❌ | Android, iOS, Web |
 
+> ¹ Thinking Mode for Gemma 4: Android, iOS, Desktop only. Web (MediaPipe) does not support `extraContext`.
+
 ### Platform Limitations
 
 | Platform | Vision/Multimodal | Audio | Embeddings | Notes |
@@ -457,10 +461,10 @@ dev_dependencies:
 
 ### MediaPipe GenAI Integration
 
-- **Current Version Web**: v0.10.26
+- **Current Version Web**: v0.10.27
 - **Current Version Android**: v0.10.33
 - **Current Version iOS**: v0.10.33
-- **Web CDN**: `https://cdn.jsdelivr.net/npm/@mediapipe/tasks-genai@0.10.26`
+- **Web CDN**: `https://cdn.jsdelivr.net/npm/@mediapipe/tasks-genai@0.10.27`
 - **iOS/Android**: Integrated via CocoaPods/Gradle
 
 ## Development Best Practices
@@ -623,7 +627,7 @@ Log.w(TAG, "sizeInTokens: LiteRT-LM does not support token counting. " +
 
 **Dependency (build.gradle):**
 ```gradle
-implementation 'com.google.ai.edge.litertlm:litertlm-android:0.9.0-beta'
+implementation 'com.google.ai.edge.litertlm:litertlm-android:0.10.0'
 ```
 
 **Usage (Dart - no changes required):**
@@ -642,7 +646,7 @@ await FlutterGemma.installModel(modelType: ModelType.gemmaIt)
 ```html
 <!-- index.html -->
 <script type="module">
-import { FilesetResolver, LlmInference } from 'https://cdn.jsdelivr.net/npm/@mediapipe/tasks-genai@0.10.26';
+import { FilesetResolver, LlmInference } from 'https://cdn.jsdelivr.net/npm/@mediapipe/tasks-genai@0.10.27';
 window.FilesetResolver = FilesetResolver;
 window.LlmInference = LlmInference;
 </script>
@@ -1243,7 +1247,7 @@ flutter_gemma/
 
 - **GitHub**: https://github.com/DenisovAV/flutter_gemma
 - **Pub.dev**: https://pub.dev/packages/flutter_gemma
-- **Current Version**: 0.13.0
+- **Current Version**: 0.13.1
 - **License**: Check repository for license details
 - **Issues**: Report bugs via GitHub Issues
 - **Changelog**: See `CHANGELOG.md` for version history
\ No newline at end of file
diff --git a/README.md b/README.md
index f9167ce5..553a8901 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@
 
 **The plugin supports not only Gemma, but also other models. Here's the full list of supported models:** [Gemma 4 E2B/E4B](https://huggingface.co/google/gemma-4-E2B-it-litert-lm), [Gemma3n E2B/E4B](https://huggingface.co/google/gemma-3n-E2B-it-litert-preview), [FastVLM 0.5B](https://huggingface.co/litert-community/FastVLM-0.5B), [Gemma-3 1B](https://huggingface.co/litert-community/Gemma3-1B-IT), [Gemma 3 270M](https://huggingface.co/litert-community/gemma-3-270m-it), [FunctionGemma 270M](https://huggingface.co/sasha-denisov/function-gemma-270M-it), [Qwen3 0.6B](https://huggingface.co/litert-community/Qwen3-0.6B), [Qwen 2.5](https://huggingface.co/litert-community/Qwen2.5-1.5B-Instruct), [Phi-4 Mini](https://huggingface.co/litert-community/Phi-4-mini-instruct), [DeepSeek R1](https://huggingface.co/litert-community/DeepSeek-R1-Distill-Qwen-1.5B), [SmolLM 135M](https://huggingface.co/litert-community/SmolLM-135M-Instruct).
 
-*Note: The flutter_gemma plugin supports Gemma3n (with **multimodal vision and audio support**), FastVLM (vision), Gemma-3, FunctionGemma, Qwen3, Qwen 2.5, Phi-4, DeepSeek R1 and SmolLM. Desktop platforms (macOS, Windows, Linux) require `.litertlm` model format.
+*Note: The flutter_gemma plugin supports Gemma 4 and Gemma3n (with **multimodal vision and audio support**), FastVLM (vision), Gemma-3, FunctionGemma, Qwen3, Qwen 2.5, Phi-4, DeepSeek R1 and SmolLM. Desktop platforms (macOS, Windows, Linux) require `.litertlm` model format.
 
 [Gemma](https://ai.google.dev/gemma) is a family of lightweight, state-of-the art open models built from the same research and technology used to create the Gemini models
 
@@ -32,7 +32,7 @@ There is an example of using:
 - **🖼️ Multimodal Support:** Text + Image input with Gemma3n vision models
 - **🎙️ Audio Input:** Record and send audio messages with Gemma3n E2B/E4B models (Android, Desktop - LiteRT-LM engine)
 - **🛠️ Function Calling:** Enable your models to call external functions and integrate with other services (supported by select models)
-- **🧠 Thinking Mode:** View the reasoning process of DeepSeek models with <think> blocks 
+- **🧠 Thinking Mode:** View the reasoning process of DeepSeek and Gemma 4 models with thinking blocks
 - **🛑 Stop Generation:** Cancel text generation mid-process on Android, Web, and Desktop
 - **⚙️ Backend Switching:** Choose between CPU and GPU backends for each model individually in the example app 
 - **🔍 Advanced Model Filtering:** Filter models by features (Multimodal, Function Calls, Thinking) with expandable UI
@@ -72,8 +72,8 @@ The example app offers a curated list of models, each suited for different tasks
 
 | Model Family | Best For | Function Calling | Thinking Mode | Vision | Languages | Size |
 |---|---|:---:|:---:|:---:|---|---|
-| **Gemma 4 E2B** | Next-gen multimodal chat — text, image, audio | ✅ | ❌ | ✅ | Multilingual | 2.4GB |
-| **Gemma 4 E4B** | Next-gen multimodal chat — text, image, audio | ✅ | ❌ | ✅ | Multilingual | 4.3GB |
+| **Gemma 4 E2B** | Next-gen multimodal chat — text, image, audio | ✅ | ✅ | ✅ | Multilingual | 2.4GB |
+| **Gemma 4 E4B** | Next-gen multimodal chat — text, image, audio | ✅ | ✅ | ✅ | Multilingual | 4.3GB |
 | **Gemma3n** | On-device multimodal chat and image analysis | ✅ | ❌ | ✅ | Multilingual | 3-6GB |
 | **FastVLM 0.5B** | Fast vision-language inference | ❌ | ❌ | ✅ | Multilingual | 0.5GB |
 | **Phi-4 Mini** | Advanced reasoning and instruction following | ✅ | ❌ | ❌ | Multilingual | 3.9GB |
@@ -1544,11 +1544,11 @@ FunctionGemma uses a special format (different from JSON-based function calling)
 
 The `flutter_gemma` plugin handles this format automatically via `FunctionCallParser`.
 
-9. **🧠 Thinking Mode (DeepSeek Models)**
+9. **🧠 Thinking Mode (DeepSeek & Gemma 4 Models)**
 
-DeepSeek models support "thinking mode" where you can see the model's reasoning process before it generates the final response. This provides transparency into how the model approaches problems.
+DeepSeek and Gemma 4 (E2B/E4B) models support "thinking mode" where you can see the model's reasoning process before it generates the final response. This provides transparency into how the model approaches problems.
 
-**Enable Thinking Mode:**
+**Enable Thinking Mode (DeepSeek):**
 
 ```dart
 final chat = await inferenceModel.createChat(
@@ -1559,7 +1559,6 @@ final chat = await inferenceModel.createChat(
   modelType: ModelType.deepSeek, // Required for DeepSeek models
   supportsFunctionCalls: true, // DeepSeek also supports function calls
   tools: _tools, // Optional: add tools for function calling
-  // tokenBuffer: 256, // Token buffer for context management
 );
 ```
 
@@ -1586,12 +1585,25 @@ chat.generateChatResponseAsync().listen((response) {
 });
 ```
 
+**Enable Thinking Mode (Gemma 4):**
+
+```dart
+final chat = await inferenceModel.createChat(
+  temperature: 1.0,
+  topK: 64,
+  topP: 0.95,
+  isThinking: true, // Enable thinking mode
+  modelType: ModelType.gemmaIt, // Gemma 4 E2B/E4B
+);
+// <|think|> is auto-injected into systemInstruction — no manual prompt needed.
+```
+
 **Thinking Mode Features:**
 - ✅ **Transparent Reasoning**: See how the model thinks through problems
 - ✅ **Interactive UI**: Show/hide thinking bubbles with expandable content
 - ✅ **Streaming Support**: Thinking content streams in real-time
 - ✅ **Function Integration**: Models can think before calling functions
-- ✅ **DeepSeek Optimized**: Designed specifically for DeepSeek model architecture
+- ✅ **Supported Models**: DeepSeek R1 and Gemma 4 E2B/E4B
 
 **Example Thinking Flow:**
 1. User asks: "Change the background to blue and explain why blue is calming"
@@ -2096,7 +2108,7 @@ Function calling is currently supported by the following models:
 | **Image Input (Multimodal)** | ✅ Full | ✅ Full | ✅ Full | ⚠️ Broken (#684) | macOS: model hallucinates |
 | **Audio Input** | ✅ Full | ✅ Full | ❌ Not supported | ✅ Full | Gemma3n E2B/E4B |
 | **Function Calling** | ✅ Full | ✅ Full | ✅ Full | ❌ Not supported | LiteRT-LM limitation |
-| **Thinking Mode** | ✅ Full | ✅ Full | ✅ Full | ❌ Not supported | DeepSeek models |
+| **Thinking Mode** | ✅ Full | ✅ Full | ✅ Full | ✅ Full | DeepSeek & Gemma 4 |
 | **Stop Generation** | ✅ Full | ✅ Full | ✅ Full | ✅ Full | Cancel mid-process |
 | **GPU Acceleration** | ✅ Full | ✅ Full | ✅ Full | ⚠️ Partial | macOS GPU broken |
 | **NPU Acceleration** | ✅ Full | ❌ Not supported | ❌ Not supported | ❌ Not supported | Android only (.litertlm) |
@@ -2264,13 +2276,14 @@ import 'package:flutter_gemma/core/extensions.dart';
 
 // Clean response based on model type
 String cleanedResponse = ModelThinkingFilter.cleanResponse(
-  rawResponse, 
+  rawResponse,
   ModelType.deepSeek
 );
 
 // The filter automatically removes model-specific tokens like:
 // - <end_of_turn> tags (Gemma models)
-// - Special DeepSeek tokens
+// - <think>...</think> blocks (DeepSeek)
+// - <|channel>thought\n...<channel|> blocks (Gemma 4 E2B/E4B)
 // - Extra whitespace and formatting
 ```
 
diff --git a/android/build.gradle b/android/build.gradle
index a01e18b3..21fc9117 100644
--- a/android/build.gradle
+++ b/android/build.gradle
@@ -76,7 +76,7 @@ dependencies {
     implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-guava:1.9.0'
 
     // LiteRT-LM Engine for .litertlm model files
-    implementation 'com.google.ai.edge.litertlm:litertlm-android:0.9.0'
+    implementation 'com.google.ai.edge.litertlm:litertlm-android:0.10.0'
 
     implementation 'androidx.core:core-ktx:1.12.0'
     implementation 'androidx.lifecycle:lifecycle-runtime-ktx:2.7.0'
diff --git a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/FlutterGemmaPlugin.kt b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/FlutterGemmaPlugin.kt
index 70637a10..3263579a 100644
--- a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/FlutterGemmaPlugin.kt
+++ b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/FlutterGemmaPlugin.kt
@@ -82,8 +82,8 @@ private class PlatformServiceImpl(
   private val engineLock = Any()  // Lock for thread-safe engine access
 
   // NEW: Use InferenceEngine abstraction instead of InferenceModel
-  private var engine: InferenceEngine? = null
-  private var session: InferenceSession? = null
+  @Volatile private var engine: InferenceEngine? = null
+  @Volatile private var session: InferenceSession? = null
 
   // RAG components
   private var embeddingModel: EmbeddingModel? = null
@@ -130,6 +130,9 @@ private class PlatformServiceImpl(
 
         // Only now clear old state and swap in new engine (thread-safe)
         synchronized(engineLock) {
+          // Cancel stale stream collector before replacing engine
+          streamJob?.cancel()
+          streamJob = null
           session?.cancelGeneration()
           try {
             session?.close()
@@ -176,6 +179,7 @@ private class PlatformServiceImpl(
     enableVisionModality: Boolean?,
     enableAudioModality: Boolean?,
     systemInstruction: String?,
+    enableThinking: Boolean?,
     callback: (Result<Unit>) -> Unit
   ) {
     scope.launch {
@@ -193,6 +197,7 @@ private class PlatformServiceImpl(
             enableVisionModality = enableVisionModality,
             enableAudioModality = enableAudioModality,
             systemInstruction = systemInstruction,
+            enableThinking = enableThinking ?: false,
           )
 
           session?.close()
diff --git a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/PigeonInterface.g.kt b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/PigeonInterface.g.kt
index d758bd63..7ec66918 100644
--- a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/PigeonInterface.g.kt
+++ b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/PigeonInterface.g.kt
@@ -203,7 +203,7 @@ private open class PigeonInterfacePigeonCodec : StandardMessageCodec() {
 interface PlatformService {
   fun createModel(maxTokens: Long, modelPath: String, loraRanks: List<Long>?, preferredBackend: PreferredBackend?, maxNumImages: Long?, supportAudio: Boolean?, callback: (Result<Unit>) -> Unit)
   fun closeModel(callback: (Result<Unit>) -> Unit)
-  fun createSession(temperature: Double, randomSeed: Long, topK: Long, topP: Double?, loraPath: String?, enableVisionModality: Boolean?, enableAudioModality: Boolean?, systemInstruction: String?, callback: (Result<Unit>) -> Unit)
+  fun createSession(temperature: Double, randomSeed: Long, topK: Long, topP: Double?, loraPath: String?, enableVisionModality: Boolean?, enableAudioModality: Boolean?, systemInstruction: String?, enableThinking: Boolean?, callback: (Result<Unit>) -> Unit)
   fun closeSession(callback: (Result<Unit>) -> Unit)
   fun sizeInTokens(prompt: String, callback: (Result<Long>) -> Unit)
   fun addQueryChunk(prompt: String, callback: (Result<Unit>) -> Unit)
@@ -315,7 +315,8 @@ interface PlatformService {
             val enableVisionModalityArg = args[5] as Boolean?
             val enableAudioModalityArg = args[6] as Boolean?
             val systemInstructionArg = args[7] as String?
-            api.createSession(temperatureArg, randomSeedArg, topKArg, topPArg, loraPathArg, enableVisionModalityArg, enableAudioModalityArg, systemInstructionArg) { result: Result<Unit> ->
+            val enableThinkingArg = args[8] as Boolean?
+            api.createSession(temperatureArg, randomSeedArg, topKArg, topPArg, loraPathArg, enableVisionModalityArg, enableAudioModalityArg, systemInstructionArg, enableThinkingArg) { result: Result<Unit> ->
               val error = result.exceptionOrNull()
               if (error != null) {
                 reply.reply(wrapError(error))
diff --git a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/EngineConfig.kt b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/EngineConfig.kt
index 292ca1f9..1dc60047 100644
--- a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/EngineConfig.kt
+++ b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/EngineConfig.kt
@@ -28,6 +28,7 @@ data class SessionConfig(
     val enableVisionModality: Boolean? = null,
     val enableAudioModality: Boolean? = null,
     val systemInstruction: String? = null,
+    val enableThinking: Boolean = false,
 )
 
 /**
diff --git a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/litertlm/LiteRtLmSession.kt b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/litertlm/LiteRtLmSession.kt
index cdece1fd..90b5b8f8 100644
--- a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/litertlm/LiteRtLmSession.kt
+++ b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/litertlm/LiteRtLmSession.kt
@@ -31,6 +31,13 @@ class LiteRtLmSession(
 
     private val conversation: Conversation
 
+    // Extra context for thinking mode (Gemma 4 via Jinja template variable)
+    private val extraContext: Map<String, Any> = if (config.enableThinking) {
+        mapOf("enable_thinking" to true)
+    } else {
+        emptyMap()
+    }
+
     // Chunk buffering (MediaPipe compatibility) - thread-safe access
     private val pendingPrompt = StringBuilder()
     private val promptLock = Any()
@@ -84,11 +91,23 @@ class LiteRtLmSession(
         Log.d(TAG, "Generating sync response for message: ${message.toString().length} chars")
 
         return try {
-            val response = conversation.sendMessage(message)
-            response.toString()
+            val response = if (extraContext.isNotEmpty()) {
+                conversation.sendMessage(message, extraContext)
+            } else {
+                conversation.sendMessage(message)
+            }
+            val thinking = response.channels["thought"]
+            val text = response.toString()
+            if (!thinking.isNullOrEmpty()) {
+                "<|channel>thought\n$thinking<channel|>$text"
+            } else {
+                text
+            }
         } catch (e: Exception) {
             Log.e(TAG, "Error generating response", e)
-            errorFlow.tryEmit(e)
+            if (!errorFlow.tryEmit(e)) {
+                Log.w(TAG, "Error emission dropped (buffer full): ${e.message}")
+            }
             throw e
         }
     }
@@ -97,27 +116,49 @@ class LiteRtLmSession(
         val message = buildAndConsumeMessage()
         Log.d(TAG, "Generating async response for message: ${message.toString().length} chars")
 
-        try {
-            // Use callback-based API
-            conversation.sendMessageAsync(message, object : MessageCallback {
-                override fun onMessage(message: Message) {
-                    val text = message.toString()
-                    resultFlow.tryEmit(text to false)
+        val callback = object : MessageCallback {
+            override fun onMessage(msg: Message) {
+                // Combine thinking + text into single emission to prevent DROP_OLDEST loss
+                // (buffer=1, two rapid tryEmit calls would drop the first)
+                val thinking = msg.channels["thought"]
+                val text = msg.toString()
+                val combined = buildString {
+                    if (!thinking.isNullOrEmpty()) {
+                        append("<|channel>thought\n$thinking<channel|>")
+                    }
+                    if (text.isNotEmpty()) {
+                        append(text)
+                    }
                 }
-
-                override fun onDone() {
-                    resultFlow.tryEmit("" to true)
+                if (combined.isNotEmpty()) {
+                    resultFlow.tryEmit(combined to false)
                 }
+            }
 
-                override fun onError(throwable: Throwable) {
-                    Log.e(TAG, "Async generation error", throwable)
-                    errorFlow.tryEmit(throwable)
-                    resultFlow.tryEmit("" to true)
+            override fun onDone() {
+                resultFlow.tryEmit("" to true)
+            }
+
+            override fun onError(throwable: Throwable) {
+                Log.e(TAG, "Async generation error", throwable)
+                if (!errorFlow.tryEmit(throwable)) {
+                    Log.w(TAG, "Error emission dropped (buffer full): ${throwable.message}")
                 }
-            })
+                resultFlow.tryEmit("" to true)
+            }
+        }
+
+        try {
+            if (extraContext.isNotEmpty()) {
+                conversation.sendMessageAsync(message, callback, extraContext)
+            } else {
+                conversation.sendMessageAsync(message, callback)
+            }
         } catch (e: Exception) {
             Log.e(TAG, "Failed to start async generation", e)
-            errorFlow.tryEmit(e)
+            if (!errorFlow.tryEmit(e)) {
+                Log.w(TAG, "Error emission dropped (buffer full): ${e.message}")
+            }
             resultFlow.tryEmit("" to true)
         }
     }
diff --git a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/mediapipe/MediaPipeEngine.kt b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/mediapipe/MediaPipeEngine.kt
index 07285555..91b868a3 100644
--- a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/mediapipe/MediaPipeEngine.kt
+++ b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/mediapipe/MediaPipeEngine.kt
@@ -24,7 +24,7 @@ class MediaPipeEngine(
     override var isInitialized: Boolean = false
         private set
 
-    override val capabilities = EngineCapabilities(
+    override var capabilities = EngineCapabilities(
         supportsVision = true,
         supportsAudio = false, // Audio is LiteRT-LM only (not supported by MediaPipe SDK)
         supportsFunctionCalls = true, // Manual via chat templates
@@ -74,6 +74,10 @@ class MediaPipeEngine(
             val options = optionsBuilder.build()
             llmInference = LlmInference.createFromOptions(context, options)
             isInitialized = true
+            // Update audio capability if audio was successfully configured
+            if (config.supportAudio == true) {
+                capabilities = capabilities.copy(supportsAudio = true)
+            }
         } catch (e: Exception) {
             throw RuntimeException("Failed to initialize MediaPipe LlmInference: ${e.message}", e)
         }
diff --git a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/mediapipe/MediaPipeSession.kt b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/mediapipe/MediaPipeSession.kt
index d06af38a..3004a51b 100644
--- a/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/mediapipe/MediaPipeSession.kt
+++ b/android/src/main/kotlin/dev/flutterberlin/flutter_gemma/engines/mediapipe/MediaPipeSession.kt
@@ -50,6 +50,11 @@ class MediaPipeSession(
 
         val sessionOptions = sessionOptionsBuilder.build()
         session = LlmInferenceSession.createFromOptions(llmInference, sessionOptions)
+
+        if (config.enableThinking) {
+            Log.w(TAG, "enableThinking=true is not supported by MediaPipe engine. " +
+                "Use LiteRT-LM (.litertlm) models for thinking mode.")
+        }
     }
 
     override fun addQueryChunk(prompt: String) {
@@ -93,7 +98,11 @@ class MediaPipeSession(
     }
 
     override fun cancelGeneration() {
-        session.cancelGenerateResponseAsync()
+        try {
+            session.cancelGenerateResponseAsync()
+        } catch (e: Exception) {
+            Log.w(TAG, "cancelGeneration failed", e)
+        }
     }
 
     override fun close() {
diff --git a/example/integration_test/desktop_thinking_test.dart b/example/integration_test/desktop_thinking_test.dart
new file mode 100644
index 00000000..3a29ed9f
--- /dev/null
+++ b/example/integration_test/desktop_thinking_test.dart
@@ -0,0 +1,165 @@
+// Integration test: Gemma 4 thinking mode on Desktop (macOS/Windows/Linux)
+// Run with: cd example && flutter test integration_test/desktop_thinking_test.dart -d macos
+//
+// Prerequisites:
+//   Copy gemma-4-E2B-it.litertlm to the app sandbox container:
+//   cp ~/Downloads/gemma-4-E2B-it.litertlm \
+//      ~/Library/Containers/dev.flutterberlin.flutterGemmaExample55/Data/Documents/
+
+import 'dart:io';
+
+import 'package:flutter_test/flutter_test.dart';
+import 'package:integration_test/integration_test.dart';
+import 'package:flutter_gemma/flutter_gemma.dart';
+
+const _modelFileName = 'gemma-4-E2B-it.litertlm';
+
+String _resolveModelPath() {
+  // Inside macOS sandbox, HOME already points to the container:
+  // ~/Library/Containers/<bundle-id>/Data
+  final home = Platform.environment['HOME'] ?? '';
+  return '$home/Documents/$_modelFileName';
+}
+
+void main() {
+  IntegrationTestWidgetsFlutterBinding.ensureInitialized();
+
+  late String modelPath;
+
+  group('Desktop Gemma 4 Thinking Mode', () {
+    setUpAll(() {
+      if (!Platform.isMacOS && !Platform.isWindows && !Platform.isLinux) {
+        fail('Test requires desktop platform');
+      }
+      modelPath = _resolveModelPath();
+      if (!File(modelPath).existsSync()) {
+        fail('Model not found: $modelPath');
+      }
+    });
+
+    testWidgets('thinking_stream', (tester) async {
+      print('=== Initializing ===');
+      await FlutterGemma.initialize();
+
+      print('=== Installing model from file ===');
+      await FlutterGemma.installModel(
+        modelType: ModelType.gemmaIt,
+        fileType: ModelFileType.litertlm,
+      ).fromFile(modelPath).install();
+
+      expect(FlutterGemma.hasActiveModel(), isTrue);
+      print('Model installed');
+
+      final model = await FlutterGemma.getActiveModel(
+        maxTokens: 2048,
+        preferredBackend: PreferredBackend.gpu,
+      );
+
+      try {
+        final chat = await model.createChat(
+          temperature: 1.0,
+          topK: 64,
+          topP: 0.95,
+          isThinking: true,
+          modelType: ModelType.gemmaIt,
+        );
+
+        await chat.addQuery(
+          const Message(text: 'Explain why the sky is blue. Think step by step.', isUser: true),
+        );
+
+        final responses = <ModelResponse>[];
+        await tester.runAsync(() async {
+          await for (final response in chat.generateChatResponseAsync()) {
+            responses.add(response);
+          }
+        });
+
+        final thinkingTokens = responses
+            .whereType<ThinkingResponse>()
+            .map((r) => r.content)
+            .join();
+        final textTokens = responses
+            .whereType<TextResponse>()
+            .map((r) => r.token)
+            .join();
+
+        print('[Gemma 4 E2B Desktop] Thinking tokens: ${thinkingTokens.length} chars');
+        print('[Gemma 4 E2B Desktop] Text tokens: ${textTokens.length} chars');
+
+        // Should have thinking content
+        expect(thinkingTokens.isNotEmpty, isTrue,
+            reason: 'Expected non-empty thinking content');
+
+        // Should have text content
+        expect(textTokens.isNotEmpty, isTrue,
+            reason: 'Expected non-empty text response');
+
+        // Thinking should come before text in stream order
+        final firstThinkingIdx = responses.indexWhere((r) => r is ThinkingResponse);
+        final firstTextIdx = responses.indexWhere((r) => r is TextResponse);
+
+        if (firstThinkingIdx >= 0 && firstTextIdx >= 0) {
+          expect(firstThinkingIdx, lessThan(firstTextIdx),
+              reason: 'First thinking should appear before first text');
+        }
+
+        print('[Gemma 4 E2B Desktop] thinking_stream PASSED');
+      } finally {
+        await model.close();
+      }
+    }, timeout: const Timeout(Duration(minutes: 10)));
+
+    testWidgets('no_thinking', (tester) async {
+      await FlutterGemma.initialize();
+
+      await FlutterGemma.installModel(
+        modelType: ModelType.gemmaIt,
+        fileType: ModelFileType.litertlm,
+      ).fromFile(modelPath).install();
+
+      final model = await FlutterGemma.getActiveModel(
+        maxTokens: 2048,
+        preferredBackend: PreferredBackend.gpu,
+      );
+
+      try {
+        final chat = await model.createChat(
+          temperature: 1.0,
+          topK: 64,
+          topP: 0.95,
+          isThinking: false,
+          modelType: ModelType.gemmaIt,
+        );
+
+        await chat.addQuery(
+          const Message(text: 'What is 2+2?', isUser: true),
+        );
+
+        final responses = <ModelResponse>[];
+        await tester.runAsync(() async {
+          await for (final response in chat.generateChatResponseAsync()) {
+            responses.add(response);
+          }
+        });
+
+        // Without thinking enabled, no ThinkingResponse should appear
+        final thinkingResponses = responses.whereType<ThinkingResponse>().toList();
+        expect(thinkingResponses, isEmpty,
+            reason: 'No ThinkingResponse expected with isThinking=false');
+
+        // Should still have text content
+        final textTokens = responses
+            .whereType<TextResponse>()
+            .map((r) => r.token)
+            .join();
+        expect(textTokens.isNotEmpty, isTrue,
+            reason: 'Expected non-empty text response');
+
+        print('[Gemma 4 E2B Desktop] no_thinking PASSED');
+      } finally {
+        await model.close();
+      }
+    }, timeout: const Timeout(Duration(minutes: 10)));
+  });
+}
diff --git a/example/integration_test/sequential_gemma4_test.dart b/example/integration_test/sequential_gemma4_test.dart
new file mode 100644
index 00000000..7d0f9b1f
--- /dev/null
+++ b/example/integration_test/sequential_gemma4_test.dart
@@ -0,0 +1,67 @@
+// Integration test: Sequential inference with Gemma 4 E2B .litertlm
+// Reproduces issue #209 — SIGSEGV crash on second sendMessage
+//
+// Run:
+//   cd example
+//   flutter test integration_test/sequential_gemma4_test.dart -d <device>
+
+import 'dart:io';
+
+import 'package:flutter_test/flutter_test.dart';
+import 'package:integration_test/integration_test.dart';
+import 'package:flutter_gemma/flutter_gemma.dart';
+
+const _modelPath = '/data/local/tmp/flutter_gemma_test/gemma-4-E2B-it.litertlm';
+
+void main() {
+  IntegrationTestWidgetsFlutterBinding.ensureInitialized();
+
+  setUpAll(() {
+    if (!Platform.isAndroid) {
+      fail('Test requires Android with .litertlm models');
+    }
+    if (!File(_modelPath).existsSync()) {
+      fail('Model not found: $_modelPath\nPush it first: adb push <model> $_modelPath');
+    }
+  });
+
+  testWidgets('Gemma 4 E2B: two sequential queries on same chat', (tester) async {
+    await FlutterGemma.initialize();
+
+    await FlutterGemma.installModel(
+      modelType: ModelType.gemmaIt,
+      fileType: ModelFileType.litertlm,
+    ).fromFile(_modelPath).install();
+
+    // No preferredBackend = CPU (default), matching issue #209 reporter's code
+    final model = await FlutterGemma.getActiveModel(
+      maxTokens: 2048,
+    );
+
+    try {
+      final chat = await model.createChat(modelType: ModelType.gemmaIt);
+
+      // First query
+      await chat.addQueryChunk(
+        const Message(text: 'What is 2+2? Answer with just the number.', isUser: true),
+      );
+      final r1 = await chat.generateChatResponse();
+      expect(r1, isA<TextResponse>());
+      final text1 = (r1 as TextResponse).token;
+      print('[Gemma4] First response: "$text1"');
+      expect(text1, isNotEmpty);
+
+      // Second query — crash point in issue #209
+      await chat.addQueryChunk(
+        const Message(text: 'What is 3+3? Answer with just the number.', isUser: true),
+      );
+      final r2 = await chat.generateChatResponse();
+      expect(r2, isA<TextResponse>());
+      final text2 = (r2 as TextResponse).token;
+      print('[Gemma4] Second response: "$text2"');
+      expect(text2, isNotEmpty);
+    } finally {
+      await model.close();
+    }
+  }, timeout: const Timeout(Duration(minutes: 20)));
+}
diff --git a/example/integration_test/sequential_litertlm_test.dart b/example/integration_test/sequential_litertlm_test.dart
new file mode 100644
index 00000000..c2ddc9d1
--- /dev/null
+++ b/example/integration_test/sequential_litertlm_test.dart
@@ -0,0 +1,171 @@
+// Integration test: Sequential inference with .litertlm models
+// Reproduces issue #209 — SIGSEGV crash on second sendMessage
+//
+// Prerequisites:
+//   adb push /path/to/gemma-4-E2B-it.litertlm /data/local/tmp/flutter_gemma_test/
+//   adb push /path/to/Qwen3-0.6B.litertlm /data/local/tmp/flutter_gemma_test/
+//
+// Run:
+//   cd example
+//   flutter test integration_test/sequential_litertlm_test.dart -d <device>
+
+import 'dart:io';
+
+import 'package:flutter_test/flutter_test.dart';
+import 'package:integration_test/integration_test.dart';
+import 'package:flutter_gemma/flutter_gemma.dart';
+
+const _deviceDir = '/data/local/tmp/flutter_gemma_test';
+
+const _models = <({String path, String name, ModelType modelType})>[
+  (
+    path: '$_deviceDir/gemma-3n-E2B-it-int4.litertlm',
+    name: 'Gemma 3n E2B',
+    modelType: ModelType.gemmaIt,
+  ),
+  (
+    path: '$_deviceDir/gemma-4-E2B-it.litertlm',
+    name: 'Gemma 4 E2B',
+    modelType: ModelType.gemmaIt,
+  ),
+];
+
+Future<InferenceModel> _installAndLoad(String path, ModelType modelType) async {
+  await FlutterGemma.initialize();
+
+  await FlutterGemma.installModel(
+    modelType: modelType,
+    fileType: ModelFileType.litertlm,
+  ).fromFile(path).install();
+
+  return FlutterGemma.getActiveModel(
+    maxTokens: 2048,
+    preferredBackend: PreferredBackend.gpu,
+  );
+}
+
+void main() {
+  IntegrationTestWidgetsFlutterBinding.ensureInitialized();
+
+  for (final (:path, :name, :modelType) in _models) {
+    group('Sequential inference [$name]', () {
+      setUpAll(() {
+        if (!Platform.isAndroid) {
+          fail('Test requires Android with .litertlm models');
+        }
+        if (!File(path).existsSync()) {
+          fail('Model not found: $path\nPush it first: adb push <model> $path');
+        }
+      });
+
+      // --- Test 1: Two sequential queries on same chat (issue #209 core repro) ---
+      testWidgets('two sequential queries on same chat', (tester) async {
+        final model = await _installAndLoad(path, modelType);
+        try {
+          final chat = await model.createChat(modelType: modelType);
+
+          // First query — should work
+          await chat.addQueryChunk(
+            const Message(text: 'What is 2+2? Answer with just the number.', isUser: true),
+          );
+          final r1 = await chat.generateChatResponse();
+          expect(r1, isA<TextResponse>());
+          final text1 = (r1 as TextResponse).token;
+          print('[$name] First response: "$text1"');
+          expect(text1, isNotEmpty);
+
+          // Second query — crashes with SIGSEGV in issue #209
+          await chat.addQueryChunk(
+            const Message(text: 'What is 3+3? Answer with just the number.', isUser: true),
+          );
+          final r2 = await chat.generateChatResponse();
+          expect(r2, isA<TextResponse>());
+          final text2 = (r2 as TextResponse).token;
+          print('[$name] Second response: "$text2"');
+          expect(text2, isNotEmpty);
+        } finally {
+          await model.close();
+        }
+      }, timeout: const Timeout(Duration(minutes: 10)));
+
+      // --- Test 2: Three sequential queries (longer conversation) ---
+      testWidgets('three sequential queries on same chat', (tester) async {
+        final model = await _installAndLoad(path, modelType);
+        try {
+          final chat = await model.createChat(modelType: modelType);
+
+          for (var i = 1; i <= 3; i++) {
+            await chat.addQueryChunk(
+              Message(text: 'What is ${i}+${i}? Answer briefly.', isUser: true),
+            );
+            final r = await chat.generateChatResponse();
+            expect(r, isA<TextResponse>());
+            final text = (r as TextResponse).token;
+            print('[$name] Query $i response: "$text"');
+            expect(text, isNotEmpty);
+          }
+        } finally {
+          await model.close();
+        }
+      }, timeout: const Timeout(Duration(minutes: 15)));
+
+      // --- Test 3: Streaming sequential queries ---
+      testWidgets('two sequential streaming queries on same chat', (tester) async {
+        final model = await _installAndLoad(path, modelType);
+        try {
+          final chat = await model.createChat(modelType: modelType);
+
+          // First streaming query
+          await chat.addQueryChunk(
+            const Message(text: 'Say hello in one word.', isUser: true),
+          );
+          final chunks1 = <String>[];
+          await tester.runAsync(() async {
+            await for (final r in chat.generateChatResponseAsync()) {
+              if (r is TextResponse) chunks1.add(r.token);
+            }
+          });
+          final text1 = chunks1.join();
+          print('[$name] First streaming response: "$text1"');
+          expect(text1, isNotEmpty);
+
+          // Second streaming query — issue #209 crash point
+          await chat.addQueryChunk(
+            const Message(text: 'Say goodbye in one word.', isUser: true),
+          );
+          final chunks2 = <String>[];
+          await tester.runAsync(() async {
+            await for (final r in chat.generateChatResponseAsync()) {
+              if (r is TextResponse) chunks2.add(r.token);
+            }
+          });
+          final text2 = chunks2.join();
+          print('[$name] Second streaming response: "$text2"');
+          expect(text2, isNotEmpty);
+        } finally {
+          await model.close();
+        }
+      }, timeout: const Timeout(Duration(minutes: 10)));
+
+      // --- Test 4: New chat per query (workaround test) ---
+      testWidgets('new chat per query works', (tester) async {
+        final model = await _installAndLoad(path, modelType);
+        try {
+          for (var i = 1; i <= 2; i++) {
+            final chat = await model.createChat(modelType: modelType);
+            await chat.addQueryChunk(
+              Message(text: 'What is ${i * 10}? Answer briefly.', isUser: true),
+            );
+            final r = await chat.generateChatResponse();
+            expect(r, isA<TextResponse>());
+            final text = (r as TextResponse).token;
+            print('[$name] New chat #$i response: "$text"');
+            expect(text, isNotEmpty);
+          }
+        } finally {
+          await model.close();
+        }
+      }, timeout: const Timeout(Duration(minutes: 10)));
+    });
+  }
+}
diff --git a/example/integration_test/thinking_mode_test.dart b/example/integration_test/thinking_mode_test.dart
new file mode 100644
index 00000000..d815a499
--- /dev/null
+++ b/example/integration_test/thinking_mode_test.dart
@@ -0,0 +1,208 @@
+// Integration test: thinking mode across DeepSeek and Gemma 4 models.
+// Run on Android: flutter test integration_test/thinking_mode_test.dart -d <android-device>
+//
+// Prerequisites:
+//   Push models to device:
+//     adb push deepseek_q8_ekv1280.task /data/local/tmp/flutter_gemma_test/
+//     adb push gemma-4-E2B-it.litertlm /data/local/tmp/flutter_gemma_test/
+//
+// Tests per model:
+//   - install: model loads from device file
+//   - thinking_stream: async stream verifies ThinkingResponse + TextResponse ordering
+//   - no_thinking: isThinking: false produces only TextResponse
+
+import 'dart:io';
+
+import 'package:flutter_test/flutter_test.dart';
+import 'package:integration_test/integration_test.dart';
+import 'package:flutter_gemma/flutter_gemma.dart';
+
+const _deviceModelDir = '/data/local/tmp/flutter_gemma_test';
+
+/// Test model configuration for thinking mode tests.
+class ThinkingTestModel {
+  final String name;
+  final String filePath;
+  final ModelType modelType;
+  final ModelFileType fileType;
+  final double temperature;
+  final int topK;
+  final double topP;
+  final int maxTokens;
+
+  const ThinkingTestModel({
+    required this.name,
+    required this.filePath,
+    required this.modelType,
+    this.fileType = ModelFileType.task,
+    this.temperature = 1.0,
+    this.topK = 64,
+    this.topP = 0.95,
+    this.maxTokens = 1024,
+  });
+}
+
+const _testModels = [
+  ThinkingTestModel(
+    name: 'DeepSeek R1 1.5B',
+    filePath: '$_deviceModelDir/deepseek_q8_ekv1280.task',
+    modelType: ModelType.deepSeek,
+    temperature: 0.6,
+    topK: 40,
+    topP: 0.7,
+  ),
+  ThinkingTestModel(
+    name: 'Gemma 4 E2B',
+    filePath: '$_deviceModelDir/gemma-4-E2B-it.litertlm',
+    modelType: ModelType.gemmaIt,
+    fileType: ModelFileType.litertlm,
+    maxTokens: 2048,
+  ),
+];
+
+Future<void> _ensureModelInstalled(ThinkingTestModel model) async {
+  await FlutterGemma.installModel(
+    modelType: model.modelType,
+    fileType: model.fileType,
+  ).fromFile(model.filePath).install();
+}
+
+void main() {
+  IntegrationTestWidgetsFlutterBinding.ensureInitialized();
+
+  for (final model in _testModels) {
+    group(model.name, () {
+      setUpAll(() {
+        if (!Platform.isAndroid) {
+          fail('Test requires Android with .litertlm/.task models');
+        }
+        if (!File(model.filePath).existsSync()) {
+          fail('Model not found: ${model.filePath}\n'
+              'Push it first: adb push <model> ${model.filePath}');
+        }
+      });
+
+      testWidgets('install', (tester) async {
+        await FlutterGemma.initialize();
+
+        print('[${model.name}] Installing from file: ${model.filePath}');
+        await _ensureModelInstalled(model);
+
+        expect(FlutterGemma.hasActiveModel(), isTrue);
+        print('[${model.name}] Installed successfully');
+      }, timeout: const Timeout(Duration(minutes: 5)));
+
+      testWidgets('thinking_stream', (tester) async {
+        await FlutterGemma.initialize();
+        await _ensureModelInstalled(model);
+
+        final inferenceModel = await FlutterGemma.getActiveModel(
+          maxTokens: model.maxTokens,
+          preferredBackend: PreferredBackend.cpu,
+        );
+
+        try {
+          final chat = await inferenceModel.createChat(
+            temperature: model.temperature,
+            topK: model.topK,
+            topP: model.topP,
+            isThinking: true,
+            modelType: model.modelType,
+          );
+
+          await chat.addQuery(
+            const Message(text: 'Explain why the sky is blue. Think step by step.', isUser: true),
+          );
+
+          final responses = <ModelResponse>[];
+          await tester.runAsync(() async {
+            await for (final response in chat.generateChatResponseAsync()) {
+              responses.add(response);
+            }
+          });
+
+          final thinkingTokens = responses
+              .whereType<ThinkingResponse>()
+              .map((r) => r.content)
+              .join();
+          final textTokens = responses
+              .whereType<TextResponse>()
+              .map((r) => r.token)
+              .join();
+
+          print('[${model.name}] Thinking tokens: ${thinkingTokens.length} chars');
+          print('[${model.name}] Text tokens: ${textTokens.length} chars');
+
+          // Should have thinking content
+          expect(thinkingTokens.isNotEmpty, isTrue,
+              reason: '${model.name}: Expected non-empty thinking content');
+
+          // Should have text content
+          expect(textTokens.isNotEmpty, isTrue,
+              reason: '${model.name}: Expected non-empty text response');
+
+          // Thinking should come before text in stream order
+          final firstThinkingIdx = responses.indexWhere((r) => r is ThinkingResponse);
+          final firstTextIdx = responses.indexWhere((r) => r is TextResponse);
+
+          if (firstThinkingIdx >= 0 && firstTextIdx >= 0) {
+            expect(firstThinkingIdx, lessThan(firstTextIdx),
+                reason: '${model.name}: First thinking should appear before first text');
+          }
+
+          print('[${model.name}] thinking_stream PASSED');
+        } finally {
+          await inferenceModel.close();
+        }
+      }, timeout: const Timeout(Duration(minutes: 5)));
+
+      testWidgets('no_thinking', (tester) async {
+        await FlutterGemma.initialize();
+        await _ensureModelInstalled(model);
+
+        final inferenceModel = await FlutterGemma.getActiveModel(
+          maxTokens: model.maxTokens,
+          preferredBackend: PreferredBackend.cpu,
+        );
+
+        try {
+          final chat = await inferenceModel.createChat(
+            temperature: model.temperature,
+            topK: model.topK,
+            topP: model.topP,
+            isThinking: false,
+            modelType: model.modelType,
+          );
+
+          await chat.addQuery(
+            const Message(text: 'What is 2+2?', isUser: true),
+          );
+
+          final responses = <ModelResponse>[];
+          await tester.runAsync(() async {
+            await for (final response in chat.generateChatResponseAsync()) {
+              responses.add(response);
+            }
+          });
+
+          // Without thinking enabled, no ThinkingResponse should appear
+          final thinkingResponses = responses.whereType<ThinkingResponse>().toList();
+          expect(thinkingResponses, isEmpty,
+              reason: '${model.name}: No ThinkingResponse expected with isThinking=false');
+
+          // Should still have text content
+          final textTokens = responses
+              .whereType<TextResponse>()
+              .map((r) => r.token)
+              .join();
+          expect(textTokens.isNotEmpty, isTrue,
+              reason: '${model.name}: Expected non-empty text response');
+
+          print('[${model.name}] no_thinking PASSED');
+        } finally {
+          await inferenceModel.close();
+        }
+      }, timeout: const Timeout(Duration(minutes: 5)));
+    });
+  }
+}
diff --git a/example/lib/models/model.dart b/example/lib/models/model.dart
index e4a8378a..536b15e7 100644
--- a/example/lib/models/model.dart
+++ b/example/lib/models/model.dart
@@ -39,6 +39,7 @@ enum Model implements InferenceModelInterface {
     supportAudio: true,
     maxTokens: 4096,
     maxNumImages: 1,
+    isThinking: true,
   ),
   gemma4_E4B(
     baseUrl:
@@ -62,6 +63,7 @@ enum Model implements InferenceModelInterface {
     supportAudio: true,
     maxTokens: 4096,
     maxNumImages: 1,
+    isThinking: true,
   ),
 
   // Gemma 3 Nano models (Multimodal + Function Calls)
diff --git a/example/pubspec.lock b/example/pubspec.lock
index ade2a6de..cb1f33a9 100644
--- a/example/pubspec.lock
+++ b/example/pubspec.lock
@@ -209,7 +209,7 @@ packages:
       path: ".."
       relative: true
     source: path
-    version: "0.13.0"
+    version: "0.13.1"
   flutter_lints:
     dependency: "direct dev"
     description:
diff --git a/ios/Classes/FlutterGemmaPlugin.swift b/ios/Classes/FlutterGemmaPlugin.swift
index a1adcae6..691b4fba 100644
--- a/ios/Classes/FlutterGemmaPlugin.swift
+++ b/ios/Classes/FlutterGemmaPlugin.swift
@@ -97,6 +97,7 @@ class PlatformServiceImpl : NSObject, PlatformService, FlutterStreamHandler {
         enableVisionModality: Bool?,
         enableAudioModality: Bool?,
         systemInstruction: String?,
+        enableThinking: Bool?,
         completion: @escaping (Result<Void, any Error>) -> Void
     ) {
         guard let inference = model?.inference else {
@@ -104,6 +105,11 @@ class PlatformServiceImpl : NSObject, PlatformService, FlutterStreamHandler {
             return
         }
 
+        if enableThinking == true {
+            print("[FlutterGemma] Warning: enableThinking=true is not supported on iOS (MediaPipe). " +
+                  "Use Android or Desktop with .litertlm models for Gemma 4 thinking mode.")
+        }
+
         DispatchQueue.global(qos: .userInitiated).async {
             do {
                 let newSession = try InferenceSession(
diff --git a/ios/Classes/PigeonInterface.g.swift b/ios/Classes/PigeonInterface.g.swift
index ac8fda75..a70156ee 100644
--- a/ios/Classes/PigeonInterface.g.swift
+++ b/ios/Classes/PigeonInterface.g.swift
@@ -231,7 +231,7 @@ class PigeonInterfacePigeonCodec: FlutterStandardMessageCodec, @unchecked Sendab
 protocol PlatformService {
   func createModel(maxTokens: Int64, modelPath: String, loraRanks: [Int64]?, preferredBackend: PreferredBackend?, maxNumImages: Int64?, supportAudio: Bool?, completion: @escaping (Result<Void, Error>) -> Void)
   func closeModel(completion: @escaping (Result<Void, Error>) -> Void)
-  func createSession(temperature: Double, randomSeed: Int64, topK: Int64, topP: Double?, loraPath: String?, enableVisionModality: Bool?, enableAudioModality: Bool?, systemInstruction: String?, completion: @escaping (Result<Void, Error>) -> Void)
+  func createSession(temperature: Double, randomSeed: Int64, topK: Int64, topP: Double?, loraPath: String?, enableVisionModality: Bool?, enableAudioModality: Bool?, systemInstruction: String?, enableThinking: Bool?, completion: @escaping (Result<Void, Error>) -> Void)
   func closeSession(completion: @escaping (Result<Void, Error>) -> Void)
   func sizeInTokens(prompt: String, completion: @escaping (Result<Int64, Error>) -> Void)
   func addQueryChunk(prompt: String, completion: @escaping (Result<Void, Error>) -> Void)
@@ -332,7 +332,8 @@ class PlatformServiceSetup {
         let enableVisionModalityArg: Bool? = nilOrValue(args[5])
         let enableAudioModalityArg: Bool? = nilOrValue(args[6])
         let systemInstructionArg: String? = nilOrValue(args[7])
-        api.createSession(temperature: temperatureArg, randomSeed: randomSeedArg, topK: topKArg, topP: topPArg, loraPath: loraPathArg, enableVisionModality: enableVisionModalityArg, enableAudioModality: enableAudioModalityArg, systemInstruction: systemInstructionArg) { result in
+        let enableThinkingArg: Bool? = nilOrValue(args[8])
+        api.createSession(temperature: temperatureArg, randomSeed: randomSeedArg, topK: topKArg, topP: topPArg, loraPath: loraPathArg, enableVisionModality: enableVisionModalityArg, enableAudioModality: enableAudioModalityArg, systemInstruction: systemInstructionArg, enableThinking: enableThinkingArg) { result in
           switch result {
           case .success:
             reply(wrapResult(nil))
diff --git a/ios/flutter_gemma.podspec b/ios/flutter_gemma.podspec
index 7695f83b..671a3e2a 100644
--- a/ios/flutter_gemma.podspec
+++ b/ios/flutter_gemma.podspec
@@ -4,7 +4,7 @@
 #
 Pod::Spec.new do |s|
   s.name             = 'flutter_gemma'
-  s.version          = '0.13.0'
+  s.version          = '0.13.1'
   s.summary          = 'Flutter plugin for running Gemma AI models locally with Gemma 3 Nano support.'
   s.description      = <<-DESC
 The plugin allows running the Gemma AI model locally on a device from a Flutter application.
diff --git a/lib/core/di/service_registry.dart b/lib/core/di/service_registry.dart
index 8947c912..f5005f72 100644
--- a/lib/core/di/service_registry.dart
+++ b/lib/core/di/service_registry.dart
@@ -22,7 +22,8 @@ import 'package:flutter_gemma/core/handlers/web_file_source_handler_stub.dart'
 import 'package:flutter_gemma/core/handlers/source_handler_registry.dart';
 import 'package:flutter_gemma/core/infrastructure/platform_file_system_service.dart';
 import 'package:flutter_gemma/core/infrastructure/web_file_system_service.dart';
-import 'package:flutter_gemma/core/infrastructure/flutter_asset_loader.dart';
+import 'package:flutter_gemma/core/infrastructure/flutter_asset_loader_stub.dart'
+    if (dart.library.io) 'package:flutter_gemma/core/infrastructure/flutter_asset_loader.dart';
 import 'package:flutter_gemma/core/infrastructure/shared_preferences_model_repository.dart';
 import 'package:flutter_gemma/core/infrastructure/in_memory_model_repository.dart';
 import 'package:flutter_gemma/core/services/vector_store_repository.dart';
diff --git a/lib/core/extensions.dart b/lib/core/extensions.dart
index 8bac2744..5f5db09a 100644
--- a/lib/core/extensions.dart
+++ b/lib/core/extensions.dart
@@ -190,8 +190,8 @@ extension MessageExtension on Message {
 
 // Filter class for thinking models
 class ModelThinkingFilter {
-  /// Filters ModelResponse stream for models with thinking support
-  /// Only supports DeepSeek models with <think>...</think> blocks
+  /// Filters ModelResponse stream for models with thinking support.
+  /// Supports DeepSeek (`<think>...</think>`) and Gemma 4 (`<|channel>thought\n...<channel|>`) models.
   static Stream<ModelResponse> filterThinkingStream(Stream<ModelResponse> originalStream,
       {required ModelType modelType}) async* {
     switch (modelType) {
@@ -244,8 +244,69 @@ class ModelThinkingFilter {
         }
         break;
 
-      case ModelType.general:
       case ModelType.gemmaIt:
+        // Gemma 4 E2B/E4B: <|channel>thought\n...<channel|>
+        const startMarker = '<|channel>thought\n';
+        const endMarker = '<channel|>';
+        bool gemmaInsideThinking = false;
+        String gemmaBuffer = '';
+
+        await for (final response in originalStream) {
+          if (response is TextResponse) {
+            gemmaBuffer += response.token;
+
+            while (gemmaBuffer.isNotEmpty) {
+              if (gemmaInsideThinking) {
+                final endIdx = gemmaBuffer.indexOf(endMarker);
+                if (endIdx >= 0) {
+                  final thinkingContent = gemmaBuffer.substring(0, endIdx);
+                  if (thinkingContent.isNotEmpty) {
+                    yield ThinkingResponse(thinkingContent);
+                  }
+                  gemmaBuffer = gemmaBuffer.substring(endIdx + endMarker.length);
+                  gemmaInsideThinking = false;
+                } else {
+                  // Check for partial end marker at tail
+                  final partial = _findPartialSuffix(gemmaBuffer, endMarker);
+                  final safe = gemmaBuffer.substring(0, gemmaBuffer.length - partial);
+                  if (safe.isNotEmpty) {
+                    yield ThinkingResponse(safe);
+                  }
+                  gemmaBuffer = gemmaBuffer.substring(gemmaBuffer.length - partial);
+                  break;
+                }
+              } else {
+                final startIdx = gemmaBuffer.indexOf(startMarker);
+                if (startIdx >= 0) {
+                  final textBefore = gemmaBuffer.substring(0, startIdx);
+                  if (textBefore.isNotEmpty) {
+                    yield TextResponse(textBefore);
+                  }
+                  gemmaBuffer = gemmaBuffer.substring(startIdx + startMarker.length);
+                  gemmaInsideThinking = true;
+                } else {
+                  // Check for partial start marker at tail
+                  final partial = _findPartialSuffix(gemmaBuffer, startMarker);
+                  final safe = gemmaBuffer.substring(0, gemmaBuffer.length - partial);
+                  if (safe.isNotEmpty) {
+                    yield TextResponse(safe);
+                  }
+                  gemmaBuffer = gemmaBuffer.substring(gemmaBuffer.length - partial);
+                  break;
+                }
+              }
+            }
+          } else {
+            yield response;
+          }
+        }
+        // Flush remaining buffer
+        if (gemmaBuffer.isNotEmpty) {
+          yield gemmaInsideThinking ? ThinkingResponse(gemmaBuffer) : TextResponse(gemmaBuffer);
+        }
+        break;
+
+      case ModelType.general:
       case ModelType.qwen:
       case ModelType.llama:
       case ModelType.hammer:
@@ -258,8 +319,9 @@ class ModelThinkingFilter {
     }
   }
 
-  /// Removes thinking blocks from final text
-  /// Only supports DeepSeek (<think>...</think>) models
+  /// Removes thinking blocks from final text.
+  /// Supports DeepSeek (`<think>...</think>`) and Gemma 4 (`<|channel>thought\n...<channel|>`) models.
+  /// Note: For streaming thinking output, use [filterThinkingStream] with generateChatResponseAsync() instead.
   static String removeThinkingFromText(String text, {required ModelType modelType}) {
     switch (modelType) {
       case ModelType.deepSeek:
@@ -267,8 +329,12 @@ class ModelThinkingFilter {
         RegExp thinkingRegex = RegExp(r'<think>.*?</think>', dotAll: true);
         return text.replaceAll(thinkingRegex, '').trim();
 
-      case ModelType.general:
       case ModelType.gemmaIt:
+        // Remove all <|channel>thought\n...<channel|> blocks (Gemma 4 E2B/E4B)
+        return text.replaceAll(
+          RegExp(r'<\|channel>thought\n.*?<channel\|>', dotAll: true), '').trim();
+
+      case ModelType.general:
       case ModelType.qwen:
       case ModelType.llama:
       case ModelType.hammer:
@@ -326,4 +392,14 @@ class ModelThinkingFilter {
         return cleaned.trim();
     }
   }
+
+  /// Returns length of the longest suffix of [text] that is a prefix of [marker].
+  static int _findPartialSuffix(String text, String marker) {
+    for (int i = marker.length.clamp(0, text.length); i >= 1; i--) {
+      if (text.endsWith(marker.substring(0, i))) {
+        return i;
+      }
+    }
+    return 0;
+  }
 }
diff --git a/lib/core/handlers/asset_source_handler.dart b/lib/core/handlers/asset_source_handler.dart
index aaf02a09..d499769b 100644
--- a/lib/core/handlers/asset_source_handler.dart
+++ b/lib/core/handlers/asset_source_handler.dart
@@ -4,7 +4,8 @@ import 'package:flutter_gemma/core/model_management/cancel_token.dart';
 import 'package:flutter_gemma/core/services/asset_loader.dart';
 import 'package:flutter_gemma/core/services/file_system_service.dart';
 import 'package:flutter_gemma/core/services/model_repository.dart';
-import 'package:flutter_gemma/core/infrastructure/flutter_asset_loader.dart';
+import 'package:flutter_gemma/core/infrastructure/flutter_asset_loader_stub.dart'
+    if (dart.library.io) 'package:flutter_gemma/core/infrastructure/flutter_asset_loader.dart';
 import 'package:path/path.dart' as path;
 
 /// Handles installation of models from Flutter assets
diff --git a/lib/core/infrastructure/flutter_asset_loader_stub.dart b/lib/core/infrastructure/flutter_asset_loader_stub.dart
new file mode 100644
index 00000000..a2e655ab
--- /dev/null
+++ b/lib/core/infrastructure/flutter_asset_loader_stub.dart
@@ -0,0 +1,19 @@
+/// Stub implementation for platforms where dart:io is not available (web)
+/// This file is used when large_file_handler cannot be imported
+library;
+
+import 'dart:typed_data';
+import 'package:flutter_gemma/core/services/asset_loader.dart';
+
+/// Stub class - should never be instantiated on web platform
+class FlutterAssetLoader implements AssetLoader {
+  @override
+  Future<Uint8List> loadAsset(String path) =>
+      throw UnsupportedError('FlutterAssetLoader is not available on this platform');
+
+  Future<void> copyAssetToFile(String assetPath, String targetPath) =>
+      throw UnsupportedError('FlutterAssetLoader is not available on this platform');
+
+  Stream<int> copyAssetToFileWithProgress(String assetPath, String targetPath) =>
+      throw UnsupportedError('FlutterAssetLoader is not available on this platform');
+}
diff --git a/lib/core/infrastructure/hnsw_vector_index.dart b/lib/core/infrastructure/hnsw_vector_index.dart
index ef6cd761..868d5980 100644
--- a/lib/core/infrastructure/hnsw_vector_index.dart
+++ b/lib/core/infrastructure/hnsw_vector_index.dart
@@ -16,7 +16,7 @@ import 'package:local_hnsw/local_hnsw.item.dart';
 ///
 /// **Key Design Decisions:**
 /// - In-memory index: Rebuilt on initialize() from SQLite data
-/// - Generic string IDs: LocalHNSW<String> stores document IDs directly
+/// - Generic string IDs: `LocalHNSW<String>` stores document IDs directly
 /// - Cosine metric: Matches SQLite brute-force implementation
 /// - Exact similarity: Recalculated from cached embeddings during search
 ///
diff --git a/lib/core/infrastructure/web_opfs_interop.dart b/lib/core/infrastructure/web_opfs_interop.dart
index 53c9b70d..ea28ebc0 100644
--- a/lib/core/infrastructure/web_opfs_interop.dart
+++ b/lib/core/infrastructure/web_opfs_interop.dart
@@ -14,13 +14,13 @@ extension type OPFSInterop._(JSObject _) implements JSObject {
   /// Check if a model is cached in OPFS
   ///
   /// @param filename Model filename (cache key)
-  /// @returns Promise<boolean>
+  /// @returns `Promise<boolean>`
   external JSPromise<JSBoolean> isModelCached(JSString filename);
 
   /// Get the size of a cached model file
   ///
   /// @param filename Model filename
-  /// @returns Promise<number|null> Size in bytes, or null if not found
+  /// @returns `Promise<number|null>` Size in bytes, or null if not found
   external JSPromise<JSNumber?> getCachedModelSize(JSString filename);
 
   /// Download a model to OPFS with progress tracking and cancellation support
@@ -30,7 +30,7 @@ extension type OPFSInterop._(JSObject _) implements JSObject {
   /// @param authToken Optional authentication token (HuggingFace, etc.)
   /// @param onProgress Progress callback (receives 0-100)
   /// @param abortSignal Optional AbortSignal for cancellation
-  /// @returns Promise<boolean> True on success
+  /// @returns `Promise<boolean>` True on success
   /// @throws Error on download failure, quota exceeded, or cancellation
   external JSPromise<JSBoolean> downloadToOPFS(
     JSString url,
@@ -45,14 +45,14 @@ extension type OPFSInterop._(JSObject _) implements JSObject {
   /// This is passed to MediaPipe's modelAssetBuffer parameter.
   ///
   /// @param filename Model filename in OPFS
-  /// @returns Promise<ReadableStreamDefaultReader>
+  /// @returns `Promise<ReadableStreamDefaultReader>`
   /// @throws Error if file not found
   external JSPromise<JSAny> getStreamReader(JSString filename);
 
   /// Delete a model from OPFS
   ///
   /// @param filename Model filename to delete
-  /// @returns Promise<void>
+  /// @returns `Promise<void>`
   external JSPromise<JSAny> deleteModel(JSString filename);
 
   /// Get current storage statistics
@@ -62,7 +62,7 @@ extension type OPFSInterop._(JSObject _) implements JSObject {
 
   /// Clear all files from OPFS (for testing/development)
   ///
-  /// @returns Promise<number> Number of files deleted
+  /// @returns `Promise<number>` Number of files deleted
   external JSPromise<JSNumber> clearAll();
 }
 
diff --git a/lib/core/model_management/constants/preferences_keys.dart b/lib/core/model_management/constants/preferences_keys.dart
index 888ccf63..e6515367 100644
--- a/lib/core/model_management/constants/preferences_keys.dart
+++ b/lib/core/model_management/constants/preferences_keys.dart
@@ -9,16 +9,16 @@ class PreferencesKeys {
   // Multi-model lists (NEW system - supports multiple models)
   // ============================================================================
 
-  /// List<String> of installed inference model files
+  /// `List<String>` of installed inference model files
   static const String installedModels = 'installed_models';
 
-  /// List<String> of installed LoRA files
+  /// `List<String>` of installed LoRA files
   static const String installedLoras = 'installed_loras';
 
-  /// List<String> of installed embedding model files
+  /// `List<String>` of installed embedding model files
   static const String installedEmbeddingModels = 'installed_embedding_models';
 
-  /// List<String> of installed tokenizer files
+  /// `List<String>` of installed tokenizer files
   static const String installedTokenizers = 'installed_tokenizers';
 
   // ============================================================================
diff --git a/lib/desktop/desktop_inference_model.dart b/lib/desktop/desktop_inference_model.dart
index 41841c60..d7a4be5a 100644
--- a/lib/desktop/desktop_inference_model.dart
+++ b/lib/desktop/desktop_inference_model.dart
@@ -39,6 +39,7 @@ class DesktopInferenceModel extends InferenceModel {
     bool? enableVisionModality,
     bool? enableAudioModality,
     String? systemInstruction,
+    bool enableThinking = false,
   }) async {
     if (_isClosed) {
       throw StateError('Model is closed. Create a new instance to use it again');
@@ -65,6 +66,7 @@ class DesktopInferenceModel extends InferenceModel {
         fileType: fileType,
         supportImage: enableVisionModality ?? supportImage,
         supportAudio: enableAudioModality ?? supportAudio,
+        enableThinking: enableThinking,
         onClose: () {
           _session = null;
           _createCompleter = null;
@@ -107,6 +109,7 @@ class DesktopInferenceModel extends InferenceModel {
         enableVisionModality: supportImage ?? this.supportImage,
         enableAudioModality: supportAudio ?? this.supportAudio,
         systemInstruction: systemInstruction,
+        enableThinking: isThinking,
       ),
       maxTokens: maxTokens,
       tokenBuffer: tokenBuffer,
@@ -158,6 +161,7 @@ class DesktopInferenceModelSession extends InferenceModelSession {
     required this.fileType,
     required this.supportImage,
     required this.supportAudio,
+    this.enableThinking = false,
     required this.onClose,
   });
 
@@ -166,6 +170,7 @@ class DesktopInferenceModelSession extends InferenceModelSession {
   final ModelFileType fileType;
   final bool supportImage;
   final bool supportAudio;
+  final bool enableThinking;
   final VoidCallback onClose;
 
   final StringBuffer _queryBuffer = StringBuffer();
@@ -213,15 +218,15 @@ class DesktopInferenceModelSession extends InferenceModelSession {
     final buffer = StringBuffer();
 
     if (audio != null) {
-      await for (final token in grpcClient.chatWithAudio(text, audio)) {
+      await for (final token in grpcClient.chatWithAudio(text, audio, enableThinking: enableThinking)) {
         buffer.write(token);
       }
     } else if (image != null) {
-      await for (final token in grpcClient.chatWithImage(text, image)) {
+      await for (final token in grpcClient.chatWithImage(text, image, enableThinking: enableThinking)) {
         buffer.write(token);
       }
     } else {
-      await for (final token in grpcClient.chat(text)) {
+      await for (final token in grpcClient.chat(text, enableThinking: enableThinking)) {
         buffer.write(token);
       }
     }
@@ -247,13 +252,13 @@ class DesktopInferenceModelSession extends InferenceModelSession {
 
     if (audio != null) {
       debugPrint('[DesktopSession] Calling chatWithAudio: audio=${audio.length} bytes');
-      yield* grpcClient.chatWithAudio(text, audio);
+      yield* grpcClient.chatWithAudio(text, audio, enableThinking: enableThinking);
     } else if (image != null) {
       debugPrint('[DesktopSession] Calling chatWithImage: image=${image.length} bytes');
-      yield* grpcClient.chatWithImage(text, image);
+      yield* grpcClient.chatWithImage(text, image, enableThinking: enableThinking);
     } else {
       debugPrint('[DesktopSession] Calling chat (no image/audio)');
-      yield* grpcClient.chat(text);
+      yield* grpcClient.chat(text, enableThinking: enableThinking);
     }
   }
 
diff --git a/lib/desktop/generated/litertlm.pb.dart b/lib/desktop/generated/litertlm.pb.dart
index a2bd35e2..4c503b3a 100644
--- a/lib/desktop/generated/litertlm.pb.dart
+++ b/lib/desktop/generated/litertlm.pb.dart
@@ -431,10 +431,12 @@ class ChatRequest extends $pb.GeneratedMessage {
   factory ChatRequest({
     $core.String? conversationId,
     $core.String? text,
+    $core.bool? enableThinking,
   }) {
     final result = create();
     if (conversationId != null) result.conversationId = conversationId;
     if (text != null) result.text = text;
+    if (enableThinking != null) result.enableThinking = enableThinking;
     return result;
   }
 
@@ -453,6 +455,7 @@ class ChatRequest extends $pb.GeneratedMessage {
       createEmptyInstance: create)
     ..aOS(1, _omitFieldNames ? '' : 'conversationId')
     ..aOS(2, _omitFieldNames ? '' : 'text')
+    ..aOB(3, _omitFieldNames ? '' : 'enableThinking')
     ..hasRequiredFields = false;
 
   @$core.Deprecated('See https://github.com/google/protobuf.dart/issues/998.')
@@ -491,6 +494,15 @@ class ChatRequest extends $pb.GeneratedMessage {
   $core.bool hasText() => $_has(1);
   @$pb.TagNumber(2)
   void clearText() => $_clearField(2);
+
+  @$pb.TagNumber(3)
+  $core.bool get enableThinking => $_getBF(2);
+  @$pb.TagNumber(3)
+  set enableThinking($core.bool value) => $_setBool(2, value);
+  @$pb.TagNumber(3)
+  $core.bool hasEnableThinking() => $_has(2);
+  @$pb.TagNumber(3)
+  void clearEnableThinking() => $_clearField(3);
 }
 
 class ChatWithImageRequest extends $pb.GeneratedMessage {
@@ -498,11 +510,13 @@ class ChatWithImageRequest extends $pb.GeneratedMessage {
     $core.String? conversationId,
     $core.String? text,
     $core.List<$core.int>? image,
+    $core.bool? enableThinking,
   }) {
     final result = create();
     if (conversationId != null) result.conversationId = conversationId;
     if (text != null) result.text = text;
     if (image != null) result.image = image;
+    if (enableThinking != null) result.enableThinking = enableThinking;
     return result;
   }
 
@@ -523,6 +537,7 @@ class ChatWithImageRequest extends $pb.GeneratedMessage {
     ..aOS(2, _omitFieldNames ? '' : 'text')
     ..a<$core.List<$core.int>>(
         3, _omitFieldNames ? '' : 'image', $pb.PbFieldType.OY)
+    ..aOB(4, _omitFieldNames ? '' : 'enableThinking')
     ..hasRequiredFields = false;
 
   @$core.Deprecated('See https://github.com/google/protobuf.dart/issues/998.')
@@ -570,6 +585,15 @@ class ChatWithImageRequest extends $pb.GeneratedMessage {
   $core.bool hasImage() => $_has(2);
   @$pb.TagNumber(3)
   void clearImage() => $_clearField(3);
+
+  @$pb.TagNumber(4)
+  $core.bool get enableThinking => $_getBF(3);
+  @$pb.TagNumber(4)
+  set enableThinking($core.bool value) => $_setBool(3, value);
+  @$pb.TagNumber(4)
+  $core.bool hasEnableThinking() => $_has(3);
+  @$pb.TagNumber(4)
+  void clearEnableThinking() => $_clearField(4);
 }
 
 class ChatWithAudioRequest extends $pb.GeneratedMessage {
@@ -577,11 +601,13 @@ class ChatWithAudioRequest extends $pb.GeneratedMessage {
     $core.String? conversationId,
     $core.String? text,
     $core.List<$core.int>? audio,
+    $core.bool? enableThinking,
   }) {
     final result = create();
     if (conversationId != null) result.conversationId = conversationId;
     if (text != null) result.text = text;
     if (audio != null) result.audio = audio;
+    if (enableThinking != null) result.enableThinking = enableThinking;
     return result;
   }
 
@@ -602,6 +628,7 @@ class ChatWithAudioRequest extends $pb.GeneratedMessage {
     ..aOS(2, _omitFieldNames ? '' : 'text')
     ..a<$core.List<$core.int>>(
         3, _omitFieldNames ? '' : 'audio', $pb.PbFieldType.OY)
+    ..aOB(4, _omitFieldNames ? '' : 'enableThinking')
     ..hasRequiredFields = false;
 
   @$core.Deprecated('See https://github.com/google/protobuf.dart/issues/998.')
@@ -649,6 +676,15 @@ class ChatWithAudioRequest extends $pb.GeneratedMessage {
   $core.bool hasAudio() => $_has(2);
   @$pb.TagNumber(3)
   void clearAudio() => $_clearField(3);
+
+  @$pb.TagNumber(4)
+  $core.bool get enableThinking => $_getBF(3);
+  @$pb.TagNumber(4)
+  set enableThinking($core.bool value) => $_setBool(3, value);
+  @$pb.TagNumber(4)
+  $core.bool hasEnableThinking() => $_has(3);
+  @$pb.TagNumber(4)
+  void clearEnableThinking() => $_clearField(4);
 }
 
 class ChatResponse extends $pb.GeneratedMessage {
@@ -656,11 +692,13 @@ class ChatResponse extends $pb.GeneratedMessage {
     $core.String? text,
     $core.bool? done,
     $core.String? error,
+    $core.String? thinking,
   }) {
     final result = create();
     if (text != null) result.text = text;
     if (done != null) result.done = done;
     if (error != null) result.error = error;
+    if (thinking != null) result.thinking = thinking;
     return result;
   }
 
@@ -680,6 +718,7 @@ class ChatResponse extends $pb.GeneratedMessage {
     ..aOS(1, _omitFieldNames ? '' : 'text')
     ..aOB(2, _omitFieldNames ? '' : 'done')
     ..aOS(3, _omitFieldNames ? '' : 'error')
+    ..aOS(4, _omitFieldNames ? '' : 'thinking')
     ..hasRequiredFields = false;
 
   @$core.Deprecated('See https://github.com/google/protobuf.dart/issues/998.')
@@ -727,6 +766,15 @@ class ChatResponse extends $pb.GeneratedMessage {
   $core.bool hasError() => $_has(2);
   @$pb.TagNumber(3)
   void clearError() => $_clearField(3);
+
+  @$pb.TagNumber(4)
+  $core.String get thinking => $_getSZ(3);
+  @$pb.TagNumber(4)
+  set thinking($core.String value) => $_setString(3, value);
+  @$pb.TagNumber(4)
+  $core.bool hasThinking() => $_has(3);
+  @$pb.TagNumber(4)
+  void clearThinking() => $_clearField(4);
 }
 
 class CancelGenerationRequest extends $pb.GeneratedMessage {
diff --git a/lib/desktop/generated/litertlm.pbjson.dart b/lib/desktop/generated/litertlm.pbjson.dart
index 16879590..125b74f1 100644
--- a/lib/desktop/generated/litertlm.pbjson.dart
+++ b/lib/desktop/generated/litertlm.pbjson.dart
@@ -108,13 +108,15 @@ const ChatRequest$json = {
   '2': [
     {'1': 'conversation_id', '3': 1, '4': 1, '5': 9, '10': 'conversationId'},
     {'1': 'text', '3': 2, '4': 1, '5': 9, '10': 'text'},
+    {'1': 'enable_thinking', '3': 3, '4': 1, '5': 8, '10': 'enableThinking'},
   ],
 };
 
 /// Descriptor for `ChatRequest`. Decode as a `google.protobuf.DescriptorProto`.
 final $typed_data.Uint8List chatRequestDescriptor = $convert.base64Decode(
     'CgtDaGF0UmVxdWVzdBInCg9jb252ZXJzYXRpb25faWQYASABKAlSDmNvbnZlcnNhdGlvbklkEh'
-    'IKBHRleHQYAiABKAlSBHRleHQ=');
+    'IKBHRleHQYAiABKAlSBHRleHQSJwoPZW5hYmxlX3RoaW5raW5nGAMgASgIUg5lbmFibGVUaGlu'
+    'a2luZw==');
 
 @$core.Deprecated('Use chatWithImageRequestDescriptor instead')
 const ChatWithImageRequest$json = {
@@ -123,13 +125,15 @@ const ChatWithImageRequest$json = {
     {'1': 'conversation_id', '3': 1, '4': 1, '5': 9, '10': 'conversationId'},
     {'1': 'text', '3': 2, '4': 1, '5': 9, '10': 'text'},
     {'1': 'image', '3': 3, '4': 1, '5': 12, '10': 'image'},
+    {'1': 'enable_thinking', '3': 4, '4': 1, '5': 8, '10': 'enableThinking'},
   ],
 };
 
 /// Descriptor for `ChatWithImageRequest`. Decode as a `google.protobuf.DescriptorProto`.
 final $typed_data.Uint8List chatWithImageRequestDescriptor = $convert.base64Decode(
     'ChRDaGF0V2l0aEltYWdlUmVxdWVzdBInCg9jb252ZXJzYXRpb25faWQYASABKAlSDmNvbnZlcn'
-    'NhdGlvbklkEhIKBHRleHQYAiABKAlSBHRleHQSFAoFaW1hZ2UYAyABKAxSBWltYWdl');
+    'NhdGlvbklkEhIKBHRleHQYAiABKAlSBHRleHQSFAoFaW1hZ2UYAyABKAxSBWltYWdlEicKD2Vu'
+    'YWJsZV90aGlua2luZxgEIAEoCFIOZW5hYmxlVGhpbmtpbmc=');
 
 @$core.Deprecated('Use chatWithAudioRequestDescriptor instead')
 const ChatWithAudioRequest$json = {
@@ -138,13 +142,15 @@ const ChatWithAudioRequest$json = {
     {'1': 'conversation_id', '3': 1, '4': 1, '5': 9, '10': 'conversationId'},
     {'1': 'text', '3': 2, '4': 1, '5': 9, '10': 'text'},
     {'1': 'audio', '3': 3, '4': 1, '5': 12, '10': 'audio'},
+    {'1': 'enable_thinking', '3': 4, '4': 1, '5': 8, '10': 'enableThinking'},
   ],
 };
 
 /// Descriptor for `ChatWithAudioRequest`. Decode as a `google.protobuf.DescriptorProto`.
 final $typed_data.Uint8List chatWithAudioRequestDescriptor = $convert.base64Decode(
     'ChRDaGF0V2l0aEF1ZGlvUmVxdWVzdBInCg9jb252ZXJzYXRpb25faWQYASABKAlSDmNvbnZlcn'
-    'NhdGlvbklkEhIKBHRleHQYAiABKAlSBHRleHQSFAoFYXVkaW8YAyABKAxSBWF1ZGlv');
+    'NhdGlvbklkEhIKBHRleHQYAiABKAlSBHRleHQSFAoFYXVkaW8YAyABKAxSBWF1ZGlvEicKD2Vu'
+    'YWJsZV90aGlua2luZxgEIAEoCFIOZW5hYmxlVGhpbmtpbmc=');
 
 @$core.Deprecated('Use chatResponseDescriptor instead')
 const ChatResponse$json = {
@@ -153,13 +159,14 @@ const ChatResponse$json = {
     {'1': 'text', '3': 1, '4': 1, '5': 9, '10': 'text'},
     {'1': 'done', '3': 2, '4': 1, '5': 8, '10': 'done'},
     {'1': 'error', '3': 3, '4': 1, '5': 9, '10': 'error'},
+    {'1': 'thinking', '3': 4, '4': 1, '5': 9, '10': 'thinking'},
   ],
 };
 
 /// Descriptor for `ChatResponse`. Decode as a `google.protobuf.DescriptorProto`.
 final $typed_data.Uint8List chatResponseDescriptor = $convert.base64Decode(
     'CgxDaGF0UmVzcG9uc2USEgoEdGV4dBgBIAEoCVIEdGV4dBISCgRkb25lGAIgASgIUgRkb25lEh'
-    'QKBWVycm9yGAMgASgJUgVlcnJvcg==');
+    'QKBWVycm9yGAMgASgJUgVlcnJvchIaCgh0aGlua2luZxgEIAEoCVIIdGhpbmtpbmc=');
 
 @$core.Deprecated('Use cancelGenerationRequestDescriptor instead')
 const CancelGenerationRequest$json = {
diff --git a/lib/desktop/grpc_client.dart b/lib/desktop/grpc_client.dart
index ac171909..6f1f7739 100644
--- a/lib/desktop/grpc_client.dart
+++ b/lib/desktop/grpc_client.dart
@@ -111,7 +111,7 @@ class LiteRtLmClient {
   static const _streamTimeout = Duration(minutes: 5);
 
   /// Send a chat message and get streaming response
-  Stream<String> chat(String text, {String? conversationId}) async* {
+  Stream<String> chat(String text, {String? conversationId, bool enableThinking = false}) async* {
     _assertInitialized();
 
     final convId = conversationId ?? _currentConversationId;
@@ -121,7 +121,8 @@ class LiteRtLmClient {
 
     final request = ChatRequest()
       ..conversationId = convId
-      ..text = text;
+      ..text = text
+      ..enableThinking = enableThinking;
 
     // Add timeout to prevent infinite hanging
     await for (final response in _client!.chat(request).timeout(
@@ -137,7 +138,11 @@ class LiteRtLmClient {
         throw Exception('Chat error: ${response.error}');
       }
 
-      if (response.hasText()) {
+      if (response.hasThinking() && response.thinking.isNotEmpty) {
+        yield '<|channel>thought\n${response.thinking}<channel|>';
+      }
+
+      if (response.hasText() && response.text.isNotEmpty) {
         yield response.text;
       }
     }
@@ -148,6 +153,7 @@ class LiteRtLmClient {
     String text,
     Uint8List imageBytes, {
     String? conversationId,
+    bool enableThinking = false,
   }) async* {
     _assertInitialized();
     debugPrint('[LiteRtLmClient] chatWithImage: text=${text.length} chars, image=${imageBytes.length} bytes');
@@ -160,7 +166,8 @@ class LiteRtLmClient {
     final request = ChatWithImageRequest()
       ..conversationId = convId
       ..text = text
-      ..image = imageBytes;
+      ..image = imageBytes
+      ..enableThinking = enableThinking;
 
     // Add timeout to prevent infinite hanging
     await for (final response in _client!.chatWithImage(request).timeout(
@@ -176,7 +183,11 @@ class LiteRtLmClient {
         throw Exception('Chat error: ${response.error}');
       }
 
-      if (response.hasText()) {
+      if (response.hasThinking() && response.thinking.isNotEmpty) {
+        yield '<|channel>thought\n${response.thinking}<channel|>';
+      }
+
+      if (response.hasText() && response.text.isNotEmpty) {
         yield response.text;
       }
     }
@@ -187,6 +198,7 @@ class LiteRtLmClient {
     String text,
     Uint8List imageBytes, {
     String? conversationId,
+    bool enableThinking = false,
   }) async {
     _assertInitialized();
 
@@ -198,7 +210,8 @@ class LiteRtLmClient {
     final request = ChatWithImageRequest()
       ..conversationId = convId
       ..text = text
-      ..image = imageBytes;
+      ..image = imageBytes
+      ..enableThinking = enableThinking;
 
     final response = await _client!.chatWithImageSync(request);
 
@@ -214,6 +227,7 @@ class LiteRtLmClient {
     String text,
     Uint8List audioBytes, {
     String? conversationId,
+    bool enableThinking = false,
   }) async* {
     _assertInitialized();
 
@@ -225,7 +239,8 @@ class LiteRtLmClient {
     final request = ChatWithAudioRequest()
       ..conversationId = convId
       ..text = text
-      ..audio = audioBytes;
+      ..audio = audioBytes
+      ..enableThinking = enableThinking;
 
     // Add timeout to prevent infinite hanging
     await for (final response in _client!.chatWithAudio(request).timeout(
@@ -241,7 +256,11 @@ class LiteRtLmClient {
         throw Exception('Chat error: ${response.error}');
       }
 
-      if (response.hasText()) {
+      if (response.hasThinking() && response.thinking.isNotEmpty) {
+        yield '<|channel>thought\n${response.thinking}<channel|>';
+      }
+
+      if (response.hasText() && response.text.isNotEmpty) {
         yield response.text;
       }
     }
diff --git a/lib/flutter_gemma_interface.dart b/lib/flutter_gemma_interface.dart
index 59d48f1f..059bb1eb 100644
--- a/lib/flutter_gemma_interface.dart
+++ b/lib/flutter_gemma_interface.dart
@@ -136,6 +136,7 @@ abstract class InferenceModel {
     bool? enableVisionModality, // Add vision modality support
     bool? enableAudioModality, // Add audio modality support (Gemma 3n E4B)
     String? systemInstruction,
+    bool enableThinking = false, // Enable thinking mode (Gemma 4 via extraContext)
   });
 
   Future<InferenceChat> createChat({
@@ -164,6 +165,7 @@ abstract class InferenceModel {
         enableVisionModality: supportImage ?? false,
         enableAudioModality: supportAudio ?? false,
         systemInstruction: systemInstruction,
+        enableThinking: isThinking,
       ),
       maxTokens: maxTokens,
       tokenBuffer: tokenBuffer,
@@ -171,10 +173,10 @@ abstract class InferenceModel {
       supportAudio: supportAudio ?? false,
       supportsFunctionCalls: supportsFunctionCalls ?? false,
       tools: tools,
-      isThinking: isThinking, // Pass isThinking parameter
-      modelType: modelType ?? ModelType.gemmaIt, // Use provided modelType or default
-      fileType: fileType, // Pass fileType from model
-      toolChoice: toolChoice, // Pass tool calling mode
+      isThinking: isThinking,
+      modelType: modelType ?? ModelType.gemmaIt,
+      fileType: fileType,
+      toolChoice: toolChoice,
       systemInstruction: systemInstruction,
     );
     await chat!.initSession();
diff --git a/lib/mobile/flutter_gemma_mobile_inference_model.dart b/lib/mobile/flutter_gemma_mobile_inference_model.dart
index a460164d..596262d0 100644
--- a/lib/mobile/flutter_gemma_mobile_inference_model.dart
+++ b/lib/mobile/flutter_gemma_mobile_inference_model.dart
@@ -43,6 +43,7 @@ class MobileInferenceModel extends InferenceModel {
         enableVisionModality: supportImage ?? false,
         enableAudioModality: supportAudio ?? this.supportAudio,
         systemInstruction: systemInstruction,
+        enableThinking: isThinking,
       ),
       maxTokens: maxTokens,
       tokenBuffer: tokenBuffer,
@@ -86,6 +87,7 @@ class MobileInferenceModel extends InferenceModel {
     bool? enableVisionModality,
     bool? enableAudioModality,
     String? systemInstruction,
+    bool enableThinking = false,
   }) async {
     if (_isClosed) {
       throw StateError('Model is closed. Create a new instance to use it again');
@@ -109,6 +111,7 @@ class MobileInferenceModel extends InferenceModel {
         // Enable audio modality if the model supports it (Gemma 3n E4B)
         enableAudioModality: enableAudioModality ?? supportAudio,
         systemInstruction: systemInstruction,
+        enableThinking: enableThinking,
       );
 
       final session = _session = MobileInferenceModelSession(
diff --git a/lib/pigeon.g.dart b/lib/pigeon.g.dart
index 03682ef4..ea24405d 100644
--- a/lib/pigeon.g.dart
+++ b/lib/pigeon.g.dart
@@ -231,7 +231,7 @@ class PlatformService {
     }
   }
 
-  Future<void> createSession({required double temperature, required int randomSeed, required int topK, double? topP, String? loraPath, bool? enableVisionModality, bool? enableAudioModality, String? systemInstruction, }) async {
+  Future<void> createSession({required double temperature, required int randomSeed, required int topK, double? topP, String? loraPath, bool? enableVisionModality, bool? enableAudioModality, String? systemInstruction, bool? enableThinking, }) async {
     final String pigeonVar_channelName = 'dev.flutter.pigeon.flutter_gemma.PlatformService.createSession$pigeonVar_messageChannelSuffix';
     final BasicMessageChannel<Object?> pigeonVar_channel = BasicMessageChannel<Object?>(
       pigeonVar_channelName,
@@ -239,7 +239,7 @@ class PlatformService {
       binaryMessenger: pigeonVar_binaryMessenger,
     );
     final List<Object?>? pigeonVar_replyList =
-        await pigeonVar_channel.send(<Object?>[temperature, randomSeed, topK, topP, loraPath, enableVisionModality, enableAudioModality, systemInstruction]) as List<Object?>?;
+        await pigeonVar_channel.send(<Object?>[temperature, randomSeed, topK, topP, loraPath, enableVisionModality, enableAudioModality, systemInstruction, enableThinking]) as List<Object?>?;
     if (pigeonVar_replyList == null) {
       throw _createConnectionError(pigeonVar_channelName);
     } else if (pigeonVar_replyList.length > 1) {
@@ -741,7 +741,7 @@ class PlatformService {
   ///
   /// **Performance:**
   /// - Returns all documents in single call
-  /// - Embeddings as List<double> (decoded from BLOB)
+  /// - Embeddings as `List<double>` (decoded from BLOB)
   ///
   /// Returns empty list if no documents stored.
   Future<List<DocumentWithEmbedding>> getAllDocumentsWithEmbeddings() async {
diff --git a/lib/web/flutter_gemma_web.dart b/lib/web/flutter_gemma_web.dart
index 5d61ba95..2bf71e66 100644
--- a/lib/web/flutter_gemma_web.dart
+++ b/lib/web/flutter_gemma_web.dart
@@ -391,7 +391,16 @@ class WebInferenceModel extends InferenceModel {
     bool? enableVisionModality, // Enabling vision modality support
     bool? enableAudioModality, // Enabling audio modality support (Gemma 3n E4B)
     String? systemInstruction,
+    bool enableThinking = false, // Not supported on Web (MediaPipe)
   }) async {
+    // Thinking mode not supported on Web (MediaPipe has no extraContext/channels API)
+    if (enableThinking) {
+      if (kDebugMode) {
+        debugPrint('Warning: enableThinking is not supported on Web (MediaPipe). '
+            'Use Android or Desktop with .litertlm models for Gemma 4 thinking mode.');
+      }
+    }
+
     // TODO: Implement vision modality for web
     if (enableVisionModality == true) {
       if (kDebugMode) {
@@ -504,6 +513,7 @@ class WebInferenceModel extends InferenceModel {
   Future<void> close() async {
     await session?.close();
     session = null;
+    _initCompleter = null;
     onClose();
   }
 }
@@ -740,6 +750,7 @@ class WebModelSession extends InferenceModelSession {
         debugPrint('❌ getResponse: Exception caught: $e');
         debugPrint('❌ getResponse: Stack trace: $stackTrace');
       }
+      _promptParts.clear();
       rethrow;
     }
   }
@@ -750,6 +761,8 @@ class WebModelSession extends InferenceModelSession {
       debugPrint('🌊 getResponseAsync: Starting async response generation');
     }
 
+    // Close previous controller to prevent leak if called again before completion
+    _controller?.close();
     _controller = StreamController<String>();
 
     try {
@@ -834,10 +847,17 @@ class WebModelSession extends InferenceModelSession {
 
   @override
   Future<void> stopGeneration() async {
-    llmInference.cancelProcessing();
-    _controller?.close();
-    _controller = null;
-    _promptParts.clear();
+    try {
+      llmInference.cancelProcessing();
+    } catch (e) {
+      if (kDebugMode) {
+        debugPrint('[WebModelSession] cancelProcessing error: $e');
+      }
+    } finally {
+      _controller?.close();
+      _controller = null;
+      _promptParts.clear();
+    }
   }
 
   @override
diff --git a/lib/web/litert_web_embeddings.dart b/lib/web/litert_web_embeddings.dart
index 85f6142e..82bbbded 100644
--- a/lib/web/litert_web_embeddings.dart
+++ b/lib/web/litert_web_embeddings.dart
@@ -87,7 +87,7 @@ class LiteRTWebEmbeddings {
   ///
   /// [text] - Text to embed
   ///
-  /// Returns [List<double>] - Embedding vector (768 dimensions)
+  /// Returns `List<double>` - Embedding vector (768 dimensions)
   ///
   /// Throws [Exception] if not initialized or generation fails
   static Future<List<double>> generateEmbedding(String text) async {
@@ -122,7 +122,7 @@ class LiteRTWebEmbeddings {
   ///
   /// [text] - Text to embed
   ///
-  /// Returns [List<double>] - Embedding vector (768 dimensions)
+  /// Returns `List<double>` - Embedding vector (768 dimensions)
   ///
   /// Throws [Exception] if not initialized or generation fails
   static Future<List<double>> generateDocumentEmbedding(String text) async {
@@ -154,7 +154,7 @@ class LiteRTWebEmbeddings {
   ///
   /// [texts] - List of texts to embed
   ///
-  /// Returns [List<List<double>>] - List of embedding vectors
+  /// Returns `List<List<double>>` - List of embedding vectors
   ///
   /// Throws [Exception] if not initialized or generation fails
   static Future<List<List<double>>> generateEmbeddings(List<String> texts) async {
diff --git a/lib/web/vector_store_web.dart b/lib/web/vector_store_web.dart
index cd93f5be..1c0feb6a 100644
--- a/lib/web/vector_store_web.dart
+++ b/lib/web/vector_store_web.dart
@@ -67,8 +67,8 @@ extension type SQLiteVectorStore._(JSObject _) implements JSObject {
   /// Add document with embedding (Dart-friendly API)
   ///
   /// Type conversions:
-  /// - Dart List<double> → JS Array<JSNumber>
-  /// - Dart String? → JS String | null
+  /// - Dart `List<double>` → JS `Array<JSNumber>`
+  /// - Dart `String?` → JS `String | null`
   ///
   /// Throws:
   /// - Dimension mismatch
@@ -86,10 +86,10 @@ extension type SQLiteVectorStore._(JSObject _) implements JSObject {
   /// Search for similar documents (Dart-friendly API)
   ///
   /// Type conversions:
-  /// - Dart List<double> → JS Array<JSNumber>
+  /// - Dart `List<double>` → JS `Array<JSNumber>`
   /// - Dart int → JS Number
   /// - Dart double → JS Number
-  /// - JS Array<JSObject> → Dart List<RetrievalResult>
+  /// - JS `Array<JSObject>` → Dart `List<RetrievalResult>`
   ///
   /// Returns:
   /// - List sorted by similarity (descending)
diff --git a/linux/scripts/setup_desktop.sh b/linux/scripts/setup_desktop.sh
index 6e3732b7..4ec05c86 100755
--- a/linux/scripts/setup_desktop.sh
+++ b/linux/scripts/setup_desktop.sh
@@ -59,9 +59,9 @@ JRE_URL="https://cdn.azul.com/zulu/bin/${JRE_ARCHIVE}"
 
 # JAR settings
 JAR_NAME="litertlm-server.jar"
-JAR_VERSION="0.13.0"
+JAR_VERSION="0.13.1"
 JAR_URL="https://github.com/DenisovAV/flutter_gemma/releases/download/v${JAR_VERSION}/${JAR_NAME}"
-JAR_CHECKSUM="61191862ae56f130366f5539e0a2d36adc9cb4ea99fe6568fb9a7b7cd2e88f02"
+JAR_CHECKSUM="97e01020f921c098f7cfc0a9509e4b207b8bc326703ae2f26bbce3c11b957430"
 
 # Plugin root (parent of linux/)
 PLUGIN_ROOT=$(dirname "$PLUGIN_DIR")
@@ -81,8 +81,8 @@ verify_checksum() {
     elif command -v shasum &> /dev/null; then
         actual=$(shasum -a 256 "$file" | awk '{print $1}')
     else
-        echo "WARNING: No sha256sum or shasum available, skipping checksum verification"
-        return 0
+        echo "ERROR: No sha256sum or shasum available, cannot verify checksum"
+        return 1
     fi
 
     if [ "$actual" != "$expected" ]; then
@@ -177,6 +177,15 @@ setup_jar() {
     if [ -z "$JAR_SOURCE" ]; then
         local CACHED_JAR="$CACHE_DIR/jar/$JAR_NAME"
 
+        if [ -f "$CACHED_JAR" ] && [ -n "$JAR_CHECKSUM" ]; then
+            # Verify cached JAR checksum
+            echo "Verifying cached JAR checksum..."
+            if ! verify_checksum "$CACHED_JAR" "$JAR_CHECKSUM"; then
+                echo "Cached JAR checksum mismatch, re-downloading..."
+                rm -f "$CACHED_JAR"
+            fi
+        fi
+
         if [ ! -f "$CACHED_JAR" ]; then
             echo "Downloading JAR from $JAR_URL..."
             curl -L --progress-bar -o "$CACHED_JAR" "$JAR_URL" || {
@@ -185,7 +194,7 @@ setup_jar() {
                 exit 1
             }
 
-            # Verify checksum (skip if not yet available for this version)
+            # Verify checksum
             if [ -n "$JAR_CHECKSUM" ]; then
                 echo "Verifying JAR checksum..."
                 if ! verify_checksum "$CACHED_JAR" "$JAR_CHECKSUM"; then
@@ -196,7 +205,7 @@ setup_jar() {
                 echo "WARNING: JAR checksum not set, skipping verification"
             fi
         else
-            echo "Using cached JAR"
+            echo "Using cached JAR (checksum verified)"
         fi
 
         JAR_SOURCE="$CACHED_JAR"
diff --git a/litertlm-server/build.gradle.kts b/litertlm-server/build.gradle.kts
index 5729f814..61c77315 100644
--- a/litertlm-server/build.gradle.kts
+++ b/litertlm-server/build.gradle.kts
@@ -7,7 +7,7 @@ plugins {
 }
 
 group = "dev.flutterberlin"
-version = "0.13.0"
+version = "0.13.1"
 
 repositories {
     mavenCentral()
@@ -16,7 +16,7 @@ repositories {
 
 dependencies {
     // LiteRT-LM JVM (only version with Contents API for multimodal)
-    implementation("com.google.ai.edge.litertlm:litertlm-jvm:0.9.0")
+    implementation("com.google.ai.edge.litertlm:litertlm-jvm:0.10.0")
 
     // gRPC + Protobuf
     implementation("io.grpc:grpc-kotlin-stub:1.4.1")
diff --git a/litertlm-server/src/main/kotlin/dev/flutterberlin/litertlm/LiteRtLmServiceImpl.kt b/litertlm-server/src/main/kotlin/dev/flutterberlin/litertlm/LiteRtLmServiceImpl.kt
index 32a9248f..c5c56137 100644
--- a/litertlm-server/src/main/kotlin/dev/flutterberlin/litertlm/LiteRtLmServiceImpl.kt
+++ b/litertlm-server/src/main/kotlin/dev/flutterberlin/litertlm/LiteRtLmServiceImpl.kt
@@ -203,21 +203,23 @@ class LiteRtLmServiceImpl : LiteRtLmServiceGrpcKt.LiteRtLmServiceCoroutineImplBa
             logger.info("=== CHAT REQUEST ===")
             logger.info("conversationId: '${request.conversationId}'")
             logger.info("text: '${request.text}' (length=${request.text.length})")
-            logger.info("text bytes: ${request.text.toByteArray().take(20).map { it.toInt() and 0xFF }}")
+            logger.info("enableThinking: ${request.enableThinking}")
 
             // Use Contents format (like Android does)
             val message = Contents.of(listOf(Content.Text(request.text)))
-            logger.info("Created Contents: $message")
 
-            // Use callback-based API (like Android does)
-            conversation.sendMessageAsync(message, object : MessageCallback {
+            val extraContext = if (request.enableThinking) mapOf("enable_thinking" to true) else emptyMap()
+
+            val messageCallback = object : MessageCallback {
                 override fun onMessage(msg: Message) {
-                    trySend(
-                        ChatResponse.newBuilder()
-                            .setText(msg.toString())
-                            .setDone(false)
-                            .build()
-                    )
+                    val builder = ChatResponse.newBuilder()
+                        .setText(msg.toString())
+                        .setDone(false)
+                    val thinking = msg.channels["thought"]
+                    if (!thinking.isNullOrEmpty()) {
+                        builder.setThinking(thinking)
+                    }
+                    trySend(builder.build())
                 }
 
                 override fun onDone() {
@@ -240,7 +242,14 @@ class LiteRtLmServiceImpl : LiteRtLmServiceGrpcKt.LiteRtLmServiceCoroutineImplBa
                     )
                     close(throwable)
                 }
-            })
+            }
+
+            // Use callback-based API (like Android does)
+            if (extraContext.isNotEmpty()) {
+                conversation.sendMessageAsync(message, messageCallback, extraContext)
+            } else {
+                conversation.sendMessageAsync(message, messageCallback)
+            }
         } catch (e: Exception) {
             logger.error("Error starting chat", e)
             trySend(
@@ -267,16 +276,25 @@ class LiteRtLmServiceImpl : LiteRtLmServiceGrpcKt.LiteRtLmServiceCoroutineImplBa
             logger.info("ChatWithImageSync: text='${request.text.take(50)}', imageBytes=${imageBytes.size}")
 
             val message = buildContents(request.text, imageBytes = imageBytes)
+            val extraContext = if (request.enableThinking) mapOf("enable_thinking" to true) else emptyMap()
 
             logger.info("Calling SYNC sendMessage...")
-            val response = conversation.sendMessage(message)
+            val response = if (extraContext.isNotEmpty()) {
+                conversation.sendMessage(message, extraContext)
+            } else {
+                conversation.sendMessage(message)
+            }
             val responseText = response.toString()
+            val thinking = response.channels["thought"]
             logger.info("Sync response (${responseText.length} chars): ${responseText.take(200)}")
 
-            ChatResponse.newBuilder()
+            val builder = ChatResponse.newBuilder()
                 .setText(responseText)
                 .setDone(true)
-                .build()
+            if (!thinking.isNullOrEmpty()) {
+                builder.setThinking(thinking)
+            }
+            builder.build()
         } catch (e: Exception) {
             logger.error("Error during sync chat with image", e)
             ChatResponse.newBuilder()
@@ -323,23 +341,25 @@ class LiteRtLmServiceImpl : LiteRtLmServiceGrpcKt.LiteRtLmServiceCoroutineImplBa
                 logger.info("Image header: $header (JPEG=FFD8, PNG=89504E47)")
             }
             val message = buildContents(request.text, imageBytes = imageBytes)
+            val extraContext = if (request.enableThinking) mapOf("enable_thinking" to true) else emptyMap()
 
             logger.info("Sending message to conversation...")
             var responseCount = 0
 
-            // Use callback-based API (like Android does)
-            conversation.sendMessageAsync(message, object : MessageCallback {
+            val messageCallback = object : MessageCallback {
                 override fun onMessage(msg: Message) {
                     responseCount++
                     if (responseCount <= 3) {
                         logger.info("Response chunk $responseCount: '${msg.toString().take(100)}'")
                     }
-                    trySend(
-                        ChatResponse.newBuilder()
-                            .setText(msg.toString())
-                            .setDone(false)
-                            .build()
-                    )
+                    val builder = ChatResponse.newBuilder()
+                        .setText(msg.toString())
+                        .setDone(false)
+                    val thinking = msg.channels["thought"]
+                    if (!thinking.isNullOrEmpty()) {
+                        builder.setThinking(thinking)
+                    }
+                    trySend(builder.build())
                 }
 
                 override fun onDone() {
@@ -362,7 +382,14 @@ class LiteRtLmServiceImpl : LiteRtLmServiceGrpcKt.LiteRtLmServiceCoroutineImplBa
                     )
                     close(throwable)
                 }
-            })
+            }
+
+            // Use callback-based API (like Android does)
+            if (extraContext.isNotEmpty()) {
+                conversation.sendMessageAsync(message, messageCallback, extraContext)
+            } else {
+                conversation.sendMessageAsync(message, messageCallback)
+            }
         } catch (e: Exception) {
             logger.error("Error starting chat with image", e)
             trySend(
@@ -413,24 +440,26 @@ class LiteRtLmServiceImpl : LiteRtLmServiceGrpcKt.LiteRtLmServiceCoroutineImplBa
             }
 
             val message = buildContents(request.text, audioBytes = audioBytes)
+            val extraContext = if (request.enableThinking) mapOf("enable_thinking" to true) else emptyMap()
 
             logger.info("Sending message to conversation...")
             var responseCount = 0
 
-            // Use callback-based API (like Android does)
-            conversation.sendMessageAsync(message, object : MessageCallback {
+            val messageCallback = object : MessageCallback {
                 override fun onMessage(msg: Message) {
                     responseCount++
                     val responseText = msg.toString()
                     if (responseCount <= 3) {
                         logger.info("Response chunk $responseCount: '${responseText.take(100)}'")
                     }
-                    trySend(
-                        ChatResponse.newBuilder()
-                            .setText(responseText)
-                            .setDone(false)
-                            .build()
-                    )
+                    val builder = ChatResponse.newBuilder()
+                        .setText(responseText)
+                        .setDone(false)
+                    val thinking = msg.channels["thought"]
+                    if (!thinking.isNullOrEmpty()) {
+                        builder.setThinking(thinking)
+                    }
+                    trySend(builder.build())
                 }
 
                 override fun onDone() {
@@ -453,7 +482,14 @@ class LiteRtLmServiceImpl : LiteRtLmServiceGrpcKt.LiteRtLmServiceCoroutineImplBa
                     )
                     close(throwable)
                 }
-            })
+            }
+
+            // Use callback-based API (like Android does)
+            if (extraContext.isNotEmpty()) {
+                conversation.sendMessageAsync(message, messageCallback, extraContext)
+            } else {
+                conversation.sendMessageAsync(message, messageCallback)
+            }
         } catch (e: Exception) {
             logger.error("Error starting chat with audio", e)
             trySend(
@@ -569,10 +605,7 @@ class LiteRtLmServiceImpl : LiteRtLmServiceGrpcKt.LiteRtLmServiceCoroutineImplBa
             // Read image (JPEG, PNG, BMP, etc.)
             val inputStream = ByteArrayInputStream(imageBytes)
             val bufferedImage = ImageIO.read(inputStream)
-            if (bufferedImage == null) {
-                logger.warn("Failed to read image, returning original bytes")
-                return imageBytes
-            }
+                ?: throw IllegalArgumentException("Failed to read image: unsupported format or corrupt data")
 
             logger.info("Read image: ${bufferedImage.width}x${bufferedImage.height}, type=${bufferedImage.type}")
 
@@ -589,8 +622,7 @@ class LiteRtLmServiceImpl : LiteRtLmServiceGrpcKt.LiteRtLmServiceCoroutineImplBa
 
             pngBytes
         } catch (e: Exception) {
-            logger.error("Failed to convert image to PNG: ${e.message}", e)
-            imageBytes // Return original on error
+            throw IllegalArgumentException("Image conversion to PNG failed: ${e.message}", e)
         }
     }
 }
diff --git a/litertlm-server/src/main/proto/litertlm.proto b/litertlm-server/src/main/proto/litertlm.proto
index 5108161b..fe0cf2c2 100644
--- a/litertlm-server/src/main/proto/litertlm.proto
+++ b/litertlm-server/src/main/proto/litertlm.proto
@@ -74,24 +74,28 @@ message CreateConversationResponse {
 message ChatRequest {
   string conversation_id = 1;
   string text = 2;
+  bool enable_thinking = 3;
 }
 
 message ChatWithImageRequest {
   string conversation_id = 1;
   string text = 2;
   bytes image = 3;  // Image bytes (JPEG/PNG)
+  bool enable_thinking = 4;
 }
 
 message ChatWithAudioRequest {
   string conversation_id = 1;
   string text = 2;
   bytes audio = 3;  // Audio bytes (PCM 16kHz, 16-bit, mono)
+  bool enable_thinking = 4;
 }
 
 message ChatResponse {
   string text = 1;           // Partial or complete text
   bool done = 2;             // Is generation complete
   string error = 3;
+  string thinking = 4;       // Thinking channel content (Gemma 4)
 }
 
 message CancelGenerationRequest {
diff --git a/macos/scripts/prepare_resources.sh b/macos/scripts/prepare_resources.sh
index 8b593ba1..8002d53d 100755
--- a/macos/scripts/prepare_resources.sh
+++ b/macos/scripts/prepare_resources.sh
@@ -39,9 +39,9 @@ TFLITE_CACHE_DIR="$HOME/Library/Caches/flutter_gemma/tflite"
 
 # JAR settings
 JAR_NAME="litertlm-server.jar"
-JAR_VERSION="0.13.0"
+JAR_VERSION="0.13.1"
 JAR_URL="https://github.com/DenisovAV/flutter_gemma/releases/download/v${JAR_VERSION}/${JAR_NAME}"
-JAR_CHECKSUM="61191862ae56f130366f5539e0a2d36adc9cb4ea99fe6568fb9a7b7cd2e88f02"
+JAR_CHECKSUM="97e01020f921c098f7cfc0a9509e4b207b8bc326703ae2f26bbce3c11b957430"
 JAR_CACHE_DIR="$HOME/Library/Caches/flutter_gemma/jar"
 
 # Create Resources directory
@@ -147,9 +147,23 @@ download_jar() {
     local cached_jar="$JAR_CACHE_DIR/$JAR_NAME"
 
     if [[ -f "$cached_jar" ]]; then
-        echo "Using cached JAR" >&2
-        echo "$cached_jar"
-        return 0
+        # Verify cached JAR checksum before reuse
+        if [[ -n "$JAR_CHECKSUM" ]]; then
+            local actual_checksum
+            actual_checksum=$(shasum -a 256 "$cached_jar" | awk '{print $1}')
+            if [[ "$actual_checksum" != "$JAR_CHECKSUM" ]]; then
+                echo "Cached JAR checksum mismatch, re-downloading..." >&2
+                rm -f "$cached_jar"
+            else
+                echo "Using cached JAR (checksum verified)" >&2
+                echo "$cached_jar"
+                return 0
+            fi
+        else
+            echo "Using cached JAR" >&2
+            echo "$cached_jar"
+            return 0
+        fi
     fi
 
     if ! curl -L -o "$cached_jar" "$JAR_URL" --fail --retry 3 --progress-bar; then
diff --git a/macos/scripts/setup_desktop.sh b/macos/scripts/setup_desktop.sh
index 16bd2262..77f16ecc 100755
--- a/macos/scripts/setup_desktop.sh
+++ b/macos/scripts/setup_desktop.sh
@@ -58,9 +58,9 @@ JRE_CHECKSUM_X64="4a36280b411db58952bc97a26f96b184222b23d36ea5008a6ee34744989ff9
 
 # JAR settings
 JAR_NAME="litertlm-server.jar"
-JAR_VERSION="0.12.6"
+JAR_VERSION="0.13.1"
 JAR_URL="https://github.com/DenisovAV/flutter_gemma/releases/download/v${JAR_VERSION}/${JAR_NAME}"
-JAR_CHECKSUM="fefc53d076533de164b5ce07c65f9aedc4739f83efc93e67625f0d90029ae5b7"
+JAR_CHECKSUM="97e01020f921c098f7cfc0a9509e4b207b8bc326703ae2f26bbce3c11b957430"
 JAR_CACHE_DIR="$HOME/Library/Caches/flutter_gemma/jar"
 
 echo "Plugin root: $PLUGIN_ROOT"
@@ -313,10 +313,20 @@ setup_jar() {
 
     # 3. Download as fallback
     if [[ -z "$jar_source" ]]; then
-        # Check cache first
+        # Check cache first (with checksum verification)
         local cached_jar="$JAR_CACHE_DIR/$JAR_NAME"
         if [[ -f "$cached_jar" ]]; then
-            echo "Using cached JAR"
+            if [[ -n "$JAR_CHECKSUM" ]]; then
+                local actual_checksum
+                actual_checksum=$(shasum -a 256 "$cached_jar" | awk '{print $1}')
+                if [[ "$actual_checksum" != "$JAR_CHECKSUM" ]]; then
+                    echo "Cached JAR checksum mismatch, re-downloading..." >&2
+                    rm -f "$cached_jar"
+                fi
+            fi
+        fi
+        if [[ -f "$cached_jar" ]]; then
+            echo "Using cached JAR (checksum verified)"
             jar_source="$cached_jar"
         else
             if jar_source=$(download_jar); then
diff --git a/pigeon.dart b/pigeon.dart
index d9a76507..8d5f0316 100644
--- a/pigeon.dart
+++ b/pigeon.dart
@@ -53,6 +53,8 @@ abstract class PlatformService {
     bool? enableAudioModality,
     // System instruction for LiteRT-LM native support
     String? systemInstruction,
+    // Enable thinking mode (Gemma 4 via extraContext)
+    bool? enableThinking,
   });
 
   @async
diff --git a/pubspec.yaml b/pubspec.yaml
index db1aa1c3..dcc6fcae 100644
--- a/pubspec.yaml
+++ b/pubspec.yaml
@@ -1,6 +1,6 @@
 name: flutter_gemma
 description: "The plugin allows running the Gemma AI model locally on a device from a Flutter application. Includes support for Gemma 3 Nano models with optimized MediaPipe GenAI v0.10.33."
-version: 0.13.0
+version: 0.13.1
 homepage: https://github.com/DenisovAV/flutter_gemma
 repository: https://github.com/DenisovAV/flutter_gemma
 
diff --git a/test/core/gemma4_thinking_test.dart b/test/core/gemma4_thinking_test.dart
new file mode 100644
index 00000000..6d004bc0
--- /dev/null
+++ b/test/core/gemma4_thinking_test.dart
@@ -0,0 +1,200 @@
+import 'package:flutter_test/flutter_test.dart';
+import 'package:flutter_gemma/core/extensions.dart';
+import 'package:flutter_gemma/core/model.dart';
+import 'package:flutter_gemma/core/model_response.dart';
+
+void main() {
+  group('Gemma 4 thinking - filterThinkingStream', () {
+    Stream<ModelResponse> makeStream(List<String> chunks) {
+      return Stream.fromIterable(chunks.map((c) => TextResponse(c)));
+    }
+
+    test('complete block in single chunk yields ThinkingResponse + TextResponse', () async {
+      final stream = makeStream([
+        '<|channel>thought\nI need to think about this.<channel|>The answer is 42.',
+      ]);
+
+      final results = await ModelThinkingFilter.filterThinkingStream(
+        stream,
+        modelType: ModelType.gemmaIt,
+      ).toList();
+
+      expect(results, [
+        const ThinkingResponse('I need to think about this.'),
+        const TextResponse('The answer is 42.'),
+      ]);
+    });
+
+    test('thinking split across multiple chunks buffers correctly', () async {
+      final stream = makeStream([
+        '<|channel>thought\nI am ',
+        'thinking hard',
+        '<channel|>Final answer.',
+      ]);
+
+      final results = await ModelThinkingFilter.filterThinkingStream(
+        stream,
+        modelType: ModelType.gemmaIt,
+      ).toList();
+
+      // Intermediate thinking chunks are yielded as they arrive
+      final thinkingParts = results.whereType<ThinkingResponse>().map((r) => r.content).join();
+      final textParts = results.whereType<TextResponse>().map((r) => r.token).join();
+
+      expect(thinkingParts, 'I am thinking hard');
+      expect(textParts, 'Final answer.');
+    });
+
+    test('no thinking block passes through as TextResponse', () async {
+      final stream = makeStream([
+        'Hello, ',
+        'world!',
+      ]);
+
+      final results = await ModelThinkingFilter.filterThinkingStream(
+        stream,
+        modelType: ModelType.gemmaIt,
+      ).toList();
+
+      final text = results.whereType<TextResponse>().map((r) => r.token).join();
+      expect(text, 'Hello, world!');
+      expect(results.whereType<ThinkingResponse>(), isEmpty);
+    });
+
+    test('multiple thinking blocks in one response', () async {
+      final stream = makeStream([
+        '<|channel>thought\nFirst thought.<channel|>Text between.<|channel>thought\nSecond thought.<channel|>Final text.',
+      ]);
+
+      final results = await ModelThinkingFilter.filterThinkingStream(
+        stream,
+        modelType: ModelType.gemmaIt,
+      ).toList();
+
+      final thinking = results.whereType<ThinkingResponse>().map((r) => r.content).toList();
+      final text = results.whereType<TextResponse>().map((r) => r.token).toList();
+
+      expect(thinking, ['First thought.', 'Second thought.']);
+      expect(text, ['Text between.', 'Final text.']);
+    });
+
+    test('partial start marker at stream end is flushed as text', () async {
+      final stream = makeStream([
+        'Some text<|chan',
+      ]);
+
+      final results = await ModelThinkingFilter.filterThinkingStream(
+        stream,
+        modelType: ModelType.gemmaIt,
+      ).toList();
+
+      final text = results.whereType<TextResponse>().map((r) => r.token).join();
+      expect(text, 'Some text<|chan');
+    });
+
+    test('partial end marker at stream end is flushed as thinking', () async {
+      final stream = makeStream([
+        '<|channel>thought\nThinking content<chan',
+      ]);
+
+      final results = await ModelThinkingFilter.filterThinkingStream(
+        stream,
+        modelType: ModelType.gemmaIt,
+      ).toList();
+
+      final thinking = results.whereType<ThinkingResponse>().map((r) => r.content).join();
+      expect(thinking, 'Thinking content<chan');
+    });
+
+    test('empty thinking block yields only TextResponse', () async {
+      final stream = makeStream([
+        '<|channel>thought\n<channel|>The answer.',
+      ]);
+
+      final results = await ModelThinkingFilter.filterThinkingStream(
+        stream,
+        modelType: ModelType.gemmaIt,
+      ).toList();
+
+      expect(results.whereType<ThinkingResponse>(), isEmpty);
+      expect(results.whereType<TextResponse>().map((r) => r.token).join(), 'The answer.');
+    });
+
+    test('start marker split across chunks', () async {
+      final stream = makeStream([
+        'Hello <|channel>',
+        'thought\nThinking.<channel|>Done.',
+      ]);
+
+      final results = await ModelThinkingFilter.filterThinkingStream(
+        stream,
+        modelType: ModelType.gemmaIt,
+      ).toList();
+
+      final thinking = results.whereType<ThinkingResponse>().map((r) => r.content).join();
+      final text = results.whereType<TextResponse>().map((r) => r.token).join();
+
+      expect(thinking, 'Thinking.');
+      expect(text, 'Hello Done.');
+    });
+  });
+
+  group('Gemma 4 thinking - removeThinkingFromText', () {
+    test('strips thinking blocks from text', () {
+      const input = 'Prefix <|channel>thought\nSome reasoning.<channel|> Suffix';
+      final result = ModelThinkingFilter.removeThinkingFromText(
+        input,
+        modelType: ModelType.gemmaIt,
+      );
+      expect(result, 'Prefix  Suffix');
+    });
+
+    test('strips multiple thinking blocks', () {
+      const input = '<|channel>thought\nA<channel|>Text<|channel>thought\nB<channel|>End';
+      final result = ModelThinkingFilter.removeThinkingFromText(
+        input,
+        modelType: ModelType.gemmaIt,
+      );
+      expect(result, 'TextEnd');
+    });
+
+    test('no thinking blocks returns text unchanged', () {
+      const input = 'Just regular text';
+      final result = ModelThinkingFilter.removeThinkingFromText(
+        input,
+        modelType: ModelType.gemmaIt,
+      );
+      expect(result, 'Just regular text');
+    });
+
+    test('multiline thinking content is stripped', () {
+      const input = '<|channel>thought\nLine 1\nLine 2\nLine 3<channel|>Answer.';
+      final result = ModelThinkingFilter.removeThinkingFromText(
+        input,
+        modelType: ModelType.gemmaIt,
+      );
+      expect(result, 'Answer.');
+    });
+  });
+
+  group('DeepSeek thinking still works', () {
+    test('filterThinkingStream handles DeepSeek format', () async {
+      final stream = Stream.fromIterable([
+        const TextResponse('I think '),
+        const TextResponse('about this</think>'),
+        const TextResponse('The answer.'),
+      ]);
+
+      final results = await ModelThinkingFilter.filterThinkingStream(
+        stream,
+        modelType: ModelType.deepSeek,
+      ).toList();
+
+      final thinking = results.whereType<ThinkingResponse>().map((r) => r.content).join();
+      final text = results.whereType<TextResponse>().map((r) => r.token).join();
+
+      expect(thinking.contains('I think '), isTrue);
+      expect(text, 'The answer.');
+    });
+  });
+}
diff --git a/windows/scripts/setup_desktop.ps1 b/windows/scripts/setup_desktop.ps1
index 10102a06..d1b78817 100644
--- a/windows/scripts/setup_desktop.ps1
+++ b/windows/scripts/setup_desktop.ps1
@@ -87,9 +87,9 @@ $JreChecksums = @{
 
 # JAR settings
 $JarName = "litertlm-server.jar"
-$JarVersion = "0.13.0"
+$JarVersion = "0.13.1"
 $JarUrl = "https://github.com/DenisovAV/flutter_gemma/releases/download/v$JarVersion/$JarName"
-$JarChecksum = "61191862ae56f130366f5539e0a2d36adc9cb4ea99fe6568fb9a7b7cd2e88f02"
+$JarChecksum = "97e01020f921c098f7cfc0a9509e4b207b8bc326703ae2f26bbce3c11b957430"
 $JarCacheDir = "$env:LOCALAPPDATA\flutter_gemma\jar"
 $PluginRoot = Split-Path -Parent $PluginDir
 
@@ -386,9 +386,23 @@ function Setup-Jar {
         # Check cache first
         $cachedJar = "$JarCacheDir\$JarName"
         if (Test-Path $cachedJar) {
-            Write-Host "Using cached JAR" -ForegroundColor Green
-            $jarSource = $cachedJar
-        } else {
+            # Verify cached JAR checksum before reuse
+            if ($JarChecksum) {
+                $actualChecksum = (Get-FileHash -Path $cachedJar -Algorithm SHA256).Hash.ToLower()
+                if ($actualChecksum -ne $JarChecksum.ToLower()) {
+                    Write-Host "Cached JAR checksum mismatch, re-downloading..." -ForegroundColor Yellow
+                    Remove-Item -Path $cachedJar -Force
+                } else {
+                    Write-Host "Using cached JAR (checksum verified)" -ForegroundColor Green
+                    $jarSource = $cachedJar
+                }
+            } else {
+                Write-Host "Using cached JAR" -ForegroundColor Green
+                $jarSource = $cachedJar
+            }
+        }
+        # Download if no valid cached JAR (missing or checksum mismatch)
+        if (-not $jarSource) {
             $jarSource = Download-Jar
             if (-not $jarSource) {
                 Write-Error "Could not obtain JAR (build failed, download failed)"