diff --git a/CHANGELOG.md b/CHANGELOG.md index f8901db9634..6684283b866 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,73 @@ ## [Version 1.117.0](https://github.com/lobehub/lobe-chat/compare/v1.116.4...v1.117.0) +Released on **2025-08-29** + +#### ✨ Features + +- **misc**: Ai image support Gemini 2.5 Flash Image, Support Gemini 2.5 Flash Image Preview in OpenRouter. + +#### 💄 Styles + +- **misc**: Update i18n. + +
+ +
+Improvements and Fixes + +#### What's improved + +- **misc**: Ai image support Gemini 2.5 Flash Image, closes [#8966](https://github.com/lobehub/lobe-chat/issues/8966) ([64b969e](https://github.com/lobehub/lobe-chat/commit/64b969e)) +- **misc**: Support Gemini 2.5 Flash Image Preview in OpenRouter, closes [#8944](https://github.com/lobehub/lobe-chat/issues/8944) ([23dcf4c](https://github.com/lobehub/lobe-chat/commit/23dcf4c)) + +#### Styles + +- **misc**: Update i18n, closes [#8975](https://github.com/lobehub/lobe-chat/issues/8975) ([6872798](https://github.com/lobehub/lobe-chat/commit/6872798)) + +
+ +
+ +[![](https://img.shields.io/badge/-BACK_TO_TOP-151515?style=flat-square)](#readme-top) + +
+ +## [Version 1.117.0](https://github.com/lobehub/lobe-chat/compare/v1.116.4...v1.117.0) + +Released on **2025-08-29** + +#### ✨ Features + +- **misc**: Ai image support Gemini 2.5 Flash Image. + +#### 💄 Styles + +- **misc**: Update i18n. + +
+ +
+Improvements and Fixes + +#### What's improved + +- **misc**: Ai image support Gemini 2.5 Flash Image, closes [#8966](https://github.com/lobehub/lobe-chat/issues/8966) ([64b969e](https://github.com/lobehub/lobe-chat/commit/64b969e)) + +#### Styles + +- **misc**: Update i18n, closes [#8975](https://github.com/lobehub/lobe-chat/issues/8975) ([6872798](https://github.com/lobehub/lobe-chat/commit/6872798)) + +
+ +
+ +[![](https://img.shields.io/badge/-BACK_TO_TOP-151515?style=flat-square)](#readme-top) + +
+ +## [Version 1.117.0](https://github.com/lobehub/lobe-chat/compare/v1.116.4...v1.117.0) + Released on **2025-08-28** #### ✨ Features diff --git a/changelog/v1.json b/changelog/v1.json index 066bab0c621..1c65f59cac9 100644 --- a/changelog/v1.json +++ b/changelog/v1.json @@ -1,4 +1,14 @@ [ + { + "children": { + "features": [ + "Ai image support Gemini 2.5 Flash Image, Support Gemini 2.5 Flash Image Preview in OpenRouter." + ], + "improvements": ["Update i18n."] + }, + "date": "2025-08-29", + "version": "1.117.0" + }, { "children": { "improvements": ["Support html preview."] diff --git a/locales/ar/models.json b/locales/ar/models.json index 0706ff883b3..d73b1270524 100644 --- a/locales/ar/models.json +++ b/locales/ar/models.json @@ -1220,6 +1220,9 @@ "gemini-2.5-flash-image-preview": { "description": "Gemini 2.5 Flash Image Preview هو أحدث وأسرع وأكثر كفاءة نموذج متعدد الوسائط أصلي من Google، ويتيح لك إنشاء الصور وتحريرها من خلال المحادثة." }, + "gemini-2.5-flash-image-preview:image": { + "description": "معاينة صورة فلاش جيميني 2.5 هي أحدث وأسرع وأكثر نموذج متعدد الوسائط كفاءة من جوجل، تتيح لك إنشاء وتحرير الصور من خلال المحادثة." + }, "gemini-2.5-flash-lite": { "description": "Gemini 2.5 Flash-Lite هو أصغر وأفضل نموذج من حيث التكلفة من Google، مصمم للاستخدام على نطاق واسع." }, diff --git a/locales/bg-BG/models.json b/locales/bg-BG/models.json index 6c59da298d7..80202e059d7 100644 --- a/locales/bg-BG/models.json +++ b/locales/bg-BG/models.json @@ -1220,6 +1220,9 @@ "gemini-2.5-flash-image-preview": { "description": "Gemini 2.5 Flash Image Preview е най-новият, най-бързият и най-ефективният роден мултимодален модел на Google; той ви позволява чрез диалог да създавате и редактирате изображения." }, + "gemini-2.5-flash-image-preview:image": { + "description": "Gemini 2.5 Flash Image Preview е най-новият, най-бързият и най-ефективният мултимодален модел на Google, който ви позволява да генерирате и редактирате изображения чрез разговор." + }, "gemini-2.5-flash-lite": { "description": "Gemini 2.5 Flash-Lite е най-малкият и най-ефективен модел на Google, създаден специално за масово използване." }, diff --git a/locales/de-DE/models.json b/locales/de-DE/models.json index 78f4ab02098..37dadceca0b 100644 --- a/locales/de-DE/models.json +++ b/locales/de-DE/models.json @@ -1220,6 +1220,9 @@ "gemini-2.5-flash-image-preview": { "description": "Gemini 2.5 Flash Image Preview ist Googles neuestes, schnellstes und effizientestes natives multimodales Modell. Es ermöglicht Ihnen, Bilder im Dialog zu erstellen und zu bearbeiten." }, + "gemini-2.5-flash-image-preview:image": { + "description": "Gemini 2.5 Flash Image Preview ist Googles neuestes, schnellstes und effizientestes natives multimodales Modell, das es Ihnen ermöglicht, Bilder durch Konversation zu erstellen und zu bearbeiten." + }, "gemini-2.5-flash-lite": { "description": "Gemini 2.5 Flash-Lite ist Googles kleinstes und kosteneffizientestes Modell, das speziell für den großflächigen Einsatz entwickelt wurde." }, diff --git a/locales/en-US/models.json b/locales/en-US/models.json index e5d4609d251..4fca41e1f44 100644 --- a/locales/en-US/models.json +++ b/locales/en-US/models.json @@ -1220,6 +1220,9 @@ "gemini-2.5-flash-image-preview": { "description": "Gemini 2.5 Flash Image Preview is Google's newest, fastest, and most efficient native multimodal model, enabling you to generate and edit images through conversation." }, + "gemini-2.5-flash-image-preview:image": { + "description": "Gemini 2.5 Flash Image Preview is Google's latest, fastest, and most efficient native multimodal model, enabling you to generate and edit images through conversation." + }, "gemini-2.5-flash-lite": { "description": "Gemini 2.5 Flash-Lite is Google's smallest and most cost-effective model, designed for large-scale use." }, diff --git a/locales/es-ES/models.json b/locales/es-ES/models.json index 329c1b6afdc..3fd5de1dd2d 100644 --- a/locales/es-ES/models.json +++ b/locales/es-ES/models.json @@ -1220,6 +1220,9 @@ "gemini-2.5-flash-image-preview": { "description": "Gemini 2.5 Flash Image Preview es el modelo multimodal nativo más reciente, rápido y eficiente de Google; le permite generar y editar imágenes a través de conversaciones." }, + "gemini-2.5-flash-image-preview:image": { + "description": "Gemini 2.5 Flash Image Preview es el modelo multimodal nativo más reciente, rápido y eficiente de Google, que le permite generar y editar imágenes mediante conversaciones." + }, "gemini-2.5-flash-lite": { "description": "Gemini 2.5 Flash-Lite es el modelo más pequeño y rentable de Google, diseñado para un uso a gran escala." }, diff --git a/locales/fa-IR/models.json b/locales/fa-IR/models.json index 7d05672be69..4947cf05113 100644 --- a/locales/fa-IR/models.json +++ b/locales/fa-IR/models.json @@ -1220,6 +1220,9 @@ "gemini-2.5-flash-image-preview": { "description": "Gemini 2.5 Flash Image Preview آخرین، سریع‌ترین و کارآمدترین مدل چندمودالی بومی گوگل است که به شما امکان می‌دهد از طریق گفتگو تصاویر را تولید و ویرایش کنید." }, + "gemini-2.5-flash-image-preview:image": { + "description": "Gemini 2.5 Flash Image Preview جدیدترین، سریع‌ترین و کارآمدترین مدل چندرسانه‌ای بومی گوگل است که به شما امکان می‌دهد از طریق گفتگو تصاویر را تولید و ویرایش کنید." + }, "gemini-2.5-flash-lite": { "description": "Gemini 2.5 Flash-Lite کوچک‌ترین و مقرون‌به‌صرفه‌ترین مدل گوگل است که برای استفاده در مقیاس وسیع طراحی شده است." }, diff --git a/locales/fr-FR/models.json b/locales/fr-FR/models.json index aafc502246a..987b406ed6a 100644 --- a/locales/fr-FR/models.json +++ b/locales/fr-FR/models.json @@ -1220,6 +1220,9 @@ "gemini-2.5-flash-image-preview": { "description": "Gemini 2.5 Flash Image Preview est le modèle multimodal natif le plus récent, le plus rapide et le plus performant de Google. Il vous permet de générer et d’éditer des images via des échanges conversationnels." }, + "gemini-2.5-flash-image-preview:image": { + "description": "Gemini 2.5 Flash Image Preview est le dernier modèle multimodal natif de Google, le plus rapide et le plus efficace, qui vous permet de générer et d’éditer des images par le biais de conversations." + }, "gemini-2.5-flash-lite": { "description": "Gemini 2.5 Flash-Lite est le modèle le plus petit et le plus rentable de Google, conçu pour une utilisation à grande échelle." }, diff --git a/locales/it-IT/models.json b/locales/it-IT/models.json index 8fe93114565..9679c6fb7e1 100644 --- a/locales/it-IT/models.json +++ b/locales/it-IT/models.json @@ -1220,6 +1220,9 @@ "gemini-2.5-flash-image-preview": { "description": "Gemini 2.5 Flash Image Preview è il modello multimodale nativo più recente, veloce ed efficiente di Google, che consente di generare e modificare immagini tramite conversazioni." }, + "gemini-2.5-flash-image-preview:image": { + "description": "Gemini 2.5 Flash Image Preview è il modello multimodale nativo più recente, veloce ed efficiente di Google, che consente di generare e modificare immagini tramite conversazioni." + }, "gemini-2.5-flash-lite": { "description": "Gemini 2.5 Flash-Lite è il modello più piccolo e conveniente di Google, progettato per un utilizzo su larga scala." }, diff --git a/locales/ja-JP/models.json b/locales/ja-JP/models.json index 52c5cbf9e0d..992a5709e20 100644 --- a/locales/ja-JP/models.json +++ b/locales/ja-JP/models.json @@ -1220,6 +1220,9 @@ "gemini-2.5-flash-image-preview": { "description": "Gemini 2.5 Flash Image Preview は Google の最新かつ最速で最も効率的なネイティブなマルチモーダルモデルであり、対話を通じて画像を生成・編集することを可能にします。" }, + "gemini-2.5-flash-image-preview:image": { + "description": "Gemini 2.5 Flash Image Preview は、Google の最新かつ最速で最も効率的なネイティブマルチモーダルモデルであり、対話を通じて画像の生成と編集を可能にします。" + }, "gemini-2.5-flash-lite": { "description": "Gemini 2.5 Flash-Lite は、Google の中で最も小さく、コストパフォーマンスに優れたモデルであり、大規模な利用を目的に設計されています。" }, diff --git a/locales/ko-KR/models.json b/locales/ko-KR/models.json index efdcc05bd96..119338bc5e0 100644 --- a/locales/ko-KR/models.json +++ b/locales/ko-KR/models.json @@ -1220,6 +1220,9 @@ "gemini-2.5-flash-image-preview": { "description": "Gemini 2.5 Flash Image Preview는 Google의 최신이자 가장 빠르고 효율적인 네이티브 멀티모달 모델로, 대화를 통해 이미지를 생성하고 편집할 수 있게 해줍니다." }, + "gemini-2.5-flash-image-preview:image": { + "description": "Gemini 2.5 Flash Image Preview는 Google의 최신이자 가장 빠르고 효율적인 네이티브 멀티모달 모델로, 대화를 통해 이미지를 생성하고 편집할 수 있습니다." + }, "gemini-2.5-flash-lite": { "description": "Gemini 2.5 Flash-Lite는 Google의 가장 작고 가성비가 뛰어난 모델로, 대규모 사용을 위해 설계되었습니다." }, diff --git a/locales/nl-NL/models.json b/locales/nl-NL/models.json index ca5dca8c154..30b67f101de 100644 --- a/locales/nl-NL/models.json +++ b/locales/nl-NL/models.json @@ -1220,6 +1220,9 @@ "gemini-2.5-flash-image-preview": { "description": "Gemini 2.5 Flash Image Preview is Google's nieuwste, snelste en meest efficiënte native multimodale model. Het stelt u in staat om via gesprekken afbeeldingen te genereren en te bewerken." }, + "gemini-2.5-flash-image-preview:image": { + "description": "Gemini 2.5 Flash Image Preview is Google's nieuwste, snelste en meest efficiënte native multimodale model, waarmee u afbeeldingen kunt genereren en bewerken via gesprekken." + }, "gemini-2.5-flash-lite": { "description": "Gemini 2.5 Flash-Lite is het kleinste en meest kosteneffectieve model van Google, speciaal ontworpen voor grootschalig gebruik." }, diff --git a/locales/pl-PL/models.json b/locales/pl-PL/models.json index b43fca8963f..6cfc1f495f4 100644 --- a/locales/pl-PL/models.json +++ b/locales/pl-PL/models.json @@ -1220,6 +1220,9 @@ "gemini-2.5-flash-image-preview": { "description": "Gemini 2.5 Flash Image Preview to najnowszy, najszybszy i najbardziej wydajny natywny model multimodalny firmy Google. Umożliwia tworzenie i edycję obrazów podczas konwersacji." }, + "gemini-2.5-flash-image-preview:image": { + "description": "Gemini 2.5 Flash Image Preview to najnowszy, najszybszy i najbardziej wydajny natywny model multimodalny Google, który pozwala generować i edytować obrazy za pomocą rozmowy." + }, "gemini-2.5-flash-lite": { "description": "Gemini 2.5 Flash-Lite to najmniejszy i najbardziej opłacalny model Google, zaprojektowany z myślą o szerokim zastosowaniu." }, diff --git a/locales/pt-BR/models.json b/locales/pt-BR/models.json index f5e54defa6f..e3f65539248 100644 --- a/locales/pt-BR/models.json +++ b/locales/pt-BR/models.json @@ -1220,6 +1220,9 @@ "gemini-2.5-flash-image-preview": { "description": "Gemini 2.5 Flash Image Preview é o modelo multimodal nativo mais recente, mais rápido e mais eficiente do Google, que permite gerar e editar imagens por meio de conversas." }, + "gemini-2.5-flash-image-preview:image": { + "description": "Gemini 2.5 Flash Image Preview é o mais recente, rápido e eficiente modelo multimodal nativo do Google, que permite gerar e editar imagens por meio de conversas." + }, "gemini-2.5-flash-lite": { "description": "Gemini 2.5 Flash-Lite é o modelo mais compacto e com melhor custo-benefício do Google, projetado para uso em larga escala." }, diff --git a/locales/ru-RU/models.json b/locales/ru-RU/models.json index afec0b4360d..91d8168057b 100644 --- a/locales/ru-RU/models.json +++ b/locales/ru-RU/models.json @@ -1220,6 +1220,9 @@ "gemini-2.5-flash-image-preview": { "description": "Gemini 2.5 Flash Image Preview — новейшая, самая быстрая и наиболее эффективная нативная мультимодальная модель Google, которая позволяет генерировать и редактировать изображения в диалоге." }, + "gemini-2.5-flash-image-preview:image": { + "description": "Gemini 2.5 Flash Image Preview — это новейшая, самая быстрая и эффективная нативная мультимодальная модель от Google, которая позволяет создавать и редактировать изображения через диалог." + }, "gemini-2.5-flash-lite": { "description": "Gemini 2.5 Flash-Lite — это самая компактная и экономичная модель от Google, разработанная для масштабного использования." }, diff --git a/locales/tr-TR/models.json b/locales/tr-TR/models.json index 2acae3e21c4..8078e8f7e60 100644 --- a/locales/tr-TR/models.json +++ b/locales/tr-TR/models.json @@ -1220,6 +1220,9 @@ "gemini-2.5-flash-image-preview": { "description": "Gemini 2.5 Flash Image Preview, Google'ın en yeni, en hızlı ve en verimli yerel çok modlu modelidir; sohbet yoluyla görüntü oluşturmanıza ve düzenlemenize olanak tanır." }, + "gemini-2.5-flash-image-preview:image": { + "description": "Gemini 2.5 Flash Image Preview, Google'ın en yeni, en hızlı ve en verimli yerel çok modlu modelidir; sohbet yoluyla görüntü oluşturmanıza ve düzenlemenize olanak tanır." + }, "gemini-2.5-flash-lite": { "description": "Gemini 2.5 Flash-Lite, Google'ın en küçük ve en uygun maliyetli modeli olup, geniş çaplı kullanım için tasarlanmıştır." }, diff --git a/locales/vi-VN/models.json b/locales/vi-VN/models.json index a0618de821b..2522f1a70fb 100644 --- a/locales/vi-VN/models.json +++ b/locales/vi-VN/models.json @@ -1220,6 +1220,9 @@ "gemini-2.5-flash-image-preview": { "description": "Gemini 2.5 Flash Image Preview là mô hình đa phương thức nguyên bản mới nhất, nhanh nhất và hiệu quả nhất của Google; nó cho phép bạn tạo và chỉnh sửa hình ảnh thông qua hội thoại." }, + "gemini-2.5-flash-image-preview:image": { + "description": "Gemini 2.5 Flash Image Preview là mô hình đa phương thức gốc mới nhất, nhanh nhất và hiệu quả nhất của Google, cho phép bạn tạo và chỉnh sửa hình ảnh thông qua đối thoại." + }, "gemini-2.5-flash-lite": { "description": "Gemini 2.5 Flash-Lite là mô hình nhỏ nhất và có hiệu suất chi phí tốt nhất của Google, được thiết kế dành cho việc sử dụng quy mô lớn." }, diff --git a/locales/zh-CN/models.json b/locales/zh-CN/models.json index 8c31abd9a69..2f624dc8b9f 100644 --- a/locales/zh-CN/models.json +++ b/locales/zh-CN/models.json @@ -1220,6 +1220,9 @@ "gemini-2.5-flash-image-preview": { "description": "Gemini 2.5 Flash Image Preview 是 Google 最新、最快、最高效的原生多模态模型,它允许您通过对话生成和编辑图像。" }, + "gemini-2.5-flash-image-preview:image": { + "description": "Gemini 2.5 Flash Image Preview 是 Google 最新、最快、最高效的原生多模态模型,它允许您通过对话生成和编辑图像。" + }, "gemini-2.5-flash-lite": { "description": "Gemini 2.5 Flash-Lite 是 Google 最小、性价比最高的模型,专为大规模使用而设计。" }, diff --git a/locales/zh-TW/models.json b/locales/zh-TW/models.json index f0bc0e22322..77a6df86c83 100644 --- a/locales/zh-TW/models.json +++ b/locales/zh-TW/models.json @@ -1220,6 +1220,9 @@ "gemini-2.5-flash-image-preview": { "description": "Gemini 2.5 Flash Image Preview 是 Google 最新、速度最快且效率最高的原生多模態模型,允許您透過對話生成與編輯圖像。" }, + "gemini-2.5-flash-image-preview:image": { + "description": "Gemini 2.5 Flash Image Preview 是 Google 最新、最快、最高效的原生多模態模型,它允許您透過對話生成和編輯圖像。" + }, "gemini-2.5-flash-lite": { "description": "Gemini 2.5 Flash-Lite 是 Google 最小、性價比最高的模型,專為大規模使用而設計。" }, diff --git a/packages/model-runtime/src/utils/streams/google-ai.test.ts b/packages/model-runtime/src/utils/streams/google-ai.test.ts index ffad2bac3ab..e845a0932de 100644 --- a/packages/model-runtime/src/utils/streams/google-ai.test.ts +++ b/packages/model-runtime/src/utils/streams/google-ai.test.ts @@ -181,7 +181,7 @@ describe('GoogleGenerativeAIStream', () => { // usage 'id: chat_1\n', 'event: usage\n', - `data: {"inputImageTokens":258,"inputTextTokens":8,"outputTextTokens":0,"totalInputTokens":266,"totalOutputTokens":0,"totalTokens":266}\n\n`, + `data: {"inputImageTokens":258,"inputTextTokens":8,"outputImageTokens":0,"outputTextTokens":0,"totalInputTokens":266,"totalOutputTokens":0,"totalTokens":266}\n\n`, ]); }); @@ -227,7 +227,7 @@ describe('GoogleGenerativeAIStream', () => { // usage 'id: chat_1\n', 'event: usage\n', - `data: {"inputCachedTokens":14286,"inputTextTokens":15725,"outputTextTokens":1053,"totalInputTokens":15725,"totalOutputTokens":1053,"totalTokens":16778}\n\n`, + `data: {"inputCachedTokens":14286,"inputTextTokens":15725,"outputImageTokens":0,"outputTextTokens":1053,"totalInputTokens":15725,"totalOutputTokens":1053,"totalTokens":16778}\n\n`, ]); }); @@ -316,7 +316,7 @@ describe('GoogleGenerativeAIStream', () => { // usage 'id: chat_1', 'event: usage', - `data: {"inputTextTokens":19,"outputTextTokens":11,"totalInputTokens":19,"totalOutputTokens":11,"totalTokens":30}\n`, + `data: {"inputTextTokens":19,"outputImageTokens":0,"outputTextTokens":11,"totalInputTokens":19,"totalOutputTokens":11,"totalTokens":30}\n`, ].map((i) => i + '\n'), ); }); @@ -409,7 +409,7 @@ describe('GoogleGenerativeAIStream', () => { // usage 'id: chat_1', 'event: usage', - `data: {"inputTextTokens":19,"outputReasoningTokens":100,"outputTextTokens":11,"totalInputTokens":19,"totalOutputTokens":111,"totalTokens":131}\n`, + `data: {"inputTextTokens":19,"outputImageTokens":0,"outputReasoningTokens":100,"outputTextTokens":11,"totalInputTokens":19,"totalOutputTokens":111,"totalTokens":131}\n`, ].map((i) => i + '\n'), ); }); @@ -542,7 +542,7 @@ describe('GoogleGenerativeAIStream', () => { // usage 'id: chat_1', 'event: usage', - `data: {"inputTextTokens":38,"outputReasoningTokens":304,"outputTextTokens":19,"totalInputTokens":38,"totalOutputTokens":323,"totalTokens":361}\n`, + `data: {"inputTextTokens":38,"outputImageTokens":0,"outputReasoningTokens":304,"outputTextTokens":19,"totalInputTokens":38,"totalOutputTokens":323,"totalTokens":361}\n`, ].map((i) => i + '\n'), ); }); @@ -662,7 +662,7 @@ describe('GoogleGenerativeAIStream', () => { // usage 'id: chat_1', 'event: usage', - `data: {"inputTextTokens":19,"outputReasoningTokens":100,"outputTextTokens":11,"totalInputTokens":19,"totalOutputTokens":111,"totalTokens":131}\n`, + `data: {"inputTextTokens":19,"outputImageTokens":0,"outputReasoningTokens":100,"outputTextTokens":11,"totalInputTokens":19,"totalOutputTokens":111,"totalTokens":131}\n`, ].map((i) => i + '\n'), ); }); @@ -811,7 +811,7 @@ describe('GoogleGenerativeAIStream', () => { // usage 'id: chat_1', 'event: usage', - `data: {"inputTextTokens":9,"outputTextTokens":122,"totalInputTokens":9,"totalOutputTokens":122,"totalTokens":131}\n`, + `data: {"inputTextTokens":9,"outputImageTokens":0,"outputTextTokens":122,"totalInputTokens":9,"totalOutputTokens":122,"totalTokens":131}\n`, ].map((i) => i + '\n'), ); }); diff --git a/packages/model-runtime/src/utils/streams/google-ai.ts b/packages/model-runtime/src/utils/streams/google-ai.ts index cc5e7a910a2..3e431c2c16b 100644 --- a/packages/model-runtime/src/utils/streams/google-ai.ts +++ b/packages/model-runtime/src/utils/streams/google-ai.ts @@ -57,8 +57,20 @@ const transformGoogleGenerativeAIStream = ( if (candidate?.finishReason && usage) { // totalTokenCount = promptTokenCount + candidatesTokenCount + thoughtsTokenCount const reasoningTokens = usage.thoughtsTokenCount; - const outputTextTokens = usage.candidatesTokenCount ?? 0; - const totalOutputTokens = outputTextTokens + (reasoningTokens ?? 0); + + const candidatesDetails = usage.candidatesTokensDetails; + const candidatesTotal = + usage.candidatesTokenCount ?? + candidatesDetails?.reduce((s: number, i: any) => s + (i?.tokenCount ?? 0), 0) ?? + 0; + + const outputImageTokens = + candidatesDetails?.find((i: any) => i.modality === 'IMAGE')?.tokenCount ?? 0; + const outputTextTokens = + candidatesDetails?.find((i: any) => i.modality === 'TEXT')?.tokenCount ?? + Math.max(0, candidatesTotal - outputImageTokens); + + const totalOutputTokens = candidatesTotal + (reasoningTokens ?? 0); usageChunks.push( { data: candidate.finishReason, id: context?.id, type: 'stop' }, @@ -69,6 +81,7 @@ const transformGoogleGenerativeAIStream = ( ?.tokenCount, inputTextTokens: usage.promptTokensDetails?.find((i) => i.modality === 'TEXT') ?.tokenCount, + outputImageTokens, outputReasoningTokens: reasoningTokens, outputTextTokens, totalInputTokens: usage.promptTokenCount, diff --git a/packages/model-runtime/src/utils/streams/openai/openai.test.ts b/packages/model-runtime/src/utils/streams/openai/openai.test.ts index 1a677692649..d17859c13d8 100644 --- a/packages/model-runtime/src/utils/streams/openai/openai.test.ts +++ b/packages/model-runtime/src/utils/streams/openai/openai.test.ts @@ -2271,4 +2271,45 @@ describe('OpenAIStream', () => { ); }); }); + + it('should handle base64_image in delta.images (image_url shape)', async () => { + const base64 = + 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=='; + + const mockOpenAIStream = new ReadableStream({ + start(controller) { + controller.enqueue({ + choices: [ + { + delta: { + images: [ + { + type: 'image_url', + image_url: { url: base64 }, + index: 0, + }, + ], + }, + index: 0, + }, + ], + id: '6', + }); + + controller.close(); + }, + }); + + const protocolStream = OpenAIStream(mockOpenAIStream); + + const decoder = new TextDecoder(); + const chunks = []; + + // @ts-ignore + for await (const chunk of protocolStream) { + chunks.push(decoder.decode(chunk, { stream: true })); + } + + expect(chunks).toEqual(['id: 6\n', 'event: base64_image\n', `data: "${base64}"\n\n`]); + }); }); diff --git a/packages/model-runtime/src/utils/streams/openai/openai.ts b/packages/model-runtime/src/utils/streams/openai/openai.ts index e3925eca148..875e58df31b 100644 --- a/packages/model-runtime/src/utils/streams/openai/openai.ts +++ b/packages/model-runtime/src/utils/streams/openai/openai.ts @@ -96,6 +96,36 @@ const transformOpenAIStream = ( } } + // Handle image preview chunks (e.g. Gemini 2.5 flash image preview) + // Example shape: + // choices[0].delta.images = [{ type: 'image_url', image_url: { url: 'data:image/png;base64,...' }, index: 0 }] + if ( + (item as any).delta && + Array.isArray((item as any).delta.images) && + (item as any).delta.images.length > 0 + ) { + const images = (item as any).delta.images as any[]; + + return images + .map((img) => { + // support multiple possible shapes for the url + const url = + img?.image_url?.url || + img?.image_url?.image_url?.url || + img?.url || + (typeof img === 'string' ? img : undefined); + + if (!url) return null; + + return { + data: url, + id: chunk.id, + type: 'base64_image', + } as StreamProtocolChunk; + }) + .filter(Boolean) as StreamProtocolChunk[]; + } + // 给定结束原因 if (item.finish_reason) { // one-api 的流式接口,会出现既有 finish_reason ,也有 content 的情况 @@ -192,11 +222,11 @@ const transformOpenAIStream = ( if ('content' in item.delta && Array.isArray(item.delta.content)) { return item.delta.content .filter((block: any) => block.type === 'thinking' && Array.isArray(block.thinking)) - .map((block: any) => + .map((block: any) => block.thinking .filter((thinkItem: any) => thinkItem.type === 'text' && thinkItem.text) .map((thinkItem: any) => thinkItem.text) - .join('') + .join(''), ) .join(''); } @@ -233,6 +263,12 @@ const transformOpenAIStream = ( streamContext.thinkingInContent = false; } + // 如果 content 是空字符串但 chunk 带有 usage,则优先返回 usage(例如 Gemini image-preview 最终会在单独的 chunk 中返回 usage) + if (content === '' && chunk.usage) { + const usage = chunk.usage; + return { data: convertUsage(usage, provider), id: chunk.id, type: 'usage' }; + } + // 判断是否有 citations 内容,更新 returnedCitation 状态 if (!streamContext?.returnedCitation) { const citations = diff --git a/packages/model-runtime/src/utils/streams/protocol.test.ts b/packages/model-runtime/src/utils/streams/protocol.test.ts index 1f8d56b9309..517a6babfa0 100644 --- a/packages/model-runtime/src/utils/streams/protocol.test.ts +++ b/packages/model-runtime/src/utils/streams/protocol.test.ts @@ -200,4 +200,36 @@ describe('createTokenSpeedCalculator', async () => { const results = await processChunk(transformer, chunks); expect(results).toHaveLength(chunks.length); }); + + it('should calculate token speed considering outputImageTokens when totalOutputTokens is missing', async () => { + const chunks = [ + { data: '', id: 'chatcmpl-image-1', type: 'text' }, + { data: 'hi', id: 'chatcmpl-image-1', type: 'text' }, + { data: 'stop', id: 'chatcmpl-image-1', type: 'stop' }, + { + data: { + inputTextTokens: 9, + outputTextTokens: 1, + outputImageTokens: 4, + totalInputTokens: 9, + // totalOutputTokens intentionally omitted to force summation path + totalTokens: 13, + }, + id: 'chatcmpl-image-1', + type: 'usage', + }, + ]; + + const transformer = createTokenSpeedCalculator((v) => v, { inputStartAt }); + const results = await processChunk(transformer, chunks); + + // should push an extra speed chunk + expect(results).toHaveLength(chunks.length + 1); + const speedChunk = results.slice(-1)[0]; + expect(speedChunk.id).toBe('output_speed'); + expect(speedChunk.type).toBe('speed'); + // tps and ttft should be numeric (avoid flakiness if interval is 0ms) + expect(speedChunk.data.tps).not.toBeNaN(); + expect(speedChunk.data.ttft).not.toBeNaN(); + }); }); diff --git a/packages/model-runtime/src/utils/streams/protocol.ts b/packages/model-runtime/src/utils/streams/protocol.ts index 762173c9709..c477393a3ee 100644 --- a/packages/model-runtime/src/utils/streams/protocol.ts +++ b/packages/model-runtime/src/utils/streams/protocol.ts @@ -364,10 +364,14 @@ export const createTokenSpeedCalculator = ( } // if the chunk is the stop chunk, set as output finish if (inputStartAt && outputStartAt && chunk.type === 'usage') { - const totalOutputTokens = chunk.data?.totalOutputTokens || chunk.data?.outputTextTokens; - const reasoningTokens = chunk.data?.outputReasoningTokens || 0; + const totalOutputTokens = + chunk.data?.totalOutputTokens ?? + (chunk.data?.outputTextTokens ?? 0) + (chunk.data?.outputImageTokens ?? 0); + const reasoningTokens = chunk.data?.outputReasoningTokens ?? 0; const outputTokens = - (outputThinking ?? false) ? totalOutputTokens : totalOutputTokens - reasoningTokens; + (outputThinking ?? false) + ? totalOutputTokens + : Math.max(0, totalOutputTokens - reasoningTokens); result.push({ data: { tps: (outputTokens / (Date.now() - outputStartAt)) * 1000, diff --git a/packages/model-runtime/src/utils/usageConverter.test.ts b/packages/model-runtime/src/utils/usageConverter.test.ts index d1aa6116733..a11131ac36b 100644 --- a/packages/model-runtime/src/utils/usageConverter.test.ts +++ b/packages/model-runtime/src/utils/usageConverter.test.ts @@ -290,4 +290,62 @@ describe('convertUsage', () => { totalTokens: 6550, }); }); + + it('should handle output image tokens correctly', () => { + // Arrange + const usageWithImage = { + prompt_tokens: 100, + completion_tokens: 200, + completion_tokens_details: { + image_tokens: 60, + reasoning_tokens: 30, + }, + total_tokens: 300, + } as OpenAI.Completions.CompletionUsage; + + // Act + const result = convertUsage(usageWithImage); + + // Assert + expect(result).toEqual({ + inputTextTokens: 100, + totalInputTokens: 100, + totalOutputTokens: 200, + outputImageTokens: 60, + outputReasoningTokens: 30, + outputTextTokens: 110, // 200 - 60 - 30 + totalTokens: 300, + }); + }); + + it('should handle response output image tokens correctly for ResponseUsage', () => { + // Arrange + const responseUsage = { + input_tokens: 100, + input_tokens_details: { + cached_tokens: 0, + }, + output_tokens: 200, + output_tokens_details: { + image_tokens: 60, + reasoning_tokens: 30, + }, + total_tokens: 300, + } as OpenAI.Responses.ResponseUsage; + + // Act + const result = convertResponseUsage(responseUsage); + + // Assert + expect(result).toEqual({ + inputTextTokens: 100, + inputCacheMissTokens: 100, // 100 - 0 + totalInputTokens: 100, + totalOutputTokens: 200, + outputImageTokens: 60, + outputReasoningTokens: 30, + outputTextTokens: 170, // 200 - 30 + totalTokens: 300, + }); + }); }); diff --git a/packages/model-runtime/src/utils/usageConverter.ts b/packages/model-runtime/src/utils/usageConverter.ts index b49b4f1883e..407dc1644bb 100644 --- a/packages/model-runtime/src/utils/usageConverter.ts +++ b/packages/model-runtime/src/utils/usageConverter.ts @@ -20,12 +20,13 @@ export const convertUsage = ( const totalOutputTokens = usage.completion_tokens; const outputReasoning = usage.completion_tokens_details?.reasoning_tokens || 0; const outputAudioTokens = usage.completion_tokens_details?.audio_tokens || 0; + const outputImageTokens = (usage.completion_tokens_details as any)?.image_tokens || 0; // XAI 的 completion_tokens 不包含 reasoning_tokens,需要特殊处理 const outputTextTokens = provider === 'xai' ? totalOutputTokens - outputAudioTokens - : totalOutputTokens - outputReasoning - outputAudioTokens; + : totalOutputTokens - outputReasoning - outputAudioTokens - outputImageTokens; const totalTokens = inputCitationTokens + usage.total_tokens; @@ -37,6 +38,7 @@ export const convertUsage = ( inputCitationTokens: inputCitationTokens, inputTextTokens: inputTextTokens, outputAudioTokens: outputAudioTokens, + outputImageTokens: outputImageTokens, outputReasoningTokens: outputReasoning, outputTextTokens: outputTextTokens, rejectedPredictionTokens: usage.completion_tokens_details?.rejected_prediction_tokens, @@ -75,6 +77,7 @@ export const convertResponseUsage = (usage: OpenAI.Responses.ResponseUsage): Mod // For ResponseUsage, outputTextTokens is totalOutputTokens minus reasoning, as no audio output tokens are specified. const outputTextTokens = totalOutputTokens - outputReasoningTokens; + const outputImageTokens = (usage.output_tokens_details as any)?.image_tokens || 0; // 3. Construct the comprehensive data object (matching ModelTokensUsage structure) const data = { @@ -87,6 +90,7 @@ export const convertResponseUsage = (usage: OpenAI.Responses.ResponseUsage): Mod inputCitationTokens: undefined, // Not in ResponseUsage inputTextTokens: inputTextTokens, outputAudioTokens: undefined, // Not in ResponseUsage + outputImageTokens: outputImageTokens, outputReasoningTokens: outputReasoningTokens, outputTextTokens: outputTextTokens, rejectedPredictionTokens: undefined, // Not in ResponseUsage diff --git a/src/config/aiModels/google.ts b/src/config/aiModels/google.ts index 36f7a4676d1..8f9e373556f 100644 --- a/src/config/aiModels/google.ts +++ b/src/config/aiModels/google.ts @@ -195,21 +195,21 @@ const googleChatModels: AIChatModelCard[] = [ imageOutput: true, vision: true, }, - contextWindowTokens: 32_768 + 32_768, + contextWindowTokens: 32_768 + 8192, description: 'Gemini 2.5 Flash Image Preview 是 Google 最新、最快、最高效的原生多模态模型,它允许您通过对话生成和编辑图像。', displayName: 'Gemini 2.5 Flash Image Preview', enabled: true, id: 'gemini-2.5-flash-image-preview', - maxOutput: 32_768, + maxOutput: 8192, pricing: { units: [ { name: 'textInput', rate: 0.3, strategy: 'fixed', unit: 'millionTokens' }, { name: 'textOutput', rate: 2.5, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'imageOutput', rate: 3, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'imageOutput', rate: 30, strategy: 'fixed', unit: 'millionTokens' }, ], }, - releasedAt: '2025-08-27', + releasedAt: '2025-08-26', type: 'chat', }, { diff --git a/src/config/aiModels/openrouter.ts b/src/config/aiModels/openrouter.ts index dafe3d1f0bd..cc05ebf24c5 100644 --- a/src/config/aiModels/openrouter.ts +++ b/src/config/aiModels/openrouter.ts @@ -11,6 +11,39 @@ const openrouterChatModels: AIChatModelCard[] = [ id: 'openrouter/auto', type: 'chat', }, + { + abilities: { + imageOutput: true, + vision: true, + }, + contextWindowTokens: 32_768 + 8192, + description: 'Gemini 2.5 Flash 实验模型,支持图像生成', + displayName: 'Gemini 2.5 Flash Image Preview', + id: 'google/gemini-2.5-flash-image-preview', + maxOutput: 8192, + pricing: { + units: [ + { name: 'imageOutput', rate: 30, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'textInput', rate: 0.3, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'textOutput', rate: 2.5, strategy: 'fixed', unit: 'millionTokens' }, + ], + }, + releasedAt: '2025-08-26', + type: 'chat', + }, + { + abilities: { + imageOutput: true, + vision: true, + }, + contextWindowTokens: 32_768 + 8192, + description: 'Gemini 2.5 Flash 实验模型,支持图像生成', + displayName: 'Gemini 2.5 Flash Image Preview (free)', + id: 'google/gemini-2.5-flash-image-preview:free', + maxOutput: 8192, + releasedAt: '2025-08-26', + type: 'chat', + }, { abilities: { reasoning: true, diff --git a/src/config/aiModels/vertexai.ts b/src/config/aiModels/vertexai.ts index 64be7cce09f..aa4db0e3ffa 100644 --- a/src/config/aiModels/vertexai.ts +++ b/src/config/aiModels/vertexai.ts @@ -126,21 +126,21 @@ const vertexaiChatModels: AIChatModelCard[] = [ imageOutput: true, vision: true, }, - contextWindowTokens: 32_768 + 32_768, + contextWindowTokens: 32_768 + 8192, description: 'Gemini 2.5 Flash Image Preview 是 Google 最新、最快、最高效的原生多模态模型,它允许您通过对话生成和编辑图像。', displayName: 'Gemini 2.5 Flash Image Preview', enabled: true, id: 'gemini-2.5-flash-image-preview', - maxOutput: 32_768, + maxOutput: 8192, pricing: { units: [ { name: 'textInput', rate: 0.3, strategy: 'fixed', unit: 'millionTokens' }, { name: 'textOutput', rate: 2.5, strategy: 'fixed', unit: 'millionTokens' }, - { name: 'imageOutput', rate: 3, strategy: 'fixed', unit: 'millionTokens' }, + { name: 'imageOutput', rate: 30, strategy: 'fixed', unit: 'millionTokens' }, ], }, - releasedAt: '2025-08-27', + releasedAt: '2025-08-26', type: 'chat', }, { diff --git a/src/features/Conversation/Extras/Usage/UsageDetail/index.tsx b/src/features/Conversation/Extras/Usage/UsageDetail/index.tsx index 22fab7c196b..47afd4c3eb2 100644 --- a/src/features/Conversation/Extras/Usage/UsageDetail/index.tsx +++ b/src/features/Conversation/Extras/Usage/UsageDetail/index.tsx @@ -61,6 +61,12 @@ const TokenDetail = memo(({ meta, model, provider }) => { ? detailTokens.outputReasoning.credit : detailTokens.outputReasoning.token, }, + !!detailTokens.outputImage && { + color: theme.purple, + id: 'outputImage', + title: t('messages.tokenDetails.outputImage'), + value: isShowCredit ? detailTokens.outputImage.credit : detailTokens.outputImage.token, + }, !!detailTokens.outputAudio && { color: theme.cyan9, id: 'outputAudio', diff --git a/src/features/Conversation/Extras/Usage/UsageDetail/tokens.test.ts b/src/features/Conversation/Extras/Usage/UsageDetail/tokens.test.ts index ed84a964474..f686cb3b935 100644 --- a/src/features/Conversation/Extras/Usage/UsageDetail/tokens.test.ts +++ b/src/features/Conversation/Extras/Usage/UsageDetail/tokens.test.ts @@ -143,6 +143,44 @@ describe('getDetailsToken', () => { }); }); + it('should handle outputImageTokens correctly', () => { + const usage = { + inputTextTokens: 100, + outputImageTokens: 60, + outputReasoningTokens: 30, + totalOutputTokens: 200, + totalTokens: 300, + } as ModelTokensUsage; + + const result = getDetailsToken(usage, mockModelCard); + + expect(result.outputImage).toEqual({ + credit: 1, // 60 * 0.02 = 1.2 -> 1 + id: 'outputImage', + token: 60, + }); + + expect(result.outputReasoning).toEqual({ + credit: 1, // 30 * 0.02 = 0.6 -> 1 + token: 30, + }); + + expect(result.outputText).toEqual({ + credit: 2, // (200 - 30 - 60) * 0.02 = 2.2 -> 2 + token: 110, + }); + + expect(result.totalOutput).toEqual({ + credit: 4, // 200 * 0.02 = 4 + token: 200, + }); + + expect(result.totalTokens).toEqual({ + credit: 4, // total credit equals totalOutputCredit here + token: 300, + }); + }); + it('should handle inputCitationTokens correctly', () => { const usage: ModelTokensUsage = { inputCitationTokens: 75, diff --git a/src/features/Conversation/Extras/Usage/UsageDetail/tokens.ts b/src/features/Conversation/Extras/Usage/UsageDetail/tokens.ts index f6748b137d6..ef368f9d26a 100644 --- a/src/features/Conversation/Extras/Usage/UsageDetail/tokens.ts +++ b/src/features/Conversation/Extras/Usage/UsageDetail/tokens.ts @@ -21,9 +21,14 @@ export const getDetailsToken = ( const outputReasoningTokens = usage.outputReasoningTokens || (usage as any).reasoningTokens || 0; + const outputImageTokens = usage.outputImageTokens || (usage as any).imageTokens || 0; + const outputTextTokens = usage.outputTextTokens ? usage.outputTextTokens - : totalOutputTokens - outputReasoningTokens - (usage.outputAudioTokens || 0); + : totalOutputTokens - + outputReasoningTokens - + (usage.outputAudioTokens || 0) - + outputImageTokens; const inputWriteCacheTokens = usage.inputWriteCacheTokens || 0; const inputCacheTokens = usage.inputCachedTokens || (usage as any).cachedTokens || 0; @@ -93,6 +98,13 @@ export const getDetailsToken = ( token: usage.outputAudioTokens, } : undefined, + outputImage: !!outputImageTokens + ? { + credit: calcCredit(outputImageTokens, formatPrice.output), + id: 'outputImage', + token: outputImageTokens, + } + : undefined, outputReasoning: !!outputReasoningTokens ? { credit: calcCredit(outputReasoningTokens, formatPrice.output), diff --git a/src/locales/default/chat.ts b/src/locales/default/chat.ts index 6624fb2170d..557c5ec6a9f 100644 --- a/src/locales/default/chat.ts +++ b/src/locales/default/chat.ts @@ -128,6 +128,7 @@ export default { inputWriteCached: '输入缓存写入', output: '输出', outputAudio: '音频输出', + outputImage: '图像输出', outputText: '文本输出', outputTitle: '输出明细', reasoning: '深度思考',