diff --git a/modules/chat.py b/modules/chat.py index 42c0d46d5a..459f099cbd 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -7,10 +7,12 @@ import re import shutil import time +import mimetypes from datetime import datetime from functools import partial from pathlib import Path +import filetype import gradio as gr import yaml from jinja2.ext import loopcontrols @@ -25,7 +27,7 @@ convert_to_markdown, make_thumbnail ) -from modules.image_utils import open_image_safely +from modules.image_utils import is_mime_type_vision_supported, open_image_safely from modules.logging_colors import logger from modules.text_generation import ( generate_reply, @@ -239,7 +241,7 @@ def generate_chat_prompt(user_input, state, **kwargs): image_refs = "" for attachment in metadata[user_key]["attachments"]: - if attachment.get("type") == "image": + if is_mime_type_vision_supported(attachment.get("type")): # Add image reference for multimodal models image_refs += "<__media__>" elif state.get('include_past_attachments', True): @@ -280,7 +282,7 @@ def generate_chat_prompt(user_input, state, **kwargs): image_refs = "" for attachment in metadata[user_key]["attachments"]: - if attachment.get("type") == "image": + if is_mime_type_vision_supported(attachment.get("type")): image_refs += "<__media__>" else: filename = attachment.get("name", "file") @@ -590,51 +592,48 @@ def add_message_attachment(history, row_idx, file_path, is_user=True): # Get file info using pathlib path = Path(file_path) filename = path.name - file_extension = path.suffix.lower() + + # Get MIME type from path + mime_type: str | None + mime_type, _ = mimetypes.guess_file_type(path) + + # Get MIME type from file + if mime_type is None: + mime_type = filetype.guess_mime(path) try: - # Handle image files - if file_extension in ['.jpg', '.jpeg', '.png', '.webp', '.bmp', '.gif']: + if is_mime_type_vision_supported(mime_type): + # Handle image files # Convert image to base64 with open(path, 'rb') as f: image_data = base64.b64encode(f.read()).decode('utf-8') - # Determine MIME type from extension - mime_type_map = { - '.jpg': 'image/jpeg', - '.jpeg': 'image/jpeg', - '.png': 'image/png', - '.webp': 'image/webp', - '.bmp': 'image/bmp', - '.gif': 'image/gif' - } - mime_type = mime_type_map.get(file_extension, 'image/jpeg') - # Format as data URL data_url = f"data:{mime_type};base64,{image_data}" # Generate unique image ID - image_id = len([att for att in history['metadata'][key]["attachments"] if att.get("type") == "image"]) + 1 + image_id = len([att for att in history['metadata'][key]["attachments"] if is_mime_type_vision_supported(att.get("type"))]) + 1 attachment = { "name": filename, - "type": "image", + "type": mime_type, "image_data": data_url, "image_id": image_id, } - elif file_extension == '.pdf': + elif mime_type == 'application/pdf': # Process PDF file content = extract_pdf_text(path) attachment = { "name": filename, - "type": "application/pdf", + "type": mime_type, "content": content, } - elif file_extension == '.docx': + elif mime_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': + # Process .docx file content = extract_docx_text(path) attachment = { "name": filename, - "type": "application/docx", + "type": mime_type, "content": content, } else: @@ -644,7 +643,7 @@ def add_message_attachment(history, row_idx, file_path, is_user=True): attachment = { "name": filename, - "type": "text/plain", + "type": mime_type or "text/plain", "content": content, } @@ -858,7 +857,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess user_key = f"user_{i}" if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]: for attachment in output['metadata'][user_key]["attachments"]: - if attachment.get("type") == "image": + if is_mime_type_vision_supported(attachment.get("type")): all_image_attachments.append(attachment) # Add all collected image attachments to state for the generation diff --git a/modules/html_generator.py b/modules/html_generator.py index 312b66ad07..8125147582 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -10,6 +10,7 @@ from PIL import Image, ImageOps from modules import shared +from modules.image_utils import is_mime_type_vision_supported from modules.sane_markdown_lists import SaneListExtension from modules.utils import get_available_chat_styles @@ -462,12 +463,13 @@ def format_message_attachments(history, role, index): attachments_html = '
' for attachment in attachments: name = html.escape(attachment["name"]) + mime_type = attachment.get("type") - if attachment.get("type") == "image": + if is_mime_type_vision_supported(mime_type): image_data = attachment.get("image_data", "") attachments_html += ( f'
' - f'{name}' + f'{name}' f'
{name}
' f'
' ) diff --git a/modules/image_utils.py b/modules/image_utils.py index d2809fef36..5f90dc1119 100644 --- a/modules/image_utils.py +++ b/modules/image_utils.py @@ -97,7 +97,7 @@ def convert_image_attachments_to_pil(image_attachments: List[dict]) -> List[Imag """Convert webui image_attachments format to PIL Images.""" pil_images = [] for attachment in image_attachments: - if attachment.get('type') == 'image' and 'image_data' in attachment: + if is_mime_type_vision_supported(attachment.get('type')) and 'image_data' in attachment: try: image = decode_base64_image(attachment['image_data']) if image.mode != 'RGB': @@ -116,3 +116,42 @@ def convert_openai_messages_to_images(messages: List[dict]) -> List[Image.Image] _, images = process_message_content(message['content']) all_images.extend(images) return all_images + + +def is_mime_type_vision_supported(mime_type: str) -> bool: + return mime_type in { + 'image/jpeg', + 'image/png', + 'image/webp', + 'image/bmp', + 'image/gif', + 'image/tiff', + 'image/avif', + # Uncommon pillow readable mime types + 'image/x-dds', + 'image/dds', + 'image/x-eps', + 'image/x-icns', + 'image/x-icon', + 'vnd.microsoft.icon', + 'image/jp2', + 'image/x-jp2-codestream', + 'image/jpx', + 'image/vnd.zbrush.pcx', + 'image/x-portable-pixmap', + 'image/qoi', + 'image/x-qoi', + 'image/x-sgi', + 'image/x-tga', + 'image/x-xbitmap', + 'image/x-win-bitmap', + 'image/fits', + 'image/vnd.fpx', + 'image/x-fpx', + 'image/x-photo-cd', + 'image/vnd.adobe.photoshop', + 'image/x-sun-raster', + 'image/emf', + 'image/wmf', + 'image/x-xpixmap', + } diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index dd74ed52e9..63b05e980f 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -6,6 +6,7 @@ datasets diffusers==0.36.* einops fastapi==0.112.4 +filetype==1.2.0 flash-linear-attention==0.4.* html2text==2025.4.15 huggingface-hub==0.36.0 diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 3475c161ac..1375acfd0a 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -5,6 +5,7 @@ datasets diffusers==0.36.* einops fastapi==0.112.4 +filetype==1.2.0 html2text==2025.4.15 huggingface-hub==0.36.0 jinja2==3.1.6 diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 05077a32a6..67b4c5ce10 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -5,6 +5,7 @@ datasets diffusers==0.36.* einops fastapi==0.112.4 +filetype==1.2.0 html2text==2025.4.15 huggingface-hub==0.36.0 jinja2==3.1.6 diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index f591fa251e..cf86336ad1 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -5,6 +5,7 @@ datasets diffusers==0.36.* einops fastapi==0.112.4 +filetype==1.2.0 html2text==2025.4.15 huggingface-hub==0.36.0 jinja2==3.1.6 diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 6ff4753d4b..85dda581c2 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -5,6 +5,7 @@ datasets diffusers==0.36.* einops fastapi==0.112.4 +filetype==1.2.0 html2text==2025.4.15 huggingface-hub==0.36.0 jinja2==3.1.6 diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index bd0776daea..9375e4cf7d 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -5,6 +5,7 @@ datasets diffusers==0.36.* einops fastapi==0.112.4 +filetype==1.2.0 html2text==2025.4.15 huggingface-hub==0.36.0 jinja2==3.1.6 diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 39a06cdbe0..291ec8ca7d 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -5,6 +5,7 @@ datasets diffusers==0.36.* einops fastapi==0.112.4 +filetype==1.2.0 html2text==2025.4.15 huggingface-hub==0.36.0 jinja2==3.1.6 diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 3ff17704be..e664fabff0 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -6,6 +6,7 @@ datasets diffusers==0.36.* einops fastapi==0.112.4 +filetype==1.2.0 flash-linear-attention==0.4.* html2text==2025.4.15 huggingface-hub==0.36.0 diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index dc740dc86f..fe1fc16bdf 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -5,6 +5,7 @@ datasets diffusers==0.36.* einops fastapi==0.112.4 +filetype==1.2.0 html2text==2025.4.15 huggingface-hub==0.36.0 jinja2==3.1.6 diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 2f6bf51142..f6b5d08c16 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -1,5 +1,6 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 +filetype==1.2.0 html2text==2025.4.15 huggingface-hub==0.36.0 jinja2==3.1.6 diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt index 209610ceb9..c657f61e57 100644 --- a/requirements/portable/requirements_amd.txt +++ b/requirements/portable/requirements_amd.txt @@ -1,5 +1,6 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 +filetype==1.2.0 html2text==2025.4.15 huggingface-hub==0.36.0 jinja2==3.1.6 diff --git a/requirements/portable/requirements_amd_noavx2.txt b/requirements/portable/requirements_amd_noavx2.txt index 29a1f72b04..abf9ef73f0 100644 --- a/requirements/portable/requirements_amd_noavx2.txt +++ b/requirements/portable/requirements_amd_noavx2.txt @@ -1,5 +1,6 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 +filetype==1.2.0 html2text==2025.4.15 huggingface-hub==0.36.0 jinja2==3.1.6 diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 8812ac39f5..cf3c653710 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -1,5 +1,6 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 +filetype==1.2.0 html2text==2025.4.15 huggingface-hub==0.36.0 jinja2==3.1.6 diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index c627cfd456..1d8d5a6936 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -1,5 +1,6 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 +filetype==1.2.0 html2text==2025.4.15 huggingface-hub==0.36.0 jinja2==3.1.6 diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 5ca660adfc..3f6859971b 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -1,5 +1,6 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 +filetype==1.2.0 html2text==2025.4.15 huggingface-hub==0.36.0 jinja2==3.1.6 diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 5bb282eb18..8c9848e2bd 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -1,5 +1,6 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 +filetype==1.2.0 html2text==2025.4.15 huggingface-hub==0.36.0 jinja2==3.1.6 diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index ecce4a2f7d..b07e2c885e 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -1,5 +1,6 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 +filetype==1.2.0 html2text==2025.4.15 huggingface-hub==0.36.0 jinja2==3.1.6 diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt index c9831985e5..6a05bef68f 100644 --- a/requirements/portable/requirements_nowheels.txt +++ b/requirements/portable/requirements_nowheels.txt @@ -1,5 +1,6 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 +filetype==1.2.0 html2text==2025.4.15 huggingface-hub==0.36.0 jinja2==3.1.6 diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 4b3adddc3d..21c57e3c46 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -1,5 +1,6 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 +filetype==1.2.0 html2text==2025.4.15 huggingface-hub==0.36.0 jinja2==3.1.6 diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 8d6acdd78e..f800adbf16 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -1,5 +1,6 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 +filetype==1.2.0 html2text==2025.4.15 huggingface-hub==0.36.0 jinja2==3.1.6