InternLM
diff --git a/‎.github/workflows/unit-test.yml‎
Lines changed: 3 additions & 22 deletions b/‎.github/workflows/unit-test.yml‎
Lines changed: 3 additions & 22 deletions
diff --git a/‎docker/Dockerfile_dev‎
Lines changed: 4 additions & 0 deletions b/‎docker/Dockerfile_dev‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎lmdeploy/messages.py‎
Lines changed: 28 additions & 10 deletions b/‎lmdeploy/messages.py‎
Lines changed: 28 additions & 10 deletions
diff --git a/‎lmdeploy/vl/model/base.py‎
Lines changed: 21 additions & 0 deletions b/‎lmdeploy/vl/model/base.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎lmdeploy/vl/model/internvl.py‎
Lines changed: 32 additions & 25 deletions b/‎lmdeploy/vl/model/internvl.py‎
Lines changed: 32 additions & 25 deletions
diff --git a/‎lmdeploy/vl/model/internvl3_hf.py‎
Lines changed: 4 additions & 69 deletions b/‎lmdeploy/vl/model/internvl3_hf.py‎
Lines changed: 4 additions & 69 deletions
diff --git a/‎lmdeploy/vl/model/qwen2.py‎
Lines changed: 31 additions & 24 deletions b/‎lmdeploy/vl/model/qwen2.py‎
Lines changed: 31 additions & 24 deletions
@@ -35,43 +35,24 @@ jobs:
     runs-on: [self-hosted, linux-a100-s2]
     timeout-minutes: 4320 # 72hours
     container:
-      image: nvidia/cuda:11.8.0-devel-ubuntu22.04
+      image: openmmlab/lmdeploy:dev-cu12.8
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e CUDA_VISIBLE_DEVICES=2,3 --pull never"
       volumes:
         - /nvme/share_data/github-actions/pip-cache:/root/.cache/pip
         - /nvme/share_data/github-actions/hf_home:/root/.cache/huggingface
         - /nvme/share_data/github-actions/packages:/root/packages
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
-      - name: Setup systems
-        run: |
-          apt-get update -y && apt-get install -y software-properties-common wget git curl &&\
-          add-apt-repository ppa:deadsnakes/ppa -y && apt-get update -y && apt-get install -y --no-install-recommends \
-          ninja-build rapidjson-dev libgoogle-glog-dev gdb python3.10 python3.10-dev python3.10-venv \
-          && apt-get clean -y && rm -rf /var/lib/apt/lists/* && cd /opt && python3 -m venv py3
-          echo "PATH=/opt/py3/bin:$PATH" >> "$GITHUB_ENV"
       - name: Clone repository
-        uses: actions/checkout@v2
-      - name: Install pytorch
-        run: |
-          python3 -V
-          python3 -m pip cache dir
-          python3 -m pip install torch==2.4.0 torchvision==0.19.0 --index-url https://download.pytorch.org/whl/cu118
+        uses: actions/checkout@v5
       - name: Install lmdeploy
         run: |
-          python3 -m pip install packaging protobuf transformers_stream_generator matplotlib
-          # manually install flash attn
-          python3 -m pip install /root/packages/cu118/flash_attn-*.whl
-          python3 -m pip install -r requirements_cuda.txt -r requirements/test.txt
+          python3 -m pip install -r requirements/test.txt
           python3 -m pip install -e .
       - name: Check env
         run: |
           python3 -m pip list
           lmdeploy check_env
-      - name: Test lmdeploy csrc
-        run: |
-          #./build/bin/build/bin/unittest
-          echo "TODO"
       - name: Test lmdeploy python UT
         run: |
           coverage run --branch --source lmdeploy -m pytest -rsE tests
 
@@ -34,4 +34,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install -r requirements_cuda.txt --extra-index-url https://download.pytorch.org/whl/cu128 && \
     uv pip install -e .
 
+# install flash_attn
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
+
 RUN uv cache clean
@@ -473,17 +473,35 @@ class Response:
     index: int = 0
     routed_experts: Any = None
 
+    def __str__(self):
+        return f'text={self.text}\n{self._format_none_text_fields()}'
+
     def __repr__(self):
-        logits = 'logits=None' if self.logits is None else f'logits.shape={self.logits.shape}\nlogits={self.logits}'
-        hidden_state = (
-            'last_hidden_state=None' if self.last_hidden_state is None else
-            f'last_hidden_state.shape={self.last_hidden_state.shape}\nlast_hidden_state={self.last_hidden_state}')
-        routed_experts = 'routed_experts=None' if self.routed_experts is None else \
-            f'routed_experts.shape={self.routed_experts.shape}'
-
-        s = (f'text={self.text!r}\ngenerate_token_len={self.generate_token_len}\nfinish_reason="{self.finish_reason}"\n'
-             f'token_ids={self.token_ids}\nlog_probs={self.logprobs}\n{logits}\n{hidden_state}\n{routed_experts}')
-        return s
+        return f'text={self.text!r}\n{self._format_none_text_fields()}'
+
+    def _format_none_text_fields(self):
+        fields = []
+        fields.append(f'input_token_len={self.input_token_len}')
+        fields.append(f'generate_token_len={self.generate_token_len}')
+        fields.append(f'finish_reason="{self.finish_reason}"')
+        fields.append(f'token_ids={self.token_ids}')
+        fields.append(f'logprobs={self.logprobs}')
+
+        # Helper function to format tensor information
+        def _format_tensor(name: str, tensor: Optional[torch.Tensor]) -> List[str]:
+            if tensor is None:
+                return [f'{name}=None']
+            try:
+                return [f'{name}.shape={tensor.shape}', f'{name}={tensor}']
+            except:  # noqa
+                # in case tensor is not torch.Tensor or has no shape
+                return [f'{name}={tensor}']
+
+        # Format tensor fields
+        fields.extend(_format_tensor('logits', self.logits))
+        fields.extend(_format_tensor('last_hidden_state', self.last_hidden_state))
+        fields.extend(_format_tensor('routed_experts', self.routed_experts))
+        return '\n'.join(fields)
 
 
 # modified from https://github.com/vllm-project/vllm/blob/main/vllm/v1/engine/__init__.py
 
@@ -181,6 +181,27 @@ def collect_images(messages):
             }) for x in content if x['type'] == 'image'])
         return images
 
+    @staticmethod
+    def IMAGE_TOKEN_included(messages):
+        """Check whether the IMAGE_TOKEN is included in the messages.
+
+        Args:
+            messages (List[Dict]): a list of message
+        Returns:
+            bool: whether the IMAGE_TOKEN is included in the messages
+        """
+        for message in messages:
+            role, content = message['role'], message['content']
+            if role != 'user':
+                continue
+            if isinstance(content, str) and '<IMAGE_TOKEN>' in content:
+                return True
+            elif isinstance(content, List):
+                content = [x['text'] for x in content if x['type'] == 'text']
+                if any('<IMAGE_TOKEN>' in x for x in content):
+                    return True
+        return False
+
     def to_pytorch_with_input_ids(self, messages):
         """Pack the preprocessing results in a format compatible with what is
         required by pytorch engine when input_ids are provided directly.
 
@@ -76,9 +76,9 @@ def __init__(self,
                  hf_config: AutoConfig = None,
                  backend: str = ''):
         super().__init__(model_path, with_llm, max_memory, hf_config, backend)
-        IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
+        self.image_token = '<IMG_CONTEXT>'
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
-        self.image_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
+        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
 
     def build_preprocessor(self):
         self.config = self.hf_config
@@ -224,8 +224,8 @@ def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
         messages.append(dict(role='forward', content=outputs))
         return messages
 
-    @staticmethod
     def proc_messages(
+        self,
         messages,
         chat_template,
         sequence_start,
@@ -235,32 +235,39 @@ def proc_messages(
         """Apply chat template to get the prompt."""
         prompt_messages = []
         IMAGE_TOKEN = '<IMAGE_TOKEN>'
-        for message in messages:
-            if isinstance(message['content'], str):
-                prompt_messages.append(message)
-                continue
-            elif message['role'] in ['preprocess', 'forward']:
-                continue
-            n_images = len([1 for x in message['content'] if x['type'] == 'image'])
-            content = [x.get('text', '') for x in message['content'] if x['type'] == 'text']
-            if len(content) == 0:
-                content.append('')
-            prompt = content[0]
-            if IMAGE_TOKEN in prompt and f'<img>{IMAGE_TOKEN}' not in prompt:
-                prompt = prompt.replace(f'{IMAGE_TOKEN}', f'<img>{IMAGE_TOKEN}</img>')
-                prompt = prompt.replace('</img><img>', '')
-                prompt = prompt.replace('<img><img>', '<img>')
-                prompt = prompt.replace('</img></img>', '</img>')
-            elif IMAGE_TOKEN not in prompt:
-                prompt = f'<img>{IMAGE_TOKEN * n_images}</img>\n' + prompt
-            else:
-                pass
-            prompt_messages.append(dict(role='user', content=prompt))
+        messages = [x for x in messages if x['role'] not in ['preprocess', 'forward']]
+        if VisonModel.IMAGE_TOKEN_included(messages):
+            # backward compatibility
+            for message in messages:
+                role, content = message['role'], message['content']
+                if role != 'user' or isinstance(content, str):
+                    prompt_messages.append(message)
+                    continue
+                content = [x['text'] for x in content if x['type'] == 'text']
+                prompt = ''.join(content)
+                prompt = prompt.replace(f'{IMAGE_TOKEN}', f'<img>{self.image_token}</img>')
+                prompt_messages.append(dict(role='user', content=prompt))
+        else:
+            for message in messages:
+                role, content = message['role'], message['content']
+                if role != 'user' or isinstance(content, str):
+                    prompt_messages.append(message)
+                    continue
+                _content = []
+                for item in content:
+                    item_type = item['type']
+                    if item_type == 'text':
+                        _content.append(item['text'])
+                    elif item_type in ['image', 'image_url']:
+                        _content.append(f'<img>{self.image_token}</img>\n')
+                    else:
+                        raise ValueError(f'Unsupported message type: {item["type"]}')
+                prompt_messages.append(dict(role='user', content=''.join(_content)))
         prompt = chat_template.messages2prompt(prompt_messages,
                                                sequence_start,
                                                tools=tools,
                                                enable_thinking=enable_thinking)
-        return prompt, IMAGE_TOKEN
+        return prompt, self.image_token
 
     def to_pytorch(self,
                    messages,
 
@@ -6,7 +6,7 @@
 from transformers.processing_utils import ImagesKwargs, ProcessingKwargs
 
 from lmdeploy.utils import get_logger
-from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
+from lmdeploy.vl.model.internvl import VISION_MODELS, InternVLVisionModel
 from lmdeploy.vl.model.utils import disable_logging
 
 logger = get_logger('lmdeploy')
@@ -32,7 +32,7 @@ class InternVLProcessorKwargs(ProcessingKwargs, total=False):
 
 
 @VISION_MODELS.register_module()
-class InternVL3VisionModel(VisonModel):
+class InternVL3VisionModel(InternVLVisionModel):
     """Internvl3 vision model."""
 
     _arch = ['InternVLForConditionalGeneration', 'InternS1ForConditionalGeneration']
@@ -44,11 +44,12 @@ def __init__(self,
                  hf_config: AutoConfig = None,
                  backend: str = ''):
         super().__init__(model_path, with_llm, max_memory, hf_config, backend)
-        self.arch = hf_config.architectures[0]
+        self.arch = self.hf_config.architectures[0]
 
     def build_preprocessor(self):
         self.processor = AutoProcessor.from_pretrained(self.model_path, trust_remote_code=True)
         tokenizer = self.processor.tokenizer
+        self.image_token = self.processor.image_token
         self.image_token_id = tokenizer.context_image_token_id
         self.image_tokens_per_patch = self.processor.image_seq_length
         self.tokenizer_init_kwargs = tokenizer.init_kwargs
@@ -145,69 +146,3 @@ def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
             outputs.extend([x.reshape(-1, x.shape[-1]) for x in feats])
         messages.append(dict(role='forward', content=outputs))
         return messages
-
-    @staticmethod
-    def proc_messages(
-        messages,
-        chat_template,
-        sequence_start,
-        tools: Optional[List[object]] = None,
-        enable_thinking: Optional[bool] = None,
-    ):
-        """Apply chat template to get the prompt."""
-        prompt_messages = []
-        IMAGE_TOKEN = '<IMAGE_TOKEN>'
-        for message in messages:
-            if isinstance(message['content'], str):
-                prompt_messages.append(message)
-                continue
-            elif message['role'] in ['preprocess', 'forward']:
-                continue
-            n_images = len([1 for x in message['content'] if x['type'] == 'image'])
-            content = [x.get('text', '') for x in message['content'] if x['type'] == 'text']
-            prompt = content[0]
-            if IMAGE_TOKEN in prompt and f'<img>{IMAGE_TOKEN}' not in prompt:
-                prompt = prompt.replace(f'{IMAGE_TOKEN}', f'<img>{IMAGE_TOKEN}</img>')
-                prompt = prompt.replace('</img><img>', '')
-                prompt = prompt.replace('<img><img>', '<img>')
-                prompt = prompt.replace('</img></img>', '</img>')
-            elif IMAGE_TOKEN not in prompt:
-                prompt = f'<img>{IMAGE_TOKEN * n_images}</img>\n' + prompt
-            else:
-                pass
-            prompt_messages.append(dict(role='user', content=prompt))
-        prompt = chat_template.messages2prompt(prompt_messages,
-                                               sequence_start,
-                                               tools=tools,
-                                               enable_thinking=enable_thinking)
-        return prompt, IMAGE_TOKEN
-
-    def to_pytorch(self,
-                   messages,
-                   chat_template,
-                   tokenizer,
-                   sequence_start,
-                   tools: Optional[List[object]] = None,
-                   enable_thinking: Optional[bool] = None,
-                   **kwargs):
-        prompt, IMAGE_TOKEN = self.proc_messages(messages,
-                                                 chat_template,
-                                                 sequence_start,
-                                                 tools=tools,
-                                                 enable_thinking=enable_thinking)
-        return self.to_pytorch_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
-
-    def to_turbomind(self,
-                     messages,
-                     chat_template,
-                     tokenizer,
-                     sequence_start,
-                     tools: Optional[List[object]] = None,
-                     enable_thinking: Optional[bool] = None,
-                     **kwargs):
-        prompt, IMAGE_TOKEN = self.proc_messages(messages,
-                                                 chat_template,
-                                                 sequence_start,
-                                                 tools=tools,
-                                                 enable_thinking=enable_thinking)
-        return self.to_turbomind_aux(messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start)
@@ -32,8 +32,8 @@ def build_preprocessor(self):
         from transformers import AutoProcessor
         self.processor = AutoProcessor.from_pretrained(self.model_path)
         tokenizer = self.processor.tokenizer
-        image_token = self.processor.image_token
-        self.image_token_id = tokenizer.encode(image_token)[-1]
+        self.image_token = self.processor.image_token
+        self.image_token_id = tokenizer.encode(self.image_token)[-1]
 
     def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """Refer to `super().preprocess()` for spec."""
@@ -124,33 +124,40 @@ def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
         messages.append(dict(role='forward', content=outputs))
         return messages
 
-    @staticmethod
-    def proc_messages(messages, chat_template, sequence_start):
+    def proc_messages(self, messages, chat_template, sequence_start):
         """Apply chat template to get the prompt."""
         prompt_messages = []
         IMAGE_TOKEN = '<IMAGE_TOKEN>'
-        for message in messages:
-            if isinstance(message['content'], str):
+        messages = [x for x in messages if x['role'] not in ['preprocess', 'forward']]
+        if VisonModel.IMAGE_TOKEN_included(messages):
+            # backward compatibility
+            for message in messages:
+                role, content = message['role'], message['content']
+                if role != 'user' or isinstance(content, str):
+                    prompt_messages.append(message)
+                    continue
+                content = [x['text'] for x in content if x['type'] == 'text']
+                prompt = ''.join(content)
+                prompt = prompt.replace(IMAGE_TOKEN, f'<|vision_start|>{self.image_token}<|vision_end|>')
+                prompt_messages.append(dict(role='user', content=prompt))
+        else:
+            for message in messages:
+                role, content = message['role'], message['content']
+                if role != 'user' or isinstance(content, str):
+                    prompt_messages.append(message)
+                    continue
+                _content = []
+                for item in content:
+                    if item['type'] == 'text':
+                        _content.append(item['text'])
+                    elif item['type'] in ['image', 'image_url']:
+                        _content.append(f'<|vision_start|>{self.image_token}<|vision_end|>')
+                    else:
+                        raise ValueError(f'Unsupported message type: {item["type"]}')
+                message = dict(role=role, content=''.join(_content))
                 prompt_messages.append(message)
-                continue
-            elif message['role'] in ['images', 'preprocess', 'forward']:
-                continue
-            n_images = len([1 for x in message['content'] if x['type'] == 'image'])
-            content = [item['text'] for item in message['content'] if item['type'] == 'text']
-            prompt = content[0]
-            if IMAGE_TOKEN in prompt and '<|vision_start|>' not in prompt:
-                prompt = prompt.replace(IMAGE_TOKEN, f'<|vision_start|>{IMAGE_TOKEN}<|vision_end|>')
-            else:
-                # Qwen2-VL-2B-Instruct will concat image and user prompt
-                # according to their order in the content list
-                # we insert image token before user prompt by default. The
-                # user can use custom image token position if they want the
-                # same decorated prompt as Qwen2-VL
-                prompt = f'<|vision_start|>{IMAGE_TOKEN}<|vision_end|>' * \
-                    n_images + prompt
-            prompt_messages.append(dict(role=message['role'], content=prompt))
         prompt = chat_template.messages2prompt(prompt_messages, sequence_start)
-        return prompt, IMAGE_TOKEN
+        return prompt, self.image_token
 
     @staticmethod
     def get_mrope_info(seq_len: int,