Qwen 2.5 VL 7B Instruct model taking too long for inference on a single document on single T4 GPU #2438

azimb-170 · 2025-04-30T06:30:10Z

azimb-170
Apr 30, 2025

I have fine tuned Qwen 2.5 VL 7B 4-bit model from unsloth on my custom data. I saved the model locally. When I try to inference the model it takes me more than 20s and more than a minute for inference using transformers. I am using single T4 gpu. I am new to LLMs. Am I doing something wrong.

Below is my code for fine tuning:

from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch
import os
from datasets import Dataset, Image, Sequence, Features, Value
from utils import convert_to_conversation
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

print('Loading base model ...')
model, tokenizer = FastVisionModel.from_pretrained(
"./fine-tuned-models/qwen_lora_01_16bit",
load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

model = FastVisionModel.get_peft_model(
model,
finetune_vision_layers = True, # False if not finetuning vision layers
finetune_language_layers = True, # False if not finetuning language layers
finetune_attention_modules = True, # False if not finetuning attention layers
finetune_mlp_modules = True, # False if not finetuning MLP layers

r = 16,         
lora_alpha = 16,  
lora_dropout = 0,
bias = "none",
random_state = 3407,
use_rslora = False,  
loftq_config = None

)

print('Preparing dataset...')

image_dir = './training_data/batch_01'

image_files = [f for f in os.listdir(image_dir) if f.lower().endswith(('.jpg', '.jpeg'))]

def sort_key(filename):
num = int(filename.split('_')[0])
return num

image_files.sort(key=sort_key)

import json
with open("./datasets/train_01.json", "r") as f:
json_data = json.load(f)

extracted_data = []

for i in json_data:
response = i['extracted_data']
extracted_data.append(response)

images = []
texts = extracted_data

data = []
for file_name, text in zip(image_files, extracted_data):
data.append({
"response": text,
"image": os.path.join(image_dir, file_name)
})

ds = Dataset.from_list(data)
ds = ds.cast_column("image", Image())

dataset = [convert_to_conversation(sample) for sample in ds]

print(f'Dataset ready with {len(dataset)} samples ...')

print('Starting training...')

FastVisionModel.for_training(model)

trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
data_collator = UnslothVisionDataCollator(model, tokenizer),
train_dataset = dataset,
args = SFTConfig(
per_device_train_batch_size = 2,
gradient_accumulation_steps = 4,
warmup_steps = 5,
num_train_epochs = 3,
learning_rate = 2e-4,
fp16 = not is_bf16_supported(),
bf16 = is_bf16_supported(),
logging_steps = 1,
optim = "adamw_8bit",
weight_decay = 0.01,
lr_scheduler_type = "linear",
seed = 3407,
output_dir = "./models",
report_to = "none",

    remove_unused_columns = False,
    dataset_text_field = "",
    dataset_kwargs = {"skip_prepare_dataset": True},
    dataset_num_proc = 4,
    max_seq_length = 2048,
),

)

trainer_stats = trainer.train()

print('Training completed...')
model.save_pretrained("./lora/qwen_lora_01_16")
tokenizer.save_pretrained("./lora/qwen_lora_01_16")
print('Saved LoRa adapters...')

model.save_pretrained_merged("./fine-tuned-models/qwen_lora_01_16bit", tokenizer, save_method = "merged_16bit")

This is my code as how I inference using unsloth:

`from unsloth import FastVisionModel
from PIL import Image
import time
import torch

model, tokenizer = FastVisionModel.from_pretrained(
model_name = "./fine-tuned-models/qwen_lora_01_16bit",
load_in_4bit = True,
)
model = torch.compile(model, mode="reduce-overhead")
FastVisionModel.for_inference(model)
start_time = time.perf_counter()
image = Image.open('./test_data/27_Front.jpg')

instruction = "Extract data from this indian cheque"

messages = [
{"role": "user", "content": [
{"type": "image"},
{"type": "text", "text": instruction}
]}
]

input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
image,
input_text,
add_special_tokens = False,
return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
outputs = model.generate(
**inputs,
return_dict_in_generate=True,
output_scores=True,
use_cache=True,
temperature=0.15,
max_new_tokens=512,
min_p=0.1
)

decoded_output = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)

end_time = time.perf_counter()

print(decoded_output)

elapsed_time = end_time - start_time

generated_tokens = outputs.sequences.shape[-1] - inputs["input_ids"].shape[-1]

tokens_per_sec = generated_tokens / elapsed_time

print(f"Elapsed time: {elapsed_time:.2f} seconds")`

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Qwen 2.5 VL 7B Instruct model taking too long for inference on a single document on single T4 GPU #2438

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

Uh oh!

Qwen 2.5 VL 7B Instruct model taking too long for inference on a single document on single T4 GPU #2438

Uh oh!

Uh oh!

azimb-170 Apr 30, 2025

Replies: 0 comments

azimb-170
Apr 30, 2025