You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I have fine tuned Qwen 2.5 VL 7B 4-bit model from unsloth on my custom data. I saved the model locally. When I try to inference the model it takes me more than 20s and more than a minute for inference using transformers. I am using single T4 gpu. I am new to LLMs. Am I doing something wrong.
Below is my code for fine tuning:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch
import os
from datasets import Dataset, Image, Sequence, Features, Value
from utils import convert_to_conversation
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig
print('Loading base model ...')
model, tokenizer = FastVisionModel.from_pretrained(
"./fine-tuned-models/qwen_lora_01_16bit",
load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)
model = FastVisionModel.get_peft_model(
model,
finetune_vision_layers = True, # False if not finetuning vision layers
finetune_language_layers = True, # False if not finetuning language layers
finetune_attention_modules = True, # False if not finetuning attention layers
finetune_mlp_modules = True, # False if not finetuning MLP layers
reacted with thumbs up emoji reacted with thumbs down emoji reacted with laugh emoji reacted with hooray emoji reacted with confused emoji reacted with heart emoji reacted with rocket emoji reacted with eyes emoji
Uh oh!
There was an error while loading. Please reload this page.
Uh oh!
There was an error while loading. Please reload this page.
-
I have fine tuned Qwen 2.5 VL 7B 4-bit model from unsloth on my custom data. I saved the model locally. When I try to inference the model it takes me more than 20s and more than a minute for inference using transformers. I am using single T4 gpu. I am new to LLMs. Am I doing something wrong.
Below is my code for fine tuning:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch
import os
from datasets import Dataset, Image, Sequence, Features, Value
from utils import convert_to_conversation
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig
print('Loading base model ...')
model, tokenizer = FastVisionModel.from_pretrained(
"./fine-tuned-models/qwen_lora_01_16bit",
load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)
model = FastVisionModel.get_peft_model(
model,
finetune_vision_layers = True, # False if not finetuning vision layers
finetune_language_layers = True, # False if not finetuning language layers
finetune_attention_modules = True, # False if not finetuning attention layers
finetune_mlp_modules = True, # False if not finetuning MLP layers
)
print('Preparing dataset...')
image_dir = './training_data/batch_01'
image_files = [f for f in os.listdir(image_dir) if f.lower().endswith(('.jpg', '.jpeg'))]
def sort_key(filename):
num = int(filename.split('_')[0])
return num
image_files.sort(key=sort_key)
import json
with open("./datasets/train_01.json", "r") as f:
json_data = json.load(f)
extracted_data = []
for i in json_data:
response = i['extracted_data']
extracted_data.append(response)
images = []
texts = extracted_data
data = []
for file_name, text in zip(image_files, extracted_data):
data.append({
"response": text,
"image": os.path.join(image_dir, file_name)
})
ds = Dataset.from_list(data)
ds = ds.cast_column("image", Image())
dataset = [convert_to_conversation(sample) for sample in ds]
print(f'Dataset ready with {len(dataset)} samples ...')
print('Starting training...')
FastVisionModel.for_training(model)
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
data_collator = UnslothVisionDataCollator(model, tokenizer),
train_dataset = dataset,
args = SFTConfig(
per_device_train_batch_size = 2,
gradient_accumulation_steps = 4,
warmup_steps = 5,
num_train_epochs = 3,
learning_rate = 2e-4,
fp16 = not is_bf16_supported(),
bf16 = is_bf16_supported(),
logging_steps = 1,
optim = "adamw_8bit",
weight_decay = 0.01,
lr_scheduler_type = "linear",
seed = 3407,
output_dir = "./models",
report_to = "none",
)
trainer_stats = trainer.train()
print('Training completed...')
model.save_pretrained("./lora/qwen_lora_01_16")
tokenizer.save_pretrained("./lora/qwen_lora_01_16")
print('Saved LoRa adapters...')
model.save_pretrained_merged("./fine-tuned-models/qwen_lora_01_16bit", tokenizer, save_method = "merged_16bit")
This is my code as how I inference using unsloth:
`from unsloth import FastVisionModel
from PIL import Image
import time
import torch
model, tokenizer = FastVisionModel.from_pretrained(
model_name = "./fine-tuned-models/qwen_lora_01_16bit",
load_in_4bit = True,
)
model = torch.compile(model, mode="reduce-overhead")
FastVisionModel.for_inference(model)
start_time = time.perf_counter()
image = Image.open('./test_data/27_Front.jpg')
instruction = "Extract data from this indian cheque"
messages = [
{"role": "user", "content": [
{"type": "image"},
{"type": "text", "text": instruction}
]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
image,
input_text,
add_special_tokens = False,
return_tensors = "pt",
).to("cuda")
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
outputs = model.generate(
**inputs,
return_dict_in_generate=True,
output_scores=True,
use_cache=True,
temperature=0.15,
max_new_tokens=512,
min_p=0.1
)
decoded_output = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
end_time = time.perf_counter()
print(decoded_output)
elapsed_time = end_time - start_time
generated_tokens = outputs.sequences.shape[-1] - inputs["input_ids"].shape[-1]
tokens_per_sec = generated_tokens / elapsed_time
print(f"Elapsed time: {elapsed_time:.2f} seconds")`
Beta Was this translation helpful? Give feedback.
All reactions