diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 1c620ad7d..a8563accc 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -12,25 +12,20 @@ import requests from PIL import Image -from transformers import PreTrainedModel, TextStreamer +from transformers import PreTrainedModel from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES from QEfficient.base.common import QEFFCommonLoader -from QEfficient.utils import check_and_assign_cache_dir, load_hf_processor, load_hf_tokenizer +from QEfficient.utils import check_and_assign_cache_dir, load_hf_processor, load_hf_tokenizer, load_streamer from QEfficient.utils.logging_utils import logger # TODO: Remove after adding support for VLM's compile and execute def execute_vlm_model( + processor: PreTrainedModel, qeff_model: PreTrainedModel, - model_name: str, - image_url: str, - image_path: str, - prompt: Optional[str] = None, # type: ignore + inputs: Optional[dict] = None, device_group: Optional[List[int]] = None, - local_model_dir: Optional[str] = None, - cache_dir: Optional[str] = None, - hf_token: Optional[str] = None, generation_len: Optional[int] = None, ): """ @@ -50,16 +45,43 @@ def execute_vlm_model( Returns: :dict: Output from the ``AI_100`` runtime. """ + streamer = load_streamer(processor.tokenizer) + output = qeff_model.generate( + inputs=inputs, + streamer=streamer, + device_ids=device_group, + generation_len=generation_len, + ) + return output + + +def count_vlm_tokens( + processor: PreTrainedModel, + prompt_len: int = 32, + ctx_len: int = 128, + image_url: Optional[str] = None, + image_path: Optional[str] = None, + prompt: Optional[str] = None, # type: ignore +): + """ + This method counts the number of tokens in the image and updates the prompt length and context length accordingly. + ``Mandatory`` Args: + :processor (PreTrainedModel): Hugging Face Processor object. + :image_url (str): Image URL to be used for inference. ``Defaults to None.`` + :image_path (str): Image path to be used for inference. ``Defaults to None.`` + ``Optional`` Args: + :prompt_len (str): Prompt length for the model to compile. ``Defaults to 32.`` + :ctx_len (str): Maximum context length to compile the model. ``Defaults to 128.`` + :prompt (str): Sample prompt for the model text generation. ``Defaults to None.``` + Returns: + :prompt_len: Updated prompt length for the VLM model to compile. + :ctx_len: Updated context length for the VLM model to compile. + :split_inputs: Tokenized inputs for the VLM model. + """ if not (image_url or image_path): raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"') raw_image = Image.open(requests.get(image_url, stream=True).raw) if image_url else Image.open(image_path) - processor = load_hf_processor( - pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), - cache_dir=cache_dir, - hf_token=hf_token, - ) - # Added for QEff version 1.20 supported VLM models (mllama and llava) conversation = [ { @@ -73,21 +95,31 @@ def execute_vlm_model( # Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids. input_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) - split_inputs = processor( text=input_text, images=raw_image, return_tensors="pt", add_special_tokens=False, ) - streamer = TextStreamer(processor.tokenizer) - output = qeff_model.generate( - inputs=split_inputs, - streamer=streamer, - device_ids=device_group, - generation_len=generation_len, - ) - return output + + # Get the number of total number of decoded tokens in the input + decoded_tokens = processor.tokenizer.decode(split_inputs["input_ids"][0]) + + total_image_tokens = decoded_tokens.count("") + decoded_tokens.count("") + + # Check if the number of tokens in the image is greater than the prompt length + if total_image_tokens > prompt_len: + logger.warning( + f"Prompt length {prompt_len} is less than the number of tokens in the image. " + f"Increasing the prompt length to at least {total_image_tokens + prompt_len}." + ) + prompt_len = total_image_tokens + prompt_len + + # Update the context length only if it is less than the prompt length + if ctx_len < prompt_len: + ctx_len = prompt_len + ctx_len + + return prompt_len, ctx_len, split_inputs def main( @@ -175,10 +207,25 @@ def main( config = qeff_model.model.config architecture = config.architectures[0] if config.architectures else None - if architecture not in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values() and ( - kwargs.pop("img_size", None) or image_path or image_url - ): - logger.warning(f"Skipping image arguments as they are not valid for {architecture}") + if architecture not in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values(): + if kwargs.pop("img_size", None) or image_path or image_url: + logger.warning(f"Skipping image arguments as they are not valid for {architecture}") + else: + processor = load_hf_processor( + pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), + cache_dir=cache_dir, + hf_token=hf_token, + ) + + # count the number of tokens in required in the input and update the prompt length and context length accordingly + prompt_len, ctx_len, inputs = count_vlm_tokens( + processor=processor, + prompt_len=prompt_len, + ctx_len=ctx_len, + image_url=image_url, + image_path=image_path, + prompt=prompt, + ) ######### # Compile @@ -209,15 +256,10 @@ def main( ######### if architecture in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values(): exec_info = execute_vlm_model( + processor=processor, qeff_model=qeff_model, - model_name=model_name, - prompt=prompt, - image_url=image_url, - image_path=image_path, + inputs=inputs, device_group=device_group, - local_model_dir=local_model_dir, - cache_dir=cache_dir, - hf_token=hf_token, generation_len=generation_len, ) print(exec_info) diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py index 7fc132b17..3a701d1ea 100755 --- a/QEfficient/utils/__init__.py +++ b/QEfficient/utils/__init__.py @@ -24,6 +24,7 @@ hf_download, load_hf_processor, load_hf_tokenizer, + load_streamer, login_and_download_hf_lm, onnx_exists, padding_check_and_fix, diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 106647bc0..b2b1856af 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -23,6 +23,7 @@ AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, + TextStreamer, ) from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants, QnnConstants @@ -220,6 +221,22 @@ def load_hf_processor( return processor +def load_streamer( + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], +): + """ + Loads the streamer for the given tokenizer. + -------- + + tokenizer: `Union[PreTrainedTokenizer, PreTrainedTokenizerFast]` - Pass model tokenizer to load streamer. + + Return: + TextStreamer object for the given tokenizer. + """ + logger.info("Loading Streamer") + return TextStreamer(tokenizer) + + def get_qpc_dir_path( model_card_name, num_cores, diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md index 3896a616d..5c962837e 100644 --- a/docs/source/quick_start.md +++ b/docs/source/quick_start.md @@ -201,6 +201,23 @@ qeff_model.generate(prompts=["My name is"]) **Users can also take advantage of features like multi-Qranium inference and continuous batching with QNN SDK Compilation.** + +### VLM Inference + +Users can compile a VLM model by using the below commands. + +**CLI Inference Command** + +For Llava +```bash +python -m QEfficient.cloud.infer --model_name llava-hf/llava-1.5-7b-hf --batch_size 1 --prompt_len 784 --ctx_len 1024 --mxfp6 --num_cores 16 --device_group [0] --prompt "Describe the image" --mos 1 --aic_enable_depth_first --image_url https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg --generation_len 128 +``` + +For Mllama +```bash +python -m QEfficient.cloud.infer --model_name meta-llama/Llama-3.2-11B-Vision-Instruct --batch_size 1 --prompt_len 32 --ctx_len 512 --num_cores 16 --device_group [0] --prompt "Describe the image?" --mos 1 --allocator_dealloc_delay 1 --image_url https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg +``` + ## Python API ### 1. Model download and Optimize for Cloud AI 100