@@ -287,233 +287,6 @@ def test_quantized_model_no_k_proj_quantized(self):
287287 output = quantized_model .generate (dummy_input , max_new_tokens = 10 )
288288 self .assertTrue ((EXPECTED_OUTPUT == output ).all ())
289289
290-
291- @slow
292- @require_torch_accelerator
293- @require_auto_awq
294- @require_accelerate
295- class AwqFusedTest (unittest .TestCase ):
296- model_name = "TheBloke/Mistral-7B-OpenOrca-AWQ"
297- model_revision = "7048b2af77d0dd1c81b000b19d73f9cc8950b510"
298-
299- custom_mapping_model_id = "TheBloke/Mistral-7B-v0.1-AWQ"
300- custom_model_revision = "f186bcfa9edbe2a4334262ec1e67f23e53ed1ae7"
301-
302- mixtral_model_name = "casperhansen/mixtral-instruct-awq"
303- mixtral_model_revision = "87dd4ec502dde74fb3a624835c776b000d190c3b"
304-
305- multi_modal_model_name = "ybelkada/llava-1.5-7b-hf-awq"
306- multi_modal_model_code_revision = "ad108a50f5b9e681bdd7378409f57b7fa59a7442"
307-
308- prompt = (
309- "You're standing on the surface of the Earth. "
310- "You walk one mile south, one mile west and one mile north. "
311- "You end up exactly where you started. Where are you?"
312- )
313-
314- EXPECTED_GENERATION = prompt + "\n \n You're at the center of a square."
315- EXPECTED_GENERATION_CUSTOM_MODEL = "Hello,\n \n I have a problem with my 20"
316- EXPECTED_GENERATION_MIXTRAL = prompt + " You're on the North Pole.\n \n The"
317-
318- def tearDown (self ):
319- gc .collect ()
320- backend_empty_cache (torch_device )
321- gc .collect ()
322-
323- def _check_fused_modules (self , model ):
324- has_fused_modules = False
325- fused_modules_name = ["QuantAttentionFused" , "QuantFusedMLP" , "FasterTransformerRMSNorm" ]
326-
327- for _ , module in model .named_modules ():
328- if module .__class__ .__name__ in fused_modules_name :
329- has_fused_modules = True
330- break
331-
332- self .assertTrue (has_fused_modules , "Modules fusing not performed correctly!" )
333-
334- def test_raise_save_pretrained (self ):
335- """
336- Test that `save_pretrained` is effectively blocked for fused models
337- """
338- quantization_config = AwqConfig (bits = 4 , fuse_max_seq_len = 128 , do_fuse = True )
339-
340- model = AutoModelForCausalLM .from_pretrained (
341- self .model_name ,
342- quantization_config = quantization_config ,
343- revision = self .model_revision ,
344- ).to (torch_device )
345-
346- self ._check_fused_modules (model )
347-
348- with self .assertRaises (ValueError ), tempfile .TemporaryDirectory () as tmpdirname :
349- model .save_pretrained (tmpdirname )
350-
351- def test_fused_modules_to_not_convert (self ):
352- """
353- Test if fused + modules to_not_convert work as expected
354- """
355- model_id = "hf-internal-testing/Mixtral-tiny-AWQ"
356-
357- quantization_config = AwqConfig (bits = 4 , fuse_max_seq_len = 128 , do_fuse = True )
358- model = AutoModelForCausalLM .from_pretrained (
359- model_id ,
360- quantization_config = quantization_config ,
361- ).to (torch_device )
362-
363- # Check if model has been correctly fused
364- self ._check_fused_modules (model )
365- # Checks if the modules_to_not_convert (here gate layer) is a Linear
366- self .assertTrue (isinstance (model .model .layers [0 ].block_sparse_moe .gate , torch .nn .Linear ))
367-
368- @unittest .skipIf (
369- get_device_properties ()[0 ] == "cuda" and get_device_properties ()[1 ] < 8 ,
370- "Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0" ,
371- )
372- @require_flash_attn
373- @require_torch_gpu
374- @pytest .mark .flash_attn_test
375- def test_generation_fused (self ):
376- """
377- Test generation quality for fused models - single batch case
378- """
379- quantization_config = AwqConfig (bits = 4 , fuse_max_seq_len = 128 , do_fuse = True )
380-
381- model = AutoModelForCausalLM .from_pretrained (
382- self .model_name ,
383- quantization_config = quantization_config ,
384- revision = self .model_revision ,
385- ).to (torch_device )
386-
387- self ._check_fused_modules (model )
388-
389- tokenizer = AutoTokenizer .from_pretrained (self .model_name , revision = self .model_revision )
390-
391- inputs = tokenizer (self .prompt , return_tensors = "pt" ).to (torch_device )
392-
393- outputs = model .generate (** inputs , max_new_tokens = 12 )
394-
395- self .assertEqual (tokenizer .decode (outputs [0 ], skip_special_tokens = True ), self .EXPECTED_GENERATION )
396-
397- @pytest .mark .flash_attn_test
398- @require_flash_attn
399- @require_torch_gpu
400- @unittest .skipIf (
401- get_device_properties ()[0 ] == "cuda" and get_device_properties ()[1 ] < 8 ,
402- "Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0" ,
403- )
404- def test_generation_fused_batched (self ):
405- """
406- Test generation quality for fused models - multi batch case
407- """
408- quantization_config = AwqConfig (bits = 4 , fuse_max_seq_len = 128 , do_fuse = True )
409-
410- model = AutoModelForCausalLM .from_pretrained (
411- self .model_name ,
412- quantization_config = quantization_config ,
413- revision = self .model_revision ,
414- ).to (torch_device )
415-
416- self ._check_fused_modules (model )
417-
418- tokenizer = AutoTokenizer .from_pretrained (self .model_name , revision = self .model_revision )
419-
420- tokenizer .pad_token_id = tokenizer .eos_token_id
421- inputs = tokenizer ([self .prompt , self .prompt ], return_tensors = "pt" , padding = True ).to (torch_device )
422-
423- outputs = model .generate (** inputs , max_new_tokens = 12 )
424-
425- self .assertEqual (tokenizer .decode (outputs [0 ], skip_special_tokens = True ), self .EXPECTED_GENERATION )
426-
427- def test_generation_llava_fused (self ):
428- from transformers import pipeline
429-
430- quantization_config = AwqConfig (do_fuse = True , fuse_max_seq_len = 2048 )
431-
432- pipe = pipeline (
433- "image-to-text" ,
434- model = self .multi_modal_model_name ,
435- device = 0 ,
436- model_kwargs = {
437- "quantization_config" : quantization_config ,
438- },
439- revision = self .multi_modal_model_code_revision ,
440- )
441- url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
442-
443- prompt = "USER: <image>\n Can you please describe this image?\n ASSISTANT:"
444-
445- outputs = pipe (url , prompt = prompt , generate_kwargs = {"max_new_tokens" : 100 })
446- EXPECTED_OUTPUT = "USER: \n Can you please describe this image?\n ASSISTANT: The image features a brown and white cat sitting on a green surface, possibly a carpet or a grassy area. The cat is holding a red ball in its paws, seemingly playing with it. The cat appears to be focused on the ball, possibly preparing to play or just enjoying the toy."
447-
448- self .assertEqual (outputs [0 ]["generated_text" ], EXPECTED_OUTPUT )
449-
450- @pytest .mark .flash_attn_test
451- @require_flash_attn
452- @require_torch_multi_gpu
453- @unittest .skipIf (
454- get_device_properties ()[0 ] == "cuda" and get_device_properties ()[1 ] < 8 ,
455- "Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0" ,
456- )
457- def test_generation_custom_model (self ):
458- """
459- Test generation quality for fused models using custom fused map.
460- """
461- quantization_config = AwqConfig (
462- bits = 4 ,
463- fuse_max_seq_len = 512 ,
464- modules_to_fuse = {
465- "attention" : ["q_proj" , "k_proj" , "v_proj" , "o_proj" ],
466- "mlp" : ["gate_proj" , "up_proj" , "down_proj" ],
467- "layernorm" : ["input_layernorm" , "post_attention_layernorm" , "norm" ],
468- "use_alibi" : False ,
469- "hidden_size" : 4096 ,
470- "num_attention_heads" : 32 ,
471- "num_key_value_heads" : 8 ,
472- },
473- )
474-
475- model = AutoModelForCausalLM .from_pretrained (
476- self .custom_mapping_model_id ,
477- quantization_config = quantization_config ,
478- device_map = "balanced" ,
479- revision = self .custom_model_revision ,
480- )
481-
482- self ._check_fused_modules (model )
483-
484- tokenizer = AutoTokenizer .from_pretrained (self .custom_mapping_model_id , revision = self .custom_model_revision )
485-
486- prompt = "Hello"
487- inputs = tokenizer (prompt , return_tensors = "pt" ).to (torch_device )
488-
489- outputs = model .generate (** inputs , max_new_tokens = 12 )
490- self .assertEqual (tokenizer .decode (outputs [0 ], skip_special_tokens = True ), self .EXPECTED_GENERATION_CUSTOM_MODEL )
491-
492- @pytest .mark .flash_attn_test
493- @require_flash_attn
494- @require_torch_multi_gpu
495- @unittest .skip (reason = "Not enough GPU memory on CI runners" )
496- def test_generation_mixtral_fused (self ):
497- """
498- Text generation test for Mixtral + AWQ + fused
499- """
500- quantization_config = AwqConfig (bits = 4 , fuse_max_seq_len = 1024 , do_fuse = True )
501- model = AutoModelForCausalLM .from_pretrained (
502- self .mixtral_model_name ,
503- quantization_config = quantization_config ,
504- device_map = "auto" ,
505- revision = self .mixtral_model_revision ,
506- )
507-
508- tokenizer = AutoTokenizer .from_pretrained (self .mixtral_model_name )
509- tokenizer .pad_token = tokenizer .eos_token
510-
511- inputs = tokenizer ([self .prompt , self .prompt ], return_tensors = "pt" , padding = True ).to (torch_device )
512-
513- outputs = model .generate (** inputs , max_new_tokens = 12 )
514- self .assertEqual (tokenizer .decode (outputs [0 ], skip_special_tokens = True ), self .EXPECTED_GENERATION_MIXTRAL )
515-
516-
517290@slow
518291@require_torch_accelerator
519292@require_auto_awq
0 commit comments