Skip to content

Commit d839d2b

Browse files
remove awq fuse test
1 parent c9f9c02 commit d839d2b

File tree

1 file changed

+0
-227
lines changed

1 file changed

+0
-227
lines changed

tests/quantization/autoawq/test_awq.py

Lines changed: 0 additions & 227 deletions
Original file line numberDiff line numberDiff line change
@@ -287,233 +287,6 @@ def test_quantized_model_no_k_proj_quantized(self):
287287
output = quantized_model.generate(dummy_input, max_new_tokens=10)
288288
self.assertTrue((EXPECTED_OUTPUT == output).all())
289289

290-
291-
@slow
292-
@require_torch_accelerator
293-
@require_auto_awq
294-
@require_accelerate
295-
class AwqFusedTest(unittest.TestCase):
296-
model_name = "TheBloke/Mistral-7B-OpenOrca-AWQ"
297-
model_revision = "7048b2af77d0dd1c81b000b19d73f9cc8950b510"
298-
299-
custom_mapping_model_id = "TheBloke/Mistral-7B-v0.1-AWQ"
300-
custom_model_revision = "f186bcfa9edbe2a4334262ec1e67f23e53ed1ae7"
301-
302-
mixtral_model_name = "casperhansen/mixtral-instruct-awq"
303-
mixtral_model_revision = "87dd4ec502dde74fb3a624835c776b000d190c3b"
304-
305-
multi_modal_model_name = "ybelkada/llava-1.5-7b-hf-awq"
306-
multi_modal_model_code_revision = "ad108a50f5b9e681bdd7378409f57b7fa59a7442"
307-
308-
prompt = (
309-
"You're standing on the surface of the Earth. "
310-
"You walk one mile south, one mile west and one mile north. "
311-
"You end up exactly where you started. Where are you?"
312-
)
313-
314-
EXPECTED_GENERATION = prompt + "\n\nYou're at the center of a square."
315-
EXPECTED_GENERATION_CUSTOM_MODEL = "Hello,\n\nI have a problem with my 20"
316-
EXPECTED_GENERATION_MIXTRAL = prompt + " You're on the North Pole.\n\nThe"
317-
318-
def tearDown(self):
319-
gc.collect()
320-
backend_empty_cache(torch_device)
321-
gc.collect()
322-
323-
def _check_fused_modules(self, model):
324-
has_fused_modules = False
325-
fused_modules_name = ["QuantAttentionFused", "QuantFusedMLP", "FasterTransformerRMSNorm"]
326-
327-
for _, module in model.named_modules():
328-
if module.__class__.__name__ in fused_modules_name:
329-
has_fused_modules = True
330-
break
331-
332-
self.assertTrue(has_fused_modules, "Modules fusing not performed correctly!")
333-
334-
def test_raise_save_pretrained(self):
335-
"""
336-
Test that `save_pretrained` is effectively blocked for fused models
337-
"""
338-
quantization_config = AwqConfig(bits=4, fuse_max_seq_len=128, do_fuse=True)
339-
340-
model = AutoModelForCausalLM.from_pretrained(
341-
self.model_name,
342-
quantization_config=quantization_config,
343-
revision=self.model_revision,
344-
).to(torch_device)
345-
346-
self._check_fused_modules(model)
347-
348-
with self.assertRaises(ValueError), tempfile.TemporaryDirectory() as tmpdirname:
349-
model.save_pretrained(tmpdirname)
350-
351-
def test_fused_modules_to_not_convert(self):
352-
"""
353-
Test if fused + modules to_not_convert work as expected
354-
"""
355-
model_id = "hf-internal-testing/Mixtral-tiny-AWQ"
356-
357-
quantization_config = AwqConfig(bits=4, fuse_max_seq_len=128, do_fuse=True)
358-
model = AutoModelForCausalLM.from_pretrained(
359-
model_id,
360-
quantization_config=quantization_config,
361-
).to(torch_device)
362-
363-
# Check if model has been correctly fused
364-
self._check_fused_modules(model)
365-
# Checks if the modules_to_not_convert (here gate layer) is a Linear
366-
self.assertTrue(isinstance(model.model.layers[0].block_sparse_moe.gate, torch.nn.Linear))
367-
368-
@unittest.skipIf(
369-
get_device_properties()[0] == "cuda" and get_device_properties()[1] < 8,
370-
"Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0",
371-
)
372-
@require_flash_attn
373-
@require_torch_gpu
374-
@pytest.mark.flash_attn_test
375-
def test_generation_fused(self):
376-
"""
377-
Test generation quality for fused models - single batch case
378-
"""
379-
quantization_config = AwqConfig(bits=4, fuse_max_seq_len=128, do_fuse=True)
380-
381-
model = AutoModelForCausalLM.from_pretrained(
382-
self.model_name,
383-
quantization_config=quantization_config,
384-
revision=self.model_revision,
385-
).to(torch_device)
386-
387-
self._check_fused_modules(model)
388-
389-
tokenizer = AutoTokenizer.from_pretrained(self.model_name, revision=self.model_revision)
390-
391-
inputs = tokenizer(self.prompt, return_tensors="pt").to(torch_device)
392-
393-
outputs = model.generate(**inputs, max_new_tokens=12)
394-
395-
self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION)
396-
397-
@pytest.mark.flash_attn_test
398-
@require_flash_attn
399-
@require_torch_gpu
400-
@unittest.skipIf(
401-
get_device_properties()[0] == "cuda" and get_device_properties()[1] < 8,
402-
"Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0",
403-
)
404-
def test_generation_fused_batched(self):
405-
"""
406-
Test generation quality for fused models - multi batch case
407-
"""
408-
quantization_config = AwqConfig(bits=4, fuse_max_seq_len=128, do_fuse=True)
409-
410-
model = AutoModelForCausalLM.from_pretrained(
411-
self.model_name,
412-
quantization_config=quantization_config,
413-
revision=self.model_revision,
414-
).to(torch_device)
415-
416-
self._check_fused_modules(model)
417-
418-
tokenizer = AutoTokenizer.from_pretrained(self.model_name, revision=self.model_revision)
419-
420-
tokenizer.pad_token_id = tokenizer.eos_token_id
421-
inputs = tokenizer([self.prompt, self.prompt], return_tensors="pt", padding=True).to(torch_device)
422-
423-
outputs = model.generate(**inputs, max_new_tokens=12)
424-
425-
self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION)
426-
427-
def test_generation_llava_fused(self):
428-
from transformers import pipeline
429-
430-
quantization_config = AwqConfig(do_fuse=True, fuse_max_seq_len=2048)
431-
432-
pipe = pipeline(
433-
"image-to-text",
434-
model=self.multi_modal_model_name,
435-
device=0,
436-
model_kwargs={
437-
"quantization_config": quantization_config,
438-
},
439-
revision=self.multi_modal_model_code_revision,
440-
)
441-
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
442-
443-
prompt = "USER: <image>\nCan you please describe this image?\nASSISTANT:"
444-
445-
outputs = pipe(url, prompt=prompt, generate_kwargs={"max_new_tokens": 100})
446-
EXPECTED_OUTPUT = "USER: \nCan you please describe this image?\nASSISTANT: The image features a brown and white cat sitting on a green surface, possibly a carpet or a grassy area. The cat is holding a red ball in its paws, seemingly playing with it. The cat appears to be focused on the ball, possibly preparing to play or just enjoying the toy."
447-
448-
self.assertEqual(outputs[0]["generated_text"], EXPECTED_OUTPUT)
449-
450-
@pytest.mark.flash_attn_test
451-
@require_flash_attn
452-
@require_torch_multi_gpu
453-
@unittest.skipIf(
454-
get_device_properties()[0] == "cuda" and get_device_properties()[1] < 8,
455-
"Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0",
456-
)
457-
def test_generation_custom_model(self):
458-
"""
459-
Test generation quality for fused models using custom fused map.
460-
"""
461-
quantization_config = AwqConfig(
462-
bits=4,
463-
fuse_max_seq_len=512,
464-
modules_to_fuse={
465-
"attention": ["q_proj", "k_proj", "v_proj", "o_proj"],
466-
"mlp": ["gate_proj", "up_proj", "down_proj"],
467-
"layernorm": ["input_layernorm", "post_attention_layernorm", "norm"],
468-
"use_alibi": False,
469-
"hidden_size": 4096,
470-
"num_attention_heads": 32,
471-
"num_key_value_heads": 8,
472-
},
473-
)
474-
475-
model = AutoModelForCausalLM.from_pretrained(
476-
self.custom_mapping_model_id,
477-
quantization_config=quantization_config,
478-
device_map="balanced",
479-
revision=self.custom_model_revision,
480-
)
481-
482-
self._check_fused_modules(model)
483-
484-
tokenizer = AutoTokenizer.from_pretrained(self.custom_mapping_model_id, revision=self.custom_model_revision)
485-
486-
prompt = "Hello"
487-
inputs = tokenizer(prompt, return_tensors="pt").to(torch_device)
488-
489-
outputs = model.generate(**inputs, max_new_tokens=12)
490-
self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_CUSTOM_MODEL)
491-
492-
@pytest.mark.flash_attn_test
493-
@require_flash_attn
494-
@require_torch_multi_gpu
495-
@unittest.skip(reason="Not enough GPU memory on CI runners")
496-
def test_generation_mixtral_fused(self):
497-
"""
498-
Text generation test for Mixtral + AWQ + fused
499-
"""
500-
quantization_config = AwqConfig(bits=4, fuse_max_seq_len=1024, do_fuse=True)
501-
model = AutoModelForCausalLM.from_pretrained(
502-
self.mixtral_model_name,
503-
quantization_config=quantization_config,
504-
device_map="auto",
505-
revision=self.mixtral_model_revision,
506-
)
507-
508-
tokenizer = AutoTokenizer.from_pretrained(self.mixtral_model_name)
509-
tokenizer.pad_token = tokenizer.eos_token
510-
511-
inputs = tokenizer([self.prompt, self.prompt], return_tensors="pt", padding=True).to(torch_device)
512-
513-
outputs = model.generate(**inputs, max_new_tokens=12)
514-
self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_MIXTRAL)
515-
516-
517290
@slow
518291
@require_torch_accelerator
519292
@require_auto_awq

0 commit comments

Comments
 (0)