You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I use Colab (A100, 80GB VRAM) to fine-tune Llama-4 model.
When I load "unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-dynamic-bnb-4bit" model, it shows "AttributeError: SequentialLlama4TextExperts has no attribute down_proj".
Same promblem in other unsloth/Llama-4 series models I tried:
unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-bnb-4bit
unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-bnb-8bit
error detal (see below):
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-dynamic-bnb-4bit",
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))== Unsloth 2025.11.3: Fast Llama4 patching. Transformers: 4.57.1.
\ /| NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.318 GB. Platform: Linux.
O^O/ _/ \ Torch: 2.9.0+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.5.0
\ / Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
"-____-" Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
model.safetensors.index.json: 382k/? [00:00<00:00, 39.5MB/s]model-00001-of-00013.safetensors: 100% 5.00G/5.00G [00:26<00:00, 105MB/s]model-00002-of-00013.safetensors: 100% 4.81G/4.81G [00:44<00:00, 80.6MB/s]model-00003-of-00013.safetensors: 100% 4.82G/4.82G [00:26<00:00, 426MB/s]model-00004-of-00013.safetensors: 100% 4.98G/4.98G [00:44<00:00, 98.3MB/s]model-00005-of-00013.safetensors: 100% 4.73G/4.73G [00:09<00:00, 171MB/s]model-00006-of-00013.safetensors: 100% 4.73G/4.73G [00:11<00:00, 321MB/s]model-00007-of-00013.safetensors: 100% 4.89G/4.89G [00:12<00:00, 600MB/s]model-00008-of-00013.safetensors: 100% 4.98G/4.98G [00:13<00:00, 388MB/s]model-00009-of-00013.safetensors: 100% 4.74G/4.74G [00:12<00:00, 673MB/s]model-00010-of-00013.safetensors: 100% 4.98G/4.98G [00:23<00:00, 60.1MB/s]model-00011-of-00013.safetensors: 100% 4.89G/4.89G [00:42<00:00, 93.2MB/s]model-00012-of-00013.safetensors: 100% 4.99G/4.99G [00:15<00:00, 180MB/s]model-00013-of-00013.safetensors: 100% 3.17G/3.17G [00:13<00:00, 148MB/s]Loading checkpoint shards: 0% 0/13 [00:00<?, ?it/s]---------------------------------------------------------------------------
AttributeError Traceback (most recent call last) /tmp/ipython-input-1968070144.py in <cell line: 0>()
5 load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
6
----> 7 model, tokenizer = FastLanguageModel.from_pretrained(
8 model_name = "unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-dynamic-bnb-4bit",
9 max_seq_length = max_seq_length,
/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py in load_shard_file(args)
841 # Skip it with fsdp on ranks other than 0
842 elif not (is_fsdp_enabled() and not is_local_dist_rank_0() and not is_quantized):
--> 843 disk_offload_index = _load_state_dict_into_meta_model(
844 model,
845 state_dict,
/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py in _load_state_dict_into_meta_model(model, state_dict, shard_file, reverse_renaming_mapping, device_map, disk_offload_folder, disk_offload_index, hf_quantizer, keep_in_fp32_regex, device_mesh)
772 else:
773 # TODO naming is stupid it loads it as well
--> 774 hf_quantizer.create_quantized_param(model, param, param_name, param_device)
775
776 # For quantized modules with FSDP/DeepSpeed Stage 3, we need to quantize the parameter on the GPU
/usr/local/lib/python3.12/dist-packages/transformers/quantizers/quantizer_bnb_4bit.py in create_quantized_param(self, model, param_value, param_name, target_device, **kwargs)
188 # update param name to get the weights instead of the quantized stats
189 param_name = self.get_param_name(param_name)
--> 190 module, tensor_name = get_module_from_name(model, param_name)
191
192 # torch.Tensor.to(<int num>) is not supported by torch_npu (see this issue).
reacted with thumbs up emoji reacted with thumbs down emoji reacted with laugh emoji reacted with hooray emoji reacted with confused emoji reacted with heart emoji reacted with rocket emoji reacted with eyes emoji
Uh oh!
There was an error while loading. Please reload this page.
Uh oh!
There was an error while loading. Please reload this page.
-
I use Colab (A100, 80GB VRAM) to fine-tune Llama-4 model.
When I load "unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-dynamic-bnb-4bit" model, it shows "AttributeError: SequentialLlama4TextExperts has no attribute
down_proj".Same promblem in other unsloth/Llama-4 series models I tried:
unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-bnb-4bit
unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-bnb-8bit
error detal (see below):
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-dynamic-bnb-4bit",
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))== Unsloth 2025.11.3: Fast Llama4 patching. Transformers: 4.57.1.
\ /| NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.318 GB. Platform: Linux.
O^O/ _/ \ Torch: 2.9.0+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.5.0
\ / Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
"-____-" Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
model.safetensors.index.json: 382k/? [00:00<00:00, 39.5MB/s]model-00001-of-00013.safetensors: 100% 5.00G/5.00G [00:26<00:00, 105MB/s]model-00002-of-00013.safetensors: 100% 4.81G/4.81G [00:44<00:00, 80.6MB/s]model-00003-of-00013.safetensors: 100% 4.82G/4.82G [00:26<00:00, 426MB/s]model-00004-of-00013.safetensors: 100% 4.98G/4.98G [00:44<00:00, 98.3MB/s]model-00005-of-00013.safetensors: 100% 4.73G/4.73G [00:09<00:00, 171MB/s]model-00006-of-00013.safetensors: 100% 4.73G/4.73G [00:11<00:00, 321MB/s]model-00007-of-00013.safetensors: 100% 4.89G/4.89G [00:12<00:00, 600MB/s]model-00008-of-00013.safetensors: 100% 4.98G/4.98G [00:13<00:00, 388MB/s]model-00009-of-00013.safetensors: 100% 4.74G/4.74G [00:12<00:00, 673MB/s]model-00010-of-00013.safetensors: 100% 4.98G/4.98G [00:23<00:00, 60.1MB/s]model-00011-of-00013.safetensors: 100% 4.89G/4.89G [00:42<00:00, 93.2MB/s]model-00012-of-00013.safetensors: 100% 4.99G/4.99G [00:15<00:00, 180MB/s]model-00013-of-00013.safetensors: 100% 3.17G/3.17G [00:13<00:00, 148MB/s]Loading checkpoint shards: 0% 0/13 [00:00<?, ?it/s]---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/tmp/ipython-input-1968070144.py in <cell line: 0>()
5 load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
6
----> 7 model, tokenizer = FastLanguageModel.from_pretrained(
8 model_name = "unsloth/Llama-4-Scout-17B-16E-Instruct-unsloth-dynamic-bnb-4bit",
9 max_seq_length = max_seq_length,
12 frames/usr/local/lib/python3.12/dist-packages/unsloth/models/loader.py in from_pretrained(model_name, max_seq_length, dtype, load_in_4bit, load_in_8bit, load_in_16bit, full_finetuning, token, device_map, rope_scaling, fix_tokenizer, trust_remote_code, use_gradient_checkpointing, resize_model_vocab, revision, use_exact_model_name, offload_embedding, float32_mixed_precision, fast_inference, gpu_memory_utilization, float8_kv_cache, random_state, max_lora_rank, disable_log_stats, qat_scheme, *args, **kwargs)
449 # dispatch_model = FastGraniteModel
450 else:
--> 451 return FastModel.from_pretrained(
452 model_name = old_model_name,
453 max_seq_length = max_seq_length,
/usr/local/lib/python3.12/dist-packages/unsloth/models/loader.py in from_pretrained(model_name, max_seq_length, dtype, load_in_4bit, load_in_8bit, load_in_16bit, full_finetuning, token, device_map, rope_scaling, fix_tokenizer, trust_remote_code, use_gradient_checkpointing, resize_model_vocab, revision, return_logits, fullgraph, use_exact_model_name, auto_model, whisper_language, whisper_task, unsloth_force_compile, offload_embedding, float32_mixed_precision, fast_inference, gpu_memory_utilization, float8_kv_cache, random_state, max_lora_rank, disable_log_stats, qat_scheme, *args, **kwargs)
1063 auto_model = AutoModelForVision2Seq if is_vlm else AutoModelForCausalLM
1064
-> 1065 model, tokenizer = FastBaseModel.from_pretrained(
1066 model_name = model_name,
1067 max_seq_length = max_seq_length,
/usr/local/lib/python3.12/dist-packages/unsloth/models/vision.py in from_pretrained(model_name, max_seq_length, dtype, load_in_4bit, load_in_8bit, load_in_16bit, full_finetuning, token, device_map, trust_remote_code, model_types, tokenizer_name, auto_model, use_gradient_checkpointing, supports_sdpa, whisper_language, whisper_task, auto_config, offload_embedding, float32_mixed_precision, fast_inference, gpu_memory_utilization, float8_kv_cache, random_state, max_lora_rank, disable_log_stats, unsloth_vllm_standby, **kwargs)
647 raise_handler = RaiseUninitialized()
648 if not fast_inference:
--> 649 model = auto_model.from_pretrained(
650 model_name,
651 device_map = device_map,
/usr/local/lib/python3.12/dist-packages/transformers/models/auto/auto_factory.py in from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
602 if model_class.config_class == config.sub_configs.get("text_config", None):
603 config = config.get_text_config()
--> 604 return model_class.from_pretrained(
605 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
606 )
/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py in _wrapper(*args, **kwargs)
275 old_dtype = torch.get_default_dtype()
276 try:
--> 277 return func(*args, **kwargs)
278 finally:
279 torch.set_default_dtype(old_dtype)
/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py in from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, weights_only, *model_args, **kwargs)
5046 offload_index,
5047 error_msgs,
-> 5048 ) = cls._load_pretrained_model(
5049 model,
5050 state_dict,
/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py in _load_pretrained_model(cls, model, state_dict, checkpoint_files, pretrained_model_name_or_path, ignore_mismatched_sizes, sharded_metadata, device_map, disk_offload_folder, dtype, hf_quantizer, keep_in_fp32_regex, device_mesh, key_mapping, weights_only)
5466
5467 for args in args_list:
-> 5468 _error_msgs, disk_offload_index = load_shard_file(args)
5469 error_msgs += _error_msgs
5470
/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py in load_shard_file(args)
841 # Skip it with fsdp on ranks other than 0
842 elif not (is_fsdp_enabled() and not is_local_dist_rank_0() and not is_quantized):
--> 843 disk_offload_index = _load_state_dict_into_meta_model(
844 model,
845 state_dict,
/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py in decorate_context(*args, **kwargs)
118 def decorate_context(*args, **kwargs):
119 with ctx_factory():
--> 120 return func(*args, **kwargs)
121
122 return decorate_context
/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py in _load_state_dict_into_meta_model(model, state_dict, shard_file, reverse_renaming_mapping, device_map, disk_offload_folder, disk_offload_index, hf_quantizer, keep_in_fp32_regex, device_mesh)
772 else:
773 # TODO naming is stupid it loads it as well
--> 774 hf_quantizer.create_quantized_param(model, param, param_name, param_device)
775
776 # For quantized modules with FSDP/DeepSpeed Stage 3, we need to quantize the parameter on the GPU
/usr/local/lib/python3.12/dist-packages/transformers/quantizers/quantizer_bnb_4bit.py in create_quantized_param(self, model, param_value, param_name, target_device, **kwargs)
188 # update param name to get the weights instead of the quantized stats
189 param_name = self.get_param_name(param_name)
--> 190 module, tensor_name = get_module_from_name(model, param_name)
191
192 #
torch.Tensor.to(<int num>)is not supported bytorch_npu(see this issue)./usr/local/lib/python3.12/dist-packages/transformers/quantizers/quantizers_utils.py in get_module_from_name(module, tensor_name)
18 if "." in tensor_name:
19 module_name, tensor_name = tensor_name.rsplit(".", 1)
---> 20 module = module.get_submodule(module_name)
21 return module, tensor_name
/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py in get_submodule(self, target)
723 for item in atoms:
724 if not hasattr(mod, item):
--> 725 raise AttributeError(
726 mod._get_name() + " has no attribute
" + item + ""727 )
AttributeError: SequentialLlama4TextExperts has no attribute
down_projBeta Was this translation helpful? Give feedback.
All reactions