Skip to content

[Bug] Qwen3-4b-Instuct-2507-bnb-4bit : AttributeError: module 'transformers.models.bit.modeling_bit' has no attribute 'Linear' #3376

@escon1004

Description

@escon1004
  1. Did you update? pip install --upgrade unsloth unsloth_zoo
  • YES
  1. Colab or Kaggle or local / cloud
  • Runpod Pod with A5000
  1. Number GPUs used, use nvidia-smi
Image
  1. Which notebook? Please link!
  • runpod/pytorch:2.8.0-py3.11-cuda12.8.1-cudnn-devel-ubuntu22.04
  1. Which Unsloth version, TRL version, transformers version, PyTorch version?
    Unsloth version: 2025.9.7
    TRL version: 0.22.2
    Transformers version: 4.55.4
    PyTorch version: 2.8.0+cu128

  2. Which trainer? SFTTrainer, GRPOTrainer etc

  • Before training, the issue occurred when I tried to load the LoRA adapter together with the base model.
from unsloth import FastLanguageModel
import torch
from transformers import pipeline, TextIteratorStreamer
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from peft import PeftModel, PeftConfig

# 1) 모델 불러오기
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Youseff1987/qwen-3-4b-instruct-bnb-4bit-lora-2",
)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[4], line 2
      1 # 1) 모델 불러오기
----> 2 model, tokenizer = FastLanguageModel.from_pretrained(
      3     model_name = "Youseff1987/qwen-3-4b-instruct-bnb-4bit-lora-2",
      4 )

File /usr/local/lib/python3.11/dist-packages/unsloth/models/loader.py:365, in FastLanguageModel.from_pretrained(model_name, max_seq_length, dtype, load_in_4bit, load_in_8bit, full_finetuning, token, device_map, rope_scaling, fix_tokenizer, trust_remote_code, use_gradient_checkpointing, resize_model_vocab, revision, use_exact_model_name, fast_inference, gpu_memory_utilization, float8_kv_cache, random_state, max_lora_rank, disable_log_stats, qat_scheme, *args, **kwargs)
    348     dispatch_model = FastQwen3Model if model_type == "qwen3" else FastQwen3MoeModel
    349 # elif model_type == "falcon_h1":
    350 #     dispatch_model = FastFalconH1Model
    351 #     if not SUPPORTS_FALCON_H1:
   (...)    363 #     dispatch_model = FastGraniteModel
    364 else:
--> 365     return FastModel.from_pretrained(
    366         model_name                 = old_model_name,
    367         max_seq_length             = max_seq_length,
    368         dtype                      = dtype,
    369         load_in_4bit               = load_in_4bit,
    370         load_in_8bit               = load_in_8bit,
    371         full_finetuning            = full_finetuning,
    372         token                      = token,
    373         device_map                 = device_map,
    374         rope_scaling               = rope_scaling, # [TODO] No effect
    375         fix_tokenizer              = fix_tokenizer, # [TODO] No effect
    376         trust_remote_code          = trust_remote_code,
    377         use_gradient_checkpointing = use_gradient_checkpointing,
    378         resize_model_vocab         = resize_model_vocab, # [TODO] No effect
    379         revision                   = revision,
    380         return_logits              = False, # Return logits
    381         fullgraph                  = True, # No graph breaks
    382         use_exact_model_name       = use_exact_model_name,
    383 
    384         # Pass vLLM/inference parameters
    385         fast_inference             = fast_inference,
    386         gpu_memory_utilization     = gpu_memory_utilization,
    387         float8_kv_cache            = float8_kv_cache,
    388         random_state               = random_state,
    389         max_lora_rank              = max_lora_rank,
    390         disable_log_stats          = disable_log_stats,
    391 
    392         *args, **kwargs,
    393     )
    394 pass
    396 if use_gradient_checkpointing == "unsloth":

File /usr/local/lib/python3.11/dist-packages/unsloth/models/loader.py:825, in FastModel.from_pretrained(model_name, max_seq_length, dtype, load_in_4bit, load_in_8bit, full_finetuning, token, device_map, rope_scaling, fix_tokenizer, trust_remote_code, use_gradient_checkpointing, resize_model_vocab, revision, return_logits, fullgraph, use_exact_model_name, auto_model, whisper_language, whisper_task, unsloth_force_compile, fast_inference, gpu_memory_utilization, float8_kv_cache, random_state, max_lora_rank, disable_log_stats, qat_scheme, *args, **kwargs)
    823 with redirector:
    824     patch_loss_functions(torch_compile = False)
--> 825     model_types, supports_sdpa = unsloth_compile_transformers(
    826         dtype                   = dtype,
    827         model_name              = model_name,
    828         model_types             = model_types,
    829         token                   = token,
    830         sdpa_dynamic_mask       = True,
    831         sdpa_bool_masks         = True,
    832         sdpa_gqa_replace        = True,
    833         sdpa_dynamic_compile    = True,
    834         compile_attention       = True,
    835         disable_causal_masks    = True,
    836         compile_torch_modules   = True,
    837         compile_custom_modules  = True,
    838         compile_function_calls  = True,
    839         fuse_lm_head            = True,
    840         gradient_checkpointing  = True,
    841         manual_replacements     = True,
    842         fast_lora_forwards      = True,
    843         fast_residual_stream    = False,
    844         accurate_accumulation   = True,
    845         epilogue_fusion         = True,
    846         max_autotune            = False,
    847         shape_padding           = True,
    848         cudagraphs              = False,
    849         debug                   = False,
    850         fullgraph               = fullgraph,
    851         import_from_cache       = False,
    852         disable                 = False,
    853         return_logits           = return_logits,
    854         trust_remote_code       = trust_remote_code,
    855         unsloth_force_compile   = unsloth_force_compile,
    856     )
    857 pass
    858 # Fix SDPA

File /usr/local/lib/python3.11/dist-packages/unsloth/models/_utils.py:1470, in unsloth_compile_transformers(dtype, model_name, model_types, token, revision, trust_remote_code, sdpa_dynamic_mask, sdpa_bool_masks, sdpa_gqa_replace, sdpa_dynamic_compile, compile_attention, disable_causal_masks, compile_torch_modules, compile_custom_modules, compile_function_calls, fuse_lm_head, gradient_checkpointing, manual_replacements, fast_lora_forwards, fast_residual_stream, accurate_accumulation, epilogue_fusion, max_autotune, shape_padding, cudagraphs, debug, fullgraph, import_from_cache, disable, return_logits, unsloth_force_compile)
   1468 supports_sdpa = [True]
   1469 for model_type in model_types:
-> 1470     _unsloth_compile_transformers(
   1471         model_type,
   1472         sdpa_dynamic_mask      = sdpa_dynamic_mask,
   1473         sdpa_bool_masks        = sdpa_bool_masks,
   1474         sdpa_gqa_replace       = sdpa_gqa_replace,
   1475         sdpa_dynamic_compile   = sdpa_dynamic_compile,
   1476         compile_attention      = compile_attention,
   1477         disable_causal_masks   = disable_causal_masks,
   1478         compile_torch_modules  = compile_torch_modules,
   1479         compile_custom_modules = compile_custom_modules,
   1480         compile_function_calls = compile_function_calls,
   1481         fuse_lm_head           = fuse_lm_head,
   1482         gradient_checkpointing = gradient_checkpointing,
   1483         manual_replacements    = manual_replacements,
   1484         fast_lora_forwards     = fast_lora_forwards,
   1485         fast_residual_stream   = fast_residual_stream,
   1486         accurate_accumulation  = accurate_accumulation,
   1487         epilogue_fusion        = epilogue_fusion,
   1488         max_autotune           = max_autotune,
   1489         shape_padding          = shape_padding,
   1490         cudagraphs             = cudagraphs,
   1491         debug                  = debug,
   1492         fullgraph              = fullgraph,
   1493         import_from_cache      = import_from_cache,
   1494         disable                = disable,
   1495         return_logits          = return_logits,
   1496         supports_sdpa          = supports_sdpa,
   1497     )
   1498 pass
   1499 # Redo patches which override compiler

File /usr/local/lib/python3.11/dist-packages/unsloth_zoo/compiler.py:2215, in unsloth_compile_transformers(model_type, sdpa_dynamic_mask, sdpa_bool_masks, sdpa_gqa_replace, sdpa_dynamic_compile, compile_attention, disable_causal_masks, compile_torch_modules, compile_custom_modules, compile_function_calls, fuse_lm_head, gradient_checkpointing, manual_replacements, fast_lora_forwards, fast_residual_stream, accurate_accumulation, epilogue_fusion, max_autotune, shape_padding, cudagraphs, debug, fullgraph, import_from_cache, disable, return_logits, supports_sdpa)
   2213 if disable_causal_masks:
   2214     for module in other_classes:
-> 2215         source = eval(f"{model_location}.{module}")
   2216         if not hasattr(source, "_update_causal_mask"): continue
   2218         try: source = inspect.getsource(source.__init__)

File <string>:1

AttributeError: module 'transformers.models.bit.modeling_bit' has no attribute 'Linear'

It seems that the LoRA adapter itself is not broken. If you first load the base model and then load the LoRA adapter through PEFT, it can be loaded and used normally.

↓↓ It is OK, and working well for me. (But I want to be able to load base_model and adapter together. )

from unsloth import FastLanguageModel
import torch
from transformers import pipeline, TextIteratorStreamer
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from peft import PeftModel, PeftConfig

# 1) 베이스 모델 불러오기
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-4B-Instruct-2507-bnb-4bit",   # ✅ Hugging Face의 원본 베이스 모델
    max_seq_length = 4096,
    load_in_4bit = True,
    full_finetuning = False,
)

# 2. LoRA 어댑터 구성 불러오기
peft_model_id = "Youseff1987/qwen-3-4b-instruct-bnb-4bit-lora"
config = PeftConfig.from_pretrained(peft_model_id)
# 3. LoRA 어댑터를 Base 모델 위에 로드
model = PeftModel.from_pretrained(base_model, peft_model_id)

When saving the LoRA adapter locally and loading it back, it works fine and can be loaded at once just like other models.
However, if I try to load it in a separated way, it causes issues: merge does not work, and additional SFT cannot proceed.

model.save_pretrained('./lora_adapter')
tokenizer.save_pretrained('./lora_adapter')

# session restart

from unsloth import FastLanguageModel
import torch
from transformers import pipeline, TextIteratorStreamer
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from peft import PeftModel, PeftConfig

# 1) 모델 불러오기
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "./lora_adapter"
)

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions