unslothai · danielhanchen · Mar 5, 2025 · Feb 12, 2025 · Feb 12, 2025 · Feb 12, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ name = "unsloth"
 dynamic = ["version"]
 description = "2-5X faster LLM finetuning"
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.9,<=3.12"
 license = {file = "LICENSE"}
 keywords = ["ai", "llm",]
 authors = [
@@ -39,8 +39,8 @@ triton = [
     "triton @ https://github.com/woct0rdho/triton-windows/releases/download/v3.2.0-windows.post10/triton-3.2.0-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'"
 ]
 
-windows=[
-    "unsloth_zoo>=2025.3.1",
+huggingface = [
+    "unsloth_zoo>=2025.3.2",
     "packaging",
     "tyro",
     "transformers>=4.46.1,!=4.47.0",
@@ -51,34 +51,18 @@ windows=[
     "wheel>=0.42.0",
     "numpy",
     "accelerate>=0.34.1",
-    "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0",
+    "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0,<=0.15.2",
     "peft>=0.7.1,!=0.11.0",
     "protobuf<4.0.0",
     "huggingface_hub",
     "hf_transfer",
     "unsloth[triton]",
+]
+windows=[
+    "unsloth[huggingface]",
     "bitsandbytes>=0.41.1 ; platform_system == 'Windows'",
     "xformers>=0.0.22.post7 ; platform_system == 'Windows'",
 ]
-huggingface = [
-    "unsloth_zoo>=2025.3.1",
-    "packaging",
-    "tyro",
-    "transformers>=4.46.1,!=4.47.0",
-    "datasets>=2.16.0",
-    "sentencepiece>=0.2.0",
-    "tqdm",
-    "psutil",
-    "wheel>=0.42.0",
-    "numpy",
-    "accelerate>=0.34.1",
-    "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0",
-    "peft>=0.7.1,!=0.11.0",
-    "protobuf<4.0.0",
-    "huggingface_hub",
-    "hf_transfer",
-    "unsloth[triton]",
-]
 cu118only = [
     "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
     "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
@@ -370,7 +354,7 @@ colab-ampere-torch220 = [
     "flash-attn>=2.6.3",
 ]
 colab-new = [
-    "unsloth_zoo>=2025.2.7",
+    "unsloth_zoo>=2025.3.1",
     "packaging",
     "tyro",
     "transformers>=4.46.1,!=4.47.0",
@@ -388,7 +372,7 @@ colab-new = [
 ]
 colab-no-deps = [
     "accelerate>=0.34.1",
-    "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0",
+    "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0,<=0.15.2",
     "peft>=0.7.1",
     "xformers",
     "bitsandbytes>=0.46.1",

diff --git a/unsloth/__init__.py b/unsloth/__init__.py
@@ -198,7 +198,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
 # Check for unsloth_zoo
 try:
     unsloth_zoo_version = importlib_version("unsloth_zoo")
-    if Version(unsloth_zoo_version) < Version("2025.3.1"):
+    if Version(unsloth_zoo_version) < Version("2025.3.2"):
         try:
             os.system("pip install --upgrade --no-cache-dir --no-deps unsloth_zoo")
         except:

diff --git a/unsloth/kernels/utils.py b/unsloth/kernels/utils.py
@@ -104,6 +104,11 @@ def _get_tensor_stream(tensor: torch_Tensor) -> c_void_p:
 cdequantize_blockwise_bf16_nf4  = bnb.functional.lib.cdequantize_blockwise_bf16_nf4
 cgemm_4bit_inference_naive_fp16 = bnb.functional.lib.cgemm_4bit_inference_naive_fp16
 cgemm_4bit_inference_naive_bf16 = bnb.functional.lib.cgemm_4bit_inference_naive_bf16
+torch_mm = torch.mm
+torch_mv = torch.mv
+torch_matmul = torch.matmul
+torch_addmm  = torch.addmm
+torch_empty  = torch.empty
 
 def QUANT_STATE(W): return getattr(W, "quant_state", None)
 
@@ -194,8 +199,8 @@ def fast_dequantize(W, quant_state = None, out = None, use_global_buffer = False
             WEIGHT_BUFFER = WEIGHT_BUFFERS[device_index]
             ABSMAX_BUFFER = ABSMAX_BUFFERS[device_index]
             if WEIGHT_BUFFER is None:
-                WEIGHT_BUFFERS[device_index] = WEIGHT_BUFFER = torch.empty(size, dtype = dtype, device = device, requires_grad = False)
-                ABSMAX_BUFFERS[device_index] = ABSMAX_BUFFER = torch.empty(n_elements_absmax, dtype = torch.float32, device = device, requires_grad = False)
+                WEIGHT_BUFFERS[device_index] = WEIGHT_BUFFER = torch_empty(size, dtype = dtype, device = device, requires_grad = False)
+                ABSMAX_BUFFERS[device_index] = ABSMAX_BUFFER = torch_empty(n_elements_absmax, dtype = torch.float32, device = device, requires_grad = False)
 
             if size > WEIGHT_BUFFER.numel(): WEIGHT_BUFFER.resize_(size)
             if n_elements_absmax > ABSMAX_BUFFER.numel(): ABSMAX_BUFFER.resize_(n_elements_absmax)
@@ -204,11 +209,11 @@ def fast_dequantize(W, quant_state = None, out = None, use_global_buffer = False
             out_absmax = ABSMAX_BUFFER[:n_elements_absmax]
         else:
             if out is None:
-                out = torch.empty(shape, dtype = dtype, device = device, requires_grad = False)
+                out = torch_empty(shape, dtype = dtype, device = device, requires_grad = False)
             else:
                 assert(out.shape == shape)
                 assert(out.dtype == dtype)
-            out_absmax = torch.empty(n_elements_absmax, dtype = torch.float32, device = device, requires_grad = False)
+            out_absmax = torch_empty(n_elements_absmax, dtype = torch.float32, device = device, requires_grad = False)
         pass
 
         # NF4 dequantization of statistics
@@ -258,11 +263,11 @@ def fast_dequantize(W, quant_state = None, out = None, use_global_buffer = False
 
         # Create weight matrix
         if out is None:
-            out = torch.empty(shape, dtype = dtype, device = device, requires_grad = False)
+            out = torch_empty(shape, dtype = dtype, device = device, requires_grad = False)
         else:
             assert(out.shape == shape)
             assert(out.dtype == dtype)
-        out_absmax = torch.empty(n_elements_absmax, dtype = torch.float32, device = device, requires_grad = False)
+        out_absmax = torch_empty(n_elements_absmax, dtype = torch.float32, device = device, requires_grad = False)
 
         # Do dequantization
         ptr_out_absmax = get_ptr(out_absmax)
@@ -286,7 +291,7 @@ def fast_dequantize(W, quant_state = None, out = None, use_global_buffer = False
 
 if HAS_CUDA_STREAM:
     def fast_gemv(X, W, quant_state, out = None):
-        if quant_state is None: return torch.matmul(X, W, out = out)
+        if quant_state is None: return torch_matmul(X, W, out = out)
         # For fast X @ W where seq_len == 1
         # From https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L1469
         _, q_len, hd = X.shape
@@ -318,7 +323,7 @@ def fast_gemv(X, W, quant_state, out = None):
         bout = shape[0]
 
         if out is None:
-            out = torch.empty((1, 1, bout,), dtype = dtype, device = device)
+            out = torch_empty((1, 1, bout,), dtype = dtype, device = device)
         # else:
         #     assert(out.shape == (1, 1, bout,))
         # pass
@@ -336,7 +341,7 @@ def fast_gemv(X, W, quant_state, out = None):
         ldb = ctypes_c_int32(ldb)
         ldc = ctypes_c_int32(ldc)
 
-        df = torch.empty(absmax.shape, dtype = torch.float32, device = device)
+        df = torch_empty(absmax.shape, dtype = torch.float32, device = device)
         with torch_cuda_device(device):
             cdequantize_blockwise_fp32(
                 get_ptr(code2), get_ptr(absmax), get_ptr(absmax2), get_ptr(df),
@@ -385,7 +390,7 @@ def fast_gemv(X, W, quant_state, out = None):
         device = W.device
 
         if out is None:
-            out = torch.empty((1, 1, bout,), dtype = dtype, device = device)
+            out = torch_empty((1, 1, bout,), dtype = dtype, device = device)
         # else:
         #     assert(out.shape == (1, 1, bout,))
         # pass
@@ -403,7 +408,7 @@ def fast_gemv(X, W, quant_state, out = None):
         ldb = ctypes_c_int32(ldb)
         ldc = ctypes_c_int32(ldc)
 
-        df = torch.empty(absmax.shape, dtype = torch.float32, device = device)
+        df = torch_empty(absmax.shape, dtype = torch.float32, device = device)
         cdequantize_blockwise_fp32(
             get_ptr(code2), get_ptr(absmax), get_ptr(absmax2), get_ptr(df),
             ctypes_c_int(blocksize2), ctypes_c_int(df.numel()),
@@ -423,10 +428,6 @@ def fast_gemv(X, W, quant_state, out = None):
 pass
 
 
-torch_mm = torch.mm
-torch_mv = torch.mv
-torch_matmul = torch.matmul
-torch_addmm  = torch.addmm
 def fast_linear_forward(proj, X, temp_lora = None, out = None):
 
     W, W_quant, lora_A, lora_B, lora_S, bias = get_lora_parameters_bias(proj)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2025.3.3"
+__version__ = "2025.3.4"
 
 __all__ = [
     "SUPPORTS_BFLOAT16",
@@ -39,8 +39,8 @@
     "create_boolean_mask",
     "torch_amp_custom_fwd",
     "torch_amp_custom_bwd",
-    "accelerate_old_send_to_device",
-    "accelerate_new_send_to_device",
+    # "accelerate_old_send_to_device",
+    # "accelerate_new_send_to_device",
     "patch_gradient_accumulation_fix",
     "patch_compiling_bitsandbytes",
     "patch_regional_compilation",
@@ -241,24 +241,24 @@ def patch_mistral_nemo_config(config):
 
 # =============================================
 # Fix KeyError: 'Cache only has 0 layers, attempted to access layer with index 0'
-import transformers.cache_utils
-if hasattr(transformers.cache_utils, "DynamicCache") and \
-    transformers.cache_utils.DynamicCache.__getitem__.__name__ != "__cache_utils_getitem__":
-
-    source = inspect.getsource(transformers.cache_utils.DynamicCache.__getitem__)
-    start = source.find("def")
-    spaces = start*" "
-    source = source.split("\n")
-    source = "\n".join(x[start:] for x in source)
-    where = source.find("raise KeyError")
-    source = source[:where] + \
-        f"if len(self) == 0:\n{spaces}{spaces}"\
-        "    raise RuntimeError('Unsloth: You must call `FastLanguageModel.for_inference(model)` before doing inference for Unsloth models.')\n" + \
-        f"{spaces}{spaces}else:\n{spaces}{spaces}{spaces}" + source[where:]
-    source = source.replace("__getitem__", "__cache_utils_getitem__", 1)
-    exec(source)
-    transformers.cache_utils.DynamicCache.__getitem__ = __cache_utils_getitem__
-pass
+# import transformers.cache_utils
+# if hasattr(transformers.cache_utils, "DynamicCache") and \
+#     transformers.cache_utils.DynamicCache.__getitem__.__name__ != "__cache_utils_getitem__":
+
+#     source = inspect.getsource(transformers.cache_utils.DynamicCache.__getitem__)
+#     start = source.find("def")
+#     spaces = start*" "
+#     source = source.split("\n")
+#     source = "\n".join(x[start:] for x in source)
+#     where = source.find("raise KeyError")
+#     source = source[:where] + \
+#         f"if len(self) == 0:\n{spaces}{spaces}"\
+#         "    raise RuntimeError('Unsloth: You must call `FastLanguageModel.for_inference(model)` before doing inference for Unsloth models.')\n" + \
+#         f"{spaces}{spaces}else:\n{spaces}{spaces}{spaces}" + source[where:]
+#     source = source.replace("__getitem__", "__cache_utils_getitem__", 1)
+#     exec(source)
+#     transformers.cache_utils.DynamicCache.__getitem__ = __cache_utils_getitem__
+# pass
 # =============================================
 
 # =============================================
@@ -411,25 +411,25 @@ def _is_openai_available(): return False
 
 # =============================================
 # Fix new Xformers versions TypeError: Multiple dispatch failed for 'torch._ops.aten.to.dtype_layout'
-accelerate_old_send_to_device = None
-accelerate_new_send_to_device = None
-if xformers_version is not None and Version(xformers_version) >= Version("0.0.27"):
-    import accelerate.utils.operations
-    if hasattr(accelerate.utils.operations, "send_to_device") and \
-        accelerate.utils.operations.send_to_device.__name__ != "_fixed_send_to_device":
-        accelerate_old_send_to_device = accelerate.utils.operations.send_to_device
-        from accelerate.utils.operations import *
-        send_to_device = inspect.getsource(accelerate.utils.operations.send_to_device)
-        send_to_device = re.sub(
-            r"([ ]{4,})return tensor\.to\(device\)",
-            r"\1try: return tensor.to(device)\n\1except: return tensor",
-            send_to_device,
-        ).replace("def send_to_device", "def _fixed_send_to_device")
-        exec(send_to_device)
-        # accelerate.utils.operations.send_to_device = _fixed_send_to_device
-        accelerate_new_send_to_device = _fixed_send_to_device
-    pass
-pass
+# accelerate_old_send_to_device = None
+# accelerate_new_send_to_device = None
+# if xformers_version is not None and Version(xformers_version) >= Version("0.0.27"):
+#     import accelerate.utils.operations
+#     if hasattr(accelerate.utils.operations, "send_to_device") and \
+#         accelerate.utils.operations.send_to_device.__name__ != "_fixed_send_to_device":
+#         accelerate_old_send_to_device = accelerate.utils.operations.send_to_device
+#         from accelerate.utils.operations import *
+#         send_to_device = inspect.getsource(accelerate.utils.operations.send_to_device)
+#         send_to_device = re.sub(
+#             r"([ ]{4,})return tensor\.to\(device\)",
+#             r"\1try: return tensor.to(device)\n\1except: return tensor",
+#             send_to_device,
+#         ).replace("def send_to_device", "def _fixed_send_to_device")
+#         exec(send_to_device)
+#         # accelerate.utils.operations.send_to_device = _fixed_send_to_device
+#         accelerate_new_send_to_device = _fixed_send_to_device
+#     pass
+# pass
 
 # Transformers 4.46 breaks dynamic caching. This is a hack
 import transformers.generation.configuration_utils