unslothai
diff --git a/‎README.md‎
Lines changed: 30 additions & 32 deletions b/‎README.md‎
Lines changed: 30 additions & 32 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎unsloth/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎unsloth/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎unsloth/kernels/cross_entropy_loss.py‎
Lines changed: 56 additions & 46 deletions b/‎unsloth/kernels/cross_entropy_loss.py‎
Lines changed: 56 additions & 46 deletions
diff --git a/‎unsloth/kernels/geglu.py‎
Lines changed: 17 additions & 7 deletions b/‎unsloth/kernels/geglu.py‎
Lines changed: 17 additions & 7 deletions
@@ -232,10 +232,8 @@ print(f'pip install --upgrade pip && pip install "unsloth[{x}] @ git+https://git
 
 ```python
 from unsloth import FastLanguageModel 
-from unsloth import is_bfloat16_supported
 import torch
-from trl import SFTTrainer
-from transformers import TrainingArguments
+from trl import SFTTrainer, SFTConfig
 from datasets import load_dataset
 max_seq_length = 2048 # Supports RoPE Scaling interally, so choose any!
 # Get LAION dataset
@@ -244,21 +242,28 @@ dataset = load_dataset("json", data_files = {"train" : url}, split = "train")
 
 # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
 fourbit_models = [
-    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
+    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
+    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
+    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
+    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
+    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
     "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
-    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
-    "unsloth/llama-3-8b-Instruct-bnb-4bit",
-    "unsloth/llama-3-70b-bnb-4bit",
-    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
+    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
     "unsloth/Phi-3-medium-4k-instruct",
-    "unsloth/mistral-7b-bnb-4bit",
-    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
+    "unsloth/gemma-2-9b-bnb-4bit",
+    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
+
+    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
+    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
+    "unsloth/Llama-3.2-3B-bnb-4bit",
+    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
+
+    "unsloth/Llama-3.3-70B-Instruct-bnb-4bit" # NEW! Llama 3.3 70B!
 ] # More models at https://huggingface.co/unsloth
 
 model, tokenizer = FastLanguageModel.from_pretrained(
-    model_name = "unsloth/llama-3-8b-bnb-4bit",
+    model_name = "unsloth/Llama-3.2-1B",
     max_seq_length = max_seq_length,
-    dtype = None,
     load_in_4bit = True,
 )
 
@@ -282,16 +287,14 @@ model = FastLanguageModel.get_peft_model(
 trainer = SFTTrainer(
     model = model,
     train_dataset = dataset,
-    dataset_text_field = "text",
-    max_seq_length = max_seq_length,
     tokenizer = tokenizer,
-    args = TrainingArguments(
+    args = SFTConfig(
+        dataset_text_field = "text",
+        max_seq_length = max_seq_length,
         per_device_train_batch_size = 2,
         gradient_accumulation_steps = 4,
         warmup_steps = 10,
         max_steps = 60,
-        fp16 = not is_bfloat16_supported(),
-        bf16 = is_bfloat16_supported(),
         logging_steps = 1,
         output_dir = "outputs",
         optim = "adamw_8bit",
@@ -323,17 +326,14 @@ RL including DPO, GRPO, PPO, Reward Modelling, Online DPO all work with Unsloth.
 import os
 os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Optional set GPU device ID
 
-from unsloth import FastLanguageModel, PatchDPOTrainer
-from unsloth import is_bfloat16_supported
-PatchDPOTrainer()
+from unsloth import FastLanguageModel
 import torch
-from transformers import TrainingArguments
-from trl import DPOTrainer
+from trl import DPOTrainer, DPOConfig
+max_seq_length = 2048
 
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name = "unsloth/zephyr-sft-bnb-4bit",
     max_seq_length = max_seq_length,
-    dtype = None,
     load_in_4bit = True,
 )
 
@@ -355,24 +355,22 @@ model = FastLanguageModel.get_peft_model(
 dpo_trainer = DPOTrainer(
     model = model,
     ref_model = None,
-    args = TrainingArguments(
+    train_dataset = YOUR_DATASET_HERE,
+    # eval_dataset = YOUR_DATASET_HERE,
+    tokenizer = tokenizer,
+    args = DPOConfig(
         per_device_train_batch_size = 4,
         gradient_accumulation_steps = 8,
         warmup_ratio = 0.1,
         num_train_epochs = 3,
-        fp16 = not is_bfloat16_supported(),
-        bf16 = is_bfloat16_supported(),
         logging_steps = 1,
         optim = "adamw_8bit",
         seed = 42,
         output_dir = "outputs",
+        max_length = 1024,
+        max_prompt_length = 512,
+        beta = 0.1,
     ),
-    beta = 0.1,
-    train_dataset = YOUR_DATASET_HERE,
-    # eval_dataset = YOUR_DATASET_HERE,
-    tokenizer = tokenizer,
-    max_length = 1024,
-    max_prompt_length = 512,
 )
 dpo_trainer.train()
 ```
 
@@ -40,7 +40,7 @@ triton = [
 ]
 
 windows=[
-    "unsloth_zoo>=2025.2.7",
+    "unsloth_zoo>=2025.3.1",
     "packaging",
     "tyro",
     "transformers>=4.46.1,!=4.47.0",
@@ -61,7 +61,7 @@ windows=[
     "xformers>=0.0.22.post7 ; platform_system == 'Windows'",
 ]
 huggingface = [
-    "unsloth_zoo>=2025.2.7",
+    "unsloth_zoo>=2025.3.1",
     "packaging",
     "tyro",
     "transformers>=4.46.1,!=4.47.0",
 
@@ -198,7 +198,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
 # Check for unsloth_zoo
 try:
     unsloth_zoo_version = importlib_version("unsloth_zoo")
-    if Version(unsloth_zoo_version) < Version("2025.2.6"):
+    if Version(unsloth_zoo_version) < Version("2025.3.1"):
         try:
             os.system("pip install --upgrade --no-cache-dir --no-deps unsloth_zoo")
         except:
@@ -212,6 +212,7 @@ def is_bf16_supported(): return SUPPORTS_BFLOAT16
 pass
 
 from .models import *
+from .models import __version__
 from .save import *
 from .chat_templates import *
 from .tokenizer_utils import *
 
@@ -15,7 +15,13 @@
 import triton
 import triton.language as tl
 import torch
-from .utils import calculate_settings, MAX_FUSED_SIZE, triton_tanh, triton_cast
+from .utils import (
+    calculate_settings,
+    MAX_FUSED_SIZE,
+    triton_tanh,
+    triton_cast,
+    torch_cuda_device,
+)
 from transformers.models.llama.modeling_llama import logger
 from packaging.version import Version
 
@@ -279,10 +285,11 @@ def forward(ctx, logits, labels, logit_softcapping : float = 0, logit_scaling :
         n_rows : int
         vocab_size : int
         n_rows, vocab_size = logits.shape
+        device = logits.device
 
         div, mod = divmod(vocab_size, MAX_FUSED_SIZE)
         n_chunks : int = div + (mod != 0)
-        losses = torch.empty(n_rows, dtype = torch.float32, device = "cuda:0")
+        losses = torch.empty(n_rows, dtype = torch.float32, device = device)
 
         DO_SOFTCAPPING   : bool = bool(logit_softcapping != 0)
         DO_LOGIT_SCALING : bool = bool(logit_scaling != 0)
@@ -292,39 +299,41 @@ def forward(ctx, logits, labels, logit_softcapping : float = 0, logit_scaling :
         if n_chunks == 1:
             # For small vocabs <= 65336 like Llama, Mistral
             BLOCK_SIZE, num_warps = calculate_settings(vocab_size)
-            logsumexp = torch.empty(n_rows, dtype = torch.float32, device = "cuda:0")
-
-            _cross_entropy_forward[(n_rows,)](
-                logits, logits.stride(0),
-                losses,
-                logsumexp,
-                labels,
-                VOCAB_SIZE       = vocab_size,
-                BLOCK_SIZE       = BLOCK_SIZE,
-                DO_SOFTCAPPING   = DO_SOFTCAPPING,
-                SOFTCAP          = logit_softcapping,
-                DO_LOGIT_SCALING = DO_LOGIT_SCALING,
-                LOGIT_SCALE      = logit_scaling,
-                num_warps        = num_warps,
-            )
+            logsumexp = torch.empty(n_rows, dtype = torch.float32, device = device)
+
+            with torch_cuda_device(device):
+                _cross_entropy_forward[(n_rows,)](
+                    logits, logits.stride(0),
+                    losses,
+                    logsumexp,
+                    labels,
+                    VOCAB_SIZE       = vocab_size,
+                    BLOCK_SIZE       = BLOCK_SIZE,
+                    DO_SOFTCAPPING   = DO_SOFTCAPPING,
+                    SOFTCAP          = logit_softcapping,
+                    DO_LOGIT_SCALING = DO_LOGIT_SCALING,
+                    LOGIT_SCALE      = logit_scaling,
+                    num_warps        = num_warps,
+                )
         else:
             # For large vocabs > 65336 like Gemma 256K
-            logsumexp = torch.empty((n_rows, n_chunks,), dtype = torch.float32, device = "cuda:0")
-
-            _chunked_cross_entropy_forward[(n_rows, n_chunks,)](
-                logits, logits.stride(0),
-                losses,
-                logsumexp,
-                labels,
-                VOCAB_SIZE       = vocab_size,
-                N_CHUNKS         = n_chunks,
-                BLOCK_SIZE       = MAX_FUSED_SIZE,
-                DO_SOFTCAPPING   = DO_SOFTCAPPING,
-                SOFTCAP          = logit_softcapping,
-                DO_LOGIT_SCALING = DO_LOGIT_SCALING,
-                LOGIT_SCALE      = logit_scaling,
-                num_warps        = 32,
-            )
+            logsumexp = torch.empty((n_rows, n_chunks,), dtype = torch.float32, device = device)
+
+            with torch_cuda_device(device):
+                _chunked_cross_entropy_forward[(n_rows, n_chunks,)](
+                    logits, logits.stride(0),
+                    losses,
+                    logsumexp,
+                    labels,
+                    VOCAB_SIZE       = vocab_size,
+                    N_CHUNKS         = n_chunks,
+                    BLOCK_SIZE       = MAX_FUSED_SIZE,
+                    DO_SOFTCAPPING   = DO_SOFTCAPPING,
+                    SOFTCAP          = logit_softcapping,
+                    DO_LOGIT_SCALING = DO_LOGIT_SCALING,
+                    LOGIT_SCALE      = logit_scaling,
+                    num_warps        = 32,
+                )
             # logsumexp(chunked_logsumexp) - x
             # Do the -x separately
             logsumexp = torch.logsumexp(logsumexp, dim = 1) # Row sum
@@ -354,19 +363,20 @@ def backward(ctx, dlosses):
         div, mod = divmod(vocab_size, BLOCK_SIZE)
         n_blocks : int = div + (mod != 0)
 
-        _cross_entropy_backward[(n_rows, n_blocks,)](
-            logits,   logits.stride(0),
-            dlosses, dlosses.stride(0),
-            logsumexp,
-            labels,
-            VOCAB_SIZE       = vocab_size,
-            BLOCK_SIZE       = BLOCK_SIZE,
-            DO_SOFTCAPPING   = ctx.DO_SOFTCAPPING,
-            SOFTCAP          = ctx.logit_softcapping,
-            DO_LOGIT_SCALING = ctx.DO_LOGIT_SCALING,
-            LOGIT_SCALE      = ctx.logit_scaling,
-            num_warps        = 8,
-        )
+        with torch_cuda_device(dlosses.device):
+            _cross_entropy_backward[(n_rows, n_blocks,)](
+                logits,   logits.stride(0),
+                dlosses, dlosses.stride(0),
+                logsumexp,
+                labels,
+                VOCAB_SIZE       = vocab_size,
+                BLOCK_SIZE       = BLOCK_SIZE,
+                DO_SOFTCAPPING   = ctx.DO_SOFTCAPPING,
+                SOFTCAP          = ctx.logit_softcapping,
+                DO_LOGIT_SCALING = ctx.DO_LOGIT_SCALING,
+                LOGIT_SCALE      = ctx.logit_scaling,
+                num_warps        = 8,
+            )
         return logits, None, None, None,
     pass
 pass
 
@@ -15,7 +15,11 @@
 import triton
 import triton.language as tl
 import torch
-from .utils import calculate_settings, triton_tanh
+from .utils import (
+    calculate_settings,
+    triton_tanh,
+    torch_cuda_device,
+)
 
 
 @triton.jit
@@ -41,9 +45,11 @@ def _exact_forward_kernel(e, g, h, n_elements, BLOCK_SIZE : tl.constexpr,):
 def geglu_exact_forward_kernel(gate, up):
     batch, seq_len, hd = gate.shape
     n_elements = gate.numel()
-    out = torch.empty((batch, seq_len, hd), dtype = gate.dtype, device = "cuda:0")
+    device = gate.device
+    out = torch.empty((batch, seq_len, hd), dtype = gate.dtype, device = device)
     grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
-    _exact_forward_kernel[grid](gate, up, out, n_elements, BLOCK_SIZE = 1024,)
+    with torch_cuda_device(device):
+        _exact_forward_kernel[grid](gate, up, out, n_elements, BLOCK_SIZE = 1024,)
     return out
 pass
 
@@ -99,7 +105,8 @@ def geglu_exact_backward_kernel(DW, e, g):
     batch_seq_len, hd = e.shape
     n_elements = e.numel()
     grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
-    _exact_backward_kernel[grid](DW, e, g, n_elements, BLOCK_SIZE = 1024,)
+    with torch_cuda_device(e.device):
+        _exact_backward_kernel[grid](DW, e, g, n_elements, BLOCK_SIZE = 1024,)
     return DW, e, g
 pass
 
@@ -133,9 +140,11 @@ def _approx_forward_kernel(e, g, h, n_elements, BLOCK_SIZE : tl.constexpr,):
 def geglu_approx_forward_kernel(gate, up):
     batch, seq_len, hd = gate.shape
     n_elements = gate.numel()
-    out = torch.empty((batch, seq_len, hd), dtype = gate.dtype, device = "cuda:0")
+    device = gate.device
+    out = torch.empty((batch, seq_len, hd), dtype = gate.dtype, device = device)
     grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
-    _approx_forward_kernel[grid](gate, up, out, n_elements, BLOCK_SIZE = 1024,)
+    with torch_cuda_device(device):
+        _approx_forward_kernel[grid](gate, up, out, n_elements, BLOCK_SIZE = 1024,)
     return out
 pass
 
@@ -198,6 +207,7 @@ def geglu_approx_backward_kernel(DW, e, g):
     batch_seq_len, hd = e.shape
     n_elements = e.numel()
     grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
-    _approx_backward_kernel[grid](DW, e, g, n_elements, BLOCK_SIZE = 1024,)
+    with torch_cuda_device(e.device):
+        _approx_backward_kernel[grid](DW, e, g, n_elements, BLOCK_SIZE = 1024,)
     return DW, e, g
 pass