axolotl-ai-cloud · SalmanMohammadi · Sep 12, 2025 · Aug 26, 2025 · Aug 27, 2025 · Aug 27, 2025
diff --git a/examples/llama-3/3b-qat-fsdp2-nfvp4.yaml b/examples/llama-3/3b-qat-fsdp2-nfvp4.yaml
@@ -0,0 +1,73 @@
+base_model: meta-llama/Llama-3.2-3B
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: yahma/alpaca-cleaned
+    type: alpaca
+    split: train[:95%]
+
+output_dir: ./outputs/qat_out/
+dataset_prepared_path: ./outputs/qat_out/dataset_prepared
+
+sample_packing: true
+sequence_len: 8192
+
+flex_attention: true
+flex_attn_compile_kwargs:
+  dynamic: false
+  mode: max-autotune-no-cudagraphs
+
+qat:
+  activation_dtype: nvfp4
+  weight_dtype: nvfp4
+  group_size: 16 # only group_size of 16 is supported with nvfp4
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 1
+micro_batch_size: 16
+num_epochs: 1
+optimizer: adamw_torch_fused
+
+cosine_constant_lr_ratio: 0
+cosine_min_lr_ratio: 1.0
+learning_rate: 2e-5
+save_only_model: true
+bf16: true
+
+resume_from_checkpoint:
+logging_steps: 1
+
+evals_per_epoch: 1
+saves_per_epoch: 1
+
+warmup_ratio: 0.1
+weight_decay: 0.0
+fsdp:
+  - full_shard
+  - auto_wrap
+
+fsdp_config:
+  fsdp_version: 2
+  fsdp_offload_params: false
+  fsdp_cpu_ram_efficient_loading: false
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_reshard_after_forward: true
+  fsdp_activation_checkpointing: true
+
+special_tokens:
+  pad_token: <|finetune_right_pad_id|>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/3b-qat-fsdp2.yaml b/examples/llama-3/3b-qat-fsdp2.yaml
@@ -6,24 +6,16 @@ load_in_8bit: false
 load_in_4bit: false
 strict: false
 
-plugins:
-  - axolotl.integrations.liger.LigerPlugin
-
-liger_rope: true
-liger_rms_norm: true
-liger_glu_activation: true
-liger_layer_norm: true
-liger_fused_linear_cross_entropy: true
-
 datasets:
   - path: yahma/alpaca-cleaned
     type: alpaca
+    split: train[:95%]
 
 output_dir: ./outputs/qat_out/
+dataset_prepared_path: ./outputs/qat_out/dataset_prepared
 
 sample_packing: true
-
-sequence_len: 512
+sequence_len: 8192
 
 flex_attention: true
 flex_attn_compile_kwargs:
@@ -67,7 +59,7 @@ fsdp:
 fsdp_config:
   fsdp_version: 2
   fsdp_offload_params: false
-  fsdp_cpu_ram_efficient_loading: true
+  fsdp_cpu_ram_efficient_loading: false
   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
   fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
   fsdp_state_dict_type: FULL_STATE_DICT
@@ -76,6 +68,6 @@ fsdp_config:
   fsdp_activation_checkpointing: true
 
 special_tokens:
-  pad_token: <|end_of_text|>
+  pad_token: <|finetune_right_pad_id|>
 
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/requirements.txt b/requirements.txt
@@ -64,7 +64,7 @@ langdetect==1.0.9
 immutabledict==4.2.0
 antlr4-python3-runtime==4.13.2
 
-torchao==0.12.0
+torchao @ git+https://github.com/pytorch/torchao.git@13029fb6855bc19ceb8215b6dab204146908464b
 schedulefree==1.4.1
 
 axolotl-contribs-lgpl==0.0.6

diff --git a/src/axolotl/cli/quantize.py b/src/axolotl/cli/quantize.py
@@ -5,12 +5,16 @@
 from pathlib import Path
 from typing import Union
 
-from transformers import AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, TorchAoConfig, AutoConfig
 
 from axolotl.cli.config import load_cfg
 from axolotl.loaders import load_tokenizer
 from axolotl.utils.logging import get_logger
-from axolotl.utils.quantization import TorchIntDType, quantize_model_for_ptq
+from axolotl.utils.quantization import (
+    get_quantization_config,
+    quantize_model,
+    TorchAOQuantDType,
+)
 
 LOG = get_logger(__name__)
 
@@ -43,13 +47,13 @@ def do_quantize(
             "No quantization configuration found. Please specify either qat or quantization in your config file."
         )
 
-    model_path = cli_args.get("model_path") or cfg.output_dir
+    model_path = cli_args.get("base_model") or cfg.output_dir
     if weight_dtype := cli_args.get("weight_dtype"):
-        weight_dtype = TorchIntDType[weight_dtype]
+        weight_dtype = TorchAOQuantDType.from_string(weight_dtype)
     else:
         weight_dtype = quantize_cfg.weight_dtype
     if activation_dtype := cli_args.get("activation_dtype"):
-        activation_dtype = TorchIntDType[activation_dtype]
+        activation_dtype = TorchAOQuantDType.from_string(activation_dtype)
     else:
         activation_dtype = quantize_cfg.activation_dtype
     group_size = cli_args.get("group_size") or quantize_cfg.group_size
@@ -60,7 +64,11 @@ def do_quantize(
 
     LOG.info(f"Loading model from {model_path}...")
     tokenizer = load_tokenizer(cfg)
-    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
+    config = AutoConfig.from_pretrained(model_path)
+    torch_dtype = config.torch_dtype if hasattr(config, "torch_dtype") else None
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path, device_map="auto", torch_dtype=torch_dtype
+    )
 
     LOG.info(
         f"Quantizing model with configuration: \n"
@@ -70,10 +78,16 @@ def do_quantize(
         f"\tquantize_embedding: {quantize_embedding}"
     )
 
-    quantize_model_for_ptq(
+    quantize_model(
         model, weight_dtype, group_size, activation_dtype, quantize_embedding
     )
 
+    quantization_config = TorchAoConfig(
+        get_quantization_config(weight_dtype, activation_dtype, group_size),
+        include_input_output_embeddings=quantize_embedding,
+    )
+    model.quantization_config = quantization_config
+
     LOG.info(f"Saving quantized model to: {str(Path(output_dir) / 'quantized')}...")
     model.save_pretrained(
         str(Path(output_dir) / "quantized"),

diff --git a/src/axolotl/train.py b/src/axolotl/train.py
@@ -30,11 +30,7 @@
     fix_untrained_tokens,
 )
 from axolotl.integrations.base import PluginManager
-from axolotl.loaders import (
-    ModelLoader,
-    load_processor,
-    load_tokenizer,
-)
+from axolotl.loaders import load_processor, load_tokenizer, ModelLoader
 from axolotl.utils.ctx_managers.sequence_parallel import SequenceParallelContextManager
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
@@ -234,16 +230,15 @@ def save_trained_model(
 
     # handle QAT
     if cfg.qat:
-        from axolotl.utils.quantization import convert_qat_model_for_ptq
+        from axolotl.utils.quantization import convert_qat_model
 
-        LOG.info("Processing QAT model for saving...")
-        convert_qat_model_for_ptq(
+        convert_qat_model(
             model,
             quantize_embedding=cfg.qat.quantize_embedding,
         )
         LOG.info(
-            "QAT modules have been converted for PTQ. Please ensure you quantize "
-            "your model weights with `axolotl quantize`."
+            "QAT usage note: please ensure you quantize your model fine-tuned using QAT by running `axolotl quantize`"
+            " with the same config which you used for training."
         )
     # Handle ReLoRA early return case
     if cfg.relora:
@@ -337,9 +332,7 @@ def save_trained_model(
 
     if hasattr(cfg, "llmcompressor") and cfg.llmcompressor:
         # TODO: add integration support so this can be implemented completely within the plugin
-        from axolotl.integrations.llm_compressor.utils import (
-            save_compressed_model,
-        )
+        from axolotl.integrations.llm_compressor.utils import save_compressed_model
 
         save_compressed_model(
             model=model,