allura-org · aronson · Jan 24, 2026 · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026
diff --git a/.gitignore b/.gitignore
@@ -58,6 +58,7 @@ checkpoints/
 train_data.jsonl
 *.ckpt
 *.pth
+*.jsonl
 
 # Logs
 *.log

diff --git a/config.example.yml → examples/lexifreak.yml b/config.example.yml → examples/lexifreak.yml
@@ -3,7 +3,7 @@ model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 
 datasets:
-  - path: brainrot.jsonl
+  - path: discord.parquet
     type: chat_template
     chat_template: tokenizer_default
     field_messages: messages
@@ -32,7 +32,7 @@ lora_target_modules:
   - up_proj
   - down_proj
 
-wandb_project: LexiFreak
+wandb_project: LexiFreakExample
 wandb_name: advance
 
 val_set_size: 0.01

diff --git a/examples/smollm.yml b/examples/smollm.yml
@@ -0,0 +1,82 @@
+base_model: mlx-community/SmolLM2-135M-Instruct
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+
+datasets:
+  - path: train_data.jsonl
+    type: chat_template
+    chat_template: tokenizer_default
+    field_messages: messages
+    message_property_mappings:
+      role: role
+      content: content
+    train_on_eos: "turn"
+    roles:
+      assistant:
+        - assistant
+      user:
+        - user
+
+load_in_4bit: False
+  # adapter: qlora
+lora_r: 16
+lora_alpha: 128
+lora_dropout: 0.1
+lora_target_modules:
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - gate_proj
+  - up_proj
+  - down_proj
+
+wandb_project: SmolLLMExample
+wandb_name: advance
+
+val_set_size: 0.01
+evals_per_epoch: 10
+eval_sample_packing: true
+eval_max_new_tokens: 128
+
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+
+bf16: auto
+gradient_checkpointing_kwargs:
+  use_reentrant: true
+resume_from_checkpoint:
+flash_attention: true
+
+gradient_accumulation_steps: 6
+gradient_checkpointing: true
+activation_offloading: true
+micro_batch_size: 1
+num_epochs: 5
+optimizer: paged_adamw_8bit
+lr_scheduler: cosine
+learning_rate: 1e-5
+warmup_ratio: 0.03
+dataset_prepared_path: ./last_run_prepared
+
+sequence_len: 2048
+pad_to_sequence_len: true
+sample_packing: true
+
+output_dir: ./output
+save_steps: 10000
+logging_steps: 10
+save_safetensors: true
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+  - axolotl.integrations.liger.LigerPlugin
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_layer_norm: true
+liger_fused_linear_cross_entropy: true
+
+special_tokens:
+  pad_token: "<|finetune_right_pad_id|>"
+  eos_token: "<|eot_id|>"
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,9 +7,10 @@ name = "train-mlx"
 version = "0.1.0"
 description = "LLM training with MLX on Apple Silicon"
 authors = [
-    {name = "Sarah Aronson", email = "vagabond@pingas.org"}
+    {name = "Sarah Aronson", email = "vagabond@pingas.org"},
+    {name = "fizz~", email = "fizzarolli@riseup.net"}
 ]
-requires-python = ">=3.10"
+requires-python = ">=3.12"
 dependencies = [
     "mlx>=0.20.0",
     "mlx-lm>=0.29.1",
@@ -27,6 +28,12 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
+cuda12 = [
+    "mlx[cuda12]>=0.20.0"
+]
+cuda13 = [
+    "mlx[cuda13]>=0.20.0"
+]
 dev = [
     "black>=24.0.0",
     "ruff>=0.3.0",
@@ -37,11 +44,11 @@ dev = [
 
 [tool.black]
 line-length = 100
-target-version = ['py310']
+target-version = ['py312']
 
 [tool.ruff]
 line-length = 100
-target-version = "py310"
+target-version = "py312"
 
 [tool.ruff.lint]
 select = [
-Original file line number
+Diff line change
@@ Expand Up / @@ -58,6 +58,7 @@ checkpoints/ @@
     train_data.jsonl
     *.ckpt
     *.pth
+    *.jsonl
     # Logs
     *.log
@@ Expand Down @@