Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ checkpoints/
train_data.jsonl
*.ckpt
*.pth
*.jsonl

# Logs
*.log
Expand Down
4 changes: 2 additions & 2 deletions config.example.yml → examples/lexifreak.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer

datasets:
- path: brainrot.jsonl
- path: discord.parquet
type: chat_template
chat_template: tokenizer_default
field_messages: messages
Expand Down Expand Up @@ -32,7 +32,7 @@ lora_target_modules:
- up_proj
- down_proj

wandb_project: LexiFreak
wandb_project: LexiFreakExample
wandb_name: advance

val_set_size: 0.01
Expand Down
82 changes: 82 additions & 0 deletions examples/smollm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
base_model: mlx-community/SmolLM2-135M-Instruct
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer

datasets:
- path: train_data.jsonl
type: chat_template
chat_template: tokenizer_default
field_messages: messages
message_property_mappings:
role: role
content: content
train_on_eos: "turn"
roles:
assistant:
- assistant
user:
- user

load_in_4bit: False
# adapter: qlora
lora_r: 16
lora_alpha: 128
lora_dropout: 0.1
lora_target_modules:
- q_proj
- k_proj
- v_proj
- o_proj
- gate_proj
- up_proj
- down_proj

wandb_project: SmolLLMExample
wandb_name: advance

val_set_size: 0.01
evals_per_epoch: 10
eval_sample_packing: true
eval_max_new_tokens: 128

lora_modules_to_save:
- embed_tokens
- lm_head

bf16: auto
gradient_checkpointing_kwargs:
use_reentrant: true
resume_from_checkpoint:
flash_attention: true

gradient_accumulation_steps: 6
gradient_checkpointing: true
activation_offloading: true
micro_batch_size: 1
num_epochs: 5
optimizer: paged_adamw_8bit
lr_scheduler: cosine
learning_rate: 1e-5
warmup_ratio: 0.03
dataset_prepared_path: ./last_run_prepared

sequence_len: 2048
pad_to_sequence_len: true
sample_packing: true

output_dir: ./output
save_steps: 10000
logging_steps: 10
save_safetensors: true
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
- axolotl.integrations.liger.LigerPlugin
liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true

special_tokens:
pad_token: "<|finetune_right_pad_id|>"
eos_token: "<|eot_id|>"
15 changes: 11 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ name = "train-mlx"
version = "0.1.0"
description = "LLM training with MLX on Apple Silicon"
authors = [
{name = "Sarah Aronson", email = "vagabond@pingas.org"}
{name = "Sarah Aronson", email = "vagabond@pingas.org"},
{name = "fizz~", email = "fizzarolli@riseup.net"}
]
requires-python = ">=3.10"
requires-python = ">=3.12"
dependencies = [
"mlx>=0.20.0",
"mlx-lm>=0.29.1",
Expand All @@ -27,6 +28,12 @@ dependencies = [
]

[project.optional-dependencies]
cuda12 = [
"mlx[cuda12]>=0.20.0"
]
cuda13 = [
"mlx[cuda13]>=0.20.0"
]
dev = [
"black>=24.0.0",
"ruff>=0.3.0",
Expand All @@ -37,11 +44,11 @@ dev = [

[tool.black]
line-length = 100
target-version = ['py310']
target-version = ['py312']

[tool.ruff]
line-length = 100
target-version = "py310"
target-version = "py312"

[tool.ruff.lint]
select = [
Expand Down
Loading
Loading