Skip to content

Sync Shampoo and GPA with facebookresearch/optimizers #1196

Sync Shampoo and GPA with facebookresearch/optimizers

Sync Shampoo and GPA with facebookresearch/optimizers #1196

Workflow file for this run

name: examples
on: [push, pull_request]
jobs:
examples:
name: "Python 3.12"
runs-on: 4-core-ubuntu-gpu-t4
steps:
- uses: actions/checkout@v4
- name: Set up and update uv.
run: |
curl -LsSf https://astral.sh/uv/install.sh | sh
source $HOME/.local/bin/env
uv self update
- name: Install Python.
run: uv python install 3.12
- name: Create venv and install the package.
run: |
uv venv && source .venv/bin/activate
uv pip install ".[examples]"
- name: Run single GPU example with Adam to serve as a baseline.
run: |
source .venv/bin/activate
CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=adam batch_size=1024
- name: Run single GPU examples with Distributed Shampoo and different graftings on CPU.
run: |
source .venv/bin/activate
CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 'optimizer.grafting_config={_target_:distributed_shampoo.AdaGradPreconditionerConfig,epsilon:1e-8}' epochs=1 batch_size=1024
CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 'optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 batch_size=1024
CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 'optimizer.grafting_config={_target_:distributed_shampoo.RMSpropPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 batch_size=1024
CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 'optimizer.grafting_config={_target_:distributed_shampoo.SGDPreconditionerConfig}' epochs=1 batch_size=1024
- name: Run single GPU example on GPU.
run: |
source .venv/bin/activate
python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 'optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 batch_size=1024
- name: Run DDP example on CPU.
run: |
source .venv/bin/activate
CUDA_VISIBLE_DEVICES="" torchrun --standalone --nnodes=1 --nproc_per_node=2 -m distributed_shampoo.examples.cifar10_example parallelism=ddp optimizer=shampoo optimizer.precondition_frequency=15 'optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 local_batch_size=1024 backend=gloo
- name: Run DDP example on GPU.
run: |
source .venv/bin/activate
torchrun --standalone --nnodes=1 --nproc_per_node=1 -m distributed_shampoo.examples.cifar10_example parallelism=ddp optimizer=shampoo optimizer.precondition_frequency=30 'optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 local_batch_size=1024