Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions .github/workflows/install-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,54 @@ jobs:
python tests/core_ptl/check_imports.py --domain "$collection"
done

test-asr-install-linux-amd:
name: ubuntu-22.04-amd-py${{ matrix.python }}-asr
runs-on: ubuntu-22.04
strategy:
fail-fast: false
matrix:
python: ["3.10", "3.11", "3.12"]
steps:
- name: Checkout repo
uses: actions/checkout@v2

- name: Check disk space before cleanup
run: df -h

- name: Free up disk space
run: |
# Remove unnecessary packages and files on Ubuntu
sudo apt-get clean
sudo rm -rf /usr/local/lib/android || true
sudo rm -rf /opt/ghc || true
sudo rm -rf /usr/local/.ghcup || true
sudo rm -rf /usr/share/dotnet || true
sudo rm -rf /opt/az || true
# Clear pip and npm caches
pip cache purge || true
sudo npm cache clean --force || true

- name: Check disk space after cleanup
run: df -h

- name: Install Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python }}

- name: Install NeMo
run: |
pip install --no-cache-dir --upgrade pip
pip install --no-cache-dir ".[asr]"

- name: Check disk space after installation
run: df -h

- name: Run import checks
run: |
# Run import checks
python tests/core_ptl/check_imports.py --domain asr

test-installs-linux-arm:
name: ubuntu-22.04-arm-py${{ matrix.python }}-${{ matrix.installer }}
runs-on: ubuntu-22.04-arm
Expand Down Expand Up @@ -188,3 +236,51 @@ jobs:
for collection in "asr" "tts" "lightning" "core"; do
python tests/core_ptl/check_imports.py --domain "$collection"
done

test-asr-installs-linux-arm:
name: ubuntu-22.04-arm-py${{ matrix.python }}-asr
runs-on: ubuntu-22.04-arm
strategy:
fail-fast: false
matrix:
python: ["3.10", "3.11", "3.12"]
steps:
- name: Checkout repo
uses: actions/checkout@v2

- name: Check disk space before cleanup
run: df -h

- name: Free up disk space
run: |
# Remove unnecessary packages and files on Ubuntu ARM
sudo apt-get clean
sudo rm -rf /usr/local/lib/android || true
sudo rm -rf /opt/ghc || true
sudo rm -rf /usr/local/.ghcup || true
sudo rm -rf /usr/share/dotnet || true
sudo rm -rf /opt/az || true
# Clear pip and npm caches
pip cache purge || true
sudo npm cache clean --force || true

- name: Check disk space after cleanup
run: df -h

- name: Install Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python }}

- name: Install NeMo
run: |
pip install --no-cache-dir --upgrade pip
pip install --no-cache-dir ".[asr]"

- name: Check disk space after installation
run: df -h

- name: Run import checks
run: |
# Run import checks
python tests/core_ptl/check_imports.py --domain asr
15 changes: 13 additions & 2 deletions nemo/lightning/fabric/strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,19 @@
from lightning.pytorch.loops.fetchers import _DataFetcher
from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO
from lightning.pytorch.utilities.combined_loader import CombinedLoader
from megatron.core.distributed import DistributedDataParallelConfig
from megatron.core.optimizer import OptimizerConfig

try:
from megatron.core.distributed import DistributedDataParallelConfig
from megatron.core.optimizer import OptimizerConfig

HAVE_MEGATRON_CORE = True

except (ImportError, ModuleNotFoundError):

DistributedDataParallelConfig = object
OptimizerConfig = object
HAVE_MEGATRON_CORE = False

from torch import Tensor, nn
from torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks import noop_hook
from torch.nn import Module
Expand Down
31 changes: 20 additions & 11 deletions nemo/lightning/io/pl.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,26 @@
from lightning.fabric.plugins import CheckpointIO
from lightning.fabric.utilities.cloud_io import get_filesystem
from lightning.fabric.utilities.types import _PATH
from megatron.core.dist_checkpointing.serialization import (
get_default_load_sharded_strategy,
get_default_save_sharded_strategy,
)
from megatron.core.dist_checkpointing.strategies.base import SaveShardedStrategy
from megatron.core.dist_checkpointing.strategies.fully_parallel import (
FullyParallelLoadStrategyWrapper,
FullyParallelSaveStrategyWrapper,
)
from megatron.core.dist_checkpointing.strategies.torch import TorchDistSaveShardedStrategy
from megatron.core.parallel_state import get_data_parallel_group

try:
from megatron.core.dist_checkpointing.serialization import (
get_default_load_sharded_strategy,
get_default_save_sharded_strategy,
)
from megatron.core.dist_checkpointing.strategies.base import SaveShardedStrategy
from megatron.core.dist_checkpointing.strategies.fully_parallel import (
FullyParallelLoadStrategyWrapper,
FullyParallelSaveStrategyWrapper,
)
from megatron.core.dist_checkpointing.strategies.torch import TorchDistSaveShardedStrategy
from megatron.core.parallel_state import get_data_parallel_group

HAVE_MEGATRON_CORE = True

except (ImportError, ModuleNotFoundError):

HAVE_MEGATRON_CORE = False

from torch import nn
from typing_extensions import Self, override

Expand Down
20 changes: 11 additions & 9 deletions nemo/lightning/megatron_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,17 @@

except (ImportError, ModuleNotFoundError):
logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
from apex.transformer.microbatches import ConstantNumMicroBatches as ConstantNumMicroBatchesCalculator
from apex.transformer.pipeline_parallel.utils import (
get_current_global_batch_size,
get_micro_batch_size,
get_num_microbatches,
)
from apex.transformer.pipeline_parallel.utils import (
setup_microbatch_calculator as init_num_microbatches_calculator,
)

if HAVE_APEX:
from apex.transformer.microbatches import ConstantNumMicroBatches as ConstantNumMicroBatchesCalculator
from apex.transformer.pipeline_parallel.utils import (
get_current_global_batch_size,
get_micro_batch_size,
get_num_microbatches,
)
from apex.transformer.pipeline_parallel.utils import (
setup_microbatch_calculator as init_num_microbatches_calculator,
)

MCORE_MB_CALCULATOR = False

Expand Down
23 changes: 17 additions & 6 deletions nemo/lightning/megatron_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,23 @@
import torch.distributed
from lightning.pytorch.trainer.states import TrainerFn
from lightning.pytorch.utilities import move_data_to_device
from megatron.core import parallel_state
from megatron.core.distributed import DistributedDataParallel as McoreDDP
from megatron.core.distributed import DistributedDataParallelConfig
from megatron.core.optimizer import OptimizerConfig
from megatron.core.transformer.moe.moe_utils import get_moe_layer_wise_logging_tracker
from megatron.core.transformer.transformer_config import TransformerConfig

try:
from megatron.core import parallel_state
from megatron.core.distributed import DistributedDataParallel as McoreDDP
from megatron.core.distributed import DistributedDataParallelConfig
from megatron.core.optimizer import OptimizerConfig
from megatron.core.transformer.moe.moe_utils import get_moe_layer_wise_logging_tracker
from megatron.core.transformer.transformer_config import TransformerConfig

HAVE_MEGATRON_CORE = True
except (ImportError, ModuleNotFoundError):

McoreDDP = object
DistributedDataParallelConfig = object
TransformerConfig = object
HAVE_MEGATRON_CORE = False

from torch import Tensor, nn
from typing_extensions import override

Expand Down
10 changes: 9 additions & 1 deletion nemo/lightning/pytorch/callbacks/ddp_parity_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,15 @@

import torch
from lightning.pytorch.callbacks.callback import Callback
from megatron.core.utils import check_param_hashes_across_dp_replicas

try:
from megatron.core.utils import check_param_hashes_across_dp_replicas

HAVE_MEGATRON_CORE = True

except (ImportError, ModuleNotFoundError):

HAVE_MEGATRON_CORE = False

from nemo.lightning import io
from nemo.utils import logging
Expand Down
11 changes: 10 additions & 1 deletion nemo/lightning/pytorch/callbacks/progress_printer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,16 @@

from lightning.pytorch.callbacks.progress import ProgressBar
from lightning.pytorch.utilities.types import STEP_OUTPUT
from megatron.core.num_microbatches_calculator import get_num_microbatches

try:
from megatron.core.num_microbatches_calculator import get_num_microbatches

HAVE_MEGATRON_CORE = True

except (ImportError, ModuleNotFoundError):

HAVE_MEGATRON_CORE = False

from typing_extensions import override


Expand Down
16 changes: 13 additions & 3 deletions nemo/lightning/pytorch/optim/megatron.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,19 @@
from typing import Callable, List, Optional

import lightning.pytorch as pl
from megatron.core.distributed import finalize_model_grads
from megatron.core.optimizer import OptimizerConfig
from megatron.core.utils import get_model_config

try:
from megatron.core.distributed import finalize_model_grads
from megatron.core.optimizer import OptimizerConfig
from megatron.core.utils import get_model_config

HAVE_MEGATRON_CORE = True

except (ImportError, ModuleNotFoundError):

OptimizerConfig = object
HAVE_MEGATRON_CORE = False

from torch.optim import Optimizer

from nemo.lightning._strategy_lib import setup_megatron_optimizer
Expand Down
12 changes: 11 additions & 1 deletion nemo/lightning/pytorch/strategies/fsdp_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,17 @@
from lightning.pytorch.strategies.fsdp import FSDPStrategy as PLFSDPStrategy
from lightning.pytorch.trainer.states import TrainerFn
from lightning.pytorch.utilities.types import STEP_OUTPUT
from megatron.core.transformer.transformer_layer import TransformerLayer

try:
from megatron.core.transformer.transformer_layer import TransformerLayer

HAVE_MEGATRON_CORE = True

except (ImportError, ModuleNotFoundError):

TransformerLayer = object
HAVE_MEGATRON_CORE = False

from torch.distributed.checkpoint.state_dict import ( # get_state_dict,
StateDictOptions,
get_optimizer_state_dict,
Expand Down
19 changes: 14 additions & 5 deletions nemo/lightning/pytorch/strategies/megatron_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,20 @@
from lightning.pytorch.strategies.ddp import DDPStrategy
from lightning.pytorch.trainer.states import RunningStage, TrainerFn
from lightning.pytorch.utilities.types import STEP_OUTPUT
from megatron.core import Timers
from megatron.core.dist_checkpointing.validation import StrictHandling
from megatron.core.distributed import DistributedDataParallelConfig
from megatron.core.optimizer import OptimizerConfig
from megatron.core.utils import get_torch_version, is_torch_min_version

try:
from megatron.core import Timers
from megatron.core.dist_checkpointing.validation import StrictHandling
from megatron.core.distributed import DistributedDataParallelConfig
from megatron.core.optimizer import OptimizerConfig
from megatron.core.utils import get_torch_version, is_torch_min_version

HAVE_MEGATRON_CORE = True
except (ImportError, ModuleNotFoundError):

DistributedDataParallelConfig = object
HAVE_MEGATRON_CORE = False

from torch import nn
from torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks import noop_hook
from torch.distributed.checkpoint.utils import CheckpointException
Expand Down
21 changes: 17 additions & 4 deletions nemo/lightning/pytorch/strategies/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,23 @@
import torch
from lightning.fabric.plugins import ClusterEnvironment
from lightning.pytorch.callbacks import TQDMProgressBar
from megatron.core import parallel_state
from megatron.core.dist_checkpointing.mapping import ShardedBase, ShardedObject, ShardedTensor
from megatron.core.dist_checkpointing.strategies.torch import sharded_tensor_to_torch_sharded_tensor
from megatron.core.transformer.utils import _get_extra_state_offsets

try:
from megatron.core import parallel_state
from megatron.core.dist_checkpointing.mapping import ShardedBase, ShardedObject, ShardedTensor
from megatron.core.dist_checkpointing.strategies.torch import sharded_tensor_to_torch_sharded_tensor
from megatron.core.transformer.utils import _get_extra_state_offsets

HAVE_MEGATRON_CORE = True

except (ImportError, ModuleNotFoundError):

ShardedObject = object
ShardedBase = object
ShardedTensor = object
HAVE_MEGATRON_CORE = False


from torch import Tensor, nn
from torch.distributed._sharded_tensor import ShardedTensor as TorchShardedTensor
from torch.distributed._tensor import DTensor, Replicate, Shard
Expand Down
Loading