Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/integration_test_8gpu_features.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ jobs:
python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --test-options="${test_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --assert-equal --steps=10 --import-result tests/assets/losses/llama3.txt
rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*

sleep 3600
python -m tests.integration_tests.run_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8

# Cleanup the checkpoints so that we don't waste network bandwidth and time.
Expand Down
4 changes: 4 additions & 0 deletions run_train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ TRAIN_FILE=${TRAIN_FILE:-"torchtitan.train"}
COMM_MODE=${COMM_MODE:-""}

TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"}
export NCCL_NVLS_ENABLE=0
export NVSHMEM_DISABLE_NVLS=0
export TORCH_SHOW_CPP_STACKTRACES=1
export TORCH_CPP_LOG_LEVEL=INFO

if [ -n "$COMM_MODE" ]; then
# Communication mode specified: validate configuration or run in debug mode
Expand Down
2 changes: 0 additions & 2 deletions tests/integration_tests/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ def build_features_test_list() -> list[OverrideDefinitions]:
"2D compile",
"2d_compile",
),
# TODO: re-enable this test once the async TP CI issue is fixed
OverrideDefinitions(
[
[
Expand All @@ -89,7 +88,6 @@ def build_features_test_list() -> list[OverrideDefinitions]:
],
"2D async TP compile",
"2d_asynctp_compile",
disabled=True,
),
OverrideDefinitions(
[
Expand Down
4 changes: 0 additions & 4 deletions tests/integration_tests/h100.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
same root config file.
"""
integration_tests_flavors = [
# TODO: re-enable this test once the async TP issue is fixed
OverrideDefinitions(
[
[
Expand All @@ -30,7 +29,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
],
"2D async TP compile",
"2d_asynctp_compile",
disabled=True,
),
OverrideDefinitions(
[
Expand All @@ -43,7 +41,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
"Float8 test",
"float8",
),
# TODO: re-enable this test once the async TP issue is fixed
OverrideDefinitions(
[
[
Expand All @@ -60,7 +57,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
"FSDP+async TP+PP+torch.compile+Float8",
"fsdp+tp+cp+compile+float8",
ngpu=8,
disabled=True,
),
OverrideDefinitions(
[
Expand Down
2 changes: 2 additions & 0 deletions torchtitan/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,7 @@ def forward_backward_step(
with self.train_context(optional_context_parallel_ctx):
assert len(model_parts) == 1
with self.maybe_enable_amp:
logger.warning(f"Current Device {torch.cuda.current_device()}")
pred = model_parts[0](inputs, **extra_inputs, **extra_kwargs)
loss = self.loss_fn(pred, labels)
# need to free pred before bwd to avoid peaking memory
Expand All @@ -557,6 +558,7 @@ def train_step(
# entire step will not be executed.
for _microbatch in range(self.gradient_accumulation_steps):
input_dict, labels = next(data_iterator)
logger.warning(f"Current Device {torch.cuda.current_device()}")
loss = self.forward_backward_step(input_dict, labels)
accumulated_losses.append(loss.detach())

Expand Down
Loading