diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index a20cd22545..75d114e810 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -97,6 +97,7 @@ jobs: python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --test-options="${test_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --assert-equal --steps=10 --import-result tests/assets/losses/llama3.txt rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/* + sleep 3600 python -m tests.integration_tests.run_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8 # Cleanup the checkpoints so that we don't waste network bandwidth and time. diff --git a/run_train.sh b/run_train.sh index 87558a782d..4a63085fad 100755 --- a/run_train.sh +++ b/run_train.sh @@ -20,6 +20,10 @@ TRAIN_FILE=${TRAIN_FILE:-"torchtitan.train"} COMM_MODE=${COMM_MODE:-""} TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"} +export NCCL_NVLS_ENABLE=0 +export NVSHMEM_DISABLE_NVLS=0 +export TORCH_SHOW_CPP_STACKTRACES=1 +export TORCH_CPP_LOG_LEVEL=INFO if [ -n "$COMM_MODE" ]; then # Communication mode specified: validate configuration or run in debug mode diff --git a/tests/integration_tests/features.py b/tests/integration_tests/features.py index fe51ab7cf7..2db7953077 100755 --- a/tests/integration_tests/features.py +++ b/tests/integration_tests/features.py @@ -78,7 +78,6 @@ def build_features_test_list() -> list[OverrideDefinitions]: "2D compile", "2d_compile", ), - # TODO: re-enable this test once the async TP CI issue is fixed OverrideDefinitions( [ [ @@ -89,7 +88,6 @@ def build_features_test_list() -> list[OverrideDefinitions]: ], "2D async TP compile", "2d_asynctp_compile", - disabled=True, ), OverrideDefinitions( [ diff --git a/tests/integration_tests/h100.py b/tests/integration_tests/h100.py index cd75548e56..05ea1078ca 100755 --- a/tests/integration_tests/h100.py +++ b/tests/integration_tests/h100.py @@ -19,7 +19,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]: same root config file. """ integration_tests_flavors = [ - # TODO: re-enable this test once the async TP issue is fixed OverrideDefinitions( [ [ @@ -30,7 +29,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]: ], "2D async TP compile", "2d_asynctp_compile", - disabled=True, ), OverrideDefinitions( [ @@ -43,7 +41,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]: "Float8 test", "float8", ), - # TODO: re-enable this test once the async TP issue is fixed OverrideDefinitions( [ [ @@ -60,7 +57,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]: "FSDP+async TP+PP+torch.compile+Float8", "fsdp+tp+cp+compile+float8", ngpu=8, - disabled=True, ), OverrideDefinitions( [ diff --git a/torchtitan/train.py b/torchtitan/train.py index c897ee3c8a..bb9b0dac32 100644 --- a/torchtitan/train.py +++ b/torchtitan/train.py @@ -533,6 +533,7 @@ def forward_backward_step( with self.train_context(optional_context_parallel_ctx): assert len(model_parts) == 1 with self.maybe_enable_amp: + logger.warning(f"Current Device {torch.cuda.current_device()}") pred = model_parts[0](inputs, **extra_inputs, **extra_kwargs) loss = self.loss_fn(pred, labels) # need to free pred before bwd to avoid peaking memory @@ -557,6 +558,7 @@ def train_step( # entire step will not be executed. for _microbatch in range(self.gradient_accumulation_steps): input_dict, labels = next(data_iterator) + logger.warning(f"Current Device {torch.cuda.current_device()}") loss = self.forward_backward_step(input_dict, labels) accumulated_losses.append(loss.detach())