pytorch · fegin · Dec 8, 2025 · Dec 9, 2025 · Dec 9, 2025 · Dec 9, 2025
@@ -97,6 +97,7 @@ jobs:
         python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --test-options="${test_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --assert-equal --steps=10  --import-result tests/assets/losses/llama3.txt
         rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*
 
+        sleep 3600
         python -m tests.integration_tests.run_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8
 
         # Cleanup the checkpoints so that we don't waste network bandwidth and time.

@@ -20,6 +20,10 @@ TRAIN_FILE=${TRAIN_FILE:-"torchtitan.train"}
 COMM_MODE=${COMM_MODE:-""}
 
 TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"}
+export NCCL_NVLS_ENABLE=0
+export NVSHMEM_DISABLE_NVLS=0
+export TORCH_SHOW_CPP_STACKTRACES=1
+export TORCH_CPP_LOG_LEVEL=INFO
 
 if [ -n "$COMM_MODE" ]; then
     # Communication mode specified: validate configuration or run in debug mode

@@ -78,7 +78,6 @@ def build_features_test_list() -> list[OverrideDefinitions]:
             "2D compile",
             "2d_compile",
         ),
-        # TODO: re-enable this test once the async TP CI issue is fixed
         OverrideDefinitions(
             [
                 [
@@ -89,7 +88,6 @@ def build_features_test_list() -> list[OverrideDefinitions]:
             ],
             "2D async TP compile",
             "2d_asynctp_compile",
-            disabled=True,
         ),
         OverrideDefinitions(
             [

@@ -19,7 +19,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
     same root config file.
     """
     integration_tests_flavors = [
-        # TODO: re-enable this test once the async TP issue is fixed
         OverrideDefinitions(
             [
                 [
@@ -30,7 +29,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
             ],
             "2D async TP compile",
             "2d_asynctp_compile",
-            disabled=True,
         ),
         OverrideDefinitions(
             [
@@ -43,7 +41,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
             "Float8 test",
             "float8",
         ),
-        # TODO: re-enable this test once the async TP issue is fixed
         OverrideDefinitions(
             [
                 [
@@ -60,7 +57,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
             "FSDP+async TP+PP+torch.compile+Float8",
             "fsdp+tp+cp+compile+float8",
             ngpu=8,
-            disabled=True,
         ),
         OverrideDefinitions(
             [

@@ -533,6 +533,7 @@ def forward_backward_step(
             with self.train_context(optional_context_parallel_ctx):
                 assert len(model_parts) == 1
                 with self.maybe_enable_amp:
+                    logger.warning(f"Current Device {torch.cuda.current_device()}")
                     pred = model_parts[0](inputs, **extra_inputs, **extra_kwargs)
                     loss = self.loss_fn(pred, labels)
                 # need to free pred before bwd to avoid peaking memory
@@ -557,6 +558,7 @@ def train_step(
         # entire step will not be executed.
         for _microbatch in range(self.gradient_accumulation_steps):
             input_dict, labels = next(data_iterator)
+            logger.warning(f"Current Device {torch.cuda.current_device()}")
             loss = self.forward_backward_step(input_dict, labels)
             accumulated_losses.append(loss.detach())