diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml
index a20cd22545..75d114e810 100644
--- a/.github/workflows/integration_test_8gpu_features.yaml
+++ b/.github/workflows/integration_test_8gpu_features.yaml
@@ -97,6 +97,7 @@ jobs:
         python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --test-options="${test_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --assert-equal --steps=10  --import-result tests/assets/losses/llama3.txt
         rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*
 
+        sleep 3600
         python -m tests.integration_tests.run_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8
 
         # Cleanup the checkpoints so that we don't waste network bandwidth and time.
diff --git a/run_train.sh b/run_train.sh
index 87558a782d..4a63085fad 100755
--- a/run_train.sh
+++ b/run_train.sh
@@ -20,6 +20,10 @@ TRAIN_FILE=${TRAIN_FILE:-"torchtitan.train"}
 COMM_MODE=${COMM_MODE:-""}
 
 TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"}
+export NCCL_NVLS_ENABLE=0
+export NVSHMEM_DISABLE_NVLS=0
+export TORCH_SHOW_CPP_STACKTRACES=1
+export TORCH_CPP_LOG_LEVEL=INFO
 
 if [ -n "$COMM_MODE" ]; then
     # Communication mode specified: validate configuration or run in debug mode
diff --git a/tests/integration_tests/features.py b/tests/integration_tests/features.py
index fe51ab7cf7..2db7953077 100755
--- a/tests/integration_tests/features.py
+++ b/tests/integration_tests/features.py
@@ -78,7 +78,6 @@ def build_features_test_list() -> list[OverrideDefinitions]:
             "2D compile",
             "2d_compile",
         ),
-        # TODO: re-enable this test once the async TP CI issue is fixed
         OverrideDefinitions(
             [
                 [
@@ -89,7 +88,6 @@ def build_features_test_list() -> list[OverrideDefinitions]:
             ],
             "2D async TP compile",
             "2d_asynctp_compile",
-            disabled=True,
         ),
         OverrideDefinitions(
             [
diff --git a/tests/integration_tests/h100.py b/tests/integration_tests/h100.py
index cd75548e56..05ea1078ca 100755
--- a/tests/integration_tests/h100.py
+++ b/tests/integration_tests/h100.py
@@ -19,7 +19,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
     same root config file.
     """
     integration_tests_flavors = [
-        # TODO: re-enable this test once the async TP issue is fixed
         OverrideDefinitions(
             [
                 [
@@ -30,7 +29,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
             ],
             "2D async TP compile",
             "2d_asynctp_compile",
-            disabled=True,
         ),
         OverrideDefinitions(
             [
@@ -43,7 +41,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
             "Float8 test",
             "float8",
         ),
-        # TODO: re-enable this test once the async TP issue is fixed
         OverrideDefinitions(
             [
                 [
@@ -60,7 +57,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
             "FSDP+async TP+PP+torch.compile+Float8",
             "fsdp+tp+cp+compile+float8",
             ngpu=8,
-            disabled=True,
         ),
         OverrideDefinitions(
             [
diff --git a/torchtitan/train.py b/torchtitan/train.py
index c897ee3c8a..bb9b0dac32 100644
--- a/torchtitan/train.py
+++ b/torchtitan/train.py
@@ -533,6 +533,7 @@ def forward_backward_step(
             with self.train_context(optional_context_parallel_ctx):
                 assert len(model_parts) == 1
                 with self.maybe_enable_amp:
+                    logger.warning(f"Current Device {torch.cuda.current_device()}")
                     pred = model_parts[0](inputs, **extra_inputs, **extra_kwargs)
                     loss = self.loss_fn(pred, labels)
                 # need to free pred before bwd to avoid peaking memory
@@ -557,6 +558,7 @@ def train_step(
         # entire step will not be executed.
         for _microbatch in range(self.gradient_accumulation_steps):
             input_dict, labels = next(data_iterator)
+            logger.warning(f"Current Device {torch.cuda.current_device()}")
             loss = self.forward_backward_step(input_dict, labels)
             accumulated_losses.append(loss.detach())