pytorch · vfdev-5 · Jan 6, 2023 · Jan 6, 2023 · Jan 6, 2023 · Jan 6, 2023
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -36,19 +36,21 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest]
-        python-version: [3.7, 3.8, 3.9, "3.10"]
-        pytorch-channel: [pytorch, pytorch-nightly]
-        include:
-          # includes a single build on windows
-          - os: windows-latest
-            pytorch-channel: pytorch
-            python-version: 3.8
-            skip-distrib-tests: 1
-          # includes a single build on macosx
-          - os: macos-latest
-            pytorch-channel: pytorch
-            python-version: 3.8
-            skip-distrib-tests: 1
+        # python-version: [3.7, 3.8, 3.9, "3.10"]
+        # pytorch-channel: [pytorch, pytorch-nightly]
+        python-version: ["3.10"]
+        pytorch-channel: [pytorch-nightly]
+        # include:
+        #   # includes a single build on windows
+        #   - os: windows-latest
+        #     pytorch-channel: pytorch
+        #     python-version: 3.8
+        #     skip-distrib-tests: 1
+        #   # includes a single build on macosx
+        #   - os: macos-latest
+        #     pytorch-channel: pytorch
+        #     python-version: 3.8
+        #     skip-distrib-tests: 1
 
     steps:
       - uses: actions/checkout@v3
@@ -90,93 +92,104 @@ jobs:
           pip install -r requirements-dev.txt
           python setup.py install
 
-      - name: Check code formatting
-        run: |
-          bash ./tests/run_code_style.sh install
-          bash ./tests/run_code_style.sh lint
+      # - name: Check code formatting
+      #   run: |
+      #     bash ./tests/run_code_style.sh install
+      #     bash ./tests/run_code_style.sh lint
 
-      - name: Run Mypy
-        # https://github.com/pytorch/ignite/pull/2780
-        # 
-        if: ${{ matrix.os == 'ubuntu-latest' && matrix.pytorch-channel == 'pytorch-nightly'}}
-        run: |
-          bash ./tests/run_code_style.sh mypy
+      # - name: Run Mypy
+      #   # https://github.com/pytorch/ignite/pull/2780
+      #   #
+      #   if: ${{ matrix.os == 'ubuntu-latest' && matrix.pytorch-channel == 'pytorch-nightly'}}
+      #   run: |
+      #     bash ./tests/run_code_style.sh mypy
 
       # Download MNIST: https://github.com/pytorch/ignite/issues/1737
       # to "/tmp" for unit tests
-      - name: Download MNIST
-        uses: pytorch-ignite/download-mnist-github-action@master
+      # - name: Download MNIST
+      #   uses: pytorch-ignite/download-mnist-github-action@master
+      #   with:
+      #     target_dir: /tmp
+
+      # # Copy MNIST to "." for the examples
+      # - name: Copy MNIST
+      #   run: |
+      #     cp -R /tmp/MNIST .
+
+      # -- REMOVE THIS
+      - name: Setup tmate session
+        uses: mxschmitt/action-tmate@v3
         with:
-          target_dir: /tmp
-
-      # Copy MNIST to "." for the examples
-      - name: Copy MNIST
-        run: |
-          cp -R /tmp/MNIST .
+          limit-access-to-actor: true
+          timeout-minutes: 15
+      # -- REMOVE THIS
 
       - name: Run Tests
         run: |
-          SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
-
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v3
-        with:
-          file: ./coverage.xml
-          flags: cpu
-          fail_ci_if_error: false
-
-      - name: Run MNIST Examples
-        run: |
-          # MNIST
-          # 1) mnist.py
-          python examples/mnist/mnist.py --epochs=1
-
-      - name: Run MNIST with loggers Examples
-        if: ${{ matrix.os == 'ubuntu-latest' }}
-        run: |
-          # 2) mnist_with_visdom.py
-          python -c "from visdom.server.build import download_scripts; download_scripts()" # download scripts : https://github.com/facebookresearch/visdom/blob/master/py/server.py#L929
-          python -m visdom.server &
-          sleep 10
-          python examples/mnist/mnist_with_visdom.py --epochs=1
-          kill %1
-          # 3.1) mnist_with_tensorboard.py with tbX
-          python examples/mnist/mnist_with_tensorboard.py --epochs=1
-          # 3.2) mnist_with_tensorboard.py with native torch tb
-          pip uninstall -y tensorboardX
-          python examples/mnist/mnist_with_tensorboard.py --epochs=1
-
-      - name: Run MNIST Example With Crash
-        if: ${{ matrix.os == 'ubuntu-latest' }}
-        continue-on-error: true
-        run: |
-          # 4) mnist_save_resume_engine.py
-          python examples/mnist/mnist_save_resume_engine.py --epochs=2 --crash_iteration 1100
-
-      - name: Resume MNIST from previous crash
-        if: ${{ matrix.os == 'ubuntu-latest' }}
-        run: |
-          python examples/mnist/mnist_save_resume_engine.py --epochs=2 --resume_from=/tmp/mnist_save_resume/checkpoint_1.pt
-
-      - name: Run GAN example
-        if: ${{ matrix.os == 'ubuntu-latest' }}
-        run: |
-          # DCGAN
-          python examples/gan/dcgan.py --dataset fake --dataroot /tmp/fakedata --output-dir /tmp/outputs-dcgan --batch-size 2 --epochs 2  --workers 0
-
-      - name: Run RL Examples
-        if: ${{ matrix.os == 'ubuntu-latest' }}
-        run: |
-          # RL
-          # 1) Actor-Critic
-          python examples/reinforcement_learning/actor_critic.py --max-episodes=2
-          # 2) Reinforce
-          python examples/reinforcement_learning/reinforce.py --max-episodes=2
-
-      - name: Run Neural Style Example
-        if: ${{ matrix.os == 'ubuntu-latest' }}
-        run: |
-          #fast-neural-style
-          #train
-          mkdir -p ~/.cache/torch/checkpoints/ && wget "https://download.pytorch.org/models/vgg16-397923af.pth" -O ~/.cache/torch/checkpoints/vgg16-397923af.pth
-          python examples/fast_neural_style/neural_style.py train --epochs 1 --cuda 0 --dataset test --dataroot . --image_size 32 --style_image examples/fast_neural_style/images/style_images/mosaic.jpg --style_size 32
+          # SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
+          # SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh "distributed"
+          # SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh "test_idist_parallel_spawn_n_procs_native"
+          CUDA_VISIBLE_DEVICES= pytest -vvv tests -s -k test_idist_parallel_spawn_n_procs_native
+
+      # - name: Upload coverage to Codecov
+      #   uses: codecov/codecov-action@v3
+      #   with:
+      #     file: ./coverage.xml
+      #     flags: cpu
+      #     fail_ci_if_error: false
+
+      # - name: Run MNIST Examples
+      #   run: |
+      #     # MNIST
+      #     # 1) mnist.py
+      #     python examples/mnist/mnist.py --epochs=1
+
+      # - name: Run MNIST with loggers Examples
+      #   if: ${{ matrix.os == 'ubuntu-latest' }}
+      #   run: |
+      #     # 2) mnist_with_visdom.py
+      #     python -c "from visdom.server.build import download_scripts; download_scripts()" # download scripts : https://github.com/facebookresearch/visdom/blob/master/py/server.py#L929
+      #     python -m visdom.server &
+      #     sleep 10
+      #     python examples/mnist/mnist_with_visdom.py --epochs=1
+      #     kill %1
+      #     # 3.1) mnist_with_tensorboard.py with tbX
+      #     python examples/mnist/mnist_with_tensorboard.py --epochs=1
+      #     # 3.2) mnist_with_tensorboard.py with native torch tb
+      #     pip uninstall -y tensorboardX
+      #     python examples/mnist/mnist_with_tensorboard.py --epochs=1
+
+      # - name: Run MNIST Example With Crash
+      #   if: ${{ matrix.os == 'ubuntu-latest' }}
+      #   continue-on-error: true
+      #   run: |
+      #     # 4) mnist_save_resume_engine.py
+      #     python examples/mnist/mnist_save_resume_engine.py --epochs=2 --crash_iteration 1100
+
+      # - name: Resume MNIST from previous crash
+      #   if: ${{ matrix.os == 'ubuntu-latest' }}
+      #   run: |
+      #     python examples/mnist/mnist_save_resume_engine.py --epochs=2 --resume_from=/tmp/mnist_save_resume/checkpoint_1.pt
+
+      # - name: Run GAN example
+      #   if: ${{ matrix.os == 'ubuntu-latest' }}
+      #   run: |
+      #     # DCGAN
+      #     python examples/gan/dcgan.py --dataset fake --dataroot /tmp/fakedata --output-dir /tmp/outputs-dcgan --batch-size 2 --epochs 2  --workers 0
+
+      # - name: Run RL Examples
+      #   if: ${{ matrix.os == 'ubuntu-latest' }}
+      #   run: |
+      #     # RL
+      #     # 1) Actor-Critic
+      #     python examples/reinforcement_learning/actor_critic.py --max-episodes=2
+      #     # 2) Reinforce
+      #     python examples/reinforcement_learning/reinforce.py --max-episodes=2
+
+      # - name: Run Neural Style Example
+      #   if: ${{ matrix.os == 'ubuntu-latest' }}
+      #   run: |
+      #     #fast-neural-style
+      #     #train
+      #     mkdir -p ~/.cache/torch/checkpoints/ && wget "https://download.pytorch.org/models/vgg16-397923af.pth" -O ~/.cache/torch/checkpoints/vgg16-397923af.pth
+      #     python examples/fast_neural_style/neural_style.py train --epochs 1 --cuda 0 --dataset test --dataroot . --image_size 32 --style_image examples/fast_neural_style/images/style_images/mosaic.jpg --style_size 32
diff --git a/tests/ignite/distributed/test_launcher.py b/tests/ignite/distributed/test_launcher.py
@@ -96,26 +96,16 @@ def _test_check_idist_parallel_torch_launch(init_method, fp, backend, nprocs):
 @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip because test uses torch launch")
 @pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:29500", "FILE"])
-def test_check_idist_parallel_torch_launch_n_procs_gloo(init_method, dirname, exec_filepath):
+@pytest.mark.parametrize(
+    "backend",
+    ["gloo", pytest.param("nccl", marks=pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU"))],
+)
+def test_check_idist_parallel_torch_launch_n_procs_native(init_method, dirname, exec_filepath, backend):
     if init_method == "FILE":
         init_method = f"file://{dirname}/shared"
 
     np = torch.cuda.device_count() if torch.cuda.is_available() else 4
-    # temporarily disable this while running on torch nightly
-    if "dev" not in torch.__version__:
-        _test_check_idist_parallel_torch_launch(init_method, exec_filepath, "gloo", np)
-
-
-@pytest.mark.distributed
-@pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
-@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip because test uses torch launch")
-@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
-@pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:29500", "FILE"])
-def test_check_idist_parallel_torch_launch_n_procs_nccl(init_method, dirname, exec_filepath):
-    if init_method == "FILE":
-        init_method = f"file://{dirname}/shared"
-
-    _test_check_idist_parallel_torch_launch(init_method, exec_filepath, "nccl", torch.cuda.device_count())
+    _test_check_idist_parallel_torch_launch(init_method, exec_filepath, backend, np)
 
 
 def _test_check_idist_parallel_hvdrun(fp, backend, nprocs):
@@ -160,9 +150,13 @@ def _test_check_idist_parallel_spawn(fp, backend, nprocs):
 @pytest.mark.distributed
 @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
-def test_check_idist_parallel_spawn_n_procs_gloo(exec_filepath):
+@pytest.mark.parametrize(
+    "backend",
+    ["gloo", pytest.param("nccl", marks=pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU"))],
+)
+def test_check_idist_parallel_spawn_n_procs_native(exec_filepath, backend):
     np = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
-    _test_check_idist_parallel_spawn(exec_filepath, "gloo", np)
+    _test_check_idist_parallel_spawn(exec_filepath, backend, np)
 
 
 @pytest.mark.distributed
@@ -171,7 +165,7 @@ def test_check_idist_parallel_spawn_n_procs_gloo(exec_filepath):
 def test_smoke_test_check_idist_parallel_spawn_multinode_n_procs_gloo(exec_filepath):
     # Just a smoke test from check_idist_parallel.py for an emulated multi-node configuration
     cmd1 = "export CUDA_VISIBLE_DEVICES= && "
-    cmd1 += 'bash -c "python tests/ignite/distributed/check_idist_parallel.py --backend=gloo --nproc_per_node=2 '
+    cmd1 += f'bash -c "{sys.executable} {exec_filepath} --backend=gloo --nproc_per_node=2 '
     cmd1 += '--nnodes=2 --node_rank=0 --master_addr=localhost --master_port=3344 &"'
     os.system(cmd1)
 
@@ -197,14 +191,6 @@ def test_smoke_test_check_idist_parallel_spawn_multinode_n_procs_gloo(exec_filep
     assert "End of run" in out
 
 
-@pytest.mark.distributed
-@pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
-@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
-@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
-def test_check_idist_parallel_spawn_n_procs_nccl(exec_filepath):
-    _test_check_idist_parallel_spawn(exec_filepath, "nccl", torch.cuda.device_count())
-
-
 @pytest.mark.tpu
 @pytest.mark.skipif("NUM_TPU_WORKERS" not in os.environ, reason="Skip if no NUM_TPU_WORKERS in env vars")
 @pytest.mark.skipif(not has_xla_support, reason="Skip if no PyTorch XLA package")
@@ -238,7 +224,7 @@ def _test_func(index, ws, device, backend, true_init_method):
 @pytest.mark.distributed
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
-@pytest.mark.parametrize("init_method", ["env://", "tcp://0.0.0.0:29500", "FILE"])
+@pytest.mark.parametrize("init_method", ["env://", "tcp://0.0.0.0:29501", "FILE"])
 @pytest.mark.parametrize(
     "backend",
     ["gloo", pytest.param("nccl", marks=pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU"))],