pytorch · vfdev-5 · Oct 1, 2022 · Sep 16, 2022 · Sep 23, 2022 · Sep 23, 2022
diff --git a/ignite/distributed/comp_models/base.py b/ignite/distributed/comp_models/base.py
@@ -211,11 +211,13 @@ def all_reduce(self, tensor: Union[torch.Tensor, float], op: str = "sum") -> Uni
 
         return cast(Union[torch.Tensor, float], self._collective_op(tensor, self._do_all_reduce, op))
 
-    def all_gather(self, tensor: Union[torch.Tensor, float, str]) -> Union[torch.Tensor, float, List[float], List[str]]:
+    def all_gather(
+        self, tensor: Union[torch.Tensor, float, str], group: Optional[Any] = None
+    ) -> Union[torch.Tensor, float, List[float], List[str]]:
         if not isinstance(tensor, (torch.Tensor, Number, str)):
             raise TypeError(f"Unhandled input type {type(tensor)}")
 
-        return self._collective_op(tensor, self._do_all_gather)
+        return self._collective_op(tensor, self._do_all_gather, group=group)
 
     def new_group(self, ranks: List[int], **kwargs: Any) -> Any:
         if isinstance(ranks, list) and all(isinstance(item, int) for item in ranks):
@@ -273,7 +275,7 @@ def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM") -> torch.Tensor:
         pass
 
     @abstractmethod
-    def _do_all_gather(self, tensor: torch.Tensor) -> torch.Tensor:
+    def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None) -> torch.Tensor:
         pass
 
     @abstractmethod
@@ -345,7 +347,9 @@ def spawn(*args: Any, **kwargs: Any) -> None:
     def all_reduce(self, tensor: Union[torch.Tensor, float], op: str = "SUM") -> Union[torch.Tensor, float]:
         return tensor
 
-    def all_gather(self, tensor: Union[torch.Tensor, float, str]) -> Union[torch.Tensor, float, List[float], List[str]]:
+    def all_gather(
+        self, tensor: Union[torch.Tensor, float, str], group: Optional[Any] = None
+    ) -> Union[torch.Tensor, float, List[float], List[str]]:
         if isinstance(tensor, torch.Tensor):
             return tensor
         return cast(Union[List[float], List[str]], [tensor])
@@ -360,7 +364,7 @@ def broadcast(
     def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM") -> torch.Tensor:
         return tensor
 
-    def _do_all_gather(self, tensor: torch.Tensor) -> torch.Tensor:
+    def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None) -> torch.Tensor:
         return tensor
 
     def _do_new_group(self, ranks: List[int], **kwargs: Any) -> Any:

diff --git a/ignite/distributed/comp_models/horovod.py b/ignite/distributed/comp_models/horovod.py
@@ -185,7 +185,9 @@ def _do_manual_all_reduce(self, tensor: torch.Tensor, op: Any) -> torch.Tensor:
             # output can also torch min/max_return_type: (min/max_vals, indices)
             return reduced_res[0]
 
-        def _do_all_gather(self, tensor: torch.Tensor) -> torch.Tensor:
+        def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None) -> torch.Tensor:
+            if group is not None:
+                raise NotImplementedError("all_reduce with group for horovod is not implemented")
             if tensor.ndimension() == 0:
                 tensor = tensor.unsqueeze(0)
             return hvd.allgather(tensor)

diff --git a/ignite/distributed/comp_models/native.py b/ignite/distributed/comp_models/native.py
@@ -426,11 +426,13 @@ def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM") -> torch.Tensor:
             dist.all_reduce(tensor, reduce_op)
             return tensor
 
-        def _do_all_gather(self, tensor: torch.Tensor) -> torch.Tensor:
+        def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None) -> torch.Tensor:
+            if group is not None and not isinstance(group, dist.ProcessGroup):
+                raise ValueError("Argument group should be list of int or ProcessGroup")
             if tensor.ndimension() == 0:
                 tensor = tensor.unsqueeze(0)
             output = [torch.zeros_like(tensor) for _ in range(self.get_world_size())]
-            dist.all_gather(output, tensor)
+            dist.all_gather(output, tensor, group=group)
             return torch.cat(output, dim=0)
 
         def _do_new_group(self, ranks: List[int], **kwargs: Any) -> Any:

diff --git a/ignite/distributed/comp_models/xla.py b/ignite/distributed/comp_models/xla.py
@@ -144,12 +144,16 @@ def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM") -> torch.Tensor:
             xm.all_reduce(op, [tensor])
             return tensor
 
-        def _do_all_gather(self, tensor: torch.Tensor) -> torch.Tensor:
+        def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None) -> torch.Tensor:
             # from https://github.com/jysohn23/xla/blob/model-parallel-colab/Gather_Scatter_Broadcast_PyTorch_XLA.ipynb
+
+            if group is not None and (not isinstance(group, list) or not all(isinstance(item, int) for item in group)):
+                raise ValueError("Argument group should be list of int")
+
             group_size = self.get_world_size()
             output = torch.zeros((group_size,) + tensor.shape, dtype=tensor.dtype, device=tensor.device)
             output[self.get_rank() % group_size] = tensor
-            xm.all_reduce("sum", [output])
+            xm.all_reduce("sum", [output], groups=group)
             return output.reshape(-1, *output.shape[2:])
 
         def _do_new_group(self, ranks: List[int], **kwargs: Any) -> Any:

diff --git a/ignite/distributed/utils.py b/ignite/distributed/utils.py
@@ -340,7 +340,9 @@ def all_reduce(tensor: Union[torch.Tensor, float], op: str = "SUM") -> Union[tor
     return _model.all_reduce(tensor, op)
 
 
-def all_gather(tensor: Union[torch.Tensor, float, str]) -> Union[torch.Tensor, float, List[float], List[str]]:
+def all_gather(
+    tensor: Union[torch.Tensor, float, str], group: Optional[Union[Any, List[int]]] = None
+) -> Union[torch.Tensor, float, List[float], List[str]]:
     """Helper method to perform all gather operation.
 
     Args:
@@ -355,7 +357,10 @@ def all_gather(tensor: Union[torch.Tensor, float, str]) -> Union[torch.Tensor, f
     if _need_to_sync and isinstance(_model, _SerialModel):
         sync(temporary=True)
 
-    return _model.all_gather(tensor)
+    if isinstance(group, list) and all(isinstance(item, int) for item in group):
+        group = _model.new_group(group)
+
+    return _model.all_gather(tensor, group=group)
 
 
 def broadcast(

diff --git a/tests/ignite/distributed/utils/__init__.py b/tests/ignite/distributed/utils/__init__.py
@@ -161,6 +161,43 @@ def _test_distrib_all_gather(device):
             idist.all_reduce([0, 1, 2])
 
 
+def _test_distrib_all_gather_group(device):
+
+    if idist.get_world_size() > 1:
+        ranks = [0, 1]
+        rank = idist.get_rank()
+        bnd = idist.backend()
+
+        t = torch.tensor([rank], device=device)
+        group = idist.new_group(ranks)
+        if bnd in ("horovod"):
+            with pytest.raises(NotImplementedError, match=r"all_reduce with group for horovod is not implemented"):
+                res = idist.all_gather(t, group=group)
+        else:
+            res = idist.all_gather(t, group=group)
+            assert torch.equal(res, torch.tensor(ranks))
+
+        t = torch.tensor([rank], device=device)
+        if bnd in ("horovod"):
+            with pytest.raises(NotImplementedError, match=r"all_reduce with group for horovod is not implemented"):
+                res = idist.all_gather(t, group=ranks)
+        else:
+            res = idist.all_gather(t, group=ranks)
+            assert torch.equal(res, torch.tensor(ranks))
+
+        ranks = "abc"
+
+        if bnd in ("nccl", "gloo", "mpi"):
+            with pytest.raises(ValueError, match=r"Argument group should be list of int or ProcessGroup"):
+                res = idist.all_gather(t, group="abc")
+        elif bnd in ("xla-tpu"):
+            with pytest.raises(ValueError, match=r"Argument group should be list of int"):
+                res = idist.all_gather(t, group="abc")
+        elif bnd in ("horovod"):
+            with pytest.raises(NotImplementedError, match=r"all_reduce with group for horovod is not implemented"):
+                res = idist.all_gather(t, group="abc")
+
+
 def _test_distrib_broadcast(device):
 
     rank = idist.get_rank()

diff --git a/tests/ignite/distributed/utils/test_horovod.py b/tests/ignite/distributed/utils/test_horovod.py
@@ -8,6 +8,7 @@
 from tests.ignite.distributed.utils import (
     _test_distrib__get_max_length,
     _test_distrib_all_gather,
+    _test_distrib_all_gather_group,
     _test_distrib_all_reduce,
     _test_distrib_barrier,
     _test_distrib_broadcast,
@@ -163,6 +164,7 @@ def test_idist_all_gather_hvd(gloo_hvd_executor):
     device = "cpu" if not torch.cuda.is_available() else "cuda"
     np = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
     gloo_hvd_executor(_test_distrib_all_gather, (device,), np=np, do_init=True)
+    gloo_hvd_executor(_test_distrib_all_gather_group, (device,), np=np, do_init=True)
 
 
 @pytest.mark.distributed

diff --git a/tests/ignite/distributed/utils/test_native.py b/tests/ignite/distributed/utils/test_native.py
@@ -9,6 +9,7 @@
 from tests.ignite.distributed.utils import (
     _test_distrib__get_max_length,
     _test_distrib_all_gather,
+    _test_distrib_all_gather_group,
     _test_distrib_all_reduce,
     _test_distrib_barrier,
     _test_distrib_broadcast,
@@ -244,6 +245,7 @@ def test_idist_all_gather_nccl(distributed_context_single_node_nccl):
 
     device = idist.device()
     _test_distrib_all_gather(device)
+    _test_distrib_all_gather_group(device)
 
 
 @pytest.mark.distributed
@@ -252,6 +254,7 @@ def test_idist_all_gather_gloo(distributed_context_single_node_gloo):
 
     device = idist.device()
     _test_distrib_all_gather(device)
+    _test_distrib_all_gather_group(device)
 
 
 @pytest.mark.distributed

diff --git a/tests/ignite/distributed/utils/test_xla.py b/tests/ignite/distributed/utils/test_xla.py
@@ -6,6 +6,7 @@
 from ignite.distributed.utils import has_xla_support
 from tests.ignite.distributed.utils import (
     _test_distrib_all_gather,
+    _test_distrib_all_gather_group,
     _test_distrib_all_reduce,
     _test_distrib_barrier,
     _test_distrib_broadcast,
@@ -147,11 +148,13 @@ def test_idist_all_gather_xla():
 
     device = idist.device()
     _test_distrib_all_gather(device)
+    _test_distrib_all_gather_group(device)
 
 
 def _test_idist_all_gather_xla_in_child_proc(index):
     device = idist.device()
     _test_distrib_all_gather(device)
+    _test_distrib_all_gather_group(device)
 
 
 @pytest.mark.tpu