Modify to all_gather takes group

puhuk · puhuk · commit b6cb4443a372 · 2022-09-23T11:08:03.000+09:00
diff --git a/ignite/distributed/comp_models/base.py b/ignite/distributed/comp_models/base.py
@@ -212,11 +212,13 @@ def all_reduce(self, tensor: Union[torch.Tensor, float], op: str = "sum") -> Uni
 
         return cast(Union[torch.Tensor, float], self._collective_op(tensor, self._do_all_reduce, op))
 
-    def all_gather(self, tensor: Union[torch.Tensor, float, str]) -> Union[torch.Tensor, float, List[float], List[str]]:
+    def all_gather(
+        self, tensor: Union[torch.Tensor, float, str], group: Optional[Union[Any, List[int]]] = None
+    ) -> Union[torch.Tensor, float, List[float], List[str]]:
         if not isinstance(tensor, (torch.Tensor, Number, str)):
             raise TypeError(f"Unhandled input type {type(tensor)}")
 
-        return self._collective_op(tensor, self._do_all_gather)
+        return self._collective_op(tensor, self._do_all_gather, group=group)
 
     def broadcast(
         self, tensor: Union[torch.Tensor, float, str, None], src: int = 0, safe_mode: bool = False
@@ -268,7 +270,7 @@ def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM") -> torch.Tensor:
         pass
 
     @abstractmethod
-    def _do_all_gather(self, tensor: torch.Tensor) -> torch.Tensor:
+    def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Union[Any, List[int]]] = None) -> torch.Tensor:
         pass
 
     @abstractmethod
@@ -336,7 +338,9 @@ def spawn(*args: Any, **kwargs: Any) -> None:
     def all_reduce(self, tensor: Union[torch.Tensor, float], op: str = "SUM") -> Union[torch.Tensor, float]:
         return tensor
 
-    def all_gather(self, tensor: Union[torch.Tensor, float, str]) -> Union[torch.Tensor, float, List[float], List[str]]:
+    def all_gather(
+        self, tensor: Union[torch.Tensor, float, str], group: Optional[Union[Any, List[int]]] = None
+    ) -> Union[torch.Tensor, float, List[float], List[str]]:
         if isinstance(tensor, torch.Tensor):
             return tensor
         return cast(Union[List[float], List[str]], [tensor])
@@ -351,7 +355,7 @@ def broadcast(
     def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM") -> torch.Tensor:
         return tensor
 
-    def _do_all_gather(self, tensor: torch.Tensor) -> torch.Tensor:
+    def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Union[Any, List[int]]] = None) -> torch.Tensor:
         return tensor
 
     def _do_broadcast(self, tensor: torch.Tensor, src: int) -> torch.Tensor:
diff --git a/ignite/distributed/comp_models/horovod.py b/ignite/distributed/comp_models/horovod.py
@@ -1,12 +1,13 @@
 import warnings
-from typing import Any, Callable, cast, Mapping, Optional, Tuple
+from typing import Any, Callable, cast, List, Mapping, Optional, Tuple, Union
 
 import torch
 
 from ignite.distributed.comp_models.base import ComputationModel
 
 try:
     import horovod.torch as hvd
+    from horovod.common.process_sets import ProcessSet
 
     try:
         # old API
@@ -184,9 +185,13 @@ def _do_manual_all_reduce(self, tensor: torch.Tensor, op: Any) -> torch.Tensor:
             # output can also torch min/max_return_type: (min/max_vals, indices)
             return reduced_res[0]
 
-        def _do_all_gather(self, tensor: torch.Tensor) -> torch.Tensor:
+        def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Union[Any, List[int]]] = None) -> torch.Tensor:
+            if group and not isinstance(group, ProcessSet):
+                raise ValueError("group should be list of int or ProcessSet")
             if tensor.ndimension() == 0:
                 tensor = tensor.unsqueeze(0)
+            if group is not None:
+                return hvd.allgather(tensor, process_set=group)
             return hvd.allgather(tensor)
 
         def _do_broadcast(self, tensor: torch.Tensor, src: int) -> torch.Tensor:
diff --git a/ignite/distributed/comp_models/native.py b/ignite/distributed/comp_models/native.py
@@ -426,11 +426,13 @@ def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM") -> torch.Tensor:
             dist.all_reduce(tensor, reduce_op)
             return tensor
 
-        def _do_all_gather(self, tensor: torch.Tensor) -> torch.Tensor:
+        def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Union[Any, List[int]]] = None) -> torch.Tensor:
+            if group is not None and not isinstance(group, dist.ProcessGroup):
+                raise ValueError("Group should be list of int or ProcessGroup")
             if tensor.ndimension() == 0:
                 tensor = tensor.unsqueeze(0)
             output = [torch.zeros_like(tensor) for _ in range(self.get_world_size())]
-            dist.all_gather(output, tensor)
+            dist.all_gather(output, tensor, group=group)
             return torch.cat(output, dim=0)
 
         def _do_broadcast(self, tensor: torch.Tensor, src: int) -> torch.Tensor:
diff --git a/ignite/distributed/comp_models/xla.py b/ignite/distributed/comp_models/xla.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, cast, Mapping, Optional, Tuple
+from typing import Any, Callable, cast, List, Mapping, Optional, Tuple, Union
 
 import torch
 
@@ -144,12 +144,16 @@ def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM") -> torch.Tensor:
             xm.all_reduce(op, [tensor])
             return tensor
 
-        def _do_all_gather(self, tensor: torch.Tensor) -> torch.Tensor:
+        def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Union[Any, List[int]]] = None) -> torch.Tensor:
             # from https://github.com/jysohn23/xla/blob/model-parallel-colab/Gather_Scatter_Broadcast_PyTorch_XLA.ipynb
+
+            if not self._check_group_type(group):
+                raise ValueError("group should be list of int or list of list of int")
+
             group_size = self.get_world_size()
             output = torch.zeros((group_size,) + tensor.shape, dtype=tensor.dtype, device=tensor.device)
             output[self.get_rank() % group_size] = tensor
-            xm.all_reduce("sum", [output])
+            xm.all_reduce("sum", [output], groups=group)
             return output.reshape(-1, *output.shape[2:])
 
         def _do_broadcast(self, tensor: torch.Tensor, src: int) -> torch.Tensor:
diff --git a/ignite/distributed/utils.py b/ignite/distributed/utils.py
@@ -339,7 +339,9 @@ def all_reduce(tensor: Union[torch.Tensor, float], op: str = "SUM") -> Union[tor
     return _model.all_reduce(tensor, op)
 
 
-def all_gather(tensor: Union[torch.Tensor, float, str]) -> Union[torch.Tensor, float, List[float], List[str]]:
+def all_gather(
+    tensor: Union[torch.Tensor, float, str], group: Optional[Union[Any, List[int]]] = None
+) -> Union[torch.Tensor, float, List[float], List[str]]:
     """Helper method to perform all gather operation.
 
     Args:
@@ -354,7 +356,10 @@ def all_gather(tensor: Union[torch.Tensor, float, str]) -> Union[torch.Tensor, f
     if _need_to_sync and isinstance(_model, _SerialModel):
         sync(temporary=True)
 
-    return _model.all_gather(tensor)
+    if isinstance(group, list) and all(isinstance(item, int) for item in group):
+        group = _model.new_group(group)
+
+    return _model.all_gather(tensor, group=group)
 
 
 def broadcast(
diff --git a/tests/ignite/distributed/utils/__init__.py b/tests/ignite/distributed/utils/__init__.py
@@ -161,6 +161,17 @@ def _test_distrib_all_gather(device):
             idist.all_reduce([0, 1, 2])
 
 
+def _test_distrib_all_gather_group(device):
+
+    if idist.get_world_size() > 1:
+        rank = idist.get_rank()
+        group = [0, 1]
+
+        t = torch.tensor([rank], device=idist.device())
+        res = idist.all_gather(t, group=group)
+        assert torch.equal(res, torch.tensor(group))
+
+
 def _test_distrib_broadcast(device):
 
     rank = idist.get_rank()
diff --git a/tests/ignite/distributed/utils/test_horovod.py b/tests/ignite/distributed/utils/test_horovod.py
@@ -8,6 +8,7 @@
 from tests.ignite.distributed.utils import (
     _test_distrib__get_max_length,
     _test_distrib_all_gather,
+    _test_distrib_all_gather_group,
     _test_distrib_all_reduce,
     _test_distrib_barrier,
     _test_distrib_broadcast,
@@ -162,6 +163,7 @@ def test_idist_all_gather_hvd(gloo_hvd_executor):
     device = "cpu" if not torch.cuda.is_available() else "cuda"
     np = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
     gloo_hvd_executor(_test_distrib_all_gather, (device,), np=np, do_init=True)
+    gloo_hvd_executor(_test_distrib_all_gather_group, (device,), np=np, do_init=True)
 
 
 @pytest.mark.distributed
diff --git a/tests/ignite/distributed/utils/test_native.py b/tests/ignite/distributed/utils/test_native.py
@@ -9,6 +9,7 @@
 from tests.ignite.distributed.utils import (
     _test_distrib__get_max_length,
     _test_distrib_all_gather,
+    _test_distrib_all_gather_group,
     _test_distrib_all_reduce,
     _test_distrib_barrier,
     _test_distrib_broadcast,
@@ -228,6 +229,7 @@ def test_idist_all_gather_nccl(distributed_context_single_node_nccl):
 
     device = idist.device()
     _test_distrib_all_gather(device)
+    _test_distrib_all_gather_group(device)
 
 
 @pytest.mark.distributed
@@ -236,6 +238,7 @@ def test_idist_all_gather_gloo(distributed_context_single_node_gloo):
 
     device = idist.device()
     _test_distrib_all_gather(device)
+    _test_distrib_all_gather_group(device)
 
 
 @pytest.mark.distributed
diff --git a/tests/ignite/distributed/utils/test_xla.py b/tests/ignite/distributed/utils/test_xla.py
@@ -6,6 +6,7 @@
 from ignite.distributed.utils import has_xla_support
 from tests.ignite.distributed.utils import (
     _test_distrib_all_gather,
+    _test_distrib_all_gather_group,
     _test_distrib_all_reduce,
     _test_distrib_barrier,
     _test_distrib_broadcast,
@@ -138,11 +139,13 @@ def test_idist_all_gather_xla():
 
     device = idist.device()
     _test_distrib_all_gather(device)
+    _test_distrib_all_gather_group(device)
 
 
 def _test_idist_all_gather_xla_in_child_proc(index):
     device = idist.device()
     _test_distrib_all_gather(device)
+    _test_distrib_all_gather_group(device)
 
 
 @pytest.mark.tpu