[device_mesh] replace dim_group_info with group_name (pytorch#150898)

wanchaol · pytorchmergebot · commit 9df9d9ded001 · 2025-05-13T17:16:45.000Z
as titled, there's no need to maintain a dim_group_info anymore, we can simply maintain a list of group_name instead. This will simplify the logic Pull Request resolved: pytorch#150898 Approved by: https://github.com/tianyu-l, https://github.com/fegin
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_init.py b/test/distributed/_composable/fsdp/test_fully_shard_init.py
@@ -840,7 +840,7 @@ def test_1d_process_group_init(self):
         # since the ref has a parent mesh, while the `from_group` one does not
         self.assertEqual(dp_mesh.mesh, ref_dp_mesh.mesh)
         self.assertEqual(dp_mesh._coordinate_on_dim, ref_dp_mesh._coordinate_on_dim)
-        self.assertEqual(dp_mesh._dim_group_infos, ref_dp_mesh._dim_group_infos)
+        self.assertEqual(dp_mesh._dim_group_names, ref_dp_mesh._dim_group_names)
 
         # Check 1D FSDP forward/backward parity over the DP mesh
         # NOTE: We cannot use 2D DTensor-based training here because the DP
@@ -916,12 +916,6 @@ def test_2d_process_group_init(self):
         )
         self.assertEqual(mesh.mesh, ref_mesh.mesh)
         self.assertEqual(mesh._coordinate_on_dim, ref_mesh._coordinate_on_dim)
-        for (_, ranks, _), (_, ref_ranks, _) in zip(
-            mesh._dim_group_infos, ref_mesh._dim_group_infos
-        ):
-            # Since we manually constructed new subgroups, the test and ref
-            # groups are not the same
-            self.assertEqual(ranks, ref_ranks)
         for mesh_dim_name in mesh_dim_names:
             child_mesh = mesh[mesh_dim_name]
             ref_child_mesh = ref_mesh[mesh_dim_name]
diff --git a/test/distributed/test_device_mesh.py b/test/distributed/test_device_mesh.py
@@ -3,6 +3,7 @@
 import os
 
 import torch
+import torch.distributed as dist
 import torch.distributed._functional_collectives as funcol
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.distributed.device_mesh import _mesh_resources, DeviceMesh, init_device_mesh
@@ -197,7 +198,7 @@ def test_fake_pg_device_mesh(self):
         local_tensor = torch.randn(2, 8)
         global_tensor = funcol.all_gather_tensor(
             local_tensor, gather_dim=0, group=(mesh, 0)
-        )
+        ).wait()
         self.assertEqual(global_tensor.shape, (self.world_size * 2, 8))
 
     @with_comms
@@ -208,7 +209,7 @@ def test_from_group_with_global_pg(self):
         mesh_pg = ref_global_mesh.get_group()
         global_mesh = DeviceMesh.from_group(mesh_pg, self.device_type)
         self.assertEqual(ref_global_mesh, global_mesh)
-        self.assertEqual(ref_global_mesh._dim_group_infos, global_mesh._dim_group_infos)
+        self.assertEqual(ref_global_mesh._dim_group_names, global_mesh._dim_group_names)
         self.assertEqual(
             ref_global_mesh._coordinate_on_dim, global_mesh._coordinate_on_dim
         )
@@ -217,7 +218,7 @@ def test_from_group_with_global_pg(self):
             mesh_pg, self.device_type, mesh=torch.arange(self.world_size)
         )
         self.assertEqual(ref_global_mesh, global_mesh)
-        self.assertEqual(ref_global_mesh._dim_group_infos, global_mesh._dim_group_infos)
+        self.assertEqual(ref_global_mesh._dim_group_names, global_mesh._dim_group_names)
         self.assertEqual(
             ref_global_mesh._coordinate_on_dim, global_mesh._coordinate_on_dim
         )
@@ -396,24 +397,20 @@ def test_from_group_with_mesh_shape_3d(self):
             mesh_dim_names=("dp_replicate", "dp_shard"),
         )
 
-        ref_mesh_dp_dim_group_infos = ref_mesh._dim_group_infos[:2]
-        for (_, ref_ranks, _), (_, ranks, _) in zip(
-            ref_mesh_dp_dim_group_infos, dp_mesh._dim_group_infos
-        ):
-            self.assertEqual(ref_ranks, ranks)
+        ref_mesh_dp_dim_group_names = ref_mesh._dim_group_names[:2]
+        self.assertEqual(ref_mesh_dp_dim_group_names, dp_mesh._dim_group_names[:2])
         # Cannot check directly for mesh equality since parent meshes are not
         # the same since the ref's parent mesh is 3D
         self.assertEqual(dp_mesh["dp_replicate"].mesh, ref_mesh["dp_replicate"].mesh)
-        for (_, ref_ranks, _), (_, ranks, _) in zip(
-            dp_mesh["dp_replicate"]._dim_group_infos,
-            ref_mesh["dp_replicate"]._dim_group_infos,
-        ):
-            self.assertEqual(ref_ranks, ranks)
+        self.assertEqual(
+            dp_mesh["dp_replicate"]._dim_group_names,
+            ref_mesh["dp_replicate"]._dim_group_names,
+        )
         self.assertEqual(dp_mesh["dp_shard"].mesh, ref_mesh["dp_shard"].mesh)
-        for (_, ref_ranks, _), (_, ranks, _) in zip(
-            dp_mesh["dp_shard"]._dim_group_infos, ref_mesh["dp_shard"]._dim_group_infos
-        ):
-            self.assertEqual(ref_ranks, ranks)
+        self.assertEqual(
+            dp_mesh["dp_shard"]._dim_group_names,
+            ref_mesh["dp_shard"]._dim_group_names,
+        )
 
     @with_comms()
     def test_from_group_with_mesh_shape_2d(self):
@@ -456,12 +453,13 @@ def test_from_group_with_mesh_shape_2d(self):
             mesh_dim_names=("dp_replicate", "dp_shard"),
         )
 
-        ref_mesh_dp_dim_group_infos = ref_mesh._dim_group_infos[:2]
-        for (_, ref_ranks, _), (_, ranks, _) in zip(
-            ref_mesh_dp_dim_group_infos, dp_mesh._dim_group_infos
+        # self.assertEqual(ref_mesh._dim_group_names, dp_mesh._dim_group_names)
+        for mesh_dim_group, ref_mesh_dim_group in zip(
+            dp_mesh.get_all_groups(), ref_mesh.get_all_groups()
         ):
-            self.assertEqual(ref_ranks, ranks)
-
+            mesh_dim_group_ranks = dist.get_process_group_ranks(mesh_dim_group)
+            ref_mesh_dim_group_ranks = dist.get_process_group_ranks(ref_mesh_dim_group)
+            self.assertEqual(mesh_dim_group_ranks, ref_mesh_dim_group_ranks)
         # check both the 2d mesh and the submeshes are exactly the same.
         self.assertEqual(dp_mesh, ref_mesh)
         self.assertEqual(dp_mesh["dp_replicate"], ref_mesh["dp_replicate"])
diff --git a/torch/distributed/_functional_collectives.py b/torch/distributed/_functional_collectives.py
@@ -731,8 +731,10 @@ def cast_listint(x):
             "Only 1D mesh is supported, pass in (DeviceMesh, int) together if mesh > 1D"
         )
         # TODO: it should run collective in the whole mesh instead of dim 0
-        tag, rankset, _ = group._dim_group_infos[0]
+        pg = group.get_group()
+        rankset = dist.get_process_group_ranks(pg)
         group_size = len(rankset)
+        tag = tag or c10d._get_group_tag(pg)
     elif isinstance(group, tuple):
         if (
             len(group) == 2
@@ -741,8 +743,10 @@ def cast_listint(x):
         ):
             dmesh = group[0]
             dim = group[1]
-            tag, rankset, _ = dmesh._dim_group_infos[dim]
+            pg = dmesh.get_group(dim)
+            rankset = dist.get_process_group_ranks(pg)
             group_size = len(rankset)
+            tag = tag or c10d._get_group_tag(pg)
         else:
             raise ValueError("Invalid tuple for group must be (DeviceMesh, int)")
     else:
@@ -767,7 +771,7 @@ def _resolve_group_name(group: RANK_TYPES, tag: str = "") -> str:
         assert group.ndim == 1, (
             "Only 1D mesh is supported, pass in (DeviceMesh, int) together if mesh > 1D"
         )
-        return group._dim_group_infos[0][2]
+        return group._dim_group_names[0]
     elif isinstance(group, tuple):
         if (
             len(group) == 2
@@ -776,7 +780,7 @@ def _resolve_group_name(group: RANK_TYPES, tag: str = "") -> str:
         ):
             dmesh = group[0]
             dim = group[1]
-            return dmesh._dim_group_infos[dim][2]
+            return dmesh._dim_group_names[dim]
         else:
             raise ValueError("Invalid tuple for group must be (DeviceMesh, int)")
     elif isinstance(group, list):
diff --git a/torch/distributed/device_mesh.py b/torch/distributed/device_mesh.py
@@ -38,9 +38,8 @@ def _init_device_mesh_stub():
 else:
     from torch._C._distributed_c10d import Backend as C10dBackend
     from torch.distributed.distributed_c10d import (
-        _find_pg_by_ranks_and_tag,
         _get_default_group,
-        _get_group_tag,
+        _resolve_process_group,
         get_backend,
         get_process_group_ranks,
         get_rank,
@@ -103,7 +102,7 @@ def create_sub_mesh(
             mesh_tensor = device_mesh.mesh
             # slice_dim_idx could be differnt from submesh_dims, as we may need to flatten out some dims.
             slice_dim_idx = []
-            slice_dim_group_info = []
+            slice_dim_group_name = []
             # keep track of the number of dims that have been flattened so we can get the correct slice_dim_idx in the
             # flattened mesh tensor.
             num_dims_flatten = 0
@@ -121,15 +120,15 @@ def create_sub_mesh(
                     # then the final slice_dim_idx should be [0, 1, 2].
                     slice_dim_idx.append(mesh_dim_indices[0] - num_dims_flatten)
                     num_dims_flatten += len(mesh_dim_indices) - 1
-                    slice_dim_group_info.append(
+                    slice_dim_group_name.append(
                         self.root_to_flatten_mapping[device_mesh][
                             mesh_dim_name
-                        ]._dim_group_infos[0]
+                        ]._dim_group_names[0]
                     )
                 else:
                     slice_dim_idx.append(mesh_dim_indices[0] - num_dims_flatten)
-                    slice_dim_group_info.append(
-                        device_mesh._dim_group_infos[mesh_dim_indices[0]]
+                    slice_dim_group_name.append(
+                        device_mesh._dim_group_names[mesh_dim_indices[0]]
                     )
 
             # mesh_tensor has already been flattened if needed. So mesh_tensor.ndim <= device_mesh.mesh.ndim now.
@@ -155,7 +154,7 @@ def create_sub_mesh(
                 if cur_rank in mesh_nd:
                     res_submesh = submesh
 
-            res_submesh._dim_group_infos = slice_dim_group_info  # type: ignore[possibly-undefined]
+            res_submesh._dim_group_names = slice_dim_group_name  # type: ignore[possibly-undefined]
             self.child_to_root_mapping[res_submesh] = device_mesh
 
             return res_submesh
@@ -360,8 +359,8 @@ def _get_all_submeshes(
                     mesh_dim_names=(mesh_dim_name,),
                     _init_backend=False,
                 )
-                submesh._dim_group_infos = (
-                    [device_mesh._dim_group_infos[mesh_dim]]
+                submesh._dim_group_names = (
+                    [device_mesh._dim_group_names[mesh_dim]]
                     if cur_rank in mesh_1d
                     else []
                 )
@@ -496,13 +495,10 @@ def _get_or_create_default_group(self):
             return _get_default_group()
 
         def _init_process_groups(self):
-            # tag/ranks/group_name associated with each mesh dimension, each
+            # group_name associated with each mesh dimension, each
             # mesh dimension should have one sub-group per rank
             #
-            # TODO(yifu): remove tag and ranks once we fully migrate to native
-            # functional collectives. See details in:
-            # https://github.com/pytorch/pytorch/issues/93173#issuecomment-1907095208
-            dim_group_infos: list[tuple[str, list[int], str]] = []
+            dim_group_names: list[str] = []
             default_group = _get_default_group()
 
             if self.mesh.ndim == 1 and self.mesh.numel() == get_world_size():
@@ -519,13 +515,7 @@ def _init_process_groups(self):
                     and get_backend(default_group) == "gloo"
                     else default_group
                 )
-                dim_group_infos.append(
-                    (
-                        _get_group_tag(dim_group),
-                        ranks,
-                        dim_group.group_name,
-                    )
-                )
+                dim_group_names.append(dim_group.group_name)
             else:
                 # create sub pgs base on the mesh argument specified
                 for dim in range(self.mesh.ndim):
@@ -579,10 +569,9 @@ def _init_process_groups(self):
                         has_split_group = True
 
                     # If the subgroup has been already created through `split_group`, we simply loop over `pg_ranks_by_dim`
-                    # and append the `(group_tag, subgroup_ranks, and group_name)` tuple to the `dim_group_infos` list when
-                    # the current rank is in the subgroup.
+                    # and append the `group_name` to the `dim_group_names` list when the current rank is in the subgroup.
                     # Otherwise, we use `new_group` instead of `split_group` to create subgroups by looping over `pg_ranks_by_dim`
-                    # along with appending information to the `dim_group_infos` list whenever necessary.
+                    # along with appending information to the `dim_group_names` list whenever necessary.
                     for dim_mesh in pg_ranks_by_dim:
                         subgroup_ranks = dim_mesh.tolist()
 
@@ -599,19 +588,13 @@ def _init_process_groups(self):
 
                         # only add to dim_groups if the current rank in the subgroup
                         if self.get_rank() in subgroup_ranks:
-                            if len(dim_group_infos) > dim:
+                            if len(dim_group_names) > dim:
                                 raise RuntimeError(
                                     f"Each device mesh dimension should get only one process group, but got {self.get_rank()} "
                                     f"in {subgroup_ranks}!"
                                 )
-                            dim_group_infos.append(
-                                (
-                                    _get_group_tag(not_none(dim_group)),
-                                    subgroup_ranks,
-                                    dim_group.group_name,
-                                )
-                            )
-            self._dim_group_infos = dim_group_infos
+                            dim_group_names.append(dim_group.group_name)
+            self._dim_group_names = dim_group_names
 
         def __enter__(self) -> "DeviceMesh":
             # set this mesh as the current mesh in mesh env
@@ -745,7 +728,7 @@ def get_group(self, mesh_dim: Optional[Union[int, str]] = None) -> ProcessGroup:
             Returns:
                 A :class:`ProcessGroup` object.
             """
-            if not hasattr(self, "_dim_group_infos"):
+            if not hasattr(self, "_dim_group_names"):
                 raise RuntimeError("DeviceMesh process groups not initialized!")
 
             if self.mesh.ndim > 1 and mesh_dim is None:
@@ -758,28 +741,25 @@ def get_group(self, mesh_dim: Optional[Union[int, str]] = None) -> ProcessGroup:
 
             # Quick return if the current device_mesh is a 1D mesh.
             if self.mesh.ndim == 1 and mesh_dim is None:
-                return not_none(
-                    _find_pg_by_ranks_and_tag(*self._dim_group_infos[0][:2])  # type: ignore[index]
-                )
+                return not_none(_resolve_process_group(self._dim_group_names[0]))
 
             root_mesh = _mesh_resources.get_root_mesh(self)
             root_to_flatten_mapping = _mesh_resources.root_to_flatten_mapping.get(
                 root_mesh, None
             )
             if root_to_flatten_mapping and mesh_dim in root_to_flatten_mapping.keys():
-                dim_group_infos = root_to_flatten_mapping[
+                dim_group_name = root_to_flatten_mapping[
                     mesh_dim  # type: ignore[index]
-                ]._dim_group_infos[0][:2]
-                return not_none(_find_pg_by_ranks_and_tag(*dim_group_infos))
+                ]._dim_group_names[0]
+                return not_none(_resolve_process_group(dim_group_name))
             else:
                 mesh_dim = (
                     _mesh_resources.get_mesh_dim_by_name(self, mesh_dim)
                     if isinstance(mesh_dim, str)
                     else mesh_dim
                 )
-                return not_none(
-                    _find_pg_by_ranks_and_tag(*self._dim_group_infos[mesh_dim][:2])  # type: ignore[index]
-                )
+                assert isinstance(mesh_dim, int)
+                return not_none(_resolve_process_group(self._dim_group_names[mesh_dim]))
 
         def get_all_groups(self) -> list[ProcessGroup]:
             """
@@ -852,9 +832,7 @@ def from_group(
                     mesh_dim_names=mesh_dim_names,
                     _init_backend=False,
                 )
-                device_mesh._dim_group_infos = [
-                    (_get_group_tag(group), group_ranks, group.group_name)
-                ]
+                device_mesh._dim_group_names = [group.group_name]
                 return device_mesh
 
             # nD scenario
@@ -880,14 +858,7 @@ def from_group(
             device_mesh = DeviceMesh(
                 device_type, mesh, mesh_dim_names=mesh_dim_names, _init_backend=False
             )
-            device_mesh._dim_group_infos = [
-                (
-                    _get_group_tag(group),
-                    get_process_group_ranks(group),
-                    group.group_name,
-                )
-                for group in groups
-            ]
+            device_mesh._dim_group_names = [group.group_name for group in groups]
             return device_mesh
 
         def size(self, mesh_dim: Optional[int] = None) -> int: