Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def __init__(
# 8*world_size bytes where world_size is at most 8. Allocating 8MB
# is enough for 131072 such tuples. The largest model I've seen only
# needs less than 10000 of registered tuples.
self.rank_data = torch.empty(
self.rank_data = torch.zeros(
8 * 1024 * 1024, dtype=torch.uint8, device=self.device
)
self._ptr = ops.init_custom_ar(
Expand All @@ -194,14 +194,14 @@ def __init__(
else:
# meta data buffers need to be "uncached" for signal on MI200
self.meta = ops.allocate_meta_buffer(ops.meta_size() + max_size)
self.buffer = torch.empty(max_size, dtype=torch.uint8, device=self.device)
self.buffer = torch.zeros(max_size, dtype=torch.uint8, device=self.device)
handle = ops.get_meta_buffer_ipc_handle(self.meta)
shard_data = (
bytes(handle), # ipc handle to base ptr
0, # offset of base ptr
)
handles, offsets = self._gather_ipc_meta(shard_data)
self.rank_data = torch.empty(
self.rank_data = torch.zeros(
8 * 1024 * 1024, dtype=torch.uint8, device=self.device
)
self._ptr = ops.init_custom_ar(
Expand Down Expand Up @@ -350,14 +350,14 @@ def should_custom_ar(self, inp: torch.Tensor):
# or, in the context of cuda graphs, register_graph_buffers
def all_reduce_reg(self, inp: torch.Tensor, out: torch.Tensor = None):
if out is None:
out = torch.empty_like(inp)
out = torch.zeros_like(inp)
ops.all_reduce_reg(self._ptr, inp, out)
return out

# all reduce, assuming inp tensor is NOT IPC registered
def all_reduce_unreg(self, inp: torch.Tensor, out: torch.Tensor = None):
if out is None:
out = torch.empty_like(inp)
out = torch.zeros_like(inp)
ops.all_reduce_unreg(self._ptr, inp, self.buffer, out)
return out

Expand All @@ -375,7 +375,7 @@ def all_reduce(
buffer.
"""
if out is None:
out = torch.empty_like(inp)
out = torch.zeros_like(inp)
if registered:
ops.all_reduce(self._ptr, inp, out, 0, 0)
else:
Expand All @@ -398,7 +398,7 @@ def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
else:
# If warm up, mimic the allocation pattern since custom
# allreduce is out-of-place.
return torch.empty_like(input)
return torch.zeros_like(input)
else:
if _is_hip:
# note: outside of cuda graph context,
Expand Down
Loading