-
Notifications
You must be signed in to change notification settings - Fork 5.2k
Revert "Fix different device type adjustment in PP" #8141
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -699,14 +699,14 @@ def send_object(self, obj: Any, dst: int) -> None: | |||||||||||||
| ) | ||||||||||||||
|
|
||||||||||||||
| # Serialize object to tensor and get the size as well | ||||||||||||||
| object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8).to( | ||||||||||||||
| device=self.device | ||||||||||||||
| object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8).cuda( | ||||||||||||||
| device=torch.cuda.current_device() | ||||||||||||||
| ) | ||||||||||||||
|
|
||||||||||||||
| size_tensor = torch.tensor( | ||||||||||||||
| [object_tensor.numel()], | ||||||||||||||
| dtype=torch.long, | ||||||||||||||
| device=self.device, | ||||||||||||||
| device=torch.cuda.current_device(), | ||||||||||||||
| ) | ||||||||||||||
|
|
||||||||||||||
| # Send object size | ||||||||||||||
|
|
@@ -731,7 +731,9 @@ def recv_object(self, src: int) -> Any: | |||||||||||||
| src != self.rank_in_group | ||||||||||||||
| ), "Invalid source rank. Source rank is the same as the current rank." | ||||||||||||||
|
|
||||||||||||||
| size_tensor = torch.empty(1, dtype=torch.long, device=self.device) | ||||||||||||||
| size_tensor = torch.empty( | ||||||||||||||
| 1, dtype=torch.long, device=torch.cuda.current_device() | ||||||||||||||
|
Comment on lines
+734
to
+735
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider caching the result of
Suggested change
|
||||||||||||||
| ) | ||||||||||||||
|
|
||||||||||||||
| # Receive object size | ||||||||||||||
| rank_size = torch.distributed.recv( | ||||||||||||||
|
|
@@ -742,7 +744,7 @@ def recv_object(self, src: int) -> Any: | |||||||||||||
| object_tensor = torch.empty( # type: ignore[call-overload] | ||||||||||||||
| size_tensor.item(), # type: ignore[arg-type] | ||||||||||||||
| dtype=torch.uint8, | ||||||||||||||
| device=self.device, | ||||||||||||||
| device=torch.cuda.current_device(), | ||||||||||||||
| ) | ||||||||||||||
|
|
||||||||||||||
| rank_object = torch.distributed.recv( | ||||||||||||||
|
|
||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1100,15 +1100,15 @@ def broadcast_pyobj( | |
| rank: int, | ||
| dist_group: Optional[torch.distributed.ProcessGroup] = None, | ||
| src: int = 0, | ||
| device: Optional[str] = None, | ||
| force_cpu_device: bool = True, | ||
| ): | ||
| """Broadcast inputs from src rank to all other ranks with torch.dist backend. | ||
| The `rank` here refer to the source rank on global process group (regardless | ||
| of dist_group argument). | ||
| """ | ||
|
|
||
| if device is None: | ||
| device = get_device() | ||
| device = torch.device( | ||
| "cuda" if torch.cuda.is_available() and not force_cpu_device else "cpu" | ||
| ) | ||
|
|
||
| if rank == src: | ||
| if len(data) == 0: | ||
|
|
@@ -1148,38 +1148,44 @@ def point_to_point_pyobj( | |
| group: Optional[torch.distributed.ProcessGroup] = None, | ||
| src: int = 0, | ||
| dst: int = 1, | ||
| device: Optional[str] = None, | ||
| ): | ||
| """Send data from src to dst in group using DeviceToDevice communication.""" | ||
| if device is None: | ||
| device = get_device() | ||
|
|
||
| if rank == src: | ||
| if len(data) == 0: | ||
| tensor_size = torch.tensor([0], dtype=torch.long, device=device) | ||
| tensor_size = torch.tensor( | ||
| [0], dtype=torch.long, device=torch.cuda.current_device() | ||
| ) | ||
| dist.send(tensor_size, dst=dst, group=group) | ||
| else: | ||
| serialized_data = pickle.dumps(data) | ||
| size = len(serialized_data) | ||
| tensor_data = torch.ByteTensor( | ||
| np.frombuffer(serialized_data, dtype=np.uint8) | ||
| ).to( | ||
| device=device | ||
| ) # Move to Device | ||
| tensor_size = torch.tensor([size], dtype=torch.long, device=device) | ||
| ).cuda( | ||
| device=torch.cuda.current_device() | ||
|
Comment on lines
+1165
to
+1166
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider caching the result of current_device = torch.cuda.current_device()
tensor_data = torch.ByteTensor(
np.frombuffer(serialized_data, dtype=np.uint8)
).cuda(
device=current_device
) # Move to GPU |
||
| ) # Move to GPU | ||
| tensor_size = torch.tensor( | ||
| [size], dtype=torch.long, device=torch.cuda.current_device() | ||
|
Comment on lines
+1168
to
+1169
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| ) | ||
|
|
||
| dist.send(tensor_size, dst=dst, group=group) | ||
| dist.send(tensor_data, dst=dst, group=group) | ||
| return data | ||
|
|
||
| elif rank == dst: | ||
| tensor_size = torch.tensor([0], dtype=torch.long, device=device) | ||
| tensor_size = torch.tensor( | ||
| [0], dtype=torch.long, device=torch.cuda.current_device() | ||
|
Comment on lines
+1177
to
+1178
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| ) | ||
| dist.recv(tensor_size, src=src, group=group) | ||
| size = tensor_size.item() | ||
|
|
||
| if size == 0: | ||
| return [] | ||
|
|
||
| tensor_data = torch.empty(size, dtype=torch.uint8, device=device) | ||
| tensor_data = torch.empty( | ||
| size, dtype=torch.uint8, device=torch.cuda.current_device() | ||
| ) | ||
| dist.recv(tensor_data, src=src, group=group) | ||
|
|
||
| serialized_data = bytes( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Consider caching the result of
torch.cuda.current_device()to a local variable to avoid redundant calls. This can improve readability and potentially performance.