|
| 1 | +# Copyright 2025 Bytedance Ltd. and/or its affiliates |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +import torch |
| 16 | +from tensordict import TensorDict |
| 17 | + |
| 18 | +from verl.utils import tensordict_utils as tu |
| 19 | +from verl.utils.device import ( |
| 20 | + is_cuda_available, |
| 21 | + is_npu_available, |
| 22 | +) |
| 23 | + |
| 24 | +if is_cuda_available: |
| 25 | + from flash_attn.bert_padding import pad_input, unpad_input |
| 26 | +elif is_npu_available: |
| 27 | + from transformers.integrations.npu_flash_attention import pad_input, unpad_input |
| 28 | + |
| 29 | + |
| 30 | +def left_right_2_no_padding(data: TensorDict) -> TensorDict: |
| 31 | + """ |
| 32 | + Convert TensorDict from left-right padding to no-padding format. |
| 33 | +
|
| 34 | + Args: |
| 35 | + data: TensorDict with "input_ids", "attention_mask", "response_mask", "position_ids" |
| 36 | +
|
| 37 | + Returns: |
| 38 | + data: TensorDict with |
| 39 | + - Tensor includes NestedTensors like "input_ids", "loss_mask", "position_ids" |
| 40 | + - NonTensorData includes "max_seq_len", "max_response_len", "indices" |
| 41 | +
|
| 42 | + Note: |
| 43 | + 1. the return input_ids/position_ids/loss_mask are nested tensor. |
| 44 | + 2. we will remove "attention_mask", "response" in the return data, but "response_mask" is kept. |
| 45 | + """ |
| 46 | + assert "input_ids" in data, "input_ids is required in left-right padding data" |
| 47 | + assert "attention_mask" in data, "attention_mask is required in left-right padding data" |
| 48 | + assert "response_mask" in data, "response_mask is required in left-right padding data" |
| 49 | + assert "position_ids" in data, "position_ids is required in left-right padding data" |
| 50 | + |
| 51 | + input_ids = data.pop("input_ids") |
| 52 | + attention_mask = data.pop("attention_mask") |
| 53 | + response_mask = data["response_mask"] |
| 54 | + if "responses" in data: |
| 55 | + _ = data.pop("responses") |
| 56 | + |
| 57 | + max_seq_len, max_response_len = input_ids.shape[1], response_mask.shape[1] |
| 58 | + tu.assign_non_tensor_data(data, "max_seq_len", max_seq_len) |
| 59 | + tu.assign_non_tensor_data(data, "max_response_len", max_response_len) |
| 60 | + |
| 61 | + input_ids_rmpad, indices, cu_seqlens, *_ = unpad_input(input_ids.unsqueeze(-1), attention_mask) |
| 62 | + tu.assign_non_tensor_data(data, "indices", indices) |
| 63 | + |
| 64 | + input_ids_nested = torch.nested.nested_tensor_from_jagged(input_ids_rmpad.squeeze(-1), offsets=cu_seqlens) |
| 65 | + |
| 66 | + seq_lens = cu_seqlens.diff().tolist() |
| 67 | + response_lens = response_mask.sum(dim=1).tolist() |
| 68 | + |
| 69 | + position_ids_list = [] |
| 70 | + loss_mask_list = [] |
| 71 | + for seq_len, response_len in zip(seq_lens, response_lens, strict=False): |
| 72 | + position_ids_list.append(torch.arange(seq_len, device=input_ids.device)) |
| 73 | + loss_mask = torch.zeros(seq_len, dtype=torch.bool, device=input_ids.device) |
| 74 | + assert seq_len >= response_len, f"{seq_len=} is less than {response_len=}" |
| 75 | + loss_mask[-response_len:] = 1 |
| 76 | + loss_mask_list.append(loss_mask) |
| 77 | + |
| 78 | + position_ids_nested = torch.nested.as_nested_tensor(position_ids_list, layout=torch.jagged) |
| 79 | + loss_mask_nested = torch.nested.as_nested_tensor(loss_mask_list, layout=torch.jagged) |
| 80 | + |
| 81 | + data["input_ids"] = input_ids_nested |
| 82 | + data["position_ids"] = position_ids_nested |
| 83 | + data["loss_mask"] = loss_mask_nested |
| 84 | + |
| 85 | + return data |
| 86 | + |
| 87 | + |
| 88 | +def no_padding_2_padding(nested_tensor: torch.Tensor, data: TensorDict) -> torch.Tensor: |
| 89 | + """ |
| 90 | + Convert NestedTensor from no-padding to right padding format. |
| 91 | +
|
| 92 | + Args: |
| 93 | + nested_tensor: NestedTensor with no-padding format |
| 94 | + data: TensorDict with |
| 95 | + - Tensor includes NestedTensors like "input_ids", "loss_mask", "position_ids" |
| 96 | + - NonTensorData includes "max_seq_len", "max_response_len", "indices" |
| 97 | +
|
| 98 | + Returns: |
| 99 | + values: regular tensor right padded to max_response_len |
| 100 | + """ |
| 101 | + assert "indices" in data, "indices is required in left-right padding data" |
| 102 | + assert "max_seq_len" in data, "max_seq_len is required in left-right padding data" |
| 103 | + assert "max_response_len" in data, "max_response_len is required in left-right padding data" |
| 104 | + |
| 105 | + indices = tu.get_non_tensor_data(data=data, key="indices", default=None) |
| 106 | + max_seq_len = tu.get_non_tensor_data(data=data, key="max_seq_len", default=2048) |
| 107 | + max_response_len = tu.get_non_tensor_data(data=data, key="max_response_len", default=1024) |
| 108 | + batch_size = nested_tensor.size(0) |
| 109 | + |
| 110 | + values = nested_tensor.values() |
| 111 | + full_values = pad_input( |
| 112 | + hidden_states=values.unsqueeze(-1), |
| 113 | + indices=indices, |
| 114 | + batch=batch_size, |
| 115 | + seqlen=max_seq_len, |
| 116 | + ) |
| 117 | + values = full_values.squeeze(-1)[:, -max_response_len - 1 : -1] # (bsz, response_length) |
| 118 | + |
| 119 | + return values |
0 commit comments