1616
1717from verl .utils .megatron_utils import unwrap_model
1818
19- from .util import postprocess_packed_seqs , preprocess_packed_seqs , recover_left_padding , remove_left_padding
19+ from .util import (
20+ postprocess_packed_seqs ,
21+ postprocess_packed_seqs_no_padding ,
22+ preprocess_packed_seqs ,
23+ preprocess_packed_seqs_no_padding ,
24+ recover_left_padding ,
25+ remove_left_padding ,
26+ )
2027
2128
2229def gptmodel_forward (
@@ -37,13 +44,16 @@ def gptmodel_forward(
3744 if pack_seqs :
3845 batch_size , seq_len = attention_mask .shape [:2 ]
3946 input_ids_rmpad , packed_seq_params = preprocess_packed_seqs (input_ids , attention_mask , pre_process = pre_process )
47+ print (f"input_ids_rmpad shape: { input_ids_rmpad .shape } , packed_seq_params: { packed_seq_params } " )
4048 input_ids_rmpad = input_ids_rmpad .contiguous ()
4149 output_orig = model (
4250 input_ids = input_ids_rmpad ,
4351 attention_mask = None ,
4452 position_ids = position_ids ,
4553 packed_seq_params = packed_seq_params ,
4654 )
55+ print (f"output_orig: { output_orig } " )
56+
4757 if post_process and logits_processor is not None :
4858 args = {
4959 k : preprocess_packed_seqs (v , attention_mask , pre_process = True )[0 ]
@@ -146,3 +156,55 @@ def gptmodel_forward_qwen2_5_vl(
146156 if value_model and post_process :
147157 output = output [..., 0 ]
148158 return output
159+
160+
161+ def gptmodel_forward_no_padding (
162+ model ,
163+ input_ids ,
164+ value_model = False ,
165+ pack_seqs = True ,
166+ logits_processor = None ,
167+ logits_processor_args : dict = None ,
168+ ** kwargs ,
169+ ):
170+ """Default forward pass for GPT models with optional sequence packing."""
171+ pre_process = unwrap_model (model ).pre_process
172+ post_process = unwrap_model (model ).post_process
173+ if pack_seqs :
174+ batch_size = input_ids .shape [0 ]
175+ input_ids_rmpad , packed_seq_params = preprocess_packed_seqs_no_padding (input_ids , pre_process = pre_process )
176+ input_ids_rmpad = input_ids_rmpad .contiguous ()
177+ output_orig = model (
178+ input_ids = input_ids_rmpad ,
179+ attention_mask = None ,
180+ position_ids = None ,
181+ packed_seq_params = packed_seq_params ,
182+ )
183+
184+ if post_process and logits_processor is not None :
185+ args = {
186+ k : preprocess_packed_seqs_no_padding (v , pre_process = True )[0 ] for k , v in logits_processor_args .items ()
187+ }
188+ output_dict = logits_processor (output_orig , ** args )
189+ # print(f'gptmodel_forward_no_padding: {output_dict=}')
190+ output = {
191+ k : postprocess_packed_seqs_no_padding (
192+ v , packed_seq_params , input_ids , batch_size , post_process = post_process
193+ )
194+ for k , v in output_dict .items ()
195+ }
196+ else :
197+ output = postprocess_packed_seqs_no_padding (
198+ output_orig , packed_seq_params , input_ids , batch_size , post_process = post_process
199+ )
200+ else :
201+ raise NotImplementedError ("gptmodel_forward_no_padding only supports packed sequences" )
202+
203+ if value_model and post_process :
204+ # output = output[..., 0]
205+ # while using nested tensor, the advanced indexing operation above will result in an error at backward, i.e.
206+ # ValueError: NestedTensor _nested_select_backward_default(grad_output: t, self: jt_all, dim: any, index: any)
207+ # so we use `squeeze` to remove the last dimension
208+ output = output .squeeze (- 1 )
209+
210+ return output
0 commit comments