@@ -72,7 +72,6 @@ class VLMoEMeta:
7272 image_index : paddle .Tensor
7373 token_type_ids : paddle .Tensor
7474 image_token_num : paddle .Tensor
75- num_image_patch_id : paddle .Tensor
7675
7776 def __str__ (self ):
7877 return (
@@ -500,13 +499,11 @@ def prepare_vl_moe_meta(
500499 ids_remove_padding : paddle .Tensor ,
501500 ) -> VLMoEMeta :
502501
503- image_mask = ids_remove_padding > = self .im_patch_id
502+ image_mask = ids_remove_padding = = self .im_patch_id
504503 token_type_ids = image_mask .cast ("int32" )
505504 image_token_num = image_mask .sum ()
506505 token_num = ids_remove_padding .shape [0 ]
507506 text_token_num = paddle .maximum ((token_num - image_token_num ), paddle .ones ([], dtype = "int64" ))
508- num_image_patch_id = ids_remove_padding == self .im_patch_id
509- num_image_patch_id = num_image_patch_id .cast ("int32" ).sum ()
510507
511508 # The scenario requiring padding is CUDA graph, thus we only need to pad the maximum capture size.
512509 self ._cuda_graph_buffers ["token_type_ids" ][: self .fd_config .graph_opt_config .max_capture_size ].fill_ (- 1 )
@@ -520,7 +517,6 @@ def prepare_vl_moe_meta(
520517 image_index = self ._cuda_graph_buffers ["image_index" ][:token_num ],
521518 token_type_ids = self ._cuda_graph_buffers ["token_type_ids" ][:token_num ],
522519 image_token_num = self ._cuda_graph_buffers ["image_token_num" ],
523- num_image_patch_id = num_image_patch_id ,
524520 )
525521
526522 def get_input_embeddings (self , ids_remove_padding : paddle .Tensor ) -> paddle .Tensor :
@@ -791,7 +787,7 @@ def forward(
791787 input_embeddings = self .get_input_embeddings (
792788 ids_remove_padding = ids_remove_padding ,
793789 image_features = image_features ,
794- image_token_num = vl_moe_meta .num_image_patch_id .item (),
790+ image_token_num = vl_moe_meta .image_token_num .item (),
795791 )
796792 self ._input_embeddings .copy_ (input_embeddings , False )
797793
0 commit comments