diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py index d222342b51..f5c167c32a 100644 --- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py +++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py @@ -491,7 +491,7 @@ def generate_prior_tokens( condition_grid = image_grid_thw[:-1] prior_token_image_embed = self.vision_language_encoder.get_image_features( inputs["pixel_values"], condition_grid - ) + ).pooler_output prior_token_image_embed = torch.cat(prior_token_image_embed, dim=0) flat_prior_token_image_ids = self.vision_language_encoder.get_image_tokens( prior_token_image_embed, condition_grid @@ -859,7 +859,7 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: preprocessed_images = ( None if isinstance(first_prompt, str) - else first_prompt.get("additional_information", {}).get("preprocessed_image") + else [first_prompt.get("additional_information", {}).get("preprocessed_image")] ) condition_images = ( None