diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst index 4410b6153f..83d5382865 100644 --- a/doc/source/models/builtin/llm/index.rst +++ b/doc/source/models/builtin/llm/index.rst @@ -346,6 +346,11 @@ The following is a list of built-in LLM in Xinference: - 32768 - Marco-o1: Towards Open Reasoning Models for Open-Ended Solutions + * - :ref:`mineru2.5-2509-1.2b ` + - chat, vision + - 32768 + - MinerU2.5-2509-1.2B is a vision language model for document understanding. + * - :ref:`minicpm-2b-dpo-bf16 ` - chat - 4096 @@ -868,6 +873,8 @@ The following is a list of built-in LLM in Xinference: marco-o1 + mineru2.5-2509-1.2b + minicpm-2b-dpo-bf16 minicpm-2b-dpo-fp16 diff --git a/doc/source/models/builtin/llm/mineru2.5-2509-1.2b.rst b/doc/source/models/builtin/llm/mineru2.5-2509-1.2b.rst new file mode 100644 index 0000000000..dc228cf857 --- /dev/null +++ b/doc/source/models/builtin/llm/mineru2.5-2509-1.2b.rst @@ -0,0 +1,31 @@ +.. _models_llm_mineru2.5-2509-1.2b: + +======================================== +MinerU2.5-2509-1.2B +======================================== + +- **Context Length:** 32768 +- **Model Name:** MinerU2.5-2509-1.2B +- **Languages:** en, zh +- **Abilities:** chat, vision +- **Description:** MinerU2.5-2509-1.2B is a vision language model for document understanding. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 1_2 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 1_2 +- **Quantizations:** none +- **Engines**: Transformers +- **Model ID:** opendatalab/MinerU2.5-2509-1.2B +- **Model Hubs**: `Hugging Face `__, `ModelScope `__ + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-engine ${engine} --model-name MinerU2.5-2509-1.2B --size-in-billions 1_2 --model-format pytorch --quantization ${quantization} + diff --git a/xinference/model/image/ocr/__init__.py b/xinference/model/image/ocr/__init__.py index f7be36bfd1..c7613a5aa2 100644 --- a/xinference/model/image/ocr/__init__.py +++ b/xinference/model/image/ocr/__init__.py @@ -15,7 +15,6 @@ from .deepseek_ocr import DeepSeekOCRModel from .got_ocr2 import GotOCR2Model from .hunyuan_ocr import HunyuanOCRModel -from .mineru import MinerUModel from .mlx import MLXDeepSeekOCRModel from .ocr_family import SUPPORTED_ENGINES from .paddleocr_vl import PaddleOCRVLModel @@ -23,7 +22,6 @@ VLLMDeepSeekOCRModel, VLLMGotOCR2Model, VLLMHunyuanOCRModel, - VLLMMinerUModel, VLLMPaddleOCRVLModel, ) @@ -31,7 +29,6 @@ "DeepSeekOCRModel", "GotOCR2Model", "HunyuanOCRModel", - "MinerUModel", "PaddleOCRVLModel", ] @@ -41,12 +38,10 @@ def register_builtin_ocr_engines() -> None: DeepSeekOCRModel, GotOCR2Model, HunyuanOCRModel, - MinerUModel, PaddleOCRVLModel, ] SUPPORTED_ENGINES["vllm"] = [ VLLMDeepSeekOCRModel, VLLMHunyuanOCRModel, - VLLMMinerUModel, ] SUPPORTED_ENGINES["mlx"] = [MLXDeepSeekOCRModel] diff --git a/xinference/model/image/ocr/mineru.py b/xinference/model/image/ocr/mineru.py deleted file mode 100644 index efc4fac5ec..0000000000 --- a/xinference/model/image/ocr/mineru.py +++ /dev/null @@ -1,288 +0,0 @@ -# Copyright 2022-2026 XProbe Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import logging -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union - -import PIL.Image - -if TYPE_CHECKING: - from ..core import ImageModelFamilyV2 - -from .ocr_family import OCRModel - -logger = logging.getLogger(__name__) - - -class MinerUModel(OCRModel): - """MinerU Vision-Language Model for document parsing and OCR. - - MinerU2.5 is a 1.2B parameter vision-language model designed for - efficient high-resolution document parsing. It employs a two-stage - strategy: global layout analysis followed by fine-grained content recognition. - """ - - required_libs = ("transformers",) - - @classmethod - def match(cls, model_family: "ImageModelFamilyV2") -> bool: - model_name = model_family.model_name - return model_name.startswith("MinerU") - - def __init__( - self, - model_uid: str, - model_path: Optional[str] = None, - device: Optional[str] = None, - model_spec: Optional["ImageModelFamilyV2"] = None, - **kwargs, - ): - self.model_family = model_spec - self._model_uid = model_uid - self._model_path = model_path - self._device = device - # model info when loading - self._model = None - self._processor = None - self._client = None - # info - self._model_spec = model_spec - self._abilities = model_spec.model_ability or [] # type: ignore - self._kwargs = kwargs - - @property - def model_ability(self): - return self._abilities - - def load(self): - import torch - from transformers import AutoProcessor, Qwen2VLForConditionalGeneration - - try: - from mineru_vl_utils import MinerUClient - except ImportError: - raise ImportError( - "mineru-vl-utils is required for MinerU models. " - "Please install it with: pip install 'mineru-vl-utils[transformers]'" - ) - - logger.info(f"Loading MinerU model from {self._model_path}") - - try: - # Determine device and dtype - if self._device == "cpu": - device_map = "cpu" - dtype = torch.float32 - else: - device_map = "auto" - dtype = ( - torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16 - ) - - # Get torch_dtype from kwargs if specified - torch_dtype = self._kwargs.get("torch_dtype", dtype) - if isinstance(torch_dtype, str): - torch_dtype = getattr(torch, torch_dtype, dtype) - - # Load model with Qwen2VL architecture - self._model = Qwen2VLForConditionalGeneration.from_pretrained( - self._model_path, - torch_dtype=torch_dtype, - device_map=device_map, - trust_remote_code=True, - ) - - # Load processor - try: - self._processor = AutoProcessor.from_pretrained( - self._model_path, - trust_remote_code=True, - use_fast=True, - ) - except ValueError: - # Fallback for when AutoProcessor cannot identify the processor type - from transformers import Qwen2VLProcessor - - self._processor = Qwen2VLProcessor.from_pretrained( - self._model_path, - trust_remote_code=True, - use_fast=True, - ) - - # Create MinerU client - self._client = MinerUClient( - backend="transformers", - model=self._model, - processor=self._processor, - ) - - logger.info( - f"MinerU model loaded successfully with device_map={device_map}, dtype={torch_dtype}" - ) - except Exception as e: - logger.error(f"Failed to load MinerU model: {e}") - raise - - def ocr( - self, - image: Union[PIL.Image.Image, List[PIL.Image.Image]], - **kwargs, - ) -> Union[str, List[str], Dict[str, Any]]: - """ - Perform document parsing and OCR using MinerU vision-language model. - - Args: - image: PIL Image or list of PIL Images - **kwargs: Additional parameters including: - - output_format: Output format ('markdown', 'json', 'text'), default: 'markdown' - - return_dict: Whether to return a dictionary with metadata (default: False) - - extract_mode: Extraction mode ('two_step', 'single_step'), default: 'two_step' - - Returns: - Document content as string, list of strings, or dict - """ - logger.info("MinerU OCR kwargs: %s", kwargs) - - if self._client is None: - raise RuntimeError("Model not loaded. Please call load() first.") - - # Extract parameters - output_format = kwargs.get("output_format", "markdown") - return_dict = kwargs.get("return_dict", False) - extract_mode = kwargs.get("extract_mode", "two_step") - - # Handle single image input - if isinstance(image, PIL.Image.Image): - result = self._process_single(image, output_format, extract_mode) - if return_dict: - return { - "text": result, - "model": "mineru", - "output_format": output_format, - "success": True, - } - return result - - # Handle batch image input - elif isinstance(image, list): - results = [ - self._process_single(img, output_format, extract_mode) for img in image - ] - if return_dict: - return { - "text": results, - "model": "mineru", - "output_format": output_format, - "success": True, - "num_images": len(results), - } - return results - - else: - raise ValueError("Input must be a PIL Image or list of PIL Images") - - def _process_single( - self, image: PIL.Image.Image, output_format: str, extract_mode: str - ) -> str: - """Process a single image with MinerU.""" - assert self._client is not None, "Client not loaded. Call load() first." - - # Convert image to RGB if needed - if image.mode in ["RGBA", "CMYK"]: - image = image.convert("RGB") - - try: - # Use two-step extraction for better accuracy (default) - if extract_mode == "two_step": - extracted_blocks = self._client.two_step_extract(image) - else: - # Single step extraction (faster but less accurate) - extracted_blocks = self._client.extract(image) - - # Format output based on requested format - if output_format == "json": - return json.dumps(extracted_blocks, ensure_ascii=False, indent=2) - elif output_format == "text": - return self._blocks_to_text(extracted_blocks) - else: # markdown (default) - return self._blocks_to_markdown(extracted_blocks) - - except Exception as e: - logger.error(f"MinerU processing failed: {e}") - raise - - def _blocks_to_markdown(self, blocks: Any) -> str: - """Convert extracted blocks to markdown format.""" - if isinstance(blocks, str): - return blocks - - if isinstance(blocks, dict): - # Handle dict response with content - if "content" in blocks: - return str(blocks["content"]) - if "text" in blocks: - return str(blocks["text"]) - if "markdown" in blocks: - return str(blocks["markdown"]) - return json.dumps(blocks, ensure_ascii=False, indent=2) - - if isinstance(blocks, list): - result_parts = [] - for block in blocks: - if isinstance(block, str): - result_parts.append(block) - elif isinstance(block, dict): - block_type = block.get("type", "text") - content = block.get("content", block.get("text", "")) - - if block_type == "title": - level = block.get("level", 1) - result_parts.append(f"{'#' * level} {content}") - elif block_type == "table": - result_parts.append(str(content)) - elif block_type == "formula": - result_parts.append(f"$${content}$$") - elif block_type == "image": - caption = block.get("caption", "") - result_parts.append(f"![{caption}]({content})") - else: - result_parts.append(str(content)) - return "\n\n".join(result_parts) - - return str(blocks) - - def _blocks_to_text(self, blocks: Any) -> str: - """Convert extracted blocks to plain text format.""" - if isinstance(blocks, str): - return blocks - - if isinstance(blocks, dict): - if "content" in blocks: - return str(blocks["content"]) - if "text" in blocks: - return str(blocks["text"]) - return json.dumps(blocks, ensure_ascii=False) - - if isinstance(blocks, list): - result_parts = [] - for block in blocks: - if isinstance(block, str): - result_parts.append(block) - elif isinstance(block, dict): - content = block.get("content", block.get("text", "")) - result_parts.append(str(content)) - return "\n".join(result_parts) - - return str(blocks) diff --git a/xinference/model/image/ocr/vllm.py b/xinference/model/image/ocr/vllm.py index d67d7d816a..29793f0cd1 100644 --- a/xinference/model/image/ocr/vllm.py +++ b/xinference/model/image/ocr/vllm.py @@ -21,7 +21,6 @@ from .deepseek_ocr import DeepSeekOCRModel from .got_ocr2 import GotOCR2Model from .hunyuan_ocr import HunyuanOCRModel -from .mineru import MinerUModel from .paddleocr_vl import PaddleOCRVLModel logger = logging.getLogger(__name__) @@ -307,44 +306,3 @@ def ocr( class VLLMPaddleOCRVLModel(PaddleOCRVLModel): required_libs = ("vllm",) - - -class VLLMMinerUModel(MinerUModel): - """vLLM-based MinerU model for faster inference.""" - - required_libs = ("vllm",) - - def load(self): - try: - from mineru_vl_utils import MinerUClient, MinerULogitsProcessor - except ImportError: - raise ImportError( - "mineru-vl-utils is required for MinerU models. " - "Please install it with: pip install 'mineru-vl-utils[vllm]'" - ) - - logger.info(f"Loading MinerU model with vLLM from {self._model_path}") - - vllm_kwargs = _sanitize_vllm_kwargs(self._kwargs) - - # Load vLLM model with MinerU logits processor - from vllm import LLM - - self._model = LLM( - model=self._model_path, - logits_processors=[MinerULogitsProcessor], - **vllm_kwargs, - ) - - # Create MinerU client with vLLM backend - self._client = MinerUClient( - backend="vllm-engine", - vllm_llm=self._model, - ) - - logger.info("MinerU model loaded successfully with vLLM backend") - - def stop(self): - _shutdown_vllm_model(self._model) - self._model = None - self._client = None diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 16ef78fc5f..ad4c6e4cc8 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -26028,5 +26028,66 @@ ], "featured": false, "updated_at": 1770196377 + }, + { + "model_name": "MinerU2.5-2509-1.2B", + "model_description": "MinerU2.5-2509-1.2B is a vision language model for document understanding.", + "context_length": 32768, + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat", + "vision" + ], + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": "1_2", + "model_src": { + "huggingface": { + "model_id": "opendatalab/MinerU2.5-2509-1.2B", + "model_revision": "main", + "quantizations": [ + "none" + ] + }, + "modelscope": { + "model_id": "opendatalab/MinerU2.5-2509-1.2B", + "model_revision": "master", + "quantizations": [ + "none" + ] + } + } + } + ], + "architectures": [ + "Qwen2VLForConditionalGeneration" + ], + "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}", + "stop_token_ids": [ + 151645, + 151643 + ], + "stop": [ + "<|im_end|>", + "<|endoftext|>" + ], + "version": 2, + "virtualenv": { + "packages": [ + "transformers>=4.45.0 ; #engine# == \"Transformers\"", + "mineru-vl-utils[transformers] ; #engine# == \"Transformers\"", + "vllm_dependencies ; #engine# == \"vllm\"", + "qwen-vl-utils", + "#system_torch#", + "#system_numpy#", + "qwen_omni_utils" + ] + }, + "featured": false, + "updated_at": 1770103567 } ] diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py index a8c1acf63f..3f43fd0741 100644 --- a/xinference/model/llm/vllm/core.py +++ b/xinference/model/llm/vllm/core.py @@ -1860,6 +1860,53 @@ async def _gen_tokens_prompt( prompt_token_ids=token_ids, multi_modal_data=multi_modal_data ) + def _handle_base64_images(self, messages, temp_files): + import base64 + import re + import tempfile + + # Regex to match data URI scheme + data_uri_pattern = re.compile( + r"data:([a-zA-Z0-9]+/[a-zA-Z0-9-.+]+);base64,(.*)" + ) + + for msg in messages: + if isinstance(msg, dict) and isinstance(msg.get("content"), list): + for content in msg["content"]: + if isinstance(content, dict): + # check image_url + if "image_url" in content and isinstance( + content["image_url"], dict + ): + url = content["image_url"].get("url", "") + if isinstance(url, str) and url.startswith("data:"): + match = data_uri_pattern.match(url) + if match: + mime_type, b64_data = match.groups() + try: + # Create temp file + suffix = ".bin" + if "pdf" in mime_type: + suffix = ".pdf" + elif "png" in mime_type: + suffix = ".png" + elif "jpeg" in mime_type or "jpg" in mime_type: + suffix = ".jpg" + + with tempfile.NamedTemporaryFile( + delete=False, suffix=suffix + ) as tmp: + tmp.write(base64.b64decode(b64_data)) + content["image_url"]["url"] = tmp.name + temp_files.append(tmp.name) + logger.debug( + f"Decoded base64 content to temp file: {tmp.name}" + ) + except Exception as e: + logger.error( + f"Failed to decode base64 file: {e}" + ) + @vllm_check async def async_chat( self, @@ -1878,6 +1925,14 @@ async def async_chat( process_vision_info, ) + # Pre-process messages to handle base64 data URIs BEFORE transform + temp_files: List[str] = [] + if ( + "vision" in self.model_family.model_ability + or "omni" in self.model_family.model_ability + ): + self._handle_base64_images(messages, temp_files) + messages = self._transform_messages(messages) chat_template_kwargs = ( diff --git a/xinference/ui/gradio/chat_interface.py b/xinference/ui/gradio/chat_interface.py index 99d9d3bc4f..c54ce32dac 100644 --- a/xinference/ui/gradio/chat_interface.py +++ b/xinference/ui/gradio/chat_interface.py @@ -416,6 +416,7 @@ def add_text(history, bot, text, image, video, audio): video, audio, ) + if image: buffered = BytesIO() with PIL.Image.open(image) as img: diff --git a/xinference/ui/gradio/media_interface.py b/xinference/ui/gradio/media_interface.py index 1a4272b721..d7794d146c 100644 --- a/xinference/ui/gradio/media_interface.py +++ b/xinference/ui/gradio/media_interface.py @@ -1611,6 +1611,170 @@ def toggle_additional_outputs(enable_viz): return ocr_interface + def document_parsing_interface(self) -> "gr.Blocks": + """Document parsing interface that supports PDF file uploads (for MinerU).""" + + def parse_document( + file_path: str, + backend: str = "hybrid-auto-engine", + parse_method: str = "auto", + language: str = "ch", + output_format: str = "markdown", + progress=gr.Progress(), + ) -> str: + from ...client import RESTfulClient + + if not file_path: + return "**Error**: Please upload a PDF or image file." + + client = RESTfulClient(self.endpoint) + client._set_token(self.access_token) + model = client.get_model(self.model_uid) + + if not hasattr(model, "ocr"): + return "**Error**: Model does not support OCR/document parsing." + + progress(0.1, desc="Reading file...") + + try: + # Read file content + with open(file_path, "rb") as f: + file_bytes = f.read() + progress(0.3, desc="Processing document...") + + # Call model's ocr method + response = model.ocr( + image=file_bytes, + backend=backend, + parse_method=parse_method, + language=language, + output_format=output_format, + return_dict=True, + ) + + progress(0.9, desc="Formatting output...") + + if isinstance(response, dict): + if response.get("success"): + result = response.get( + "markdown", response.get("text", "No content extracted") + ) + return result or "No content extracted" + else: + return f"**Error**: {response.get('error', 'Unknown error')}" + elif isinstance(response, str): + return response + else: + return str(response) + + except Exception as e: + logger.error(f"Document parsing error: {e}") + import traceback + + error_details = traceback.format_exc() + logger.error(f"Full traceback: {error_details}") + return f"""**Document Parsing Error** + +``` +{str(e)} +``` + +**Debug Info:** +- File: {file_path} +- Backend: {backend} +- Parse Method: {parse_method} +- Language: {language} +""" + finally: + progress(1.0, desc="Complete") + + with gr.Blocks() as doc_parsing_interface: + gr.Markdown(f"### 📄 Document Parsing with {self.model_name}") + gr.Markdown( + "Upload PDF or image files for high-precision document parsing to Markdown/JSON." + ) + + with gr.Row(): + with gr.Column(scale=1): + # File upload that accepts PDF and images + file_input = gr.File( + label="Upload Document (PDF or Image)", + file_types=[ + ".pdf", + ".png", + ".jpg", + ".jpeg", + ".webp", + ".bmp", + ".gif", + ], + type="filepath", + ) + + gr.Markdown(f"**Current Model:** {self.model_name}") + + # MinerU-specific configuration + backend = gr.Dropdown( + choices=[ + "pipeline", # General mode + "vlm-auto-engine", # Local VLM high accuracy + "hybrid-auto-engine", # Hybrid mode (recommended) + ], + value="hybrid-auto-engine", + label="Backend", + info="pipeline: General, vlm: High accuracy (local), hybrid: Recommended", + ) + + parse_method = gr.Dropdown( + choices=["auto", "txt", "ocr"], + value="auto", + label="Parse Method", + info="auto: Auto-detect, txt: Text extraction, ocr: OCR for scanned documents", + ) + + language = gr.Dropdown( + choices=[ + "ch", # Chinese + "en", # English + "chinese_cht", # Traditional Chinese + ], + value="ch", + label="Document Language", + info="Select the primary language of your document", + ) + + output_format = gr.Dropdown( + choices=["markdown", "json"], + value="markdown", + label="Output Format", + ) + + parse_btn = gr.Button("Parse Document", variant="primary") + + with gr.Column(scale=1): + with gr.Group(elem_classes="output-container"): + gr.Markdown("### 📄 Parsing Results") + + result_output = gr.Markdown( + value="Parsed content will be displayed here...", + elem_classes="output-text", + container=False, + ) + + parse_btn.click( + fn=parse_document, + inputs=[ + file_input, + backend, + parse_method, + language, + output_format, + ], + outputs=result_output, + ) + + return doc_parsing_interface + def build_main_interface(self) -> "gr.Blocks": if self.model_type == "image": if "ocr" in self.model_ability: @@ -1731,6 +1895,9 @@ def build_main_interface(self) -> "gr.Blocks": if "ocr" in self.model_ability: with gr.Tab("OCR"): self.ocr_interface() + if "document-parsing" in self.model_ability: + with gr.Tab("Document Parsing"): + self.document_parsing_interface() if "text2image" in self.model_ability: with gr.Tab("Text to Image"): self.text2image_interface()