diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
index 4410b6153f..83d5382865 100644
--- a/doc/source/models/builtin/llm/index.rst
+++ b/doc/source/models/builtin/llm/index.rst
@@ -346,6 +346,11 @@ The following is a list of built-in LLM in Xinference:
      - 32768
      - Marco-o1: Towards Open Reasoning Models for Open-Ended Solutions
 
+   * - :ref:`mineru2.5-2509-1.2b <models_llm_mineru2.5-2509-1.2b>`
+     - chat, vision
+     - 32768
+     - MinerU2.5-2509-1.2B is a vision language model for document understanding.
+
    * - :ref:`minicpm-2b-dpo-bf16 <models_llm_minicpm-2b-dpo-bf16>`
      - chat
      - 4096
@@ -868,6 +873,8 @@ The following is a list of built-in LLM in Xinference:
   
    marco-o1
   
+   mineru2.5-2509-1.2b
+  
    minicpm-2b-dpo-bf16
   
    minicpm-2b-dpo-fp16
diff --git a/doc/source/models/builtin/llm/mineru2.5-2509-1.2b.rst b/doc/source/models/builtin/llm/mineru2.5-2509-1.2b.rst
new file mode 100644
index 0000000000..dc228cf857
--- /dev/null
+++ b/doc/source/models/builtin/llm/mineru2.5-2509-1.2b.rst
@@ -0,0 +1,31 @@
+.. _models_llm_mineru2.5-2509-1.2b:
+
+========================================
+MinerU2.5-2509-1.2B
+========================================
+
+- **Context Length:** 32768
+- **Model Name:** MinerU2.5-2509-1.2B
+- **Languages:** en, zh
+- **Abilities:** chat, vision
+- **Description:** MinerU2.5-2509-1.2B is a vision language model for document understanding.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 1_2 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 1_2
+- **Quantizations:** none
+- **Engines**: Transformers
+- **Model ID:** opendatalab/MinerU2.5-2509-1.2B
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/opendatalab/MinerU2.5-2509-1.2B>`__, `ModelScope <https://modelscope.cn/models/opendatalab/MinerU2.5-2509-1.2B>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-engine ${engine} --model-name MinerU2.5-2509-1.2B --size-in-billions 1_2 --model-format pytorch --quantization ${quantization}
+
diff --git a/xinference/model/image/ocr/__init__.py b/xinference/model/image/ocr/__init__.py
index f7be36bfd1..c7613a5aa2 100644
--- a/xinference/model/image/ocr/__init__.py
+++ b/xinference/model/image/ocr/__init__.py
@@ -15,7 +15,6 @@
 from .deepseek_ocr import DeepSeekOCRModel
 from .got_ocr2 import GotOCR2Model
 from .hunyuan_ocr import HunyuanOCRModel
-from .mineru import MinerUModel
 from .mlx import MLXDeepSeekOCRModel
 from .ocr_family import SUPPORTED_ENGINES
 from .paddleocr_vl import PaddleOCRVLModel
@@ -23,7 +22,6 @@
     VLLMDeepSeekOCRModel,
     VLLMGotOCR2Model,
     VLLMHunyuanOCRModel,
-    VLLMMinerUModel,
     VLLMPaddleOCRVLModel,
 )
 
@@ -31,7 +29,6 @@
     "DeepSeekOCRModel",
     "GotOCR2Model",
     "HunyuanOCRModel",
-    "MinerUModel",
     "PaddleOCRVLModel",
 ]
 
@@ -41,12 +38,10 @@ def register_builtin_ocr_engines() -> None:
         DeepSeekOCRModel,
         GotOCR2Model,
         HunyuanOCRModel,
-        MinerUModel,
         PaddleOCRVLModel,
     ]
     SUPPORTED_ENGINES["vllm"] = [
         VLLMDeepSeekOCRModel,
         VLLMHunyuanOCRModel,
-        VLLMMinerUModel,
     ]
     SUPPORTED_ENGINES["mlx"] = [MLXDeepSeekOCRModel]
diff --git a/xinference/model/image/ocr/mineru.py b/xinference/model/image/ocr/mineru.py
deleted file mode 100644
index efc4fac5ec..0000000000
--- a/xinference/model/image/ocr/mineru.py
+++ /dev/null
@@ -1,288 +0,0 @@
-# Copyright 2022-2026 XProbe Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import logging
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
-
-import PIL.Image
-
-if TYPE_CHECKING:
-    from ..core import ImageModelFamilyV2
-
-from .ocr_family import OCRModel
-
-logger = logging.getLogger(__name__)
-
-
-class MinerUModel(OCRModel):
-    """MinerU Vision-Language Model for document parsing and OCR.
-
-    MinerU2.5 is a 1.2B parameter vision-language model designed for
-    efficient high-resolution document parsing. It employs a two-stage
-    strategy: global layout analysis followed by fine-grained content recognition.
-    """
-
-    required_libs = ("transformers",)
-
-    @classmethod
-    def match(cls, model_family: "ImageModelFamilyV2") -> bool:
-        model_name = model_family.model_name
-        return model_name.startswith("MinerU")
-
-    def __init__(
-        self,
-        model_uid: str,
-        model_path: Optional[str] = None,
-        device: Optional[str] = None,
-        model_spec: Optional["ImageModelFamilyV2"] = None,
-        **kwargs,
-    ):
-        self.model_family = model_spec
-        self._model_uid = model_uid
-        self._model_path = model_path
-        self._device = device
-        # model info when loading
-        self._model = None
-        self._processor = None
-        self._client = None
-        # info
-        self._model_spec = model_spec
-        self._abilities = model_spec.model_ability or []  # type: ignore
-        self._kwargs = kwargs
-
-    @property
-    def model_ability(self):
-        return self._abilities
-
-    def load(self):
-        import torch
-        from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
-
-        try:
-            from mineru_vl_utils import MinerUClient
-        except ImportError:
-            raise ImportError(
-                "mineru-vl-utils is required for MinerU models. "
-                "Please install it with: pip install 'mineru-vl-utils[transformers]'"
-            )
-
-        logger.info(f"Loading MinerU model from {self._model_path}")
-
-        try:
-            # Determine device and dtype
-            if self._device == "cpu":
-                device_map = "cpu"
-                dtype = torch.float32
-            else:
-                device_map = "auto"
-                dtype = (
-                    torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
-                )
-
-            # Get torch_dtype from kwargs if specified
-            torch_dtype = self._kwargs.get("torch_dtype", dtype)
-            if isinstance(torch_dtype, str):
-                torch_dtype = getattr(torch, torch_dtype, dtype)
-
-            # Load model with Qwen2VL architecture
-            self._model = Qwen2VLForConditionalGeneration.from_pretrained(
-                self._model_path,
-                torch_dtype=torch_dtype,
-                device_map=device_map,
-                trust_remote_code=True,
-            )
-
-            # Load processor
-            try:
-                self._processor = AutoProcessor.from_pretrained(
-                    self._model_path,
-                    trust_remote_code=True,
-                    use_fast=True,
-                )
-            except ValueError:
-                # Fallback for when AutoProcessor cannot identify the processor type
-                from transformers import Qwen2VLProcessor
-
-                self._processor = Qwen2VLProcessor.from_pretrained(
-                    self._model_path,
-                    trust_remote_code=True,
-                    use_fast=True,
-                )
-
-            # Create MinerU client
-            self._client = MinerUClient(
-                backend="transformers",
-                model=self._model,
-                processor=self._processor,
-            )
-
-            logger.info(
-                f"MinerU model loaded successfully with device_map={device_map}, dtype={torch_dtype}"
-            )
-        except Exception as e:
-            logger.error(f"Failed to load MinerU model: {e}")
-            raise
-
-    def ocr(
-        self,
-        image: Union[PIL.Image.Image, List[PIL.Image.Image]],
-        **kwargs,
-    ) -> Union[str, List[str], Dict[str, Any]]:
-        """
-        Perform document parsing and OCR using MinerU vision-language model.
-
-        Args:
-            image: PIL Image or list of PIL Images
-            **kwargs: Additional parameters including:
-                - output_format: Output format ('markdown', 'json', 'text'), default: 'markdown'
-                - return_dict: Whether to return a dictionary with metadata (default: False)
-                - extract_mode: Extraction mode ('two_step', 'single_step'), default: 'two_step'
-
-        Returns:
-            Document content as string, list of strings, or dict
-        """
-        logger.info("MinerU OCR kwargs: %s", kwargs)
-
-        if self._client is None:
-            raise RuntimeError("Model not loaded. Please call load() first.")
-
-        # Extract parameters
-        output_format = kwargs.get("output_format", "markdown")
-        return_dict = kwargs.get("return_dict", False)
-        extract_mode = kwargs.get("extract_mode", "two_step")
-
-        # Handle single image input
-        if isinstance(image, PIL.Image.Image):
-            result = self._process_single(image, output_format, extract_mode)
-            if return_dict:
-                return {
-                    "text": result,
-                    "model": "mineru",
-                    "output_format": output_format,
-                    "success": True,
-                }
-            return result
-
-        # Handle batch image input
-        elif isinstance(image, list):
-            results = [
-                self._process_single(img, output_format, extract_mode) for img in image
-            ]
-            if return_dict:
-                return {
-                    "text": results,
-                    "model": "mineru",
-                    "output_format": output_format,
-                    "success": True,
-                    "num_images": len(results),
-                }
-            return results
-
-        else:
-            raise ValueError("Input must be a PIL Image or list of PIL Images")
-
-    def _process_single(
-        self, image: PIL.Image.Image, output_format: str, extract_mode: str
-    ) -> str:
-        """Process a single image with MinerU."""
-        assert self._client is not None, "Client not loaded. Call load() first."
-
-        # Convert image to RGB if needed
-        if image.mode in ["RGBA", "CMYK"]:
-            image = image.convert("RGB")
-
-        try:
-            # Use two-step extraction for better accuracy (default)
-            if extract_mode == "two_step":
-                extracted_blocks = self._client.two_step_extract(image)
-            else:
-                # Single step extraction (faster but less accurate)
-                extracted_blocks = self._client.extract(image)
-
-            # Format output based on requested format
-            if output_format == "json":
-                return json.dumps(extracted_blocks, ensure_ascii=False, indent=2)
-            elif output_format == "text":
-                return self._blocks_to_text(extracted_blocks)
-            else:  # markdown (default)
-                return self._blocks_to_markdown(extracted_blocks)
-
-        except Exception as e:
-            logger.error(f"MinerU processing failed: {e}")
-            raise
-
-    def _blocks_to_markdown(self, blocks: Any) -> str:
-        """Convert extracted blocks to markdown format."""
-        if isinstance(blocks, str):
-            return blocks
-
-        if isinstance(blocks, dict):
-            # Handle dict response with content
-            if "content" in blocks:
-                return str(blocks["content"])
-            if "text" in blocks:
-                return str(blocks["text"])
-            if "markdown" in blocks:
-                return str(blocks["markdown"])
-            return json.dumps(blocks, ensure_ascii=False, indent=2)
-
-        if isinstance(blocks, list):
-            result_parts = []
-            for block in blocks:
-                if isinstance(block, str):
-                    result_parts.append(block)
-                elif isinstance(block, dict):
-                    block_type = block.get("type", "text")
-                    content = block.get("content", block.get("text", ""))
-
-                    if block_type == "title":
-                        level = block.get("level", 1)
-                        result_parts.append(f"{'#' * level} {content}")
-                    elif block_type == "table":
-                        result_parts.append(str(content))
-                    elif block_type == "formula":
-                        result_parts.append(f"$${content}$$")
-                    elif block_type == "image":
-                        caption = block.get("caption", "")
-                        result_parts.append(f"![{caption}]({content})")
-                    else:
-                        result_parts.append(str(content))
-            return "\n\n".join(result_parts)
-
-        return str(blocks)
-
-    def _blocks_to_text(self, blocks: Any) -> str:
-        """Convert extracted blocks to plain text format."""
-        if isinstance(blocks, str):
-            return blocks
-
-        if isinstance(blocks, dict):
-            if "content" in blocks:
-                return str(blocks["content"])
-            if "text" in blocks:
-                return str(blocks["text"])
-            return json.dumps(blocks, ensure_ascii=False)
-
-        if isinstance(blocks, list):
-            result_parts = []
-            for block in blocks:
-                if isinstance(block, str):
-                    result_parts.append(block)
-                elif isinstance(block, dict):
-                    content = block.get("content", block.get("text", ""))
-                    result_parts.append(str(content))
-            return "\n".join(result_parts)
-
-        return str(blocks)
diff --git a/xinference/model/image/ocr/vllm.py b/xinference/model/image/ocr/vllm.py
index d67d7d816a..29793f0cd1 100644
--- a/xinference/model/image/ocr/vllm.py
+++ b/xinference/model/image/ocr/vllm.py
@@ -21,7 +21,6 @@
 from .deepseek_ocr import DeepSeekOCRModel
 from .got_ocr2 import GotOCR2Model
 from .hunyuan_ocr import HunyuanOCRModel
-from .mineru import MinerUModel
 from .paddleocr_vl import PaddleOCRVLModel
 
 logger = logging.getLogger(__name__)
@@ -307,44 +306,3 @@ def ocr(
 
 class VLLMPaddleOCRVLModel(PaddleOCRVLModel):
     required_libs = ("vllm",)
-
-
-class VLLMMinerUModel(MinerUModel):
-    """vLLM-based MinerU model for faster inference."""
-
-    required_libs = ("vllm",)
-
-    def load(self):
-        try:
-            from mineru_vl_utils import MinerUClient, MinerULogitsProcessor
-        except ImportError:
-            raise ImportError(
-                "mineru-vl-utils is required for MinerU models. "
-                "Please install it with: pip install 'mineru-vl-utils[vllm]'"
-            )
-
-        logger.info(f"Loading MinerU model with vLLM from {self._model_path}")
-
-        vllm_kwargs = _sanitize_vllm_kwargs(self._kwargs)
-
-        # Load vLLM model with MinerU logits processor
-        from vllm import LLM
-
-        self._model = LLM(
-            model=self._model_path,
-            logits_processors=[MinerULogitsProcessor],
-            **vllm_kwargs,
-        )
-
-        # Create MinerU client with vLLM backend
-        self._client = MinerUClient(
-            backend="vllm-engine",
-            vllm_llm=self._model,
-        )
-
-        logger.info("MinerU model loaded successfully with vLLM backend")
-
-    def stop(self):
-        _shutdown_vllm_model(self._model)
-        self._model = None
-        self._client = None
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 16ef78fc5f..ad4c6e4cc8 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -26028,5 +26028,66 @@
     ],
     "featured": false,
     "updated_at": 1770196377
+  },
+  {
+    "model_name": "MinerU2.5-2509-1.2B",
+    "model_description": "MinerU2.5-2509-1.2B is a vision language model for document understanding.",
+    "context_length": 32768,
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_2",
+        "model_src": {
+          "huggingface": {
+            "model_id": "opendatalab/MinerU2.5-2509-1.2B",
+            "model_revision": "main",
+            "quantizations": [
+              "none"
+            ]
+          },
+          "modelscope": {
+            "model_id": "opendatalab/MinerU2.5-2509-1.2B",
+            "model_revision": "master",
+            "quantizations": [
+              "none"
+            ]
+          }
+        }
+      }
+    ],
+    "architectures": [
+      "Qwen2VLForConditionalGeneration"
+    ],
+    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
+    "stop_token_ids": [
+      151645,
+      151643
+    ],
+    "stop": [
+      "<|im_end|>",
+      "<|endoftext|>"
+    ],
+    "version": 2,
+    "virtualenv": {
+      "packages": [
+        "transformers>=4.45.0 ; #engine# == \"Transformers\"",
+        "mineru-vl-utils[transformers] ; #engine# == \"Transformers\"",
+        "vllm_dependencies ; #engine# == \"vllm\"",
+        "qwen-vl-utils",
+        "#system_torch#",
+        "#system_numpy#",
+        "qwen_omni_utils"
+      ]
+    },
+    "featured": false,
+    "updated_at": 1770103567
   }
 ]
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
index a8c1acf63f..3f43fd0741 100644
--- a/xinference/model/llm/vllm/core.py
+++ b/xinference/model/llm/vllm/core.py
@@ -1860,6 +1860,53 @@ async def _gen_tokens_prompt(
             prompt_token_ids=token_ids, multi_modal_data=multi_modal_data
         )
 
+    def _handle_base64_images(self, messages, temp_files):
+        import base64
+        import re
+        import tempfile
+
+        # Regex to match data URI scheme
+        data_uri_pattern = re.compile(
+            r"data:([a-zA-Z0-9]+/[a-zA-Z0-9-.+]+);base64,(.*)"
+        )
+
+        for msg in messages:
+            if isinstance(msg, dict) and isinstance(msg.get("content"), list):
+                for content in msg["content"]:
+                    if isinstance(content, dict):
+                        # check image_url
+                        if "image_url" in content and isinstance(
+                            content["image_url"], dict
+                        ):
+                            url = content["image_url"].get("url", "")
+                            if isinstance(url, str) and url.startswith("data:"):
+                                match = data_uri_pattern.match(url)
+                                if match:
+                                    mime_type, b64_data = match.groups()
+                                    try:
+                                        # Create temp file
+                                        suffix = ".bin"
+                                        if "pdf" in mime_type:
+                                            suffix = ".pdf"
+                                        elif "png" in mime_type:
+                                            suffix = ".png"
+                                        elif "jpeg" in mime_type or "jpg" in mime_type:
+                                            suffix = ".jpg"
+
+                                        with tempfile.NamedTemporaryFile(
+                                            delete=False, suffix=suffix
+                                        ) as tmp:
+                                            tmp.write(base64.b64decode(b64_data))
+                                            content["image_url"]["url"] = tmp.name
+                                            temp_files.append(tmp.name)
+                                            logger.debug(
+                                                f"Decoded base64 content to temp file: {tmp.name}"
+                                            )
+                                    except Exception as e:
+                                        logger.error(
+                                            f"Failed to decode base64 file: {e}"
+                                        )
+
     @vllm_check
     async def async_chat(
         self,
@@ -1878,6 +1925,14 @@ async def async_chat(
                 process_vision_info,
             )
 
+            # Pre-process messages to handle base64 data URIs BEFORE transform
+            temp_files: List[str] = []
+            if (
+                "vision" in self.model_family.model_ability
+                or "omni" in self.model_family.model_ability
+            ):
+                self._handle_base64_images(messages, temp_files)
+
             messages = self._transform_messages(messages)
 
             chat_template_kwargs = (
diff --git a/xinference/ui/gradio/chat_interface.py b/xinference/ui/gradio/chat_interface.py
index 99d9d3bc4f..c54ce32dac 100644
--- a/xinference/ui/gradio/chat_interface.py
+++ b/xinference/ui/gradio/chat_interface.py
@@ -416,6 +416,7 @@ def add_text(history, bot, text, image, video, audio):
                 video,
                 audio,
             )
+
             if image:
                 buffered = BytesIO()
                 with PIL.Image.open(image) as img:
diff --git a/xinference/ui/gradio/media_interface.py b/xinference/ui/gradio/media_interface.py
index 1a4272b721..d7794d146c 100644
--- a/xinference/ui/gradio/media_interface.py
+++ b/xinference/ui/gradio/media_interface.py
@@ -1611,6 +1611,170 @@ def toggle_additional_outputs(enable_viz):
 
         return ocr_interface
 
+    def document_parsing_interface(self) -> "gr.Blocks":
+        """Document parsing interface that supports PDF file uploads (for MinerU)."""
+
+        def parse_document(
+            file_path: str,
+            backend: str = "hybrid-auto-engine",
+            parse_method: str = "auto",
+            language: str = "ch",
+            output_format: str = "markdown",
+            progress=gr.Progress(),
+        ) -> str:
+            from ...client import RESTfulClient
+
+            if not file_path:
+                return "**Error**: Please upload a PDF or image file."
+
+            client = RESTfulClient(self.endpoint)
+            client._set_token(self.access_token)
+            model = client.get_model(self.model_uid)
+
+            if not hasattr(model, "ocr"):
+                return "**Error**: Model does not support OCR/document parsing."
+
+            progress(0.1, desc="Reading file...")
+
+            try:
+                # Read file content
+                with open(file_path, "rb") as f:
+                    file_bytes = f.read()
+                progress(0.3, desc="Processing document...")
+
+                # Call model's ocr method
+                response = model.ocr(
+                    image=file_bytes,
+                    backend=backend,
+                    parse_method=parse_method,
+                    language=language,
+                    output_format=output_format,
+                    return_dict=True,
+                )
+
+                progress(0.9, desc="Formatting output...")
+
+                if isinstance(response, dict):
+                    if response.get("success"):
+                        result = response.get(
+                            "markdown", response.get("text", "No content extracted")
+                        )
+                        return result or "No content extracted"
+                    else:
+                        return f"**Error**: {response.get('error', 'Unknown error')}"
+                elif isinstance(response, str):
+                    return response
+                else:
+                    return str(response)
+
+            except Exception as e:
+                logger.error(f"Document parsing error: {e}")
+                import traceback
+
+                error_details = traceback.format_exc()
+                logger.error(f"Full traceback: {error_details}")
+                return f"""**Document Parsing Error**
+
+```
+{str(e)}
+```
+
+**Debug Info:**
+- File: {file_path}
+- Backend: {backend}
+- Parse Method: {parse_method}
+- Language: {language}
+"""
+            finally:
+                progress(1.0, desc="Complete")
+
+        with gr.Blocks() as doc_parsing_interface:
+            gr.Markdown(f"### 📄 Document Parsing with {self.model_name}")
+            gr.Markdown(
+                "Upload PDF or image files for high-precision document parsing to Markdown/JSON."
+            )
+
+            with gr.Row():
+                with gr.Column(scale=1):
+                    # File upload that accepts PDF and images
+                    file_input = gr.File(
+                        label="Upload Document (PDF or Image)",
+                        file_types=[
+                            ".pdf",
+                            ".png",
+                            ".jpg",
+                            ".jpeg",
+                            ".webp",
+                            ".bmp",
+                            ".gif",
+                        ],
+                        type="filepath",
+                    )
+
+                    gr.Markdown(f"**Current Model:** {self.model_name}")
+
+                    # MinerU-specific configuration
+                    backend = gr.Dropdown(
+                        choices=[
+                            "pipeline",  # General mode
+                            "vlm-auto-engine",  # Local VLM high accuracy
+                            "hybrid-auto-engine",  # Hybrid mode (recommended)
+                        ],
+                        value="hybrid-auto-engine",
+                        label="Backend",
+                        info="pipeline: General, vlm: High accuracy (local), hybrid: Recommended",
+                    )
+
+                    parse_method = gr.Dropdown(
+                        choices=["auto", "txt", "ocr"],
+                        value="auto",
+                        label="Parse Method",
+                        info="auto: Auto-detect, txt: Text extraction, ocr: OCR for scanned documents",
+                    )
+
+                    language = gr.Dropdown(
+                        choices=[
+                            "ch",  # Chinese
+                            "en",  # English
+                            "chinese_cht",  # Traditional Chinese
+                        ],
+                        value="ch",
+                        label="Document Language",
+                        info="Select the primary language of your document",
+                    )
+
+                    output_format = gr.Dropdown(
+                        choices=["markdown", "json"],
+                        value="markdown",
+                        label="Output Format",
+                    )
+
+                    parse_btn = gr.Button("Parse Document", variant="primary")
+
+                with gr.Column(scale=1):
+                    with gr.Group(elem_classes="output-container"):
+                        gr.Markdown("### 📄 Parsing Results")
+
+                        result_output = gr.Markdown(
+                            value="Parsed content will be displayed here...",
+                            elem_classes="output-text",
+                            container=False,
+                        )
+
+            parse_btn.click(
+                fn=parse_document,
+                inputs=[
+                    file_input,
+                    backend,
+                    parse_method,
+                    language,
+                    output_format,
+                ],
+                outputs=result_output,
+            )
+
+        return doc_parsing_interface
+
     def build_main_interface(self) -> "gr.Blocks":
         if self.model_type == "image":
             if "ocr" in self.model_ability:
@@ -1731,6 +1895,9 @@ def build_main_interface(self) -> "gr.Blocks":
             if "ocr" in self.model_ability:
                 with gr.Tab("OCR"):
                     self.ocr_interface()
+            if "document-parsing" in self.model_ability:
+                with gr.Tab("Document Parsing"):
+                    self.document_parsing_interface()
             if "text2image" in self.model_ability:
                 with gr.Tab("Text to Image"):
                     self.text2image_interface()