-
Notifications
You must be signed in to change notification settings - Fork 135
Closed
Description
Proposal: gRPC Service Definition from the REST API
I've been working on a gRPC service definition that maps 1:1 with the current REST API. I'd like to propose it to the community and offer to maintain both the data format spec and a mapping layer between the two.
Motivation
A gRPC interface would provide a streaming HTTP/2 alternative to the REST API, enabling significant throughput improvements for pipeline processing workloads.
Approach
- All data types, relationships, and endpoints map 1:1 to the existing REST API.
- The gRPC definitions follow Buf linting and Google API Design standards, with dedicated request/response wrappers per RPC for backward compatibility.
- The goal is to always stay in sync with the REST API, with the eventual aim of offering a server-side gRPC API as well.
- gRPC-specific streaming RPCs can augment existing calls where it makes sense (e.g., watch/polling patterns for async tasks).
Maintenance
I understand the spec changes frequently. To keep up, I plan to introduce tooling to help automate gRPC definition updates as the REST API evolves.
Below is my initial proposal - I will work with the docling document as well as the serve API
syntax = "proto3";
package ai.docling.core.v1;
option java_multiple_files = true;
option java_outer_classname = "DoclingDocumentProto";
option java_package = "ai.docling.core.v1";
// Docling Document Structure Protocol Buffers Definition
//
// This proto file defines the complete structure for documents processed by Docling,
// providing a 1:1 mapping of the Docling JSON schema to protobuf. Docling is an
// advanced document parsing system that extracts rich semantic structure from PDFs
// and other document formats.
//
// Document Structure Overview:
// ===========================
//
// A DoclingDocument contains:
// - Hierarchical structure (body, groups)
// - Text content (titles, headers, paragraphs, lists, code, formulas)
// - Visual elements (pictures with AI-generated descriptions)
// - Tabular data (tables with cell-level structure)
// - Form data (key-value pairs, form fields)
// - Page metadata (size, images)
// - Provenance tracking (bounding boxes, page numbers)
// DoclingDocument is the root message representing a complete parsed document.
message DoclingDocument {
// Schema identifier for versioning (e.g., "docling_document_v2")
optional string schema_name = 1;
// Version number of the Docling schema used
optional string version = 2;
// Human-readable name or title of the document
string name = 3;
// Metadata about the source document (file info, hash, etc.)
optional DocumentOrigin origin = 4;
// The root body group containing the main document structure.
GroupItem body = 5;
// Additional groups representing logical sections (chapters, sections, etc.).
repeated GroupItem groups = 6;
// All text items in the document (titles, paragraphs, lists, etc.).
repeated BaseTextItem texts = 7;
// All picture/image items in the document.
repeated PictureItem pictures = 8;
// All table items in the document.
repeated TableItem tables = 9;
// Key-value pairs extracted from forms or structured data.
repeated KeyValueItem key_value_items = 10;
// Form elements detected in the document.
repeated FormItem form_items = 11;
// Map of page numbers to page metadata.
map<string, PageItem> pages = 12;
}
// DocumentOrigin contains metadata about the source document file.
message DocumentOrigin {
// MIME type of the source file
string mimetype = 1;
// Binary hash of the source file for integrity verification.
string binary_hash = 2;
// Original filename of the source document
string filename = 3;
// Optional URI/URL where the document was retrieved from
optional string uri = 4;
}
// ContentLayer defines the semantic layer where content appears in the document.
enum ContentLayer {
CONTENT_LAYER_UNSPECIFIED = 0;
CONTENT_LAYER_BODY = 1;
CONTENT_LAYER_FURNITURE = 2;
CONTENT_LAYER_BACKGROUND = 3;
CONTENT_LAYER_INVISIBLE = 4;
CONTENT_LAYER_NOTES = 5;
}
// GroupLabel defines the semantic type of a group in the document hierarchy.
enum GroupLabel {
GROUP_LABEL_UNSPECIFIED = 0;
GROUP_LABEL_LIST = 1;
GROUP_LABEL_ORDERED_LIST = 2;
GROUP_LABEL_CHAPTER = 3;
GROUP_LABEL_SECTION = 4;
GROUP_LABEL_SHEET = 5;
GROUP_LABEL_SLIDE = 6;
GROUP_LABEL_FORM_AREA = 7;
GROUP_LABEL_KEY_VALUE_AREA = 8;
GROUP_LABEL_COMMENT_SECTION = 9;
GROUP_LABEL_INLINE = 10;
GROUP_LABEL_PICTURE_AREA = 11;
}
// GroupItem represents a logical grouping of document elements.
message GroupItem {
string self_ref = 1;
optional RefItem parent = 2;
repeated RefItem children = 3;
ContentLayer content_layer = 4;
optional BaseMeta meta = 5;
optional string name = 6;
GroupLabel label = 7;
}
// RefItem is a JSON Pointer reference to another item in the document.
message RefItem {
string ref = 1;
}
// BaseMeta contains metadata fields common to most document items.
message BaseMeta {
optional SummaryMetaField summary = 1;
}
// SummaryMetaField contains an AI-generated text summary with confidence.
message SummaryMetaField {
optional double confidence = 1;
optional string created_by = 2;
string text = 3;
}
// DocItemLabel defines the semantic type of document content items.
enum DocItemLabel {
DOC_ITEM_LABEL_UNSPECIFIED = 0;
DOC_ITEM_LABEL_CAPTION = 1;
DOC_ITEM_LABEL_CHART = 2;
DOC_ITEM_LABEL_CHECKBOX_SELECTED = 3;
DOC_ITEM_LABEL_CHECKBOX_UNSELECTED = 4;
DOC_ITEM_LABEL_CODE = 5;
DOC_ITEM_LABEL_DOCUMENT_INDEX = 6;
DOC_ITEM_LABEL_EMPTY_VALUE = 7;
DOC_ITEM_LABEL_FOOTNOTE = 8;
DOC_ITEM_LABEL_FORM = 9;
DOC_ITEM_LABEL_FORMULA = 10;
DOC_ITEM_LABEL_GRADING_SCALE = 11;
DOC_ITEM_LABEL_HANDWRITTEN_TEXT = 12;
DOC_ITEM_LABEL_KEY_VALUE_REGION = 13;
DOC_ITEM_LABEL_LIST_ITEM = 14;
DOC_ITEM_LABEL_PAGE_FOOTER = 15;
DOC_ITEM_LABEL_PAGE_HEADER = 16;
DOC_ITEM_LABEL_PARAGRAPH = 17;
DOC_ITEM_LABEL_PICTURE = 18;
DOC_ITEM_LABEL_REFERENCE = 19;
DOC_ITEM_LABEL_SECTION_HEADER = 20;
DOC_ITEM_LABEL_TABLE = 21;
DOC_ITEM_LABEL_TEXT = 22;
DOC_ITEM_LABEL_TITLE = 23;
}
// Script defines the vertical positioning of text.
enum Script {
SCRIPT_UNSPECIFIED = 0;
SCRIPT_BASELINE = 1;
SCRIPT_SUB = 2;
SCRIPT_SUPER = 3;
}
// Formatting contains text formatting/styling information.
message Formatting {
bool bold = 1;
bool italic = 2;
bool underline = 3;
bool strikethrough = 4;
Script script = 5;
}
// BaseTextItem is a union type representing any text-based item in the document.
message BaseTextItem {
oneof item {
TitleItem title = 1;
SectionHeaderItem section_header = 2;
ListItem list_item = 3;
CodeItem code = 4;
FormulaItem formula = 5;
TextItem text = 6;
}
}
// TextItemBase contains fields common to all text-based items.
message TextItemBase {
string self_ref = 1;
optional RefItem parent = 2;
repeated RefItem children = 3;
ContentLayer content_layer = 4;
optional BaseMeta meta = 5;
DocItemLabel label = 6;
repeated ProvenanceItem prov = 7;
string orig = 8;
string text = 9;
optional Formatting formatting = 10;
optional string hyperlink = 11;
}
// TitleItem represents a document title or major heading.
message TitleItem {
TextItemBase base = 1;
}
// SectionHeaderItem represents a section header with hierarchical level.
message SectionHeaderItem {
TextItemBase base = 1;
int32 level = 2;
}
// ListItem represents a single item in a bulleted or numbered list.
message ListItem {
TextItemBase base = 1;
bool enumerated = 2;
optional string marker = 3;
}
// CodeItem represents a code block with syntax highlighting metadata.
message CodeItem {
TextItemBase base = 1;
optional FloatingMeta meta = 2;
repeated RefItem captions = 3;
repeated RefItem references = 4;
repeated RefItem footnotes = 5;
optional ImageRef image = 6;
optional string code_language = 7;
}
// FormulaItem represents a mathematical formula or equation.
message FormulaItem {
TextItemBase base = 1;
}
// TextItem represents generic text content (paragraphs, captions, etc.).
message TextItem {
TextItemBase base = 1;
}
// ProvenanceItem tracks the precise location of content in the source document.
message ProvenanceItem {
int32 page_no = 1;
BoundingBox bbox = 2;
repeated int32 charspan = 3;
}
// BoundingBox defines a rectangular region in page coordinates.
message BoundingBox {
double l = 1;
double t = 2;
double r = 3;
double b = 4;
optional string coord_origin = 5;
}
// ImageRef references an embedded image with its properties.
message ImageRef {
string mimetype = 1;
int32 dpi = 2;
Size size = 3;
string uri = 4;
}
// Size represents 2D dimensions (width and height).
message Size {
double width = 1;
double height = 2;
}
// PictureItem represents an image or figure in the document.
message PictureItem {
optional string self_ref = 1;
RefItem parent = 2;
repeated RefItem children = 3;
ContentLayer content_layer = 4;
optional PictureMeta meta = 5;
string label = 6;
repeated ProvenanceItem prov = 7;
repeated RefItem captions = 8;
repeated RefItem references = 9;
repeated RefItem footnotes = 10;
optional ImageRef image = 11;
}
// PictureMeta contains rich metadata for pictures, including AI analysis.
message PictureMeta {
optional SummaryMetaField summary = 1;
optional DescriptionMetaField description = 2;
optional PictureClassificationMetaField classification = 3;
optional MoleculeMetaField molecule = 4;
optional TabularChartMetaField tabular_chart = 5;
}
// DescriptionMetaField contains an AI-generated detailed description.
message DescriptionMetaField {
optional double confidence = 1;
optional string created_by = 2;
string text = 3;
}
// PictureClassificationMetaField contains AI classification results.
message PictureClassificationMetaField {
repeated PictureClassificationPrediction predictions = 1;
}
// PictureClassificationPrediction is a single classification result.
message PictureClassificationPrediction {
optional double confidence = 1;
optional string created_by = 2;
string class_name = 3;
}
// MoleculeMetaField contains chemical structure information.
message MoleculeMetaField {
optional double confidence = 1;
optional string created_by = 2;
string smi = 3;
}
// TabularChartMetaField contains chart data extracted from images.
message TabularChartMetaField {
optional double confidence = 1;
optional string created_by = 2;
optional string title = 3;
TableData chart_data = 4;
}
// FloatingMeta contains metadata for floating elements (tables, code blocks, figures).
message FloatingMeta {
optional SummaryMetaField summary = 1;
optional DescriptionMetaField description = 2;
}
// TableItem represents a table in the document with full structure.
message TableItem {
string self_ref = 1;
optional RefItem parent = 2;
repeated RefItem children = 3;
ContentLayer content_layer = 4;
optional FloatingMeta meta = 5;
string label = 6;
repeated ProvenanceItem prov = 7;
repeated RefItem captions = 8;
repeated RefItem references = 9;
repeated RefItem footnotes = 10;
optional ImageRef image = 11;
TableData data = 12;
}
// TableData contains the complete table structure and content.
message TableData {
repeated TableCell table_cells = 1;
int32 num_rows = 2;
int32 num_cols = 3;
repeated TableRow grid = 4;
}
// TableRow represents a single row in the table.
message TableRow {
repeated TableCell cells = 1;
}
// TableCell represents a single cell in a table.
message TableCell {
BoundingBox bbox = 1;
int32 row_span = 2;
int32 col_span = 3;
int32 start_row_offset_idx = 4;
int32 end_row_offset_idx = 5;
int32 start_col_offset_idx = 6;
int32 end_col_offset_idx = 7;
string text = 8;
bool column_header = 9;
bool row_header = 10;
bool row_section = 11;
bool fillable = 12;
}
// KeyValueItem represents a key-value pair extracted from forms.
message KeyValueItem {
string self_ref = 1;
optional RefItem parent = 2;
repeated RefItem children = 3;
ContentLayer content_layer = 4;
optional FloatingMeta meta = 5;
string label = 6;
repeated ProvenanceItem prov = 7;
repeated RefItem captions = 8;
repeated RefItem references = 9;
repeated RefItem footnotes = 10;
optional ImageRef image = 11;
GraphData graph = 12;
}
// GraphData represents the structure of key-value relationships as a graph.
message GraphData {
repeated GraphCell cells = 1;
repeated GraphLink links = 2;
}
// GraphCellLabel defines the role of a cell in a key-value graph.
enum GraphCellLabel {
GRAPH_CELL_LABEL_UNSPECIFIED = 0;
GRAPH_CELL_LABEL_KEY = 1;
GRAPH_CELL_LABEL_VALUE = 2;
GRAPH_CELL_LABEL_CHECKBOX = 3;
}
// GraphCell is a node in the key-value graph.
message GraphCell {
GraphCellLabel label = 1;
int32 cell_id = 2;
string text = 3;
string orig = 4;
optional ProvenanceItem prov = 5;
optional RefItem item_ref = 6;
}
// GraphLinkLabel defines the type of relationship between cells.
enum GraphLinkLabel {
GRAPH_LINK_LABEL_UNSPECIFIED = 0;
GRAPH_LINK_LABEL_TO_VALUE = 1;
GRAPH_LINK_LABEL_TO_KEY = 2;
GRAPH_LINK_LABEL_TO_PARENT = 3;
GRAPH_LINK_LABEL_TO_CHILD = 4;
}
// GraphLink is an edge in the key-value graph.
message GraphLink {
GraphLinkLabel label = 1;
int32 source_cell_id = 2;
int32 target_cell_id = 3;
}
// FormItem represents a form element in the document.
message FormItem {
string self_ref = 1;
optional RefItem parent = 2;
repeated RefItem children = 3;
ContentLayer content_layer = 4;
optional FloatingMeta meta = 5;
string label = 6;
repeated ProvenanceItem prov = 7;
repeated RefItem captions = 8;
repeated RefItem references = 9;
repeated RefItem footnotes = 10;
optional ImageRef image = 11;
GraphData graph = 12;
}
// PageItem represents metadata about a single page in the document.
message PageItem {
Size size = 1;
optional ImageRef image = 2;
int32 page_no = 3;
}Serve types:
syntax = "proto3";
package ai.docling.serve.v1;
option java_multiple_files = true;
option java_outer_classname = "DoclingServeTypesProto";
option java_package = "ai.docling.serve.v1";
import "ai/docling/core/v1/docling_document.proto";
import "google/protobuf/struct.proto";
// ============================================================================
// Enumerations - 1:1 mapping of the REST API enum types
// ============================================================================
// Input document format types supported by Docling.
enum InputFormat {
INPUT_FORMAT_UNSPECIFIED = 0;
INPUT_FORMAT_ASCIIDOC = 1;
INPUT_FORMAT_AUDIO = 2;
INPUT_FORMAT_CSV = 3;
INPUT_FORMAT_DOCX = 4;
INPUT_FORMAT_HTML = 5;
INPUT_FORMAT_IMAGE = 6;
INPUT_FORMAT_JSON_DOCLING = 7;
INPUT_FORMAT_MD = 8;
INPUT_FORMAT_METS_GBS = 9;
INPUT_FORMAT_PDF = 10;
INPUT_FORMAT_PPTX = 11;
INPUT_FORMAT_XLSX = 12;
INPUT_FORMAT_XML_JATS = 13;
INPUT_FORMAT_XML_USPTO = 14;
}
// Output format types for converted documents.
enum OutputFormat {
OUTPUT_FORMAT_UNSPECIFIED = 0;
OUTPUT_FORMAT_DOCTAGS = 1;
OUTPUT_FORMAT_HTML = 2;
OUTPUT_FORMAT_HTML_SPLIT_PAGE = 3;
OUTPUT_FORMAT_JSON = 4;
OUTPUT_FORMAT_MD = 5;
OUTPUT_FORMAT_TEXT = 6;
}
// OCR engine selection.
enum OcrEngine {
OCR_ENGINE_UNSPECIFIED = 0;
OCR_ENGINE_AUTO = 1;
OCR_ENGINE_EASYOCR = 2;
OCR_ENGINE_OCRMAC = 3;
OCR_ENGINE_RAPIDOCR = 4;
OCR_ENGINE_TESSEROCR = 5;
OCR_ENGINE_TESSERACT = 6;
}
// PDF processing backend.
enum PdfBackend {
PDF_BACKEND_UNSPECIFIED = 0;
PDF_BACKEND_DLPARSE_V1 = 1;
PDF_BACKEND_DLPARSE_V2 = 2;
PDF_BACKEND_DLPARSE_V4 = 3;
PDF_BACKEND_PYPDFIUM2 = 4;
}
// Table structure extraction mode.
enum TableFormerMode {
TABLE_FORMER_MODE_UNSPECIFIED = 0;
TABLE_FORMER_MODE_ACCURATE = 1;
TABLE_FORMER_MODE_FAST = 2;
}
// Document processing pipeline.
enum ProcessingPipeline {
PROCESSING_PIPELINE_UNSPECIFIED = 0;
PROCESSING_PIPELINE_ASR = 1;
PROCESSING_PIPELINE_STANDARD = 2;
PROCESSING_PIPELINE_VLM = 3;
}
// Image reference mode for export.
enum ImageRefMode {
IMAGE_REF_MODE_UNSPECIFIED = 0;
IMAGE_REF_MODE_EMBEDDED = 1;
IMAGE_REF_MODE_PLACEHOLDER = 2;
IMAGE_REF_MODE_REFERENCED = 3;
}
// Preset VLM model types.
enum VlmModelType {
VLM_MODEL_TYPE_UNSPECIFIED = 0;
VLM_MODEL_TYPE_SMOLDOCLING = 1;
VLM_MODEL_TYPE_SMOLDOCLING_VLLM = 2;
VLM_MODEL_TYPE_GRANITE_VISION = 3;
VLM_MODEL_TYPE_GRANITE_VISION_VLLM = 4;
VLM_MODEL_TYPE_GRANITE_VISION_OLLAMA = 5;
VLM_MODEL_TYPE_GOT_OCR_2 = 6;
}
// Response format for VLM model output.
enum ResponseFormat {
RESPONSE_FORMAT_UNSPECIFIED = 0;
RESPONSE_FORMAT_DOCTAGS = 1;
RESPONSE_FORMAT_MARKDOWN = 2;
RESPONSE_FORMAT_HTML = 3;
RESPONSE_FORMAT_OTSL = 4;
RESPONSE_FORMAT_PLAINTEXT = 5;
}
// Inference framework for local VLM models.
enum InferenceFramework {
INFERENCE_FRAMEWORK_UNSPECIFIED = 0;
INFERENCE_FRAMEWORK_MLX = 1;
INFERENCE_FRAMEWORK_TRANSFORMERS = 2;
INFERENCE_FRAMEWORK_VLLM = 3;
}
// Type of transformers auto-model to use.
enum TransformersModelType {
TRANSFORMERS_MODEL_TYPE_UNSPECIFIED = 0;
TRANSFORMERS_MODEL_TYPE_AUTOMODEL = 1;
TRANSFORMERS_MODEL_TYPE_AUTOMODEL_VISION2SEQ = 2;
TRANSFORMERS_MODEL_TYPE_AUTOMODEL_CAUSALLM = 3;
TRANSFORMERS_MODEL_TYPE_AUTOMODEL_IMAGETEXTTOTEXT = 4;
}
// Async task status.
enum TaskStatus {
TASK_STATUS_UNSPECIFIED = 0;
TASK_STATUS_PENDING = 1;
TASK_STATUS_STARTED = 2;
TASK_STATUS_SUCCESS = 3;
TASK_STATUS_FAILURE = 4;
}
// ============================================================================
// Source Types - where documents come from
// ============================================================================
// Source represents a document input source (polymorphic via oneof).
message Source {
oneof source {
FileSource file = 1;
HttpSource http = 2;
S3Source s3 = 3;
}
}
// FileSource provides a document as base64-encoded content.
message FileSource {
// Base64-encoded file content
string base64_string = 1;
// Original filename
string filename = 2;
}
// HttpSource provides a document via HTTP URL.
message HttpSource {
// URL to fetch the document from
string url = 1;
// Optional HTTP headers (e.g., authentication)
map<string, string> headers = 2;
}
// S3Source provides a document from an S3-compatible store.
message S3Source {
string endpoint = 1;
string access_key = 2;
string secret_key = 3;
string bucket = 4;
optional string key_prefix = 5;
bool verify_ssl = 6;
}
// ============================================================================
// Target Types - where results go
// ============================================================================
// Target represents a result destination (polymorphic via oneof).
message Target {
oneof target {
InBodyTarget inbody = 1;
PutTarget put = 2;
S3Target s3 = 3;
ZipTarget zip = 4;
}
}
// InBodyTarget returns results in the response body (default).
message InBodyTarget {}
// PutTarget sends results via HTTP PUT.
message PutTarget {
string url = 1;
}
// S3Target sends results to an S3-compatible store.
message S3Target {
string endpoint = 1;
string access_key = 2;
string secret_key = 3;
string bucket = 4;
optional string key_prefix = 5;
bool verify_ssl = 6;
}
// ZipTarget returns results as a ZIP archive.
message ZipTarget {}
// ============================================================================
// Convert Options
// ============================================================================
// PictureDescriptionLocal configures a local VLM for picture descriptions.
message PictureDescriptionLocal {
// Hugging Face repository ID
string repo_id = 1;
// Optional prompt for the model
optional string prompt = 2;
// Optional generation config parameters
map<string, google.protobuf.Value> generation_config = 3;
}
// PictureDescriptionApi configures an API-based VLM for picture descriptions.
message PictureDescriptionApi {
// API endpoint URL
string url = 1;
// Optional HTTP headers
map<string, string> headers = 2;
// Optional model parameters
map<string, google.protobuf.Value> params = 3;
// Timeout in seconds
optional double timeout = 4;
// Max concurrent requests
optional int32 concurrency = 5;
// Optional prompt
optional string prompt = 6;
}
// VlmModelLocal configures a local vision-language model for the VLM pipeline.
message VlmModelLocal {
optional string repo_id = 1;
optional string prompt = 2;
optional int32 scale = 3;
optional ResponseFormat response_format = 4;
optional InferenceFramework inference_framework = 5;
optional TransformersModelType transformers_model_type = 6;
map<string, google.protobuf.Value> extra_generation_config = 7;
}
// VlmModelApi configures an API-based vision-language model for the VLM pipeline.
message VlmModelApi {
optional string url = 1;
map<string, string> headers = 2;
map<string, google.protobuf.Value> params = 3;
optional double timeout = 4;
optional int32 concurrency = 5;
optional string prompt = 6;
optional int32 scale = 7;
optional ResponseFormat response_format = 8;
}
// ConvertDocumentOptions mirrors all conversion settings from the REST API.
message ConvertDocumentOptions {
// Input format(s) to convert from
repeated InputFormat from_formats = 1;
// Output format(s) to convert to
repeated OutputFormat to_formats = 2;
// Image export mode
optional ImageRefMode image_export_mode = 3;
// Enable OCR processing
optional bool do_ocr = 4;
// Replace text with OCR output
optional bool force_ocr = 5;
// OCR engine selection
optional OcrEngine ocr_engine = 6;
// OCR language codes
repeated string ocr_lang = 7;
// PDF processing backend
optional PdfBackend pdf_backend = 8;
// Table structure mode
optional TableFormerMode table_mode = 9;
// Match table cells to PDF cells
optional bool table_cell_matching = 10;
// Processing pipeline
optional ProcessingPipeline pipeline = 11;
// Page range to process (1-indexed)
repeated int32 page_range = 12;
// Per-document timeout in seconds
optional double document_timeout = 13;
// Abort on first error
optional bool abort_on_error = 14;
// Extract table structure
optional bool do_table_structure = 15;
// Extract images
optional bool include_images = 16;
// Image scale factor
optional double images_scale = 17;
// Markdown page break placeholder
optional string md_page_break_placeholder = 18;
// Enable code OCR enrichment
optional bool do_code_enrichment = 19;
// Enable formula OCR enrichment
optional bool do_formula_enrichment = 20;
// Enable picture classification
optional bool do_picture_classification = 21;
// Enable picture description
optional bool do_picture_description = 22;
// Min area percentage for picture processing
optional double picture_description_area_threshold = 23;
// Local VLM for picture description (mutually exclusive with api)
optional PictureDescriptionLocal picture_description_local = 24;
// API VLM for picture description (mutually exclusive with local)
optional PictureDescriptionApi picture_description_api = 25;
// Preset VLM model (mutually exclusive with local/api)
optional VlmModelType vlm_pipeline_model = 26;
// Local VLM model string (mutually exclusive with api/preset)
optional string vlm_pipeline_model_local = 27;
// API VLM model string (mutually exclusive with local/preset)
optional string vlm_pipeline_model_api = 28;
}
// ============================================================================
// Error types
// ============================================================================
// ErrorItem represents a processing error from a specific component.
message ErrorItem {
string component_type = 1;
string error_message = 2;
string module_name = 3;
}
// ============================================================================
// Document response types
// ============================================================================
// DocumentResponse contains the converted document in multiple output formats.
message DocumentResponse {
// Filename of the source document
string filename = 1;
// Full DoclingDocument JSON structure
optional ai.docling.core.v1.DoclingDocument json_content = 2;
// Markdown representation
optional string md_content = 3;
// HTML representation
optional string html_content = 4;
// Plain text representation
optional string text_content = 5;
// DocTags representation
optional string doctags_content = 6;
}
// ExportDocumentResponse is the document format used within chunk responses.
message ExportDocumentResponse {
string filename = 1;
optional ai.docling.core.v1.DoclingDocument json_content = 2;
optional string md_content = 3;
optional string html_content = 4;
optional string text_content = 5;
optional string doctags_content = 6;
}
// Document wraps an exported document with status and error info.
message Document {
optional string kind = 1;
ExportDocumentResponse content = 2;
string status = 3;
repeated ErrorItem errors = 4;
}
// ============================================================================
// Task types
// ============================================================================
// TaskStatusMetadata provides progress information for async tasks.
message TaskStatusMetadata {
int64 num_docs = 1;
int64 num_processed = 2;
int64 num_succeeded = 3;
int64 num_failed = 4;
}
// ============================================================================
// Request / Response Domain Models (Moved from docling_serve.proto)
// ============================================================================
// ConvertDocumentRequest mirrors POST /v1/convert/source
message ConvertDocumentRequest {
repeated Source sources = 1;
ConvertDocumentOptions options = 2;
optional Target target = 3;
}
// ConvertDocumentResponse mirrors the REST convert response body.
message ConvertDocumentResponse {
DocumentResponse document = 1;
repeated ErrorItem errors = 2;
double processing_time = 3;
string status = 4;
map<string, double> timings = 5;
}
// HierarchicalChunkerOptions configures the hierarchical chunker.
message HierarchicalChunkerOptions {
bool use_markdown_tables = 1;
bool include_raw_text = 2;
}
// HybridChunkerOptions configures the hybrid chunker.
message HybridChunkerOptions {
bool use_markdown_tables = 1;
bool include_raw_text = 2;
optional int32 max_tokens = 3;
optional string tokenizer = 4;
optional bool merge_peers = 5;
}
// HierarchicalChunkRequest mirrors POST /v1/chunk/hierarchical/source
message HierarchicalChunkRequest {
repeated Source sources = 1;
ConvertDocumentOptions convert_options = 2;
optional Target target = 3;
bool include_converted_doc = 4;
HierarchicalChunkerOptions chunking_options = 5;
}
// HybridChunkRequest mirrors POST /v1/chunk/hybrid/source
message HybridChunkRequest {
repeated Source sources = 1;
ConvertDocumentOptions convert_options = 2;
optional Target target = 3;
bool include_converted_doc = 4;
HybridChunkerOptions chunking_options = 5;
}
// Chunk represents a single document chunk in the response.
message Chunk {
string filename = 1;
int32 chunk_index = 2;
string text = 3;
optional string raw_text = 4;
optional int32 num_tokens = 5;
repeated string headings = 6;
repeated string captions = 7;
repeated string doc_items = 8;
repeated int32 page_numbers = 9;
map<string, string> metadata = 10;
}
// ChunkDocumentResponse mirrors the REST chunk response body.
message ChunkDocumentResponse {
repeated Chunk chunks = 1;
repeated Document documents = 2;
double processing_time = 3;
}
// TaskStatusPollRequest mirrors GET /v1/status/poll/{taskId}
message TaskStatusPollRequest {
string task_id = 1;
// Wait time in seconds (0 = no wait / immediate poll)
double wait_time = 2;
}
// TaskStatusPollResponse mirrors the REST task status response.
message TaskStatusPollResponse {
string task_id = 1;
optional string task_type = 2;
TaskStatus task_status = 3;
optional int64 task_position = 4;
optional TaskStatusMetadata task_meta = 5;
}
// TaskResultRequest mirrors GET /v1/result/{taskId}
message TaskResultRequest {
string task_id = 1;
}
// ClearResponse mirrors the REST clear response body.
message ClearResponse {
optional string status = 1;
}Docling-serve
syntax = "proto3";
package ai.docling.serve.v1;
option java_multiple_files = true;
option java_outer_classname = "DoclingServeProto";
option java_package = "ai.docling.serve.v1";
import "ai/docling/serve/v1/docling_serve_types.proto";
// ============================================================================
// Service Definition
// ============================================================================
// DoclingServeService is the gRPC service definition that provides a mapping
// of the docling-serve REST API endpoints.
//
// Adheres to Buf linting and Google gRPC style guide by providing unique
// request and response messages for every RPC.
service DoclingServeService {
// Health check - mirrors GET /health
rpc Health(HealthRequest) returns (HealthResponse);
// Synchronous document conversion - mirrors POST /v1/convert/source
rpc ConvertSource(ConvertSourceRequest) returns (ConvertSourceResponse);
// Asynchronous document conversion - submits the task and returns status immediately.
rpc ConvertSourceAsync(ConvertSourceAsyncRequest) returns (ConvertSourceAsyncResponse);
// Synchronous hierarchical chunking - mirrors POST /v1/chunk/hierarchical/source
rpc ChunkHierarchicalSource(ChunkHierarchicalSourceRequest) returns (ChunkHierarchicalSourceResponse);
// Synchronous hybrid chunking - mirrors POST /v1/chunk/hybrid/source
rpc ChunkHybridSource(ChunkHybridSourceRequest) returns (ChunkHybridSourceResponse);
// Async hierarchical chunking - mirrors POST /v1/chunk/hierarchical/source/async
rpc ChunkHierarchicalSourceAsync(ChunkHierarchicalSourceAsyncRequest) returns (ChunkHierarchicalSourceAsyncResponse);
// Async hybrid chunking - mirrors POST /v1/chunk/hybrid/source/async
rpc ChunkHybridSourceAsync(ChunkHybridSourceAsyncRequest) returns (ChunkHybridSourceAsyncResponse);
// Poll async task status - mirrors GET /v1/status/poll/{taskId}
rpc PollTaskStatus(PollTaskStatusRequest) returns (PollTaskStatusResponse);
// Get convert task result - mirrors GET /v1/result/{taskId} for convert tasks
rpc GetConvertResult(GetConvertResultRequest) returns (GetConvertResultResponse);
// Get chunk task result - mirrors GET /v1/result/{taskId} for chunk tasks
rpc GetChunkResult(GetChunkResultRequest) returns (GetChunkResultResponse);
// Clear converters - mirrors GET /v1/clear/converters
rpc ClearConverters(ClearConvertersRequest) returns (ClearConvertersResponse);
// Clear stale results - mirrors GET /v1/clear/results
rpc ClearResults(ClearResultsRequest) returns (ClearResultsResponse);
// Streaming document conversion - sends results as they complete per source.
// This is an addition beyond the REST API that leverages gRPC's streaming.
rpc ConvertSourceStream(ConvertSourceStreamRequest) returns (stream ConvertSourceStreamResponse);
// --- Watch RPCs (server-managed polling via streaming) ---
// These RPCs submit the task, then internally poll and stream each status
// update until the task completes or fails. Clients just read the stream
// instead of managing their own poll loop.
// Watch convert task - submit and stream status updates until done.
rpc WatchConvertSource(WatchConvertSourceRequest) returns (stream WatchConvertSourceResponse);
// Watch hierarchical chunk task - submit and stream status updates until done.
rpc WatchChunkHierarchicalSource(WatchChunkHierarchicalSourceRequest) returns (stream WatchChunkHierarchicalSourceResponse);
// Watch hybrid chunk task - submit and stream status updates until done.
rpc WatchChunkHybridSource(WatchChunkHybridSourceRequest) returns (stream WatchChunkHybridSourceResponse);
}
// ============================================================================
// RPC Request / Response Wrappers
// ============================================================================
// --- Health ---
message HealthRequest {}
message HealthResponse {
optional string status = 1;
}
// --- Convert ---
message ConvertSourceRequest {
ConvertDocumentRequest request = 1;
}
message ConvertSourceResponse {
ConvertDocumentResponse response = 1;
}
message ConvertSourceAsyncRequest {
ConvertDocumentRequest request = 1;
}
message ConvertSourceAsyncResponse {
TaskStatusPollResponse response = 1;
}
message ConvertSourceStreamRequest {
ConvertDocumentRequest request = 1;
}
message ConvertSourceStreamResponse {
ConvertDocumentResponse response = 1;
}
// --- Chunk ---
message ChunkHierarchicalSourceRequest {
HierarchicalChunkRequest request = 1;
}
message ChunkHierarchicalSourceResponse {
ChunkDocumentResponse response = 1;
}
message ChunkHybridSourceRequest {
HybridChunkRequest request = 1;
}
message ChunkHybridSourceResponse {
ChunkDocumentResponse response = 1;
}
message ChunkHierarchicalSourceAsyncRequest {
HierarchicalChunkRequest request = 1;
}
message ChunkHierarchicalSourceAsyncResponse {
TaskStatusPollResponse response = 1;
}
message ChunkHybridSourceAsyncRequest {
HybridChunkRequest request = 1;
}
message ChunkHybridSourceAsyncResponse {
TaskStatusPollResponse response = 1;
}
// --- Task ---
message PollTaskStatusRequest {
TaskStatusPollRequest request = 1;
}
message PollTaskStatusResponse {
TaskStatusPollResponse response = 1;
}
message GetConvertResultRequest {
TaskResultRequest request = 1;
}
message GetConvertResultResponse {
ConvertDocumentResponse response = 1;
}
message GetChunkResultRequest {
TaskResultRequest request = 1;
}
message GetChunkResultResponse {
ChunkDocumentResponse response = 1;
}
// --- Clear ---
message ClearConvertersRequest {}
message ClearConvertersResponse {
ClearResponse response = 1;
}
message ClearResultsRequest {
// Clear results older than this many seconds (default: 3600)
optional double older_than = 1;
}
message ClearResultsResponse {
ClearResponse response = 1;
}
// --- Watch ---
message WatchConvertSourceRequest {
ConvertDocumentRequest request = 1;
}
message WatchConvertSourceResponse {
TaskStatusPollResponse response = 1;
}
message WatchChunkHierarchicalSourceRequest {
HierarchicalChunkRequest request = 1;
}
message WatchChunkHierarchicalSourceResponse {
TaskStatusPollResponse response = 1;
}
message WatchChunkHybridSourceRequest {
HybridChunkRequest request = 1;
}
message WatchChunkHybridSourceResponse {
TaskStatusPollResponse response = 1;
}
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels