diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 30d4e50c4b..56d5e9415f 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -84,6 +84,7 @@ class OcrOptions(BaseOptions): bitmap_area_threshold: float = ( 0.05 # percentage of the area for a bitmap to processed with OCR ) + force_low_confidence_ocr: bool = False # If enabled low confidence programmatic cells are processed with OCR class OcrAutoOptions(OcrOptions): diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index 31f44ae0a6..6c19a4d207 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -90,11 +90,23 @@ def find_ocr_rects(size, bitmap_rects): bitmap_rects = page._backend.get_bitmap_rects() else: bitmap_rects = [] + + force_ocr_on_low_confidence_cells = False + + if self.options.force_low_confidence_ocr: + for cell in page.cells: + if cell.confidence == 0.0: + force_ocr_on_low_confidence_cells = True + break + coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects) # return full-page rectangle if page is dominantly covered with bitmaps - if self.options.force_full_page_ocr or coverage > max( - BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold + if ( + self.options.force_full_page_ocr + or coverage + > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold) + or force_ocr_on_low_confidence_cells ): return [ BoundingBox( @@ -145,6 +157,13 @@ def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None: # Get existing cells from the read-only property existing_cells = page.cells + force_ocr_on_low_confidence_cells = False + if self.options.force_low_confidence_ocr: + existing_cells_length = len(existing_cells) + existing_cells = [cell for cell in existing_cells if cell.confidence != 0.0] + if len(existing_cells) < existing_cells_length: + force_ocr_on_low_confidence_cells = True + # Combine existing and OCR cells with overlap filtering final_cells = self._combine_cells(existing_cells, ocr_cells) @@ -158,7 +177,7 @@ def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None: # unreliable. Filter out cells where from_ocr=False, keeping any OCR- # generated cells. This ensures downstream components (e.g., table # structure model) fall back to OCR-extracted textline cells. - if self.options.force_full_page_ocr: + if self.options.force_full_page_ocr or force_ocr_on_low_confidence_cells: page.parsed_page.word_cells = [ c for c in page.parsed_page.word_cells if c.from_ocr ] diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py index a1d4949021..8740f97304 100644 --- a/docling/models/page_preprocessing_model.py +++ b/docling/models/page_preprocessing_model.py @@ -27,7 +27,7 @@ def __init__(self, options: PagePreprocessingOptions): self.options = options # Pre-compiled regex patterns for efficiency - self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>") + self.GLYPH_RE = re.compile(r"GLYPH<[^>]+>") # anything between < and > self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}") self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b") self.SLASH_NUMBER_GARBAGE_RE = re.compile( @@ -76,6 +76,7 @@ def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page: text_scores = [] for c in page.cells: score = self.rate_text_quality(c.text) + c.confidence = score text_scores.append(score) with warnings.catch_warnings():