style changes

odelliab · odelliab · commit 9b9ef0900722 · 2026-02-26T15:00:48.000+02:00
diff --git a/docling_core/transforms/chunker/hybrid_chunker.py b/docling_core/transforms/chunker/hybrid_chunker.py
@@ -39,6 +39,7 @@
 )
 from docling_core.types import DoclingDocument
 
+
 def _get_default_tokenizer():
     from docling_core.transforms.chunker.tokenizer.huggingface import (
         HuggingFaceTokenizer,
@@ -216,9 +217,8 @@ def _split_by_doc_items(self, doc_chunk: DocChunk, doc_serializer: BaseDocSerial
 
     def _split_using_plain_text(
         self,
-        doc_chunk: DocChunk, 
-        doc_serializer:ChunkingDocSerializer,
-        
+        doc_chunk: DocChunk,
+        doc_serializer: ChunkingDocSerializer,
     ) -> list[DocChunk]:
         lengths = self._doc_chunk_length(doc_chunk)
         if lengths.total_len <= self.max_tokens:
@@ -227,7 +227,7 @@ def _split_using_plain_text(
             # How much room is there for text after subtracting out the headers and
             # captions:
             available_length = self.max_tokens - lengths.other_len
-        
+
             if available_length <= 0:
                 warnings.warn(
                     "Headers and captions for this chunk are longer than the total "
@@ -238,38 +238,40 @@ def _split_using_plain_text(
                 new_chunk.meta.captions = None
                 new_chunk.meta.headings = None
                 return self._split_using_plain_text(doc_chunk=new_chunk, doc_serializer=doc_serializer)
-                      
-            segments = self.segment(doc_chunk,available_length,doc_serializer)
+
+            segments = self.segment(doc_chunk, available_length, doc_serializer)
             chunks = [DocChunk(text=s, meta=doc_chunk.meta) for s in segments]
             return chunks
 
-    def segment(self, doc_chunk: DocChunk, available_length: int, doc_serializer:ChunkingDocSerializer) -> list[str]:
+    def segment(self, doc_chunk: DocChunk, available_length: int, doc_serializer: ChunkingDocSerializer) -> list[str]:
         segments = []
-        if self.duplicate_table_header and len(doc_chunk.meta.doc_items) == 1 and isinstance(doc_chunk.meta.doc_items[0], TableItem):
-            
+        if (
+            self.duplicate_table_header
+            and len(doc_chunk.meta.doc_items) == 1
+            and isinstance(doc_chunk.meta.doc_items[0], TableItem)
+        ):
             header_lines, body_lines = doc_serializer.table_serializer.get_header_and_body_lines(
-                table_text=doc_chunk.text)
+                table_text=doc_chunk.text
+            )
             from docling_core.transforms.chunker.line_chunker import LineBasedTokenChunker
+
             line_chunker = LineBasedTokenChunker(
-                tokenizer=self.tokenizer,
-                max_tokens=available_length,
-                prefix="\n".join(header_lines)
+                tokenizer=self.tokenizer, max_tokens=available_length, prefix="\n".join(header_lines)
             )
             segments = line_chunker.chunk_text(lines=body_lines)
         else:
             sem_chunker = semchunk.chunkerify(self.tokenizer.get_tokenizer(), chunk_size=available_length)
-            segments= sem_chunker.chunk(doc_chunk.text)
-        return segments    
+            segments = sem_chunker.chunk(doc_chunk.text)
+        return segments
 
-    
     def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
         output_chunks = []
         window_start = 0
         window_end = 0  # an inclusive index
         num_chunks = len(chunks)
         while window_end < num_chunks:
             chunk = chunks[window_end]
-            headings = chunk.meta.headings 
+            headings = chunk.meta.headings
             ready_to_append = False
             if window_start == window_end:
                 current_headings = headings
diff --git a/docling_core/transforms/chunker/line_chunker.py b/docling_core/transforms/chunker/line_chunker.py
@@ -13,7 +13,7 @@
     ChunkingSerializerProvider,
 )
 from docling_core.transforms.serializer.base import (
-     BaseSerializerProvider,
+    BaseSerializerProvider,
 )
 
 
@@ -27,27 +27,26 @@ class LineBasedTokenChunker(BaseChunker):
             resolved from the tokenizer
         prefix: a text that should appear at the beginning of each chunks, default is an empty string
     """
+
     model_config = ConfigDict(arbitrary_types_allowed=True)
     tokenizer: BaseTokenizer = Field(default_factory=_get_default_tokenizer)
     prefix: str = ""
     prefix_len: int = Field(default=0, init=False)
     serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider()
-    
+
     @property
     def max_tokens(self) -> int:
         """Get maximum number of tokens allowed."""
         return self.tokenizer.get_max_tokens()
-    
+
     def model_post_init(self, __context) -> None:
-        
         self.prefix_len = self.tokenizer.count_tokens(self.prefix)
         if self.prefix_len >= self.max_tokens:
             warnings.warn(
                 f"Chunks prefix: {self.prefix} is too long for chunk size {self.max_tokens} and will be ignored"
             )
             self.prefix = ""
             self.prefix_len = 0
-        
 
     def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]:
         """Chunk the provided document using line-based token-aware chunking.
@@ -59,16 +58,16 @@ def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]:
             Iterator[BaseChunk]: iterator over extracted chunks
         """
         my_doc_ser = self.serializer_provider.get_serializer(doc=dl_doc)
-        
+
         # Serialize the entire document to get the text
         ser_res = my_doc_ser.serialize()
-        
+
         if not ser_res.text:
             return
-        
+
         # Use chunk_text to split the text into chunks
-        text_chunks = self.chunk_text(lines = ser_res.text.splitlines(True))
-        
+        text_chunks = self.chunk_text(lines=ser_res.text.splitlines(True))
+
         # Yield DocChunk objects for each text chunk
         for chunk_text in text_chunks:
             yield DocChunk(
@@ -109,10 +108,7 @@ def chunk_text(self, lines: list[str]) -> list[str]:
 
                 # Remaining is too large even for an empty chunk → split it.
                 # Split off the first segment that fits into current.
-                take, remaining = self.split_by_token_limit(
-                    remaining,
-                    available
-                )
+                take, remaining = self.split_by_token_limit(remaining, available)
 
                 # Add the taken part
                 current += "\n" + take
@@ -131,7 +127,6 @@ def chunk_text(self, lines: list[str]) -> list[str]:
 
         return chunks
 
-
     def split_by_token_limit(
         self,
         text: str,
@@ -142,7 +137,7 @@ def split_by_token_limit(
         Split `text` into (head, tail) where `head` has at most `token_limit` tokens,
         and `tail` is the remainder. Uses binary search on character indices to minimize
         calls to `count_tokens`.
-    
+
         Parameters
         ----------
         text : str
@@ -190,10 +185,10 @@ def split_by_token_limit(
         # Optionally adjust to a previous whitespace boundary without violating the limit
         if prefer_word_boundary:
             # Search backwards from best_idx to find whitespace; keep within token limit.
-               
-            last_space_index= text[:best_idx].rfind(" ")
+
+            last_space_index = text[:best_idx].rfind(" ")
             if last_space_index > 0:
                 best_idx = last_space_index
-                
+
         head, tail = text[:best_idx], text[best_idx:]
         return head, tail
diff --git a/docling_core/transforms/serializer/base.py b/docling_core/transforms/serializer/base.py
@@ -82,15 +82,15 @@ def get_header_and_body_lines(
         self,
         *,
         table_text: str,
-                **kwargs: Any,
+        **kwargs: Any,
     ) -> tuple[list[str], list[str]]:
         """Get header lines and body lines from the table.
-        
+
         Returns:
             A tuple of (header_lines, body_lines) where header_lines is a list
             of strings representing table headers and body_lines is a list of
             strings representing table body rows.
-            
+
         Default implementation returns empty header lines and all content in body lines.
         """
         # default: empty headers, all content in body
diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py
@@ -356,29 +356,28 @@ def get_header_and_body_lines(
         self,
         *,
         table_text: str,
-        
         **kwargs: Any,
     ) -> tuple[list[str], list[str]]:
         """Get header lines and body lines from the markdown table.
-        
+
         Returns:
             A tuple of (header_lines, body_lines) where header_lines contains
             the header row and separator row, and body_lines contains the data rows.
         """
-                
+
         lines = [line for line in table_text.split("\n") if line.strip()]
-        
+
         if len(lines) < 2:
             # Not enough lines for a proper markdown table (need at least header + separator)
             return [], lines
-        
+
         # In markdown tables:
         # Line 0: Header row
         # Line 1: Separator row (with dashes)
         # Lines 2+: Body rows
         header_lines = lines[:2]
         body_lines = lines[2:]
-        
+
         return header_lines, body_lines
 
     @staticmethod