3939)
4040from docling_core .types import DoclingDocument
4141
42+
4243def _get_default_tokenizer ():
4344 from docling_core .transforms .chunker .tokenizer .huggingface import (
4445 HuggingFaceTokenizer ,
@@ -216,9 +217,8 @@ def _split_by_doc_items(self, doc_chunk: DocChunk, doc_serializer: BaseDocSerial
216217
217218 def _split_using_plain_text (
218219 self ,
219- doc_chunk : DocChunk ,
220- doc_serializer :ChunkingDocSerializer ,
221-
220+ doc_chunk : DocChunk ,
221+ doc_serializer : ChunkingDocSerializer ,
222222 ) -> list [DocChunk ]:
223223 lengths = self ._doc_chunk_length (doc_chunk )
224224 if lengths .total_len <= self .max_tokens :
@@ -227,7 +227,7 @@ def _split_using_plain_text(
227227 # How much room is there for text after subtracting out the headers and
228228 # captions:
229229 available_length = self .max_tokens - lengths .other_len
230-
230+
231231 if available_length <= 0 :
232232 warnings .warn (
233233 "Headers and captions for this chunk are longer than the total "
@@ -238,38 +238,40 @@ def _split_using_plain_text(
238238 new_chunk .meta .captions = None
239239 new_chunk .meta .headings = None
240240 return self ._split_using_plain_text (doc_chunk = new_chunk , doc_serializer = doc_serializer )
241-
242- segments = self .segment (doc_chunk ,available_length ,doc_serializer )
241+
242+ segments = self .segment (doc_chunk , available_length , doc_serializer )
243243 chunks = [DocChunk (text = s , meta = doc_chunk .meta ) for s in segments ]
244244 return chunks
245245
246- def segment (self , doc_chunk : DocChunk , available_length : int , doc_serializer :ChunkingDocSerializer ) -> list [str ]:
246+ def segment (self , doc_chunk : DocChunk , available_length : int , doc_serializer : ChunkingDocSerializer ) -> list [str ]:
247247 segments = []
248- if self .duplicate_table_header and len (doc_chunk .meta .doc_items ) == 1 and isinstance (doc_chunk .meta .doc_items [0 ], TableItem ):
249-
248+ if (
249+ self .duplicate_table_header
250+ and len (doc_chunk .meta .doc_items ) == 1
251+ and isinstance (doc_chunk .meta .doc_items [0 ], TableItem )
252+ ):
250253 header_lines , body_lines = doc_serializer .table_serializer .get_header_and_body_lines (
251- table_text = doc_chunk .text )
254+ table_text = doc_chunk .text
255+ )
252256 from docling_core .transforms .chunker .line_chunker import LineBasedTokenChunker
257+
253258 line_chunker = LineBasedTokenChunker (
254- tokenizer = self .tokenizer ,
255- max_tokens = available_length ,
256- prefix = "\n " .join (header_lines )
259+ tokenizer = self .tokenizer , max_tokens = available_length , prefix = "\n " .join (header_lines )
257260 )
258261 segments = line_chunker .chunk_text (lines = body_lines )
259262 else :
260263 sem_chunker = semchunk .chunkerify (self .tokenizer .get_tokenizer (), chunk_size = available_length )
261- segments = sem_chunker .chunk (doc_chunk .text )
262- return segments
264+ segments = sem_chunker .chunk (doc_chunk .text )
265+ return segments
263266
264-
265267 def _merge_chunks_with_matching_metadata (self , chunks : list [DocChunk ]):
266268 output_chunks = []
267269 window_start = 0
268270 window_end = 0 # an inclusive index
269271 num_chunks = len (chunks )
270272 while window_end < num_chunks :
271273 chunk = chunks [window_end ]
272- headings = chunk .meta .headings
274+ headings = chunk .meta .headings
273275 ready_to_append = False
274276 if window_start == window_end :
275277 current_headings = headings
0 commit comments