88from pydantic import BaseModel , ConfigDict , Field , computed_field , model_validator
99
1010from docling_core .transforms .chunker .hierarchical_chunker import (
11- ChunkingDocSerializer ,
1211 ChunkingSerializerProvider ,
1312)
14- from docling_core .transforms .serializer .base import BaseDocSerializer
1513from docling_core .transforms .chunker .tokenizer .base import BaseTokenizer
16- from docling_core .types .doc .document import SectionHeaderItem , TitleItem , TableItem
14+ from docling_core .types .doc .document import SectionHeaderItem , TitleItem
1715
1816try :
1917 import semchunk
3937)
4038from docling_core .types import DoclingDocument
4139
40+
4241def _get_default_tokenizer ():
4342 from docling_core .transforms .chunker .tokenizer .huggingface import (
4443 HuggingFaceTokenizer ,
@@ -62,7 +61,6 @@ class HybridChunker(BaseChunker):
6261 model_config = ConfigDict (arbitrary_types_allowed = True )
6362
6463 tokenizer : BaseTokenizer = Field (default_factory = _get_default_tokenizer )
65- duplicate_table_header : bool = True
6664 merge_peers : bool = True
6765
6866 serializer_provider : BaseSerializerProvider = ChunkingSerializerProvider ()
@@ -216,9 +214,7 @@ def _split_by_doc_items(self, doc_chunk: DocChunk, doc_serializer: BaseDocSerial
216214
217215 def _split_using_plain_text (
218216 self ,
219- doc_chunk : DocChunk ,
220- doc_serializer :ChunkingDocSerializer ,
221-
217+ doc_chunk : DocChunk ,
222218 ) -> list [DocChunk ]:
223219 lengths = self ._doc_chunk_length (doc_chunk )
224220 if lengths .total_len <= self .max_tokens :
@@ -227,7 +223,7 @@ def _split_using_plain_text(
227223 # How much room is there for text after subtracting out the headers and
228224 # captions:
229225 available_length = self .max_tokens - lengths .other_len
230-
226+ sem_chunker = semchunk . chunkerify ( self . tokenizer . get_tokenizer (), chunk_size = available_length )
231227 if available_length <= 0 :
232228 warnings .warn (
233229 "Headers and captions for this chunk are longer than the total "
@@ -237,39 +233,20 @@ def _split_using_plain_text(
237233 new_chunk = DocChunk (** doc_chunk .export_json_dict ())
238234 new_chunk .meta .captions = None
239235 new_chunk .meta .headings = None
240- return self ._split_using_plain_text (doc_chunk = new_chunk , doc_serializer = doc_serializer )
241-
242- segments = self . segment ( doc_chunk , available_length , doc_serializer )
236+ return self ._split_using_plain_text (doc_chunk = new_chunk )
237+ text = doc_chunk . text
238+ segments = sem_chunker . chunk ( text )
243239 chunks = [DocChunk (text = s , meta = doc_chunk .meta ) for s in segments ]
244240 return chunks
245241
246- def segment (self , doc_chunk : DocChunk , available_length : int , doc_serializer :ChunkingDocSerializer ) -> list [str ]:
247- segments = []
248- if self .duplicate_table_header and len (doc_chunk .meta .doc_items ) == 1 and isinstance (doc_chunk .meta .doc_items [0 ], TableItem ):
249-
250- header_lines , body_lines = doc_serializer .table_serializer .get_header_and_body_lines (
251- table_text = doc_chunk .text )
252- from docling_core .transforms .chunker .line_chunker import LineBasedTokenChunker
253- line_chunker = LineBasedTokenChunker (
254- tokenizer = self .tokenizer ,
255- max_tokens = available_length ,
256- prefix = "\n " .join (header_lines )
257- )
258- segments = line_chunker .chunk_text (lines = body_lines )
259- else :
260- sem_chunker = semchunk .chunkerify (self .tokenizer .get_tokenizer (), chunk_size = available_length )
261- segments = sem_chunker .chunk (doc_chunk .text )
262- return segments
263-
264-
265242 def _merge_chunks_with_matching_metadata (self , chunks : list [DocChunk ]):
266243 output_chunks = []
267244 window_start = 0
268245 window_end = 0 # an inclusive index
269246 num_chunks = len (chunks )
270247 while window_end < num_chunks :
271248 chunk = chunks [window_end ]
272- headings = chunk .meta .headings
249+ headings = chunk .meta .headings
273250 ready_to_append = False
274251 if window_start == window_end :
275252 current_headings = headings
@@ -329,7 +306,7 @@ def chunk(
329306 ** kwargs ,
330307 ) # type: ignore
331308 res = [x for c in res for x in self ._split_by_doc_items (c , doc_serializer = my_doc_ser )]
332- res = [x for c in res for x in self ._split_using_plain_text (c , doc_serializer = my_doc_ser )]
309+ res = [x for c in res for x in self ._split_using_plain_text (c )]
333310 if self .merge_peers :
334311 res = self ._merge_chunks_with_matching_metadata (res )
335312 return iter (res )
0 commit comments