88from pydantic import BaseModel , ConfigDict , Field , computed_field , model_validator
99
1010from docling_core .transforms .chunker .hierarchical_chunker import (
11+ ChunkingDocSerializer ,
1112 ChunkingSerializerProvider ,
1213)
14+ from docling_core .transforms .serializer .base import BaseDocSerializer
1315from docling_core .transforms .chunker .tokenizer .base import BaseTokenizer
14- from docling_core .types .doc .document import SectionHeaderItem , TitleItem
16+ from docling_core .types .doc .document import SectionHeaderItem , TitleItem , TableItem
1517
1618try :
1719 import semchunk
3739)
3840from docling_core .types import DoclingDocument
3941
40-
4142def _get_default_tokenizer ():
4243 from docling_core .transforms .chunker .tokenizer .huggingface import (
4344 HuggingFaceTokenizer ,
@@ -61,6 +62,7 @@ class HybridChunker(BaseChunker):
6162 model_config = ConfigDict (arbitrary_types_allowed = True )
6263
6364 tokenizer : BaseTokenizer = Field (default_factory = _get_default_tokenizer )
65+ duplicate_table_header : bool = True
6466 merge_peers : bool = True
6567
6668 serializer_provider : BaseSerializerProvider = ChunkingSerializerProvider ()
@@ -214,7 +216,9 @@ def _split_by_doc_items(self, doc_chunk: DocChunk, doc_serializer: BaseDocSerial
214216
215217 def _split_using_plain_text (
216218 self ,
217- doc_chunk : DocChunk ,
219+ doc_chunk : DocChunk ,
220+ doc_serializer :ChunkingDocSerializer ,
221+
218222 ) -> list [DocChunk ]:
219223 lengths = self ._doc_chunk_length (doc_chunk )
220224 if lengths .total_len <= self .max_tokens :
@@ -223,7 +227,7 @@ def _split_using_plain_text(
223227 # How much room is there for text after subtracting out the headers and
224228 # captions:
225229 available_length = self .max_tokens - lengths .other_len
226- sem_chunker = semchunk . chunkerify ( self . tokenizer . get_tokenizer (), chunk_size = available_length )
230+
227231 if available_length <= 0 :
228232 warnings .warn (
229233 "Headers and captions for this chunk are longer than the total "
@@ -233,20 +237,39 @@ def _split_using_plain_text(
233237 new_chunk = DocChunk (** doc_chunk .export_json_dict ())
234238 new_chunk .meta .captions = None
235239 new_chunk .meta .headings = None
236- return self ._split_using_plain_text (doc_chunk = new_chunk )
237- text = doc_chunk . text
238- segments = sem_chunker . chunk ( text )
240+ return self ._split_using_plain_text (doc_chunk = new_chunk , doc_serializer = doc_serializer )
241+
242+ segments = self . segment ( doc_chunk , available_length , doc_serializer )
239243 chunks = [DocChunk (text = s , meta = doc_chunk .meta ) for s in segments ]
240244 return chunks
241245
246+ def segment (self , doc_chunk : DocChunk , available_length : int , doc_serializer :ChunkingDocSerializer ) -> list [str ]:
247+ segments = []
248+ if self .duplicate_table_header and len (doc_chunk .meta .doc_items ) == 1 and isinstance (doc_chunk .meta .doc_items [0 ], TableItem ):
249+
250+ header_lines , body_lines = doc_serializer .table_serializer .get_header_and_body_lines (
251+ table_text = doc_chunk .text )
252+ from docling_core .transforms .chunker .line_chunker import LineBasedTokenChunker
253+ line_chunker = LineBasedTokenChunker (
254+ tokenizer = self .tokenizer ,
255+ max_tokens = available_length ,
256+ prefix = "\n " .join (header_lines )
257+ )
258+ segments = line_chunker .chunk_text (lines = body_lines )
259+ else :
260+ sem_chunker = semchunk .chunkerify (self .tokenizer .get_tokenizer (), chunk_size = available_length )
261+ segments = sem_chunker .chunk (doc_chunk .text )
262+ return segments
263+
264+
242265 def _merge_chunks_with_matching_metadata (self , chunks : list [DocChunk ]):
243266 output_chunks = []
244267 window_start = 0
245268 window_end = 0 # an inclusive index
246269 num_chunks = len (chunks )
247270 while window_end < num_chunks :
248271 chunk = chunks [window_end ]
249- headings = chunk .meta .headings
272+ headings = chunk .meta .headings
250273 ready_to_append = False
251274 if window_start == window_end :
252275 current_headings = headings
@@ -306,7 +329,7 @@ def chunk(
306329 ** kwargs ,
307330 ) # type: ignore
308331 res = [x for c in res for x in self ._split_by_doc_items (c , doc_serializer = my_doc_ser )]
309- res = [x for c in res for x in self ._split_using_plain_text (c )]
332+ res = [x for c in res for x in self ._split_using_plain_text (c , doc_serializer = my_doc_ser )]
310333 if self .merge_peers :
311334 res = self ._merge_chunks_with_matching_metadata (res )
312335 return iter (res )
0 commit comments