Skip to content

Commit 0642a07

Browse files
committed
duplicate table headers
1 parent 6c3a8f7 commit 0642a07

3 files changed

Lines changed: 1243 additions & 10 deletions

File tree

docling_core/transforms/chunker/hybrid_chunker.py

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,12 @@
88
from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator
99

1010
from docling_core.transforms.chunker.hierarchical_chunker import (
11+
ChunkingDocSerializer,
1112
ChunkingSerializerProvider,
1213
)
14+
from docling_core.transforms.serializer.base import BaseDocSerializer
1315
from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer
14-
from docling_core.types.doc.document import SectionHeaderItem, TitleItem
16+
from docling_core.types.doc.document import SectionHeaderItem, TitleItem, TableItem
1517

1618
try:
1719
import semchunk
@@ -37,7 +39,6 @@
3739
)
3840
from docling_core.types import DoclingDocument
3941

40-
4142
def _get_default_tokenizer():
4243
from docling_core.transforms.chunker.tokenizer.huggingface import (
4344
HuggingFaceTokenizer,
@@ -61,6 +62,7 @@ class HybridChunker(BaseChunker):
6162
model_config = ConfigDict(arbitrary_types_allowed=True)
6263

6364
tokenizer: BaseTokenizer = Field(default_factory=_get_default_tokenizer)
65+
duplicate_table_header: bool = True
6466
merge_peers: bool = True
6567

6668
serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider()
@@ -214,7 +216,9 @@ def _split_by_doc_items(self, doc_chunk: DocChunk, doc_serializer: BaseDocSerial
214216

215217
def _split_using_plain_text(
216218
self,
217-
doc_chunk: DocChunk,
219+
doc_chunk: DocChunk,
220+
doc_serializer:ChunkingDocSerializer,
221+
218222
) -> list[DocChunk]:
219223
lengths = self._doc_chunk_length(doc_chunk)
220224
if lengths.total_len <= self.max_tokens:
@@ -223,7 +227,7 @@ def _split_using_plain_text(
223227
# How much room is there for text after subtracting out the headers and
224228
# captions:
225229
available_length = self.max_tokens - lengths.other_len
226-
sem_chunker = semchunk.chunkerify(self.tokenizer.get_tokenizer(), chunk_size=available_length)
230+
227231
if available_length <= 0:
228232
warnings.warn(
229233
"Headers and captions for this chunk are longer than the total "
@@ -233,20 +237,39 @@ def _split_using_plain_text(
233237
new_chunk = DocChunk(**doc_chunk.export_json_dict())
234238
new_chunk.meta.captions = None
235239
new_chunk.meta.headings = None
236-
return self._split_using_plain_text(doc_chunk=new_chunk)
237-
text = doc_chunk.text
238-
segments = sem_chunker.chunk(text)
240+
return self._split_using_plain_text(doc_chunk=new_chunk, doc_serializer=doc_serializer)
241+
242+
segments = self.segment(doc_chunk,available_length,doc_serializer)
239243
chunks = [DocChunk(text=s, meta=doc_chunk.meta) for s in segments]
240244
return chunks
241245

246+
def segment(self, doc_chunk: DocChunk, available_length: int, doc_serializer:ChunkingDocSerializer) -> list[str]:
247+
segments = []
248+
if self.duplicate_table_header and len(doc_chunk.meta.doc_items) == 1 and isinstance(doc_chunk.meta.doc_items[0], TableItem):
249+
250+
header_lines, body_lines = doc_serializer.table_serializer.get_header_and_body_lines(
251+
table_text=doc_chunk.text)
252+
from docling_core.transforms.chunker.line_chunker import LineBasedTokenChunker
253+
line_chunker = LineBasedTokenChunker(
254+
tokenizer=self.tokenizer,
255+
max_tokens=available_length,
256+
prefix="\n".join(header_lines)
257+
)
258+
segments = line_chunker.chunk_text(lines=body_lines)
259+
else:
260+
sem_chunker = semchunk.chunkerify(self.tokenizer.get_tokenizer(), chunk_size=available_length)
261+
segments= sem_chunker.chunk(doc_chunk.text)
262+
return segments
263+
264+
242265
def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
243266
output_chunks = []
244267
window_start = 0
245268
window_end = 0 # an inclusive index
246269
num_chunks = len(chunks)
247270
while window_end < num_chunks:
248271
chunk = chunks[window_end]
249-
headings = chunk.meta.headings
272+
headings = chunk.meta.headings
250273
ready_to_append = False
251274
if window_start == window_end:
252275
current_headings = headings
@@ -306,7 +329,7 @@ def chunk(
306329
**kwargs,
307330
) # type: ignore
308331
res = [x for c in res for x in self._split_by_doc_items(c, doc_serializer=my_doc_ser)]
309-
res = [x for c in res for x in self._split_using_plain_text(c)]
332+
res = [x for c in res for x in self._split_using_plain_text(c, doc_serializer=my_doc_ser)]
310333
if self.merge_peers:
311334
res = self._merge_chunks_with_matching_metadata(res)
312335
return iter(res)

0 commit comments

Comments
 (0)