Skip to content

Commit a50392e

Browse files
committed
Revert "duplicat table headers"
This reverts commit 5d17bda.
1 parent 5d17bda commit a50392e

3 files changed

Lines changed: 10 additions & 1243 deletions

File tree

docling_core/transforms/chunker/hybrid_chunker.py

Lines changed: 9 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,10 @@
88
from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator
99

1010
from docling_core.transforms.chunker.hierarchical_chunker import (
11-
ChunkingDocSerializer,
1211
ChunkingSerializerProvider,
1312
)
14-
from docling_core.transforms.serializer.base import BaseDocSerializer
1513
from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer
16-
from docling_core.types.doc.document import SectionHeaderItem, TitleItem, TableItem
14+
from docling_core.types.doc.document import SectionHeaderItem, TitleItem
1715

1816
try:
1917
import semchunk
@@ -39,6 +37,7 @@
3937
)
4038
from docling_core.types import DoclingDocument
4139

40+
4241
def _get_default_tokenizer():
4342
from docling_core.transforms.chunker.tokenizer.huggingface import (
4443
HuggingFaceTokenizer,
@@ -62,7 +61,6 @@ class HybridChunker(BaseChunker):
6261
model_config = ConfigDict(arbitrary_types_allowed=True)
6362

6463
tokenizer: BaseTokenizer = Field(default_factory=_get_default_tokenizer)
65-
duplicate_table_header: bool = True
6664
merge_peers: bool = True
6765

6866
serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider()
@@ -216,9 +214,7 @@ def _split_by_doc_items(self, doc_chunk: DocChunk, doc_serializer: BaseDocSerial
216214

217215
def _split_using_plain_text(
218216
self,
219-
doc_chunk: DocChunk,
220-
doc_serializer:ChunkingDocSerializer,
221-
217+
doc_chunk: DocChunk,
222218
) -> list[DocChunk]:
223219
lengths = self._doc_chunk_length(doc_chunk)
224220
if lengths.total_len <= self.max_tokens:
@@ -227,7 +223,7 @@ def _split_using_plain_text(
227223
# How much room is there for text after subtracting out the headers and
228224
# captions:
229225
available_length = self.max_tokens - lengths.other_len
230-
226+
sem_chunker = semchunk.chunkerify(self.tokenizer.get_tokenizer(), chunk_size=available_length)
231227
if available_length <= 0:
232228
warnings.warn(
233229
"Headers and captions for this chunk are longer than the total "
@@ -237,39 +233,20 @@ def _split_using_plain_text(
237233
new_chunk = DocChunk(**doc_chunk.export_json_dict())
238234
new_chunk.meta.captions = None
239235
new_chunk.meta.headings = None
240-
return self._split_using_plain_text(doc_chunk=new_chunk, doc_serializer=doc_serializer)
241-
242-
segments = self.segment(doc_chunk,available_length,doc_serializer)
236+
return self._split_using_plain_text(doc_chunk=new_chunk)
237+
text = doc_chunk.text
238+
segments = sem_chunker.chunk(text)
243239
chunks = [DocChunk(text=s, meta=doc_chunk.meta) for s in segments]
244240
return chunks
245241

246-
def segment(self, doc_chunk: DocChunk, available_length: int, doc_serializer:ChunkingDocSerializer) -> list[str]:
247-
segments = []
248-
if self.duplicate_table_header and len(doc_chunk.meta.doc_items) == 1 and isinstance(doc_chunk.meta.doc_items[0], TableItem):
249-
250-
header_lines, body_lines = doc_serializer.table_serializer.get_header_and_body_lines(
251-
table_text=doc_chunk.text)
252-
from docling_core.transforms.chunker.line_chunker import LineBasedTokenChunker
253-
line_chunker = LineBasedTokenChunker(
254-
tokenizer=self.tokenizer,
255-
max_tokens=available_length,
256-
prefix="\n".join(header_lines)
257-
)
258-
segments = line_chunker.chunk_text(lines=body_lines)
259-
else:
260-
sem_chunker = semchunk.chunkerify(self.tokenizer.get_tokenizer(), chunk_size=available_length)
261-
segments= sem_chunker.chunk(doc_chunk.text)
262-
return segments
263-
264-
265242
def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
266243
output_chunks = []
267244
window_start = 0
268245
window_end = 0 # an inclusive index
269246
num_chunks = len(chunks)
270247
while window_end < num_chunks:
271248
chunk = chunks[window_end]
272-
headings = chunk.meta.headings
249+
headings = chunk.meta.headings
273250
ready_to_append = False
274251
if window_start == window_end:
275252
current_headings = headings
@@ -329,7 +306,7 @@ def chunk(
329306
**kwargs,
330307
) # type: ignore
331308
res = [x for c in res for x in self._split_by_doc_items(c, doc_serializer=my_doc_ser)]
332-
res = [x for c in res for x in self._split_using_plain_text(c, doc_serializer=my_doc_ser)]
309+
res = [x for c in res for x in self._split_using_plain_text(c)]
333310
if self.merge_peers:
334311
res = self._merge_chunks_with_matching_metadata(res)
335312
return iter(res)

0 commit comments

Comments
 (0)