Skip to content

Commit 9b9ef09

Browse files
committed
style changes
1 parent 59cdda5 commit 9b9ef09

4 files changed

Lines changed: 41 additions & 45 deletions

File tree

docling_core/transforms/chunker/hybrid_chunker.py

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
)
4040
from docling_core.types import DoclingDocument
4141

42+
4243
def _get_default_tokenizer():
4344
from docling_core.transforms.chunker.tokenizer.huggingface import (
4445
HuggingFaceTokenizer,
@@ -216,9 +217,8 @@ def _split_by_doc_items(self, doc_chunk: DocChunk, doc_serializer: BaseDocSerial
216217

217218
def _split_using_plain_text(
218219
self,
219-
doc_chunk: DocChunk,
220-
doc_serializer:ChunkingDocSerializer,
221-
220+
doc_chunk: DocChunk,
221+
doc_serializer: ChunkingDocSerializer,
222222
) -> list[DocChunk]:
223223
lengths = self._doc_chunk_length(doc_chunk)
224224
if lengths.total_len <= self.max_tokens:
@@ -227,7 +227,7 @@ def _split_using_plain_text(
227227
# How much room is there for text after subtracting out the headers and
228228
# captions:
229229
available_length = self.max_tokens - lengths.other_len
230-
230+
231231
if available_length <= 0:
232232
warnings.warn(
233233
"Headers and captions for this chunk are longer than the total "
@@ -238,38 +238,40 @@ def _split_using_plain_text(
238238
new_chunk.meta.captions = None
239239
new_chunk.meta.headings = None
240240
return self._split_using_plain_text(doc_chunk=new_chunk, doc_serializer=doc_serializer)
241-
242-
segments = self.segment(doc_chunk,available_length,doc_serializer)
241+
242+
segments = self.segment(doc_chunk, available_length, doc_serializer)
243243
chunks = [DocChunk(text=s, meta=doc_chunk.meta) for s in segments]
244244
return chunks
245245

246-
def segment(self, doc_chunk: DocChunk, available_length: int, doc_serializer:ChunkingDocSerializer) -> list[str]:
246+
def segment(self, doc_chunk: DocChunk, available_length: int, doc_serializer: ChunkingDocSerializer) -> list[str]:
247247
segments = []
248-
if self.duplicate_table_header and len(doc_chunk.meta.doc_items) == 1 and isinstance(doc_chunk.meta.doc_items[0], TableItem):
249-
248+
if (
249+
self.duplicate_table_header
250+
and len(doc_chunk.meta.doc_items) == 1
251+
and isinstance(doc_chunk.meta.doc_items[0], TableItem)
252+
):
250253
header_lines, body_lines = doc_serializer.table_serializer.get_header_and_body_lines(
251-
table_text=doc_chunk.text)
254+
table_text=doc_chunk.text
255+
)
252256
from docling_core.transforms.chunker.line_chunker import LineBasedTokenChunker
257+
253258
line_chunker = LineBasedTokenChunker(
254-
tokenizer=self.tokenizer,
255-
max_tokens=available_length,
256-
prefix="\n".join(header_lines)
259+
tokenizer=self.tokenizer, max_tokens=available_length, prefix="\n".join(header_lines)
257260
)
258261
segments = line_chunker.chunk_text(lines=body_lines)
259262
else:
260263
sem_chunker = semchunk.chunkerify(self.tokenizer.get_tokenizer(), chunk_size=available_length)
261-
segments= sem_chunker.chunk(doc_chunk.text)
262-
return segments
264+
segments = sem_chunker.chunk(doc_chunk.text)
265+
return segments
263266

264-
265267
def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
266268
output_chunks = []
267269
window_start = 0
268270
window_end = 0 # an inclusive index
269271
num_chunks = len(chunks)
270272
while window_end < num_chunks:
271273
chunk = chunks[window_end]
272-
headings = chunk.meta.headings
274+
headings = chunk.meta.headings
273275
ready_to_append = False
274276
if window_start == window_end:
275277
current_headings = headings

docling_core/transforms/chunker/line_chunker.py

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
ChunkingSerializerProvider,
1414
)
1515
from docling_core.transforms.serializer.base import (
16-
BaseSerializerProvider,
16+
BaseSerializerProvider,
1717
)
1818

1919

@@ -27,27 +27,26 @@ class LineBasedTokenChunker(BaseChunker):
2727
resolved from the tokenizer
2828
prefix: a text that should appear at the beginning of each chunks, default is an empty string
2929
"""
30+
3031
model_config = ConfigDict(arbitrary_types_allowed=True)
3132
tokenizer: BaseTokenizer = Field(default_factory=_get_default_tokenizer)
3233
prefix: str = ""
3334
prefix_len: int = Field(default=0, init=False)
3435
serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider()
35-
36+
3637
@property
3738
def max_tokens(self) -> int:
3839
"""Get maximum number of tokens allowed."""
3940
return self.tokenizer.get_max_tokens()
40-
41+
4142
def model_post_init(self, __context) -> None:
42-
4343
self.prefix_len = self.tokenizer.count_tokens(self.prefix)
4444
if self.prefix_len >= self.max_tokens:
4545
warnings.warn(
4646
f"Chunks prefix: {self.prefix} is too long for chunk size {self.max_tokens} and will be ignored"
4747
)
4848
self.prefix = ""
4949
self.prefix_len = 0
50-
5150

5251
def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]:
5352
"""Chunk the provided document using line-based token-aware chunking.
@@ -59,16 +58,16 @@ def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]:
5958
Iterator[BaseChunk]: iterator over extracted chunks
6059
"""
6160
my_doc_ser = self.serializer_provider.get_serializer(doc=dl_doc)
62-
61+
6362
# Serialize the entire document to get the text
6463
ser_res = my_doc_ser.serialize()
65-
64+
6665
if not ser_res.text:
6766
return
68-
67+
6968
# Use chunk_text to split the text into chunks
70-
text_chunks = self.chunk_text(lines = ser_res.text.splitlines(True))
71-
69+
text_chunks = self.chunk_text(lines=ser_res.text.splitlines(True))
70+
7271
# Yield DocChunk objects for each text chunk
7372
for chunk_text in text_chunks:
7473
yield DocChunk(
@@ -109,10 +108,7 @@ def chunk_text(self, lines: list[str]) -> list[str]:
109108

110109
# Remaining is too large even for an empty chunk → split it.
111110
# Split off the first segment that fits into current.
112-
take, remaining = self.split_by_token_limit(
113-
remaining,
114-
available
115-
)
111+
take, remaining = self.split_by_token_limit(remaining, available)
116112

117113
# Add the taken part
118114
current += "\n" + take
@@ -131,7 +127,6 @@ def chunk_text(self, lines: list[str]) -> list[str]:
131127

132128
return chunks
133129

134-
135130
def split_by_token_limit(
136131
self,
137132
text: str,
@@ -142,7 +137,7 @@ def split_by_token_limit(
142137
Split `text` into (head, tail) where `head` has at most `token_limit` tokens,
143138
and `tail` is the remainder. Uses binary search on character indices to minimize
144139
calls to `count_tokens`.
145-
140+
146141
Parameters
147142
----------
148143
text : str
@@ -190,10 +185,10 @@ def split_by_token_limit(
190185
# Optionally adjust to a previous whitespace boundary without violating the limit
191186
if prefer_word_boundary:
192187
# Search backwards from best_idx to find whitespace; keep within token limit.
193-
194-
last_space_index= text[:best_idx].rfind(" ")
188+
189+
last_space_index = text[:best_idx].rfind(" ")
195190
if last_space_index > 0:
196191
best_idx = last_space_index
197-
192+
198193
head, tail = text[:best_idx], text[best_idx:]
199194
return head, tail

docling_core/transforms/serializer/base.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,15 +82,15 @@ def get_header_and_body_lines(
8282
self,
8383
*,
8484
table_text: str,
85-
**kwargs: Any,
85+
**kwargs: Any,
8686
) -> tuple[list[str], list[str]]:
8787
"""Get header lines and body lines from the table.
88-
88+
8989
Returns:
9090
A tuple of (header_lines, body_lines) where header_lines is a list
9191
of strings representing table headers and body_lines is a list of
9292
strings representing table body rows.
93-
93+
9494
Default implementation returns empty header lines and all content in body lines.
9595
"""
9696
# default: empty headers, all content in body

docling_core/transforms/serializer/markdown.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -356,29 +356,28 @@ def get_header_and_body_lines(
356356
self,
357357
*,
358358
table_text: str,
359-
360359
**kwargs: Any,
361360
) -> tuple[list[str], list[str]]:
362361
"""Get header lines and body lines from the markdown table.
363-
362+
364363
Returns:
365364
A tuple of (header_lines, body_lines) where header_lines contains
366365
the header row and separator row, and body_lines contains the data rows.
367366
"""
368-
367+
369368
lines = [line for line in table_text.split("\n") if line.strip()]
370-
369+
371370
if len(lines) < 2:
372371
# Not enough lines for a proper markdown table (need at least header + separator)
373372
return [], lines
374-
373+
375374
# In markdown tables:
376375
# Line 0: Header row
377376
# Line 1: Separator row (with dashes)
378377
# Lines 2+: Body rows
379378
header_lines = lines[:2]
380379
body_lines = lines[2:]
381-
380+
382381
return header_lines, body_lines
383382

384383
@staticmethod

0 commit comments

Comments
 (0)