-
Notifications
You must be signed in to change notification settings - Fork 135
feat: table aware chunking #527
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
PeterStaar-IBM
merged 19 commits into
docling-project:main
from
odelliab:table_aware_chunking
Mar 4, 2026
Merged
Changes from all commits
Commits
Show all changes
19 commits
Select commit
Hold shift + click to select a range
5cc61d9
line_chunker
odelliab 91b43f9
split table to header and body
odelliab 5d17bda
duplicat table headers
odelliab a50392e
Revert "duplicat table headers"
odelliab e589429
Revert "split table to header and body"
odelliab 30c72a9
Revert "line_chunker"
odelliab 510e949
line chunker
odelliab 6c3a8f7
split table to header and body
odelliab 0642a07
duplicate table headers
odelliab 59cdda5
DCO Remediation Commit for odelliab <[email protected]>
odelliab 9b9ef09
style changes
odelliab b3699e3
pre-commit fixes
odelliab 9d393df
expected output name change
odelliab 1c5ac39
DCO Remediation Commit for odelliab <[email protected]>
odelliab d5001ca
Apply suggestion from @ceberam
odelliab 759f0b5
Apply suggestions from code review
odelliab 29bd5cf
address review comments
odelliab 5c30d89
pre-commit fixes
odelliab df5ab71
refactor: move get_default_tokenizer to huggingface module
ceberam File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,222 @@ | ||
| import warnings | ||
| from collections.abc import Iterator | ||
| from functools import cached_property | ||
| from typing import Annotated, Any | ||
|
|
||
| from pydantic import ConfigDict, Field, computed_field | ||
|
|
||
| from docling_core.transforms.chunker import BaseChunk, BaseChunker, DocChunk, DocMeta | ||
| from docling_core.transforms.chunker.hierarchical_chunker import ( | ||
| ChunkingSerializerProvider, | ||
| ) | ||
| from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer | ||
| from docling_core.transforms.chunker.tokenizer.huggingface import get_default_tokenizer | ||
| from docling_core.transforms.serializer.base import ( | ||
| BaseSerializerProvider, | ||
| ) | ||
| from docling_core.types import DoclingDocument | ||
|
|
||
|
|
||
| class LineBasedTokenChunker(BaseChunker): | ||
| """Tokenization-aware chunker that preserves line boundaries. | ||
|
|
||
| This chunker serializes the document content into text and attempts to keep lines | ||
| intact within chunks. It only splits a line if it exceeds the maximum token limit on | ||
| its own. This is particularly useful for structured content like tables, code, or logs | ||
| where line boundaries are semantically important. | ||
| """ | ||
|
|
||
| model_config = ConfigDict(arbitrary_types_allowed=True) | ||
|
|
||
| tokenizer: Annotated[ | ||
| BaseTokenizer, | ||
| Field( | ||
| default_factory=get_default_tokenizer, | ||
| description="The tokenizer to use; either instantiated object or name or path of respective pretrained model", | ||
| ), | ||
| ] | ||
|
|
||
| prefix: Annotated[ | ||
| str, | ||
| Field( | ||
| default="", | ||
| description="Text that appears at the beginning of each chunk. Useful for adding context like table headers", | ||
| ), | ||
| ] | ||
|
|
||
| serializer_provider: Annotated[ | ||
| BaseSerializerProvider, | ||
| Field( | ||
| default_factory=ChunkingSerializerProvider, | ||
| description="Provider for document serialization during chunking", | ||
| ), | ||
| ] | ||
|
|
||
| @computed_field # type: ignore[misc] | ||
| @cached_property | ||
| def prefix_len(self) -> int: | ||
| """Cached token count of the prefix, computed during initialization.""" | ||
| token_count = self.tokenizer.count_tokens(self.prefix) | ||
| if token_count >= self.max_tokens: | ||
| warnings.warn( | ||
| f"Chunks prefix: {self.prefix} is too long for chunk size {self.max_tokens} and will be ignored" | ||
| ) | ||
| return 0 | ||
| return token_count | ||
|
|
||
| @property | ||
| def max_tokens(self) -> int: | ||
| """Get maximum number of tokens allowed in a chunk. If not set, limit is resolved from the tokenizer.""" | ||
| return self.tokenizer.get_max_tokens() | ||
|
|
||
| def model_post_init(self, __context) -> None: | ||
| # Trigger computation of prefix_len to validate prefix length | ||
| _ = self.prefix_len | ||
| if self.prefix_len == 0 and self.prefix: | ||
| # If prefix_len is 0 but prefix exists, it means prefix was too long | ||
| self.prefix = "" | ||
|
|
||
| def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]: | ||
| """Chunk the provided document using line-based token-aware chunking. | ||
|
|
||
| Args: | ||
| dl_doc (DoclingDocument): document to chunk | ||
|
|
||
| Yields: | ||
| Iterator[BaseChunk]: iterator over extracted chunks | ||
| """ | ||
| my_doc_ser = self.serializer_provider.get_serializer(doc=dl_doc) | ||
|
|
||
| # Serialize the entire document to get the text | ||
| ser_res = my_doc_ser.serialize() | ||
|
|
||
| if not ser_res.text: | ||
| return | ||
|
|
||
| # Use chunk_text to split the text into chunks | ||
| text_chunks = self.chunk_text(lines=ser_res.text.splitlines(True)) | ||
|
|
||
| # Yield DocChunk objects for each text chunk | ||
| for chunk_text in text_chunks: | ||
| yield DocChunk( | ||
| text=chunk_text, | ||
| meta=DocMeta( | ||
| doc_items=ser_res.get_unique_doc_items(), | ||
| headings=None, | ||
| origin=dl_doc.origin, | ||
| ), | ||
| ) | ||
|
|
||
| def chunk_text(self, lines: list[str]) -> list[str]: | ||
| chunks = [] | ||
| current = self.prefix | ||
| current_len = self.prefix_len | ||
|
|
||
| for line in lines: | ||
| remaining = line | ||
|
|
||
| while True: | ||
| line_tokens = self.tokenizer.count_tokens(remaining) | ||
| available = self.max_tokens - current_len | ||
|
|
||
| # If the remaining part fits entirely into current chunk → append and stop | ||
| if line_tokens <= available: | ||
| current += remaining | ||
| current_len += line_tokens | ||
| break | ||
|
|
||
| # Remaining does NOT fit into current chunk. | ||
| # If it CAN fit into a fresh chunk → flush current and start new one. | ||
| if line_tokens + self.prefix_len <= self.max_tokens: | ||
| chunks.append(current) | ||
| current = self.prefix | ||
| current_len = self.prefix_len | ||
| # loop continues to retry fitting `remaining` | ||
| continue | ||
|
|
||
| # Remaining is too large even for an empty chunk → split it. | ||
| # Split off the first segment that fits into current. | ||
| take, remaining = self.split_by_token_limit(remaining, available) | ||
|
|
||
| # Add the taken part | ||
| current += "\n" + take | ||
| current_len += self.tokenizer.count_tokens(take) | ||
|
|
||
| # flush the current chunk (full) | ||
| chunks.append(current) | ||
| current = self.prefix | ||
| current_len = self.prefix_len | ||
|
|
||
| # end while for this line | ||
|
|
||
| # push final chunk if non-empty | ||
| if current != self.prefix: | ||
| chunks.append(current) | ||
|
|
||
| return chunks | ||
|
|
||
| def split_by_token_limit( | ||
| self, | ||
| text: str, | ||
| token_limit: int, | ||
| prefer_word_boundary: bool = True, | ||
| ) -> tuple[str, str]: | ||
| """ | ||
| Split `text` into (head, tail) where `head` has at most `token_limit` tokens, | ||
| and `tail` is the remainder. Uses binary search on character indices to minimize | ||
| calls to `count_tokens`. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| text : str | ||
| Input string to split. | ||
| token_limit: int | ||
| Maximum number of tokens allowed in the head. | ||
| prefer_word_boundary : bool | ||
| If True, try to end the head on a whitespace boundary (without violating | ||
| the token limit). If no boundary exists in range, fall back to the | ||
| exact max index found by search. | ||
|
|
||
| Returns | ||
| ------- | ||
| (head, tail) : tuple[str, str] | ||
| `head` contains at most `token_limit` tokens, `tail` is the remaining suffix. | ||
| If `token_limit <= 0`, returns ("", text). | ||
| """ | ||
| if token_limit <= 0 or not text: | ||
| return "", text | ||
|
|
||
| # if the whole text already fits, return as is. | ||
| if self.tokenizer.count_tokens(text) <= token_limit: | ||
| return text, "" | ||
|
|
||
| # Binary search over character indices [0, len(text)] | ||
| lo, hi = 0, len(text) | ||
| best_idx: int | None = None | ||
|
|
||
| while lo <= hi: | ||
| mid = (lo + hi) // 2 | ||
| head = text[:mid] | ||
| tok_count = self.tokenizer.count_tokens(head) | ||
|
|
||
| if tok_count <= token_limit: | ||
| best_idx = mid # feasible; try to extend | ||
| lo = mid + 1 | ||
| else: | ||
| hi = mid - 1 | ||
|
|
||
| if best_idx is None or best_idx <= 0: | ||
| # Even the first character exceeds the limit (e.g., tokenizer behavior). | ||
| # Return nothing in head, everything in tail. | ||
| return "", text | ||
|
|
||
| # Optionally adjust to a previous whitespace boundary without violating the limit | ||
| if prefer_word_boundary: | ||
| # Search backwards from best_idx to find whitespace; keep within token limit. | ||
|
|
||
| last_space_index = text[:best_idx].rfind(" ") | ||
| if last_space_index >= 0: | ||
| best_idx = last_space_index | ||
|
|
||
| head, tail = text[:best_idx], text[best_idx:] | ||
| return head, tail | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.