Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions presidio-analyzer/presidio_analyzer/chunkers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""Text chunking strategies for handling long texts."""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at these classes, I'm thinking about applying the Factory pattern here.

The idea is that users would define the chunker type via a string in their YAML config (e.g., chunker_type: "character"), and the factory would instantiate the appropriate chunker for them.

This would align with how Presidio handles other configs

WDYT?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea for future extensibility! However, I'd suggest keeping the current approach for now:

Only one chunker exists so a factory would be premature abstraction
Current API is flexible - Users can already pass any chunker via the parameter in GLiNERRecognizer
YAGNI principle - We can add the factory pattern when we actually have multiple chunker types that need runtime selection
I have ensured that the current design allows for furture implementation of new chunkers (e.g., sentence-based, semantic-based), we can introduce a factory then. The current design doesn't prevent that future addition.
Does that make sense?


from presidio_analyzer.chunkers.base_chunker import BaseTextChunker
from presidio_analyzer.chunkers.character_based_text_chunker import (
CharacterBasedTextChunker,
)
from presidio_analyzer.chunkers.chunking_utils import (
deduplicate_overlapping_entities,
predict_with_chunking,
process_text_in_chunks,
)

__all__ = [
"BaseTextChunker",
"CharacterBasedTextChunker",
"predict_with_chunking",
"process_text_in_chunks",
"deduplicate_overlapping_entities",
]
16 changes: 16 additions & 0 deletions presidio-analyzer/presidio_analyzer/chunkers/base_chunker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"""Abstract base class for text chunking strategies."""
from abc import ABC, abstractmethod
from typing import List


class BaseTextChunker(ABC):
"""Abstract base class for text chunking strategies."""

@abstractmethod
def chunk(self, text: str) -> List[str]:
"""Split text into chunks.

:param text: The input text to split
:return: List of text chunks
"""
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""Character-based text chunker with word boundary preservation.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add a bit logs in debug mode ?

Copy link
Author

@jedheaj314 jedheaj314 Dec 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done!


Based on gliner-spacy implementation:
https://github.com/theirstory/gliner-spacy/blob/main/gliner_spacy/pipeline.py#L60-L96
"""
import logging
from typing import List

from presidio_analyzer.chunkers.base_chunker import BaseTextChunker

logger = logging.getLogger("presidio-analyzer")


class CharacterBasedTextChunker(BaseTextChunker):
"""Character-based text chunker with word boundary preservation."""

def __init__(self, chunk_size: int, chunk_overlap: int = 0):
"""Initialize the character-based text chunker.

Note: Chunks may slightly exceed chunk_size to preserve complete words.
When this occurs, the actual overlap may vary from the specified value.

:param chunk_size: Target maximum characters per chunk (must be > 0)
:param chunk_overlap: Target characters to overlap between chunks
(must be >= 0 and < chunk_size)
"""
if chunk_size <= 0:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we export the validation logic into validation function ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a good point, but following YAGNI now I think No, I don't think we need to. The validation is only 2 lines and very straightforward. Extracting it would add complexity without real benefit since there's only one chunker type currently and no other code needs this validation. We can refactor later if we add more chunker implementations that share validation logic.

logger.error("Invalid chunk_size: %d. Must be greater than 0.", chunk_size)
raise ValueError("chunk_size must be greater than 0")
if chunk_overlap < 0 or chunk_overlap >= chunk_size:
logger.error(
"Invalid chunk_overlap. Must be non-negative and less than chunk_size"
)
raise ValueError(
"chunk_overlap must be non-negative and less than chunk_size"
)

self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap

def chunk(self, text: str) -> List[str]:
"""Split text into overlapping chunks at word boundaries.

Chunks are extended to the nearest word boundary (space or newline)
to avoid splitting words. This means chunks may slightly exceed
chunk_size. For texts without spaces (e.g., CJK languages), chunks
may extend to end of text.

:param text: The input text to chunk
:return: List of text chunks with overlap
"""
if not text:
logger.debug("Empty text provided, returning empty chunk list")
return []

logger.debug(
"Chunking text: length=%d, chunk_size=%d, overlap=%d",
len(text),
self.chunk_size,
self.chunk_overlap,
)

chunks = []
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what would happen when we using languages with no spaces , /n ? should we log a warning ?

Copy link
Author

@jedheaj314 jedheaj314 Dec 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, but No warning needed. The docstring already documents this: "For texts without spaces (e.g., CJK languages), chunks may extend to end of text." and warning will just add unnecessary noise.
and we also have a unit test suggesting the behaviour to devs using this.

Most real-world CJK text has punctuation or newlines for boundaries. For pure spaceless text, not splitting mid-character is the right choice to avoid corrupting Unicode. If CJK truncation becomes a real issue, we can add character-based fallback chunking or other ways of chunking approach as a future enhancements.
wdyt?

start = 0

while start < len(text):
# Calculate end position
end = (
start + self.chunk_size
if start + self.chunk_size < len(text)
else len(text)
)

# Extend to complete word boundary (space or newline)
while end < len(text) and text[end] not in [" ", "\n"]:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we export it to constant level parameter and think about more word boundary ?
Should we let the user option to enhance this list via config or something else ?

Copy link
Author

@jedheaj314 jedheaj314 Dec 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is good suggestion, I can certainly consider doing this in another PR, would that be okay?
As the current simple implementation solves the immediate problem (GLiNER truncation). We can enhance boundary detection as a separate feature if there's actual demand.

end += 1

chunks.append(text[start:end])

# Move start position with overlap (stop if we've covered all text)
if end >= len(text):
break
start = end - self.chunk_overlap

logger.debug("Created %d chunks from text", len(chunks))
return chunks
102 changes: 102 additions & 0 deletions presidio-analyzer/presidio_analyzer/chunkers/chunking_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""Utility functions for processing text with chunking strategies."""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did we consider either (1) reusing an existing splitter (LangChain / NLTK / spaCy / HF tokenizers) or (2) at least aligning our implementation with their separator hierarchy pattern (paragraph β†’ line β†’ word β†’ char)?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point! We considered this but chose a simple custom implementation for several reasons
Please check the commit message for the justification/info around approaches considered

from typing import Any, Callable, Dict, List

from presidio_analyzer.chunkers.base_chunker import BaseTextChunker


def predict_with_chunking(
text: str,
predict_func: Callable[[str], List[Dict[str, Any]]],
chunker: BaseTextChunker,
) -> List[Dict[str, Any]]:
"""Process text with automatic chunking for long texts.

For short text (≀ chunker.chunk_size), calls predict_func directly.
For long text, chunks it and merges predictions with deduplication.

:param text: Input text to process
:param predict_func: Function that takes text and returns predictions
:param chunker: Text chunking strategy (contains chunk_size and chunk_overlap)
:return: List of predictions with correct offsets
"""
if len(text) <= chunker.chunk_size:
return predict_func(text)

predictions = process_text_in_chunks(
text=text,
chunker=chunker,
process_func=predict_func,
)
return deduplicate_overlapping_entities(predictions)

def process_text_in_chunks(
text: str,
chunker: BaseTextChunker,
process_func: Callable[[str], List[Dict[str, Any]]],
) -> List[Dict[str, Any]]:
"""Process text in chunks and adjust entity offsets.

:param text: Input text to process
:param chunker: Text chunking strategy
:param process_func: Function that takes chunk text and returns predictions
:return: List of predictions with adjusted offsets
"""
chunks = chunker.chunk(text)
all_predictions = []
offset = 0

for chunk in chunks:
chunk_predictions = process_func(chunk)

# Adjust offsets to match original text position
for pred in chunk_predictions:
pred["start"] += offset
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we validate that predictions have required keys? or catch exception if one chunk fail? and log warning ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd say not adding validation or error handling here would be the right way

Type hints define the contract - callers must provide the correct format
Fail fast is better - if predictions are malformed, a KeyError immediately shows where the bug is
Consistent with existing code
Performance - this runs for every prediction, so validation adds unnecessary overhead
If a chunk fails, it's a recognizer bug that needs fixing, not something to silently skip.

pred["end"] += offset

all_predictions.extend(chunk_predictions)
offset += len(chunk) - chunker.chunk_overlap

return all_predictions

def deduplicate_overlapping_entities(
predictions: List[Dict[str, Any]], overlap_threshold: float = 0.5
) -> List[Dict[str, Any]]:
"""Remove duplicate entities from overlapping chunks.

:param predictions: List of predictions with 'start', 'end', 'label',
'score'
:param overlap_threshold: Overlap ratio threshold to consider duplicates
(default: 0.5)
:return: Deduplicated list of predictions sorted by position
"""
if not predictions:
return predictions

# Sort by score descending to keep highest scoring entities
sorted_preds = sorted(predictions, key=lambda p: p["score"], reverse=True)
unique = []

for pred in sorted_preds:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For n predictions, this is O(nΒ²), could we optimize it using any library or more sophisticated approach ?

Copy link
Author

@jedheaj314 jedheaj314 Dec 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a very good observation! have discussed this with Sharon H
I have mentioned this and justification in the commit message as well

TLDR is
I'd suggest keeping the current simple implementation for now since:
It's readable and maintainable
Performance is acceptable for typical entity counts
Adding a dependency just for this would increase complexity
wdyt?

is_duplicate = False
for kept in unique:
# Check if same entity type and overlapping positions
if pred["label"] == kept["label"]:
overlap_start = max(pred["start"], kept["start"])
overlap_end = min(pred["end"], kept["end"])

if overlap_start < overlap_end:
# Calculate overlap ratio
overlap_len = overlap_end - overlap_start
pred_len = pred["end"] - pred["start"]
kept_len = kept["end"] - kept["start"]

# Check if overlap exceeds threshold
if overlap_len / min(pred_len, kept_len) > overlap_threshold:
is_duplicate = True
break

if not is_duplicate:
unique.append(pred)

# Sort by position for consistent output
return sorted(unique, key=lambda p: p["start"])
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
import json
import logging
from typing import Dict, List, Optional
from typing import Any, Dict, List, Optional

from presidio_analyzer import (
AnalysisExplanation,
LocalRecognizer,
RecognizerResult,
)
from presidio_analyzer.chunkers import (
BaseTextChunker,
CharacterBasedTextChunker,
predict_with_chunking,
)
from presidio_analyzer.nlp_engine import NerModelConfiguration, NlpArtifacts

try:
Expand Down Expand Up @@ -35,6 +40,9 @@ def __init__(
multi_label: bool = False,
threshold: float = 0.30,
map_location: str = "cpu",
chunk_size: int = 250,
chunk_overlap: int = 50,
text_chunker: Optional[BaseTextChunker] = None,
):
"""GLiNER model based entity recognizer.

Expand All @@ -54,6 +62,12 @@ def __init__(
:param threshold: The threshold for the model's output
(see GLiNER's documentation)
:param map_location: The device to use for the model
:param chunk_size: Maximum character length for text chunks
(default: 250)
:param chunk_overlap: Characters to overlap between chunks
(default: 50)
:param text_chunker: Custom text chunking strategy. If None, uses
CharacterBasedTextChunker


"""
Expand Down Expand Up @@ -86,6 +100,15 @@ def __init__(
self.flat_ner = flat_ner
self.multi_label = multi_label
self.threshold = threshold
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap

# Use provided chunker or default to CharacterBasedTextChunker
self.text_chunker = (
text_chunker
if text_chunker is not None
else CharacterBasedTextChunker(chunk_size, chunk_overlap)
)

self.gliner = None

Expand Down Expand Up @@ -121,13 +144,22 @@ def analyze(
# combine the input labels as this model allows for ad-hoc labels
labels = self.__create_input_labels(entities)

predictions = self.gliner.predict_entities(
# Process text with automatic chunking
def predict_func(text: str) -> List[Dict[str, Any]]:
return self.gliner.predict_entities(
text=text,
labels=labels,
flat_ner=self.flat_ner,
threshold=self.threshold,
multi_label=self.multi_label,
)

predictions = predict_with_chunking(
text=text,
labels=labels,
flat_ner=self.flat_ner,
threshold=self.threshold,
multi_label=self.multi_label,
predict_func=predict_func,
chunker=self.text_chunker,
)

recognizer_results = []
for prediction in predictions:
presidio_entity = self.model_to_presidio_entity_mapping.get(
Expand Down
Loading
Loading