microsoft · jedheaj314 · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025 · Nov 26, 2025
diff --git a/presidio-analyzer/presidio_analyzer/chunkers/__init__.py b/presidio-analyzer/presidio_analyzer/chunkers/__init__.py
@@ -0,0 +1,19 @@
+"""Text chunking strategies for handling long texts."""
+
+from presidio_analyzer.chunkers.base_chunker import BaseTextChunker
+from presidio_analyzer.chunkers.character_based_text_chunker import (
+    CharacterBasedTextChunker,
+)
+from presidio_analyzer.chunkers.chunking_utils import (
+    deduplicate_overlapping_entities,
+    predict_with_chunking,
+    process_text_in_chunks,
+)
+
+__all__ = [
+    "BaseTextChunker",
+    "CharacterBasedTextChunker",
+    "predict_with_chunking",
+    "process_text_in_chunks",
+    "deduplicate_overlapping_entities",
+]
diff --git a/presidio-analyzer/presidio_analyzer/chunkers/base_chunker.py b/presidio-analyzer/presidio_analyzer/chunkers/base_chunker.py
@@ -0,0 +1,16 @@
+"""Abstract base class for text chunking strategies."""
+from abc import ABC, abstractmethod
+from typing import List
+
+
+class BaseTextChunker(ABC):
+    """Abstract base class for text chunking strategies."""
+
+    @abstractmethod
+    def chunk(self, text: str) -> List[str]:
+        """Split text into chunks.
+
+        :param text: The input text to split
+        :return: List of text chunks
+        """
+        pass
diff --git a/presidio-analyzer/presidio_analyzer/chunkers/character_based_text_chunker.py b/presidio-analyzer/presidio_analyzer/chunkers/character_based_text_chunker.py
@@ -0,0 +1,86 @@
+"""Character-based text chunker with word boundary preservation.
+
+Based on gliner-spacy implementation:
+https://github.com/theirstory/gliner-spacy/blob/main/gliner_spacy/pipeline.py#L60-L96
+"""
+import logging
+from typing import List
+
+from presidio_analyzer.chunkers.base_chunker import BaseTextChunker
+
+logger = logging.getLogger("presidio-analyzer")
+
+
+class CharacterBasedTextChunker(BaseTextChunker):
+    """Character-based text chunker with word boundary preservation."""
+
+    def __init__(self, chunk_size: int, chunk_overlap: int = 0):
+        """Initialize the character-based text chunker.
+
+        Note: Chunks may slightly exceed chunk_size to preserve complete words.
+        When this occurs, the actual overlap may vary from the specified value.
+
+        :param chunk_size: Target maximum characters per chunk (must be > 0)
+        :param chunk_overlap: Target characters to overlap between chunks
+            (must be >= 0 and < chunk_size)
+        """
+        if chunk_size <= 0:
+            logger.error("Invalid chunk_size: %d. Must be greater than 0.", chunk_size)
+            raise ValueError("chunk_size must be greater than 0")
+        if chunk_overlap < 0 or chunk_overlap >= chunk_size:
+            logger.error(
+                "Invalid chunk_overlap. Must be non-negative and less than chunk_size"
+            )
+            raise ValueError(
+                "chunk_overlap must be non-negative and less than chunk_size"
+            )
+
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+
+    def chunk(self, text: str) -> List[str]:
+        """Split text into overlapping chunks at word boundaries.
+
+        Chunks are extended to the nearest word boundary (space or newline)
+        to avoid splitting words. This means chunks may slightly exceed
+        chunk_size. For texts without spaces (e.g., CJK languages), chunks
+        may extend to end of text.
+
+        :param text: The input text to chunk
+        :return: List of text chunks with overlap
+        """
+        if not text:
+            logger.debug("Empty text provided, returning empty chunk list")
+            return []
+
+        logger.debug(
+            "Chunking text: length=%d, chunk_size=%d, overlap=%d",
+            len(text),
+            self.chunk_size,
+            self.chunk_overlap,
+        )
+
+        chunks = []
+        start = 0
+
+        while start < len(text):
+            # Calculate end position
+            end = (
+                start + self.chunk_size
+                if start + self.chunk_size < len(text)
+                else len(text)
+            )
+
+            # Extend to complete word boundary (space or newline)
+            while end < len(text) and text[end] not in [" ", "\n"]:
+                end += 1
+
+            chunks.append(text[start:end])
+
+            # Move start position with overlap (stop if we've covered all text)
+            if end >= len(text):
+                break
+            start = end - self.chunk_overlap
+
+        logger.debug("Created %d chunks from text", len(chunks))
+        return chunks
diff --git a/presidio-analyzer/presidio_analyzer/chunkers/chunking_utils.py b/presidio-analyzer/presidio_analyzer/chunkers/chunking_utils.py
@@ -0,0 +1,102 @@
+"""Utility functions for processing text with chunking strategies."""
+from typing import Any, Callable, Dict, List
+
+from presidio_analyzer.chunkers.base_chunker import BaseTextChunker
+
+
+def predict_with_chunking(
+    text: str,
+    predict_func: Callable[[str], List[Dict[str, Any]]],
+    chunker: BaseTextChunker,
+) -> List[Dict[str, Any]]:
+    """Process text with automatic chunking for long texts.
+
+    For short text (≤ chunker.chunk_size), calls predict_func directly.
+    For long text, chunks it and merges predictions with deduplication.
+
+    :param text: Input text to process
+    :param predict_func: Function that takes text and returns predictions
+    :param chunker: Text chunking strategy (contains chunk_size and chunk_overlap)
+    :return: List of predictions with correct offsets
+    """
+    if len(text) <= chunker.chunk_size:
+        return predict_func(text)
+
+    predictions = process_text_in_chunks(
+        text=text,
+        chunker=chunker,
+        process_func=predict_func,
+    )
+    return deduplicate_overlapping_entities(predictions)
+
+def process_text_in_chunks(
+    text: str,
+    chunker: BaseTextChunker,
+    process_func: Callable[[str], List[Dict[str, Any]]],
+) -> List[Dict[str, Any]]:
+    """Process text in chunks and adjust entity offsets.
+
+    :param text: Input text to process
+    :param chunker: Text chunking strategy
+    :param process_func: Function that takes chunk text and returns predictions
+    :return: List of predictions with adjusted offsets
+    """
+    chunks = chunker.chunk(text)
+    all_predictions = []
+    offset = 0
+
+    for chunk in chunks:
+        chunk_predictions = process_func(chunk)
+
+        # Adjust offsets to match original text position
+        for pred in chunk_predictions:
+            pred["start"] += offset
+            pred["end"] += offset
+
+        all_predictions.extend(chunk_predictions)
+        offset += len(chunk) - chunker.chunk_overlap
+
+    return all_predictions
+
+def deduplicate_overlapping_entities(
+    predictions: List[Dict[str, Any]], overlap_threshold: float = 0.5
+) -> List[Dict[str, Any]]:
+    """Remove duplicate entities from overlapping chunks.
+
+    :param predictions: List of predictions with 'start', 'end', 'label',
+        'score'
+    :param overlap_threshold: Overlap ratio threshold to consider duplicates
+        (default: 0.5)
+    :return: Deduplicated list of predictions sorted by position
+    """
+    if not predictions:
+        return predictions
+
+    # Sort by score descending to keep highest scoring entities
+    sorted_preds = sorted(predictions, key=lambda p: p["score"], reverse=True)
+    unique = []
+
+    for pred in sorted_preds:
+        is_duplicate = False
+        for kept in unique:
+            # Check if same entity type and overlapping positions
+            if pred["label"] == kept["label"]:
+                overlap_start = max(pred["start"], kept["start"])
+                overlap_end = min(pred["end"], kept["end"])
+
+                if overlap_start < overlap_end:
+                    # Calculate overlap ratio
+                    overlap_len = overlap_end - overlap_start
+                    pred_len = pred["end"] - pred["start"]
+                    kept_len = kept["end"] - kept["start"]
+
+                    # Check if overlap exceeds threshold
+                    if overlap_len / min(pred_len, kept_len) > overlap_threshold:
+                        is_duplicate = True
+                        break
+
+        if not is_duplicate:
+            unique.append(pred)
+
+    # Sort by position for consistent output
+    return sorted(unique, key=lambda p: p["start"])
diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/gliner_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/gliner_recognizer.py
@@ -1,12 +1,17 @@
 import json
 import logging
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional
 
 from presidio_analyzer import (
     AnalysisExplanation,
     LocalRecognizer,
     RecognizerResult,
 )
+from presidio_analyzer.chunkers import (
+    BaseTextChunker,
+    CharacterBasedTextChunker,
+    predict_with_chunking,
+)
 from presidio_analyzer.nlp_engine import NerModelConfiguration, NlpArtifacts
 
 try:
@@ -35,6 +40,9 @@ def __init__(
         multi_label: bool = False,
         threshold: float = 0.30,
         map_location: str = "cpu",
+        chunk_size: int = 250,
+        chunk_overlap: int = 50,
+        text_chunker: Optional[BaseTextChunker] = None,
     ):
         """GLiNER model based entity recognizer.
 
@@ -54,6 +62,12 @@ def __init__(
         :param threshold: The threshold for the model's output
         (see GLiNER's documentation)
         :param map_location: The device to use for the model
+        :param chunk_size: Maximum character length for text chunks
+            (default: 250)
+        :param chunk_overlap: Characters to overlap between chunks
+            (default: 50)
+        :param text_chunker: Custom text chunking strategy. If None, uses
+            CharacterBasedTextChunker
 
 
         """
@@ -86,6 +100,15 @@ def __init__(
         self.flat_ner = flat_ner
         self.multi_label = multi_label
         self.threshold = threshold
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+
+        # Use provided chunker or default to CharacterBasedTextChunker
+        self.text_chunker = (
+            text_chunker
+            if text_chunker is not None
+            else CharacterBasedTextChunker(chunk_size, chunk_overlap)
+        )
 
         self.gliner = None
 
@@ -121,13 +144,22 @@ def analyze(
         # combine the input labels as this model allows for ad-hoc labels
         labels = self.__create_input_labels(entities)
 
-        predictions = self.gliner.predict_entities(
+        # Process text with automatic chunking
+        def predict_func(text: str) -> List[Dict[str, Any]]:
+            return self.gliner.predict_entities(
+                text=text,
+                labels=labels,
+                flat_ner=self.flat_ner,
+                threshold=self.threshold,
+                multi_label=self.multi_label,
+            )
+
+        predictions = predict_with_chunking(
             text=text,
-            labels=labels,
-            flat_ner=self.flat_ner,
-            threshold=self.threshold,
-            multi_label=self.multi_label,
+            predict_func=predict_func,
+            chunker=self.text_chunker,
         )
+
         recognizer_results = []
         for prediction in predictions:
             presidio_entity = self.model_to_presidio_entity_mapping.get(