diff --git a/presidio-analyzer/presidio_analyzer/chunkers/__init__.py b/presidio-analyzer/presidio_analyzer/chunkers/__init__.py new file mode 100644 index 0000000000..d73e4032d9 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/chunkers/__init__.py @@ -0,0 +1,19 @@ +"""Text chunking strategies for handling long texts.""" + +from presidio_analyzer.chunkers.base_chunker import BaseTextChunker +from presidio_analyzer.chunkers.character_based_text_chunker import ( + CharacterBasedTextChunker, +) +from presidio_analyzer.chunkers.chunking_utils import ( + deduplicate_overlapping_entities, + predict_with_chunking, + process_text_in_chunks, +) + +__all__ = [ + "BaseTextChunker", + "CharacterBasedTextChunker", + "predict_with_chunking", + "process_text_in_chunks", + "deduplicate_overlapping_entities", +] diff --git a/presidio-analyzer/presidio_analyzer/chunkers/base_chunker.py b/presidio-analyzer/presidio_analyzer/chunkers/base_chunker.py new file mode 100644 index 0000000000..e9e331007d --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/chunkers/base_chunker.py @@ -0,0 +1,16 @@ +"""Abstract base class for text chunking strategies.""" +from abc import ABC, abstractmethod +from typing import List + + +class BaseTextChunker(ABC): + """Abstract base class for text chunking strategies.""" + + @abstractmethod + def chunk(self, text: str) -> List[str]: + """Split text into chunks. + + :param text: The input text to split + :return: List of text chunks + """ + pass diff --git a/presidio-analyzer/presidio_analyzer/chunkers/character_based_text_chunker.py b/presidio-analyzer/presidio_analyzer/chunkers/character_based_text_chunker.py new file mode 100644 index 0000000000..90f5242f20 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/chunkers/character_based_text_chunker.py @@ -0,0 +1,86 @@ +"""Character-based text chunker with word boundary preservation. + +Based on gliner-spacy implementation: +https://github.com/theirstory/gliner-spacy/blob/main/gliner_spacy/pipeline.py#L60-L96 +""" +import logging +from typing import List + +from presidio_analyzer.chunkers.base_chunker import BaseTextChunker + +logger = logging.getLogger("presidio-analyzer") + + +class CharacterBasedTextChunker(BaseTextChunker): + """Character-based text chunker with word boundary preservation.""" + + def __init__(self, chunk_size: int, chunk_overlap: int = 0): + """Initialize the character-based text chunker. + + Note: Chunks may slightly exceed chunk_size to preserve complete words. + When this occurs, the actual overlap may vary from the specified value. + + :param chunk_size: Target maximum characters per chunk (must be > 0) + :param chunk_overlap: Target characters to overlap between chunks + (must be >= 0 and < chunk_size) + """ + if chunk_size <= 0: + logger.error("Invalid chunk_size: %d. Must be greater than 0.", chunk_size) + raise ValueError("chunk_size must be greater than 0") + if chunk_overlap < 0 or chunk_overlap >= chunk_size: + logger.error( + "Invalid chunk_overlap. Must be non-negative and less than chunk_size" + ) + raise ValueError( + "chunk_overlap must be non-negative and less than chunk_size" + ) + + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + + def chunk(self, text: str) -> List[str]: + """Split text into overlapping chunks at word boundaries. + + Chunks are extended to the nearest word boundary (space or newline) + to avoid splitting words. This means chunks may slightly exceed + chunk_size. For texts without spaces (e.g., CJK languages), chunks + may extend to end of text. + + :param text: The input text to chunk + :return: List of text chunks with overlap + """ + if not text: + logger.debug("Empty text provided, returning empty chunk list") + return [] + + logger.debug( + "Chunking text: length=%d, chunk_size=%d, overlap=%d", + len(text), + self.chunk_size, + self.chunk_overlap, + ) + + chunks = [] + start = 0 + + while start < len(text): + # Calculate end position + end = ( + start + self.chunk_size + if start + self.chunk_size < len(text) + else len(text) + ) + + # Extend to complete word boundary (space or newline) + while end < len(text) and text[end] not in [" ", "\n"]: + end += 1 + + chunks.append(text[start:end]) + + # Move start position with overlap (stop if we've covered all text) + if end >= len(text): + break + start = end - self.chunk_overlap + + logger.debug("Created %d chunks from text", len(chunks)) + return chunks diff --git a/presidio-analyzer/presidio_analyzer/chunkers/chunking_utils.py b/presidio-analyzer/presidio_analyzer/chunkers/chunking_utils.py new file mode 100644 index 0000000000..f8ae9cdd58 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/chunkers/chunking_utils.py @@ -0,0 +1,102 @@ +"""Utility functions for processing text with chunking strategies.""" +from typing import Any, Callable, Dict, List + +from presidio_analyzer.chunkers.base_chunker import BaseTextChunker + + +def predict_with_chunking( + text: str, + predict_func: Callable[[str], List[Dict[str, Any]]], + chunker: BaseTextChunker, +) -> List[Dict[str, Any]]: + """Process text with automatic chunking for long texts. + + For short text (≤ chunker.chunk_size), calls predict_func directly. + For long text, chunks it and merges predictions with deduplication. + + :param text: Input text to process + :param predict_func: Function that takes text and returns predictions + :param chunker: Text chunking strategy (contains chunk_size and chunk_overlap) + :return: List of predictions with correct offsets + """ + if len(text) <= chunker.chunk_size: + return predict_func(text) + + predictions = process_text_in_chunks( + text=text, + chunker=chunker, + process_func=predict_func, + ) + return deduplicate_overlapping_entities(predictions) + +def process_text_in_chunks( + text: str, + chunker: BaseTextChunker, + process_func: Callable[[str], List[Dict[str, Any]]], +) -> List[Dict[str, Any]]: + """Process text in chunks and adjust entity offsets. + + :param text: Input text to process + :param chunker: Text chunking strategy + :param process_func: Function that takes chunk text and returns predictions + :return: List of predictions with adjusted offsets + """ + chunks = chunker.chunk(text) + all_predictions = [] + offset = 0 + + for chunk in chunks: + chunk_predictions = process_func(chunk) + + # Adjust offsets to match original text position + for pred in chunk_predictions: + pred["start"] += offset + pred["end"] += offset + + all_predictions.extend(chunk_predictions) + offset += len(chunk) - chunker.chunk_overlap + + return all_predictions + +def deduplicate_overlapping_entities( + predictions: List[Dict[str, Any]], overlap_threshold: float = 0.5 +) -> List[Dict[str, Any]]: + """Remove duplicate entities from overlapping chunks. + + :param predictions: List of predictions with 'start', 'end', 'label', + 'score' + :param overlap_threshold: Overlap ratio threshold to consider duplicates + (default: 0.5) + :return: Deduplicated list of predictions sorted by position + """ + if not predictions: + return predictions + + # Sort by score descending to keep highest scoring entities + sorted_preds = sorted(predictions, key=lambda p: p["score"], reverse=True) + unique = [] + + for pred in sorted_preds: + is_duplicate = False + for kept in unique: + # Check if same entity type and overlapping positions + if pred["label"] == kept["label"]: + overlap_start = max(pred["start"], kept["start"]) + overlap_end = min(pred["end"], kept["end"]) + + if overlap_start < overlap_end: + # Calculate overlap ratio + overlap_len = overlap_end - overlap_start + pred_len = pred["end"] - pred["start"] + kept_len = kept["end"] - kept["start"] + + # Check if overlap exceeds threshold + if overlap_len / min(pred_len, kept_len) > overlap_threshold: + is_duplicate = True + break + + if not is_duplicate: + unique.append(pred) + + # Sort by position for consistent output + return sorted(unique, key=lambda p: p["start"]) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/gliner_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/gliner_recognizer.py index 13523f5c0e..36ef693f1a 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/gliner_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/ner/gliner_recognizer.py @@ -1,12 +1,17 @@ import json import logging -from typing import Dict, List, Optional +from typing import Any, Dict, List, Optional from presidio_analyzer import ( AnalysisExplanation, LocalRecognizer, RecognizerResult, ) +from presidio_analyzer.chunkers import ( + BaseTextChunker, + CharacterBasedTextChunker, + predict_with_chunking, +) from presidio_analyzer.nlp_engine import NerModelConfiguration, NlpArtifacts try: @@ -35,6 +40,9 @@ def __init__( multi_label: bool = False, threshold: float = 0.30, map_location: str = "cpu", + chunk_size: int = 250, + chunk_overlap: int = 50, + text_chunker: Optional[BaseTextChunker] = None, ): """GLiNER model based entity recognizer. @@ -54,6 +62,12 @@ def __init__( :param threshold: The threshold for the model's output (see GLiNER's documentation) :param map_location: The device to use for the model + :param chunk_size: Maximum character length for text chunks + (default: 250) + :param chunk_overlap: Characters to overlap between chunks + (default: 50) + :param text_chunker: Custom text chunking strategy. If None, uses + CharacterBasedTextChunker """ @@ -86,6 +100,15 @@ def __init__( self.flat_ner = flat_ner self.multi_label = multi_label self.threshold = threshold + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + + # Use provided chunker or default to CharacterBasedTextChunker + self.text_chunker = ( + text_chunker + if text_chunker is not None + else CharacterBasedTextChunker(chunk_size, chunk_overlap) + ) self.gliner = None @@ -121,13 +144,22 @@ def analyze( # combine the input labels as this model allows for ad-hoc labels labels = self.__create_input_labels(entities) - predictions = self.gliner.predict_entities( + # Process text with automatic chunking + def predict_func(text: str) -> List[Dict[str, Any]]: + return self.gliner.predict_entities( + text=text, + labels=labels, + flat_ner=self.flat_ner, + threshold=self.threshold, + multi_label=self.multi_label, + ) + + predictions = predict_with_chunking( text=text, - labels=labels, - flat_ner=self.flat_ner, - threshold=self.threshold, - multi_label=self.multi_label, + predict_func=predict_func, + chunker=self.text_chunker, ) + recognizer_results = [] for prediction in predictions: presidio_entity = self.model_to_presidio_entity_mapping.get( diff --git a/presidio-analyzer/tests/test_character_based_text_chunker.py b/presidio-analyzer/tests/test_character_based_text_chunker.py new file mode 100644 index 0000000000..e8fae2994c --- /dev/null +++ b/presidio-analyzer/tests/test_character_based_text_chunker.py @@ -0,0 +1,189 @@ +"""Tests for text chunking strategies.""" +import pytest + +from presidio_analyzer.chunkers import CharacterBasedTextChunker + + +class TestCharacterBasedTextChunker: + """Test CharacterBasedTextChunker implementation.""" + + def test_empty_text(self): + """Test chunking empty text.""" + chunker = CharacterBasedTextChunker(chunk_size=100, chunk_overlap=20) + result = chunker.chunk("") + assert result == [] + + def test_short_text(self): + """Test text shorter than chunk_size.""" + chunker = CharacterBasedTextChunker(chunk_size=100, chunk_overlap=20) + text = "This is a short text." + result = chunker.chunk(text) + assert len(result) == 1 + assert result[0] == text + + def test_long_text_without_overlap(self): + """Test long text with no overlap.""" + chunker = CharacterBasedTextChunker(chunk_size=3, chunk_overlap=0) + text = "1 2 3 4" # 7 chars + result = chunker.chunk(text) + # Actual behavior: word boundaries extend chunks: ["1 2", " 3 4"] + assert len(result) == 2 + assert result[0] == "1 2" + assert result[1] == " 3 4" + + def test_long_text_with_overlap(self): + """Test long text with overlap.""" + chunker = CharacterBasedTextChunker(chunk_size=5, chunk_overlap=2) + text = "1 3 5 7 9" # 9 chars: positions 0-8 + result = chunker.chunk(text) + + assert len(result) == 2 + assert result[0] == "1 3 5" + assert result[1] == " 5 7 9" + # Verify overlap exists + assert result[0].endswith(" 5") and result[1].startswith(" 5") + + def test_word_boundary_preservation(self): + """Test that chunks extend to word boundaries.""" + chunker = CharacterBasedTextChunker(chunk_size=8, chunk_overlap=2) + text = "one two three four" + result = chunker.chunk(text) + # Chunks extend to word boundaries: "one two three" (13 chars) instead of breaking at 8 + assert result[0] == "one two three" + assert len(result) == 2 + + def test_exact_chunk_size(self): + """Test text that's exactly chunk_size.""" + chunker = CharacterBasedTextChunker(chunk_size=5, chunk_overlap=2) + text = "1 2 3" + result = chunker.chunk(text) + assert len(result) == 1 + assert result[0] == text + + def test_validation_zero_chunk_size(self): + """Test that chunk_size must be > 0.""" + with pytest.raises(ValueError, match="chunk_size must be greater than 0"): + CharacterBasedTextChunker(chunk_size=0, chunk_overlap=5) + + def test_validation_negative_chunk_size(self): + """Test that chunk_size cannot be negative.""" + with pytest.raises(ValueError, match="chunk_size must be greater than 0"): + CharacterBasedTextChunker(chunk_size=-10, chunk_overlap=5) + + def test_validation_negative_overlap(self): + """Test that overlap cannot be negative.""" + with pytest.raises( + ValueError, match="chunk_overlap must be non-negative and less than chunk_size" + ): + CharacterBasedTextChunker(chunk_size=100, chunk_overlap=-5) + + def test_validation_overlap_equals_chunk_size(self): + """Test that overlap cannot equal chunk_size.""" + with pytest.raises( + ValueError, match="chunk_overlap must be non-negative and less than chunk_size" + ): + CharacterBasedTextChunker(chunk_size=100, chunk_overlap=100) + + def test_validation_overlap_greater_than_chunk_size(self): + """Test that overlap cannot exceed chunk_size.""" + with pytest.raises( + ValueError, match="chunk_overlap must be non-negative and less than chunk_size" + ): + CharacterBasedTextChunker(chunk_size=50, chunk_overlap=75) + + def test_multiple_chunks_coverage(self): + """Test that chunks cover entire text.""" + chunker = CharacterBasedTextChunker(chunk_size=5, chunk_overlap=1) + text = "1 2 3 4 5 6" # 11 chars: positions 0-10 + result = chunker.chunk(text) + # Actual result: ['1 2 3', '3 4 5', '5 6'] + assert len(result) == 3 + assert result[0] == "1 2 3" + assert result[1] == "3 4 5" + assert result[2] == "5 6" + # Verify all digits appear (overlap causes duplication in joined string) + all_text = "".join(result) + for digit in ["1", "2", "3", "4", "5", "6"]: + assert digit in all_text + + def test_newline_handling(self): + """Test that newlines are preserved and treated as word boundaries.""" + chunker = CharacterBasedTextChunker(chunk_size=10, chunk_overlap=0) + text = "line1\nline2\nline3" # 17 chars + result = chunker.chunk(text) + # Chunk 1: "line1\nline2" (12 chars, extends to newline boundary at position 11) + # Chunk 2: "\nline3" (remaining 6 chars) + assert len(result) == 2 + assert result[0] == "line1\nline2" + assert result[1] == "\nline3" + # Verify complete text preserved + assert "".join(result) == text + + def test_default_parameters(self): + """Test chunker with default overlap (0).""" + chunker = CharacterBasedTextChunker(chunk_size=5) # No overlap specified (default=0) + text = "1 2 3 4" # 7 chars + result = chunker.chunk(text) + # Chunk 1: "1 2 3" (5 chars, extends to word boundary at position 4) + # Chunk 2: starts at position 5: " 4" (remaining) + assert len(result) == 2 + assert result[0] == "1 2 3" + assert result[1] == " 4" + + def test_very_long_text(self): + """Test chunking very long text.""" + chunker = CharacterBasedTextChunker(chunk_size=10, chunk_overlap=2) + text = " ".join([str(i) for i in range(50)]) # "0 1 2 3..." + # Text: "0 1 2 3 4 5 6 7 8 9 10 11..." = 138 chars + result = chunker.chunk(text) + # With chunk_size=10, overlap=2, word boundaries: creates 16 chunks + assert len(result) == 16 + # First chunk + assert result[0] == "0 1 2 3 4 5" + # Last chunk + assert result[-1] == "48 49" + # Verify all numbers appear in chunks + all_text = " ".join(result) + for i in range(50): + assert str(i) in all_text + + def test_real_world_example(self): + """Test with real-world PII detection scenario.""" + chunker = CharacterBasedTextChunker(chunk_size=250, chunk_overlap=50) + text = """John Smith's credit card number is 4532-1234-5678-9010. + His social security number is 123-45-6789 and his email is john.smith@example.com. + He lives at 123 Main Street, Anytown, ST 12345. + For contact, his phone number is (555) 123-4567.""" + result = chunker.chunk(text) + # Text is 251 chars, creates 2 chunks with overlap + assert len(result) == 2 + # All PII should be present across chunks + all_text = " ".join(result) + assert "4532-1234-5678-9010" in all_text + assert "123-45-6789" in all_text + assert "john.smith@example.com" in all_text + assert "123-4567" in all_text + + def test_cjk_text_without_spaces(self): + """Test CJK text without spaces extends to end of text.""" + chunker = CharacterBasedTextChunker(chunk_size=5, chunk_overlap=1) + text = "你好世界测试" # 6 Chinese characters, no spaces + result = chunker.chunk(text) + # No spaces, so first chunk extends all the way to end + # (word boundary extension continues until end of text) + assert len(result) == 1 + assert result[0] == text + + def test_unicode_emoji_handling(self): + """Test Unicode characters and emojis are handled correctly.""" + chunker = CharacterBasedTextChunker(chunk_size=10, chunk_overlap=2) + text = "Hello 👋 World 🌍 Test" + result = chunker.chunk(text) + # Verify emojis are preserved in chunks + all_text = "".join(result) + assert "👋" in all_text + assert "🌍" in all_text + # Verify all words appear (overlap may cause partial duplication) + assert "Hello" in all_text + assert "World" in all_text # May appear as 'Worldld' due to overlap + assert "Test" in all_text diff --git a/presidio-analyzer/tests/test_chunking_utils.py b/presidio-analyzer/tests/test_chunking_utils.py new file mode 100644 index 0000000000..803b11f048 --- /dev/null +++ b/presidio-analyzer/tests/test_chunking_utils.py @@ -0,0 +1,155 @@ +"""Tests for chunking utility functions.""" +import pytest + +from presidio_analyzer.chunkers import ( + CharacterBasedTextChunker, + process_text_in_chunks, + deduplicate_overlapping_entities, +) + + +class TestProcessTextInChunks: + """Test process_text_in_chunks utility function.""" + + def test_short_text_no_chunking(self): + """Test text shorter than chunk size is not chunked.""" + chunker = CharacterBasedTextChunker(chunk_size=100, chunk_overlap=20) + text = "Short text" + predict_func = lambda chunk: [{"start": 0, "end": 5, "label": "PERSON", "score": 0.9}] + + result = process_text_in_chunks(text, chunker, predict_func) + + assert len(result) == 1 + assert result[0]["start"] == 0 + assert result[0]["end"] == 5 + + def test_long_text_with_offset_adjustment(self): + """Test offset adjustment for chunked text.""" + chunker = CharacterBasedTextChunker(chunk_size=20, chunk_overlap=5) + text = "John Smith lives in New York City with Jane Doe" + + # Mock predict function that finds entities in each chunk + def predict_func(chunk): + if "John" in chunk: + return [{"start": 0, "end": 10, "label": "PERSON", "score": 0.9}] + elif "Jane" in chunk: + idx = chunk.index("Jane") + return [{"start": idx, "end": idx + 8, "label": "PERSON", "score": 0.85}] + return [] + + result = process_text_in_chunks(text, chunker, predict_func) + + # First entity should be at original position + assert result[0]["start"] == 0 + assert result[0]["end"] == 10 + # Second entity should have adjusted offset + assert result[1]["start"] > 20 # In second chunk + + def test_empty_predictions(self): + """Test handling of no predictions.""" + chunker = CharacterBasedTextChunker(chunk_size=50, chunk_overlap=10) + text = "Some text without entities" + predict_func = lambda chunk: [] + + result = process_text_in_chunks(text, chunker, predict_func) + + assert result == [] + + +class TestDeduplicateOverlappingEntities: + """Test deduplicate_overlapping_entities utility function.""" + + def test_no_duplicates(self): + """Test predictions with no overlap.""" + predictions = [ + {"start": 0, "end": 10, "label": "PERSON", "score": 0.9}, + {"start": 20, "end": 30, "label": "PERSON", "score": 0.85}, + ] + + result = deduplicate_overlapping_entities(predictions) + + assert len(result) == 2 + assert result[0]["start"] == 0 + assert result[1]["start"] == 20 + + def test_exact_duplicates_keeps_highest_score(self): + """Test exact duplicates keeps highest scoring entity.""" + predictions = [ + {"start": 0, "end": 10, "label": "PERSON", "score": 0.9}, + {"start": 0, "end": 10, "label": "PERSON", "score": 0.85}, + ] + + result = deduplicate_overlapping_entities(predictions) + + assert len(result) == 1 + assert result[0]["score"] == 0.9 + + def test_overlapping_duplicates(self): + """Test overlapping entities are deduplicated.""" + predictions = [ + {"start": 0, "end": 10, "label": "PERSON", "score": 0.9}, + {"start": 3, "end": 13, "label": "PERSON", "score": 0.85}, + ] + + result = deduplicate_overlapping_entities(predictions) + + # Overlap is 7 chars, ratio = 0.7 > 0.5 threshold + assert len(result) == 1 + assert result[0]["score"] == 0.9 + + def test_different_labels_not_deduplicated(self): + """Test overlapping entities with different labels are kept.""" + predictions = [ + {"start": 0, "end": 10, "label": "PERSON", "score": 0.9}, + {"start": 5, "end": 15, "label": "LOCATION", "score": 0.85}, + ] + + result = deduplicate_overlapping_entities(predictions) + + assert len(result) == 2 + + def test_low_overlap_not_deduplicated(self): + """Test entities with low overlap are not deduplicated.""" + predictions = [ + {"start": 0, "end": 10, "label": "PERSON", "score": 0.9}, + {"start": 9, "end": 20, "label": "PERSON", "score": 0.85}, + ] + + result = deduplicate_overlapping_entities(predictions, overlap_threshold=0.6) + + # Overlap is only 1 char out of 10, ratio = 0.1, below threshold + assert len(result) == 2 + + def test_empty_predictions(self): + """Test empty predictions list.""" + result = deduplicate_overlapping_entities([]) + assert result == [] + + def test_sorted_by_position(self): + """Test results are sorted by start position.""" + predictions = [ + {"start": 20, "end": 30, "label": "PERSON", "score": 0.9}, + {"start": 0, "end": 10, "label": "PERSON", "score": 0.85}, + {"start": 40, "end": 50, "label": "PERSON", "score": 0.95}, + ] + + result = deduplicate_overlapping_entities(predictions) + + assert result[0]["start"] == 0 + assert result[1]["start"] == 20 + assert result[2]["start"] == 40 + + def test_custom_overlap_threshold(self): + """Test custom overlap threshold.""" + predictions = [ + {"start": 0, "end": 10, "label": "PERSON", "score": 0.9}, + {"start": 5, "end": 15, "label": "PERSON", "score": 0.85}, + ] + + # With 0.3 threshold, should deduplicate (overlap ratio = 0.5) + result = deduplicate_overlapping_entities(predictions, overlap_threshold=0.3) + assert len(result) == 1 + + # With 0.7 threshold, should keep both (overlap ratio = 0.5 < 0.7) + result = deduplicate_overlapping_entities(predictions, overlap_threshold=0.7) + assert len(result) == 2 diff --git a/presidio-analyzer/tests/test_gliner_recognizer.py b/presidio-analyzer/tests/test_gliner_recognizer.py index b78f731f78..916ecd91fd 100644 --- a/presidio-analyzer/tests/test_gliner_recognizer.py +++ b/presidio-analyzer/tests/test_gliner_recognizer.py @@ -131,3 +131,122 @@ def test_analyze_with_no_entities(mock_gliner): # Should return no results assert len(results) == 0 + + +def test_gliner_handles_long_text_with_chunking(mock_gliner): + """Test that GLiNER chunks long text and adjusts entity offsets correctly.""" + if sys.version_info < (3, 10): + pytest.skip("gliner requires Python >= 3.10") + + text = "John Smith lives here. " + ("x " * 120) + "Jane Doe works there." + + # Mock returns entities with positions relative to each chunk + def mock_predict_entities(text, labels, flat_ner, threshold, multi_label): + entities = [] + if "John Smith" in text: + start = text.find("John Smith") + entities.append({"label": "person", "start": start, "end": start + 10, "score": 0.95}) + if "Jane Doe" in text: + start = text.find("Jane Doe") + entities.append({"label": "person", "start": start, "end": start + 8, "score": 0.93}) + return entities + + mock_gliner.predict_entities.side_effect = mock_predict_entities + + gliner_recognizer = GLiNERRecognizer( + entity_mapping={"person": "PERSON"}, + chunk_size=250, + ) + gliner_recognizer.gliner = mock_gliner + + results = gliner_recognizer.analyze(text, ["PERSON"]) + + # Verify chunking occurred (predict_entities called multiple times) + assert mock_gliner.predict_entities.call_count == 2, f"Expected 2 chunks, got {mock_gliner.predict_entities.call_count}" + + # Verify exactly 2 entities were detected + assert len(results) == 2, f"Expected 2 entities, found {len(results)}" + + # Verify both entities have correct offsets in original text + assert text[results[0].start:results[0].end] == "John Smith" + assert results[0].entity_type == "PERSON" + assert results[0].score == 0.95 + + assert text[results[1].start:results[1].end] == "Jane Doe" + assert results[1].entity_type == "PERSON" + assert results[1].score == 0.93 + + +def test_gliner_detects_entity_split_across_chunk_boundary(mock_gliner): + """Test that overlap catches entities split at chunk boundaries.""" + if sys.version_info < (3, 10): + pytest.skip("gliner requires Python >= 3.10") + + # Entity "Amanda Williams" will be split: "Amanda" at end of chunk 1, "Williams" at start of chunk 2 + # With 50-char overlap, both parts should be in the overlapping region + text = ("x " * 100) + "Amanda Williams" + (" x" * 100) + + def mock_predict_entities(text, labels, flat_ner, threshold, multi_label): + entities = [] + if "Amanda Williams" in text: + start = text.find("Amanda Williams") + entities.append({"label": "person", "start": start, "end": start + 15, "score": 0.92}) + return entities + + mock_gliner.predict_entities.side_effect = mock_predict_entities + + gliner_recognizer = GLiNERRecognizer( + entity_mapping={"person": "PERSON"}, + chunk_size=250, + chunk_overlap=50, + ) + gliner_recognizer.gliner = mock_gliner + + results = gliner_recognizer.analyze(text, ["PERSON"]) + + # Verify entity at boundary was detected + assert len(results) == 1, f"Expected 1 entity, found {len(results)}" + assert text[results[0].start:results[0].end] == "Amanda Williams" + assert results[0].entity_type == "PERSON" + + +def test_gliner_deduplicates_entities_in_overlap_region(mock_gliner): + """Test that duplicate entities from overlapping chunks are removed.""" + if sys.version_info < (3, 10): + pytest.skip("gliner requires Python >= 3.10") + + # Create text where entity appears in overlap region of both chunks + text = ("x " * 95) + "Dr. Smith" + (" x" * 100) + + call_count = 0 + def mock_predict_entities(text, labels, flat_ner, threshold, multi_label): + nonlocal call_count + call_count += 1 + entities = [] + if "Dr. Smith" in text: + start = text.find("Dr. Smith") + # Return slightly different scores to test that highest is kept + score = 0.95 if call_count == 1 else 0.90 + entities.append({"label": "person", "start": start, "end": start + 9, "score": score}) + return entities + + mock_gliner.predict_entities.side_effect = mock_predict_entities + + gliner_recognizer = GLiNERRecognizer( + entity_mapping={"person": "PERSON"}, + chunk_size=250, + chunk_overlap=50, + ) + gliner_recognizer.gliner = mock_gliner + + results = gliner_recognizer.analyze(text, ["PERSON"]) + + # Verify: Called multiple times due to overlap + assert mock_gliner.predict_entities.call_count >= 2, "Should process multiple chunks" + + # Verify: Only 1 result after deduplication (not 2) + assert len(results) == 1, f"Expected 1 deduplicated entity, found {len(results)}" + + # Verify: Kept the one with highest score (0.95 from first chunk) + assert results[0].score == 0.95 + assert text[results[0].start:results[0].end] == "Dr. Smith"