Skip to content
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
159315e
initial code for pydantic based validation for yaml files
omri374 Oct 19, 2025
cfc7c1b
Validation layer for YAML based configuration - cont'd
omri374 Nov 11, 2025
f7b54d4
Merge branch 'main' into omri/pydantic_validation
omri374 Nov 11, 2025
0fbd010
linting
omri374 Nov 11, 2025
c4841b5
Update presidio-analyzer/presidio_analyzer/input_validation/yaml_reco…
omri374 Nov 11, 2025
3939fc9
Update presidio-analyzer/presidio_analyzer/input_validation/yaml_reco…
omri374 Nov 11, 2025
d7cb69b
Update presidio-analyzer/tests/test_configuration_validator.py
omri374 Nov 11, 2025
3b61469
Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py
omri374 Nov 11, 2025
251cefc
Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py
omri374 Nov 11, 2025
1420bd5
Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py
omri374 Nov 11, 2025
c677b79
Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py
omri374 Nov 11, 2025
421e47d
Update presidio-analyzer/tests/test_recognizer_registry_provider.py
omri374 Nov 11, 2025
2901b13
Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py
omri374 Nov 11, 2025
8b84370
Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py
omri374 Nov 11, 2025
1cf7678
Merge branch 'main' into omri/pydantic_validation
omri374 Nov 17, 2025
108e3d0
ruff on the entire analyzer codebase
omri374 Nov 19, 2025
b492b79
Merge branch 'main' into omri/pydantic_validation
omri374 Nov 19, 2025
eb9f7c7
Merge remote-tracking branch 'origin/omri/pydantic_validation' into o…
omri374 Nov 19, 2025
bfd067b
Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py
omri374 Nov 19, 2025
f2e7fd9
ruff and copilot review fixes
omri374 Nov 19, 2025
18df02e
merge
omri374 Nov 19, 2025
41328cc
Delete presidio-analyzer/test-output.xml
omri374 Nov 19, 2025
bd2d045
fixed bad test
omri374 Nov 19, 2025
11a8169
ruff
omri374 Nov 19, 2025
8054750
removed wrong test which assumes defaults
omri374 Nov 20, 2025
86baa12
Clean up comments in recognizers_loader_utils.py
omri374 Nov 20, 2025
24d491f
Merge branch 'main' into omri/pydantic_validation
omri374 Nov 30, 2025
08c15ac
updates to PR following review
omri374 Nov 30, 2025
a82a9b0
Merge branch 'main' into omri/pydantic_validation
omri374 Dec 1, 2025
4dd0675
added more tests
omri374 Dec 1, 2025
3bf8e7b
removed bandit from defender-for-devops
omri374 Dec 1, 2025
8850b97
merge with main
omri374 Dec 1, 2025
8f98b64
more unit tests
omri374 Dec 4, 2025
4ebee59
more unit tests
omri374 Dec 4, 2025
c17ee63
Update presidio-analyzer/presidio_analyzer/recognizer_registry/recogn…
omri374 Dec 4, 2025
92728a7
merge
omri374 Dec 4, 2025
3d06f1c
Merge branch 'main' into omri/pydantic_validation
dorlugasigal Dec 4, 2025
ea593d6
updates based on PR comments
omri374 Dec 7, 2025
fea6c06
Merge remote-tracking branch 'origin/omri/pydantic_validation' into o…
omri374 Dec 7, 2025
18084f8
updates based on PR comments
omri374 Dec 7, 2025
43ce284
Update presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_prov…
omri374 Dec 7, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def __init__(
def get_configuration(
self, conf_file: Optional[Union[Path, str]]
) -> Union[Dict[str, Any]]:
"""Retrieve the analyzer engine configuration from the provided file."""
"""Retrieve analyzer engine configuration from the provided file."""

if not conf_file:
default_conf_file = self._get_full_conf_path()
Expand All @@ -59,10 +59,18 @@ def get_configuration(
with open(self._get_full_conf_path()) as file:
configuration = yaml.safe_load(file)
except Exception:
print(f"Failed to parse file {conf_file}, resorting to default")
logger.warning(
f"Failed to parse file {conf_file}, resorting to default"
)
with open(self._get_full_conf_path()) as file:
configuration = yaml.safe_load(file)

# Validate configuration using Pydantic-based ConfigurationValidator
from presidio_analyzer.input_validation import ConfigurationValidator

ConfigurationValidator.validate_analyzer_configuration(configuration)
logger.debug("Analyzer configuration validation passed")

return configuration

def create_engine(self) -> AnalyzerEngine:
Expand Down
5 changes: 3 additions & 2 deletions presidio-analyzer/presidio_analyzer/analyzer_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,6 @@ def __init__(self, req_data: Dict):
self.context = req_data.get("context")
self.allow_list = req_data.get("allow_list")
self.allow_list_match = req_data.get("allow_list_match", "exact")
self.regex_flags = req_data.get("regex_flags",
re.DOTALL | re.MULTILINE | re.IGNORECASE)
self.regex_flags = req_data.get(
"regex_flags", re.DOTALL | re.MULTILINE | re.IGNORECASE
)
19 changes: 19 additions & 0 deletions presidio-analyzer/presidio_analyzer/input_validation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""Configuration validation module for Presidio."""

from .schemas import ConfigurationValidator
from .yaml_recognizer_models import (
BaseRecognizerConfig,
CustomRecognizerConfig,
LanguageContextConfig,
PredefinedRecognizerConfig,
RecognizerRegistryConfig,
)

__all__ = [
"ConfigurationValidator",
"BaseRecognizerConfig",
"CustomRecognizerConfig",
"LanguageContextConfig",
"PredefinedRecognizerConfig",
"RecognizerRegistryConfig",
]
143 changes: 143 additions & 0 deletions presidio-analyzer/presidio_analyzer/input_validation/schemas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import re
from pathlib import Path
from typing import Any, Dict, List, Union

from pydantic import ValidationError

from .yaml_recognizer_models import RecognizerRegistryConfig


class ConfigurationValidator:
"""Class for validating configurations using Pydantic-enabled classes."""

@staticmethod
def validate_language_codes(languages: List[str]) -> List[str]:
"""Validate language codes format.

:param languages: List of languages to validate.
"""
for lang in languages:
if not re.match(r"^[a-z]{2}(-[A-Z]{2})?$", lang):
raise ValueError(
f"Invalid language code format: {lang}. "
f"Expected format: 'en' or 'en-US'"
)
return languages

@staticmethod
def validate_file_path(file_path: Union[str, Path]) -> Path:
"""Validate file path exists and is readable.

:param file_path: Path to validate.
"""
path = Path(file_path)
if not path.exists():
raise ValueError(f"Configuration file does not exist: {path}")
if not path.is_file():
raise ValueError(f"Path is not a file: {path}")
return path

@staticmethod
def validate_score_threshold(threshold: float) -> float:
"""Validate score threshold is within valid range.

:param threshold: score threshold to validate.
"""
if not 0.0 <= threshold <= 1.0:
raise ValueError(
f"Score threshold must be between 0.0 and 1.0, got: {threshold}"
)
return threshold

@staticmethod
def validate_nlp_configuration(config: Dict[str, Any]) -> Dict[str, Any]:
"""Validate NLP configuration structure.

:param config: NLP Configuration to validate.
"""
if not isinstance(config, dict):
raise ValueError("NLP configuration must be a dictionary")

required_fields = ["nlp_engine_name", "models"]
missing_fields = [field for field in required_fields if field not in config]
if missing_fields:
raise ValueError(
f"NLP configuration missing required fields: {missing_fields}"
)

# Validate models structure
if not isinstance(config["models"], list) or not config["models"]:
raise ValueError("Models must be a non-empty list")

for model in config["models"]:
if not isinstance(model, dict):
raise ValueError("Each model must be a dictionary")
if "lang_code" not in model or "model_name" not in model:
raise ValueError("Each model must have 'lang_code' and 'model_name'")

return config

@staticmethod
def validate_recognizer_registry_configuration(
config: Dict[str, Any],
) -> Dict[str, Any]:
"""Validate recognizer registry configuration using Pydantic models."""
try:
# Use Pydantic model for validation
validated_config = RecognizerRegistryConfig(**config)
# Use model_dump() without exclude_unset to include default values
return validated_config.model_dump(exclude_unset=False)
except ValidationError as e:
raise ValueError(f"Invalid recognizer registry configuration: {e}")
except ImportError:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right. It was one of the optimizations I tried but it was just left there. It was intended to catch cases where the validation fails because we couldn't find a recognizer (there's an import mechanism)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

# Fallback to basic validation if models not available
return ConfigurationValidator._validate_recognizer_registry_basic(config)

@staticmethod
def _validate_recognizer_registry_basic(config: Dict[str, Any]) -> Dict[str, Any]:
"""Validate recognizer registry config."""
if not isinstance(config, dict):
raise ValueError("Recognizer registry configuration must be a dictionary")

# Validate supported languages
if "supported_languages" in config:
ConfigurationValidator.validate_language_codes(
config["supported_languages"]
)

# Validate recognizers list
if "recognizers" in config and not isinstance(config["recognizers"], list):
raise ValueError("Recognizers must be a list")

return config

@staticmethod
def validate_analyzer_configuration(config: Dict[str, Any]) -> Dict[str, Any]:
"""Validate analyzer engine validation."""
if not isinstance(config, dict):
raise ValueError("Analyzer configuration must be a dictionary")

# Validate supported languages if present
if "supported_languages" in config:
ConfigurationValidator.validate_language_codes(
config["supported_languages"]
)

# Validate score threshold if present
if "default_score_threshold" in config:
ConfigurationValidator.validate_score_threshold(
config["default_score_threshold"]
)

# Validate nested configurations
if "nlp_configuration" in config:
ConfigurationValidator.validate_nlp_configuration(
config["nlp_configuration"]
)

if "recognizer_registry" in config:
ConfigurationValidator.validate_recognizer_registry_configuration(
config["recognizer_registry"]
)

return config
Loading
Loading