-
Notifications
You must be signed in to change notification settings - Fork 915
Add a validation layer for YAML based configuration #1780
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 16 commits
159315e
cfc7c1b
f7b54d4
0fbd010
c4841b5
3939fc9
d7cb69b
3b61469
251cefc
1420bd5
c677b79
421e47d
2901b13
8b84370
1cf7678
108e3d0
b492b79
eb9f7c7
bfd067b
f2e7fd9
18df02e
41328cc
bd2d045
11a8169
8054750
86baa12
24d491f
08c15ac
a82a9b0
4dd0675
3bf8e7b
8850b97
8f98b64
4ebee59
c17ee63
92728a7
3d06f1c
ea593d6
fea6c06
18084f8
43ce284
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| """Configuration validation module for Presidio.""" | ||
|
|
||
| from .schemas import ConfigurationValidator | ||
| from .yaml_recognizer_models import ( | ||
| BaseRecognizerConfig, | ||
| CustomRecognizerConfig, | ||
| LanguageContextConfig, | ||
| PredefinedRecognizerConfig, | ||
| RecognizerRegistryConfig, | ||
| ) | ||
|
|
||
| __all__ = [ | ||
| "ConfigurationValidator", | ||
| "BaseRecognizerConfig", | ||
| "CustomRecognizerConfig", | ||
| "LanguageContextConfig", | ||
| "PredefinedRecognizerConfig", | ||
| "RecognizerRegistryConfig", | ||
| ] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,143 @@ | ||
| import re | ||
omri374 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| from pathlib import Path | ||
| from typing import Any, Dict, List, Union | ||
|
|
||
| from pydantic import ValidationError | ||
|
|
||
| from .yaml_recognizer_models import RecognizerRegistryConfig | ||
|
|
||
|
|
||
| class ConfigurationValidator: | ||
| """Class for validating configurations using Pydantic-enabled classes.""" | ||
|
|
||
| @staticmethod | ||
| def validate_language_codes(languages: List[str]) -> List[str]: | ||
| """Validate language codes format. | ||
|
|
||
| :param languages: List of languages to validate. | ||
| """ | ||
| for lang in languages: | ||
| if not re.match(r"^[a-z]{2}(-[A-Z]{2})?$", lang): | ||
RonShakutai marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| raise ValueError( | ||
| f"Invalid language code format: {lang}. " | ||
| f"Expected format: 'en' or 'en-US'" | ||
| ) | ||
| return languages | ||
|
|
||
| @staticmethod | ||
| def validate_file_path(file_path: Union[str, Path]) -> Path: | ||
| """Validate file path exists and is readable. | ||
|
|
||
| :param file_path: Path to validate. | ||
| """ | ||
| path = Path(file_path) | ||
| if not path.exists(): | ||
| raise ValueError(f"Configuration file does not exist: {path}") | ||
| if not path.is_file(): | ||
| raise ValueError(f"Path is not a file: {path}") | ||
| return path | ||
|
|
||
| @staticmethod | ||
| def validate_score_threshold(threshold: float) -> float: | ||
| """Validate score threshold is within valid range. | ||
|
|
||
| :param threshold: score threshold to validate. | ||
| """ | ||
| if not 0.0 <= threshold <= 1.0: | ||
| raise ValueError( | ||
| f"Score threshold must be between 0.0 and 1.0, got: {threshold}" | ||
| ) | ||
| return threshold | ||
|
|
||
| @staticmethod | ||
| def validate_nlp_configuration(config: Dict[str, Any]) -> Dict[str, Any]: | ||
| """Validate NLP configuration structure. | ||
|
|
||
| :param config: NLP Configuration to validate. | ||
| """ | ||
| if not isinstance(config, dict): | ||
| raise ValueError("NLP configuration must be a dictionary") | ||
|
|
||
| required_fields = ["nlp_engine_name", "models"] | ||
| missing_fields = [field for field in required_fields if field not in config] | ||
| if missing_fields: | ||
| raise ValueError( | ||
| f"NLP configuration missing required fields: {missing_fields}" | ||
| ) | ||
|
|
||
| # Validate models structure | ||
| if not isinstance(config["models"], list) or not config["models"]: | ||
| raise ValueError("Models must be a non-empty list") | ||
|
|
||
| for model in config["models"]: | ||
| if not isinstance(model, dict): | ||
| raise ValueError("Each model must be a dictionary") | ||
| if "lang_code" not in model or "model_name" not in model: | ||
| raise ValueError("Each model must have 'lang_code' and 'model_name'") | ||
|
|
||
| return config | ||
|
|
||
| @staticmethod | ||
| def validate_recognizer_registry_configuration( | ||
| config: Dict[str, Any], | ||
| ) -> Dict[str, Any]: | ||
| """Validate recognizer registry configuration using Pydantic models.""" | ||
| try: | ||
| # Use Pydantic model for validation | ||
| validated_config = RecognizerRegistryConfig(**config) | ||
| # Use model_dump() without exclude_unset to include default values | ||
| return validated_config.model_dump(exclude_unset=False) | ||
| except ValidationError as e: | ||
| raise ValueError(f"Invalid recognizer registry configuration: {e}") | ||
RonShakutai marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| except ImportError: | ||
|
||
| # Fallback to basic validation if models not available | ||
| return ConfigurationValidator._validate_recognizer_registry_basic(config) | ||
RonShakutai marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| @staticmethod | ||
| def _validate_recognizer_registry_basic(config: Dict[str, Any]) -> Dict[str, Any]: | ||
| """Validate recognizer registry config.""" | ||
| if not isinstance(config, dict): | ||
| raise ValueError("Recognizer registry configuration must be a dictionary") | ||
|
|
||
| # Validate supported languages | ||
| if "supported_languages" in config: | ||
| ConfigurationValidator.validate_language_codes( | ||
| config["supported_languages"] | ||
| ) | ||
|
|
||
| # Validate recognizers list | ||
| if "recognizers" in config and not isinstance(config["recognizers"], list): | ||
| raise ValueError("Recognizers must be a list") | ||
|
|
||
| return config | ||
|
|
||
| @staticmethod | ||
| def validate_analyzer_configuration(config: Dict[str, Any]) -> Dict[str, Any]: | ||
| """Validate analyzer engine validation.""" | ||
RonShakutai marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| if not isinstance(config, dict): | ||
| raise ValueError("Analyzer configuration must be a dictionary") | ||
|
|
||
| # Validate supported languages if present | ||
| if "supported_languages" in config: | ||
| ConfigurationValidator.validate_language_codes( | ||
| config["supported_languages"] | ||
| ) | ||
|
|
||
| # Validate score threshold if present | ||
| if "default_score_threshold" in config: | ||
| ConfigurationValidator.validate_score_threshold( | ||
| config["default_score_threshold"] | ||
| ) | ||
|
|
||
| # Validate nested configurations | ||
| if "nlp_configuration" in config: | ||
| ConfigurationValidator.validate_nlp_configuration( | ||
| config["nlp_configuration"] | ||
| ) | ||
|
|
||
| if "recognizer_registry" in config: | ||
| ConfigurationValidator.validate_recognizer_registry_configuration( | ||
| config["recognizer_registry"] | ||
| ) | ||
|
|
||
| return config | ||
Uh oh!
There was an error while loading. Please reload this page.