Skip to content

Commit b4168eb

Browse files
authored
Add a validation layer for YAML based configuration (#1780)
1 parent dd1623f commit b4168eb

39 files changed

Lines changed: 3243 additions & 337 deletions

presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import yaml
66

77
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
8+
from presidio_analyzer.input_validation import ConfigurationValidator
89
from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
910
from presidio_analyzer.recognizer_registry import RecognizerRegistryProvider
1011

@@ -29,14 +30,21 @@ def __init__(
2930
nlp_engine_conf_file: Optional[Union[Path, str]] = None,
3031
recognizer_registry_conf_file: Optional[Union[Path, str]] = None,
3132
):
33+
if analyzer_engine_conf_file:
34+
ConfigurationValidator.validate_file_path(analyzer_engine_conf_file)
35+
if nlp_engine_conf_file:
36+
ConfigurationValidator.validate_file_path(nlp_engine_conf_file)
37+
if recognizer_registry_conf_file:
38+
ConfigurationValidator.validate_file_path(recognizer_registry_conf_file)
39+
3240
self.configuration = self.get_configuration(conf_file=analyzer_engine_conf_file)
3341
self.nlp_engine_conf_file = nlp_engine_conf_file
3442
self.recognizer_registry_conf_file = recognizer_registry_conf_file
3543

3644
def get_configuration(
3745
self, conf_file: Optional[Union[Path, str]]
3846
) -> Union[Dict[str, Any]]:
39-
"""Retrieve the analyzer engine configuration from the provided file."""
47+
"""Retrieve analyzer engine configuration from the provided file."""
4048

4149
if not conf_file:
4250
default_conf_file = self._get_full_conf_path()
@@ -59,10 +67,15 @@ def get_configuration(
5967
with open(self._get_full_conf_path()) as file:
6068
configuration = yaml.safe_load(file)
6169
except Exception:
62-
print(f"Failed to parse file {conf_file}, resorting to default")
70+
logger.warning(
71+
f"Failed to parse file {conf_file}, resorting to default"
72+
)
6373
with open(self._get_full_conf_path()) as file:
6474
configuration = yaml.safe_load(file)
6575

76+
ConfigurationValidator.validate_analyzer_configuration(configuration)
77+
logger.debug("Analyzer configuration validation passed")
78+
6679
return configuration
6780

6881
def create_engine(self) -> AnalyzerEngine:

presidio-analyzer/presidio_analyzer/analyzer_request.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,5 +37,6 @@ def __init__(self, req_data: Dict):
3737
self.context = req_data.get("context")
3838
self.allow_list = req_data.get("allow_list")
3939
self.allow_list_match = req_data.get("allow_list_match", "exact")
40-
self.regex_flags = req_data.get("regex_flags",
41-
re.DOTALL | re.MULTILINE | re.IGNORECASE)
40+
self.regex_flags = req_data.get(
41+
"regex_flags", re.DOTALL | re.MULTILINE | re.IGNORECASE
42+
)
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
supported_languages:
2+
- en
3+
default_score_threshold: 0
4+
nlp_configuration:
5+
nlp_engine_name: spacy
6+
models:
7+
- lang_code: en
8+
model_name: en_core_web_lg
9+
10+
ner_model_configuration:
11+
model_to_presidio_entity_mapping:
12+
PER: PERSON
13+
PERSON: PERSON
14+
NORP: NRP
15+
FAC: LOCATION
16+
LOC: LOCATION
17+
LOCATION: LOCATION
18+
GPE: LOCATION
19+
ORG: ORGANIZATION
20+
ORGANIZATION: ORGANIZATION
21+
DATE: DATE_TIME
22+
TIME: DATE_TIME
23+
24+
low_confidence_score_multiplier: 0.4
25+
low_score_entity_names:
26+
-
27+
labels_to_ignore:
28+
- ORG
29+
- ORGANIZATION # has many false positives
30+
- CARDINAL
31+
- EVENT
32+
- LANGUAGE
33+
- LAW
34+
- MONEY
35+
- ORDINAL
36+
- PERCENT
37+
- PRODUCT
38+
- QUANTITY
39+
- WORK_OF_ART
40+
41+
42+
recognizer_registry:
43+
# global_regex_flags: 26
44+
recognizers:
45+
# Recognizers listed here can either be loaded from the recognizers defined in code (type: predefined),
46+
# or created based on the provided configuration (type: custom).
47+
# For predefined:
48+
# - If only a recognizer name is provided, a predefined recognizer with this name and default parameters will be loaded.
49+
# - If a parameter isn't provided, the default one would be loaded.
50+
# For custom:
51+
# - See an example configuration here: https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/conf/example_recognizers.yaml
52+
# - Custom pattern recognizers with this configuration can be added to this file, with type: custom
53+
# For recognizers supporting more than one language, an instance of the recognizer for each language will be created.
54+
# For example, see the CreditCardRecognizer definition below:
55+
- name: CreditCardRecognizer
56+
supported_languages:
57+
- language: en
58+
context: [credit, card, visa, mastercard, cc, amex, discover, jcb, diners, maestro, instapayment]
59+
type: predefined
60+
61+
- name: UsBankRecognizer
62+
type: predefined
63+
64+
- name: UsLicenseRecognizer
65+
type: predefined
66+
67+
- name: UsItinRecognizer
68+
type: predefined
69+
70+
- name: UsPassportRecognizer
71+
type: predefined
72+
73+
- name: UsSsnRecognizer
74+
type: predefined
75+
76+
- name: NhsRecognizer
77+
type: predefined
78+
79+
- name: UkNinoRecognizer
80+
type: predefined
81+
enabled: false
82+
83+
- name: SgFinRecognizer
84+
type: predefined
85+
enabled: false
86+
87+
- name: AuAbnRecognizer
88+
type: predefined
89+
enabled: false
90+
91+
- name: AuAcnRecognizer
92+
type: predefined
93+
enabled: false
94+
95+
- name: AuTfnRecognizer
96+
type: predefined
97+
enabled: false
98+
99+
- name: AuMedicareRecognizer
100+
type: predefined
101+
enabled: false
102+
103+
- name: InPanRecognizer
104+
type: predefined
105+
enabled: false
106+
107+
- name: InAadhaarRecognizer
108+
supported_languages:
109+
- en
110+
type: predefined
111+
enabled: false
112+
113+
- name: InVehicleRegistrationRecognizer
114+
type: predefined
115+
enabled: false
116+
117+
- name: InPassportRecognizer
118+
type: predefined
119+
enabled: false
120+
121+
- name: CryptoRecognizer
122+
type: predefined
123+
124+
- name: DateRecognizer
125+
type: predefined
126+
127+
- name: EmailRecognizer
128+
type: predefined
129+
130+
- name: IbanRecognizer
131+
type: predefined
132+
133+
- name: IpRecognizer
134+
type: predefined
135+
136+
- name: MedicalLicenseRecognizer
137+
type: predefined
138+
139+
- name: PhoneRecognizer
140+
type: predefined
141+
142+
- name: UrlRecognizer
143+
type: predefined
144+
145+
- name: InVoterRecognizer
146+
type: predefined
147+
enabled: false
148+
149+
- name: InGstinRecognizer
150+
type: predefined
151+
enabled: false
152+
153+
- name: SpacyRecognizer
154+
type: predefined

presidio-analyzer/presidio_analyzer/context_aware_enhancers/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Context awareness modules."""
2+
23
from .context_aware_enhancer import ContextAwareEnhancer
34
from .lemma_context_aware_enhancer import LemmaContextAwareEnhancer
45

presidio-analyzer/presidio_analyzer/entity_recognizer.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import logging
22
from abc import abstractmethod
3-
from typing import Dict, List, Optional, Tuple
3+
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
44

55
from presidio_analyzer import RecognizerResult
6-
from presidio_analyzer.nlp_engine import NlpArtifacts
6+
7+
if TYPE_CHECKING:
8+
from presidio_analyzer.nlp_engine import NlpArtifacts
79

810
logger = logging.getLogger("presidio-analyzer")
911

@@ -74,7 +76,7 @@ def load(self) -> None:
7476

7577
@abstractmethod
7678
def analyze(
77-
self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts
79+
self, text: str, entities: List[str], nlp_artifacts: "NlpArtifacts"
7880
) -> List[RecognizerResult]:
7981
"""
8082
Analyze text to identify entities.
@@ -92,7 +94,7 @@ def enhance_using_context(
9294
text: str,
9395
raw_recognizer_results: List[RecognizerResult],
9496
other_raw_recognizer_results: List[RecognizerResult],
95-
nlp_artifacts: NlpArtifacts,
97+
nlp_artifacts: "NlpArtifacts",
9698
context: Optional[List[str]] = None,
9799
) -> List[RecognizerResult]:
98100
"""Enhance confidence score using context of the entity.
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
"""Configuration validation module for Presidio."""
2+
3+
from .language_validation import validate_language_codes
4+
from .schemas import ConfigurationValidator
5+
from .yaml_recognizer_models import (
6+
BaseRecognizerConfig,
7+
CustomRecognizerConfig,
8+
LanguageContextConfig,
9+
PredefinedRecognizerConfig,
10+
RecognizerRegistryConfig,
11+
)
12+
13+
__all__ = [
14+
"validate_language_codes",
15+
"ConfigurationValidator",
16+
"BaseRecognizerConfig",
17+
"CustomRecognizerConfig",
18+
"LanguageContextConfig",
19+
"PredefinedRecognizerConfig",
20+
"RecognizerRegistryConfig",
21+
]
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from typing import List
2+
3+
import regex as re
4+
5+
6+
def validate_language_codes(languages: List[str]) -> None:
7+
"""Validate language codes format.
8+
9+
:param languages: List of languages to validate.
10+
"""
11+
language_code_regex = re.compile(r"^[a-z]{2}(-[A-Z]{2})?$")
12+
13+
for lang in languages:
14+
if not re.match(language_code_regex, lang):
15+
raise ValueError(
16+
f"Invalid language code format: {lang}. "
17+
f"Expected format: 'en' or 'en-US'"
18+
)

0 commit comments

Comments
 (0)