Skip to content
Closed
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
b431cff
Refactor nlp_engine_provider.py to improve configuration handling and…
ShakutaiGit Mar 25, 2025
ee9feb9
Update NLP configuration to use default Spacy model settings
ShakutaiGit Mar 25, 2025
d7d465f
Add built-in default NLP configuration and improve error handling for…
ShakutaiGit Mar 25, 2025
715a98d
Enhance NLP configuration validation by adding a dedicated method and…
ShakutaiGit Mar 30, 2025
ad4ea4d
Merge branch 'main' into fix-1556-config-file-missing
ShakutaiGit Mar 30, 2025
368d32c
Add NER model configuration to test_stanza.yaml
ShakutaiGit Mar 30, 2025
2978fd5
Fix formatting in nlp_engine_provider.py and remove trailing whitespace
ShakutaiGit Mar 30, 2025
ce7b846
Merge branch 'main' into fix-1556-config-file-missing
omri374 Apr 3, 2025
8447862
Merge branch 'main' into fix-1556-config-file-missing
omri374 Apr 3, 2025
00d0d5a
Merge branch 'main' into fix-1556-config-file-missing
omri374 Apr 21, 2025
1ff6918
CR fixes
ShakutaiGit Apr 21, 2025
4f8479d
Merge branch 'fix-1556-config-file-missing' of https://github.com/mic…
ShakutaiGit Apr 21, 2025
ca9a0f4
CR fixes
ShakutaiGit Apr 27, 2025
6c82cfb
linting fix
ShakutaiGit Apr 27, 2025
6e05990
Merge branch 'main' of https://github.com/microsoft/presidio into fix…
ShakutaiGit Apr 27, 2025
46cea85
linting
ShakutaiGit Apr 27, 2025
073c2ba
Add missing newline at end of file in nlp_engine_provider.py
ShakutaiGit Apr 27, 2025
cd45852
Fix missing newline at end of file in nlp_engine_provider.py
ShakutaiGit Apr 27, 2025
3e36952
Merge branch 'main' into fix-1556-config-file-missing
omri374 Jul 16, 2025
5bf7d18
Merge branch 'main' into fix-1556-config-file-missing
omri374 Jul 16, 2025
338fb2e
Merge branch 'main' into fix-1556-config-file-missing
SharonHart Oct 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,13 @@
TransformersNlpEngine,
)

logger = logging.getLogger("presidio-analyzer")
DEFAULT_BUILTIN_CONFIG = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
"ner_model_configuration": {},
}

logger = logging.getLogger("presidio-analyzer")

class NlpEngineProvider:
"""Create different NLP engines from configuration.
Expand Down Expand Up @@ -52,17 +57,21 @@ def __init__(
raise ValueError(
"Either conf_file or nlp_configuration should be provided, not both."
)

if nlp_configuration:
elif nlp_configuration:
self.nlp_configuration = nlp_configuration

if conf_file:
elif conf_file:
self.nlp_configuration = self._read_nlp_conf(conf_file)

if conf_file is None and nlp_configuration is None:
else:
conf_file = self._get_full_conf_path()
logger.debug(f"Reading default conf file from {conf_file}")
self.nlp_configuration = self._read_nlp_conf(conf_file)
try:
self.nlp_configuration = self._read_nlp_conf(conf_file)
except FileNotFoundError:
logger.warning(
f"Default config file '{conf_file}' not found. "
f"Falling back to built-in default: {DEFAULT_BUILTIN_CONFIG}"
)
self.nlp_configuration = DEFAULT_BUILTIN_CONFIG

def create_engine(self) -> NlpEngine:
"""Create an NLP engine instance."""
Expand Down Expand Up @@ -108,26 +117,15 @@ def create_engine(self) -> NlpEngine:

@staticmethod
def _read_nlp_conf(conf_file: Union[Path, str]) -> dict:
"""Read the nlp configuration from a provided yaml file."""
"""Read and validate the NLP configuration from a provided YAML file."""

if not Path(conf_file).exists():
nlp_configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
}
logger.warning(
f"configuration file {conf_file} not found. "
f"Using default config: {nlp_configuration}."
)
raise FileNotFoundError(f"Configuration file {conf_file} not found.")

else:
with open(conf_file) as file:
nlp_configuration = yaml.safe_load(file)
with open(conf_file) as file:
nlp_configuration = yaml.safe_load(file)

if "ner_model_configuration" not in nlp_configuration:
logger.warning(
"configuration file is missing 'ner_model_configuration'. Using default"
)
NlpEngineProvider._validate_yaml_config_format(nlp_configuration)

return nlp_configuration

Expand All @@ -137,3 +135,11 @@ def _get_full_conf_path(
) -> Path:
"""Return a Path to the default conf file."""
return Path(Path(__file__).parent.parent, "conf", default_conf_file)

@staticmethod
def _validate_yaml_config_format(nlp_configuration: Dict) -> None:
"""Validate the YAML configuration file format."""
required_fields = ["nlp_engine_name", "ner_model_configuration", "models"]
for field in required_fields:
if field not in nlp_configuration:
raise ValueError(f"Configuration file is missing '{field}'.")
1 change: 1 addition & 0 deletions presidio-analyzer/tests/conf/test_stanza.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ models:
-
lang_code: en
model_name: en
ner_model_configuration:
147 changes: 145 additions & 2 deletions presidio-analyzer/tests/test_nlp_engine_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ def mock_he_model():
"""
he = spacy.blank("he")
he.to_disk("he_test")
yield
import shutil
shutil.rmtree("he_test", ignore_errors=True)


@pytest.fixture(scope="module")
Expand All @@ -32,6 +35,9 @@ def mock_bn_model():
"""
bn = spacy.blank("bn")
bn.to_disk("bn_test")
yield
import shutil
shutil.rmtree("bn_test", ignore_errors=True)


@pytest.fixture(scope="session")
Expand Down Expand Up @@ -172,8 +178,7 @@ def test_when_both_conf_and_config_then_fail(mocker):


def test_when_labels_to_ignore_not_define_in_conf_file_default_into_empty_set(mocker):
conf_file = "conf/spacy_multilingual.yaml"

conf_file = Path(__file__).parent.parent / "presidio_analyzer" / "conf" / "spacy_multilingual.yaml"
engine = NlpEngineProvider(conf_file=conf_file).create_engine()
assert len(engine.ner_model_configuration.labels_to_ignore) == 0

Expand Down Expand Up @@ -246,3 +251,141 @@ def test_nlp_engine_provider_init_through_nlp_engine_configuration():
engine = NlpEngineProvider().create_engine()
assert isinstance(engine, SpacyNlpEngine)
assert engine.engine_name == "spacy"


def test_init_only_conf_file(tmp_path):
yaml_content = """
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg
ner_model_configuration: {}
"""
yaml_file = tmp_path / "valid.yaml"
yaml_file.write_text(yaml_content)
provider = NlpEngineProvider(conf_file=str(yaml_file))
assert provider.nlp_configuration["nlp_engine_name"] == "spacy"
yaml_file.unlink()


def test_init_only_nlp_configuration():
config = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
"ner_model_configuration": {},
}
provider = NlpEngineProvider(nlp_configuration=config)
assert provider.nlp_configuration["nlp_engine_name"] == "spacy"


def test_init_both_conf_file_and_nlp_configuration(tmp_path):
yaml_content = """
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg
ner_model_configuration: {}
"""
yaml_file = tmp_path / "valid.yaml"
yaml_file.write_text(yaml_content)
config = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
"ner_model_configuration": {},
}
with pytest.raises(ValueError):
NlpEngineProvider(conf_file=str(yaml_file), nlp_configuration=config)
yaml_file.unlink()


def test_init_none_provided():
# Should use default config
provider = NlpEngineProvider()
assert provider.nlp_configuration["nlp_engine_name"] == "spacy"


def test_init_conf_file_not_exist():
with pytest.raises(FileNotFoundError):
NlpEngineProvider(conf_file="not_a_real_file.yaml")


def test_create_engine_missing_models():
config = {"nlp_engine_name": "spacy", "ner_model_configuration": {}}
provider = NlpEngineProvider(nlp_configuration=config)
with pytest.raises(ValueError):
provider.create_engine()


def test_create_engine_missing_nlp_engine_name():
config = {"models": [{"lang_code": "en", "model_name": "en_core_web_lg"}], "ner_model_configuration": {}}
provider = NlpEngineProvider(nlp_configuration=config)
with pytest.raises(ValueError):
provider.create_engine()


def test_create_engine_models_empty():
config = {"nlp_engine_name": "spacy", "models": [], "ner_model_configuration": {}}
provider = NlpEngineProvider(nlp_configuration=config)
with pytest.raises(ValueError):
provider.create_engine()


def test_create_engine_invalid_ner_model_configuration():
config = {"nlp_engine_name": "spacy", "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}], "ner_model_configuration": "not_a_dict"}
provider = NlpEngineProvider(nlp_configuration=config)
with pytest.raises(Exception):
provider.create_engine()


def test_read_nlp_conf_file_not_found():
with pytest.raises(FileNotFoundError):
NlpEngineProvider._read_nlp_conf("definitely_missing.yaml")


def test_read_nlp_conf_file_valid(tmp_path):
yaml_content = """
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg
ner_model_configuration: {}
"""
yaml_file = tmp_path / "valid.yaml"
yaml_file.write_text(yaml_content)
config = NlpEngineProvider._read_nlp_conf(str(yaml_file))
assert config["nlp_engine_name"] == "spacy"
yaml_file.unlink() # Clean up


def test_read_nlp_conf_file_invalid(tmp_path):
yaml_content = """
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg
""" # missing ner_model_configuration
yaml_file = tmp_path / "invalid.yaml"
yaml_file.write_text(yaml_content)
with pytest.raises(ValueError):
NlpEngineProvider._read_nlp_conf(str(yaml_file))
yaml_file.unlink() # Clean up


def test_get_full_conf_path_returns_path():
path = NlpEngineProvider._get_full_conf_path()
assert path.name.endswith("default.yaml")


def test_validate_yaml_config_format_missing_field():
config = {"nlp_engine_name": "spacy", "models": []}
with pytest.raises(ValueError):
NlpEngineProvider._validate_yaml_config_format(config)


def test_validate_yaml_config_format_all_fields():
config = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
"ner_model_configuration": {},
}
NlpEngineProvider._validate_yaml_config_format(config) # Should not raise
Loading