Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
b431cff
Refactor nlp_engine_provider.py to improve configuration handling and…
ShakutaiGit Mar 25, 2025
ee9feb9
Update NLP configuration to use default Spacy model settings
ShakutaiGit Mar 25, 2025
d7d465f
Add built-in default NLP configuration and improve error handling for…
ShakutaiGit Mar 25, 2025
715a98d
Enhance NLP configuration validation by adding a dedicated method and…
ShakutaiGit Mar 30, 2025
ad4ea4d
Merge branch 'main' into fix-1556-config-file-missing
ShakutaiGit Mar 30, 2025
368d32c
Add NER model configuration to test_stanza.yaml
ShakutaiGit Mar 30, 2025
2978fd5
Fix formatting in nlp_engine_provider.py and remove trailing whitespace
ShakutaiGit Mar 30, 2025
ce7b846
Merge branch 'main' into fix-1556-config-file-missing
omri374 Apr 3, 2025
8447862
Merge branch 'main' into fix-1556-config-file-missing
omri374 Apr 3, 2025
00d0d5a
Merge branch 'main' into fix-1556-config-file-missing
omri374 Apr 21, 2025
1ff6918
CR fixes
ShakutaiGit Apr 21, 2025
4f8479d
Merge branch 'fix-1556-config-file-missing' of https://github.com/mic…
ShakutaiGit Apr 21, 2025
ca9a0f4
CR fixes
ShakutaiGit Apr 27, 2025
6c82cfb
linting fix
ShakutaiGit Apr 27, 2025
6e05990
Merge branch 'main' of https://github.com/microsoft/presidio into fix…
ShakutaiGit Apr 27, 2025
46cea85
linting
ShakutaiGit Apr 27, 2025
073c2ba
Add missing newline at end of file in nlp_engine_provider.py
ShakutaiGit Apr 27, 2025
cd45852
Fix missing newline at end of file in nlp_engine_provider.py
ShakutaiGit Apr 27, 2025
3e36952
Merge branch 'main' into fix-1556-config-file-missing
omri374 Jul 16, 2025
5bf7d18
Merge branch 'main' into fix-1556-config-file-missing
omri374 Jul 16, 2025
338fb2e
Merge branch 'main' into fix-1556-config-file-missing
SharonHart Oct 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,13 @@
TransformersNlpEngine,
)

logger = logging.getLogger("presidio-analyzer")
DEFAULT_BUILTIN_CONFIG = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
"ner_model_configuration": {},
}

logger = logging.getLogger("presidio-analyzer")

class NlpEngineProvider:
"""Create different NLP engines from configuration.
Expand Down Expand Up @@ -62,11 +67,17 @@ def __init__(
if conf_file or conf_file == '':
self._validate_conf_file_path(conf_file)
self.nlp_configuration = self._read_nlp_conf(conf_file)

if conf_file is None and nlp_configuration is None:
else:
conf_file = self._get_full_conf_path()
logger.debug(f"Reading default conf file from {conf_file}")
self.nlp_configuration = self._read_nlp_conf(conf_file)
try:
self.nlp_configuration = self._read_nlp_conf(conf_file)
except FileNotFoundError:
logger.warning(
f"Default config file '{conf_file}' not found. "
f"Falling back to built-in default: {DEFAULT_BUILTIN_CONFIG}"
)
self.nlp_configuration = DEFAULT_BUILTIN_CONFIG

@staticmethod
def _validate_nlp_engines(nlp_engines: Tuple) -> None:
Expand Down Expand Up @@ -208,10 +219,7 @@ def _read_nlp_conf(conf_file: Union[Path, str]) -> dict:
with open(conf_file) as file:
nlp_configuration = yaml.safe_load(file)

if "ner_model_configuration" not in nlp_configuration:
logger.warning(
"configuration file is missing 'ner_model_configuration'. Using default"
)
NlpEngineProvider._validate_yaml_config_format(nlp_configuration)

return nlp_configuration

Expand All @@ -221,3 +229,44 @@ def _get_full_conf_path(
) -> Path:
"""Return a Path to the default conf file."""
return Path(Path(__file__).parent.parent, "conf", default_conf_file)

@staticmethod
def _validate_yaml_config_format(nlp_configuration: Dict) -> None:
"""Validate the YAML configuration file format."""
logger = logging.getLogger("presidio-analyzer")

for key in ("nlp_engine_name", "models"):
if key not in nlp_configuration:
raise ValueError(f"Configuration file is missing '{key}'.")

if nlp_configuration.get("ner_model_configuration"):
return

cfg_langs = {
str(lang).lower()
for lang in nlp_configuration.get("supported_languages", []) or []
}

recog_langs = {
str(lang).lower()
for lang in (
nlp_configuration.get("recognizer_registry", {})
.get("supported_languages", [])
or []
)
}

requested_langs = cfg_langs | recog_langs
english_only = not requested_langs or requested_langs == {"en"}

if english_only:
logger.warning(
"ner_model_configuration is missing, "
"Default English configuration will be used."
)
else:
raise ValueError(
"Configuration file is missing 'ner_model_configuration', "
"which is required when requested languages are not only English. "
f"Detected languages: {sorted(requested_langs)}"
)
230 changes: 230 additions & 0 deletions presidio-analyzer/tests/test_nlp_engine_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import pytest
import spacy
import shutil

from presidio_analyzer.nlp_engine import (
SpacyNlpEngine,
Expand All @@ -11,6 +12,10 @@
)
from presidio_analyzer.nlp_engine.transformers_nlp_engine import TransformersNlpEngine

def _write_yaml(tmp_path, content: str, name: str = "config.yaml") -> Path:
path = tmp_path / name
path.write_text(content)
return path

@pytest.fixture(scope="module")
def mock_he_model():
Expand All @@ -21,6 +26,8 @@ def mock_he_model():
"""
he = spacy.blank("he")
he.to_disk("he_test")
yield
shutil.rmtree("he_test", ignore_errors=True)


@pytest.fixture(scope="module")
Expand All @@ -32,6 +39,8 @@ def mock_bn_model():
"""
bn = spacy.blank("bn")
bn.to_disk("bn_test")
yield
shutil.rmtree("bn_test", ignore_errors=True)


@pytest.fixture(scope="session")
Expand Down Expand Up @@ -171,6 +180,12 @@ def test_when_both_conf_and_config_then_fail(mocker):
NlpEngineProvider(conf_file=conf_file, nlp_configuration=nlp_configuration)


def test_when_labels_to_ignore_not_define_in_conf_file_default_into_empty_set(mocker):
conf_file = (Path(__file__).parent.parent/ "presidio_analyzer"/ "conf"/ "spacy_multilingual.yaml")
engine = NlpEngineProvider(conf_file=conf_file).create_engine()
assert len(engine.ner_model_configuration.labels_to_ignore) == 0


@pytest.mark.skip_engine("transformers_en")
def test_when_create_transformers_nlp_engine_then_succeeds(mocker):
mocker.patch(
Expand Down Expand Up @@ -241,6 +256,221 @@ def test_nlp_engine_provider_init_through_nlp_engine_configuration():
assert isinstance(engine, SpacyNlpEngine)
assert engine.engine_name == "spacy"


def test_create_engine_missing_ner_model_configuration_english_only():
config = {
"nlp_engine_name": "spacy",
"models": [
{"lang_code": "en", "model_name": "en_core_web_lg"}
],
}
provider = NlpEngineProvider(nlp_configuration=config)
engine = provider.create_engine()
assert isinstance(engine, SpacyNlpEngine)
assert 'en' in engine.nlp
assert isinstance(engine.nlp['en'], spacy.lang.en.English)


def test_create_engine_missing_ner_model_configuration_non_english(mocker):
mocker.patch(
"presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed",
return_value=None,
)
config = {
"nlp_engine_name": "spacy",
"models": [
{"lang_code": "de", "model_name": "de_core_news_md"}
],
}
provider = NlpEngineProvider(nlp_configuration=config)
engine = provider.create_engine()
assert isinstance(engine, SpacyNlpEngine)
assert 'de' in engine.nlp
assert isinstance(engine.nlp['de'], spacy.lang.de.German)


def test_create_engine_missing_ner_model_configuration_mixed_languages(mocker):
mocker.patch(
"presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed",
return_value=None,
)
config = {
"nlp_engine_name": "spacy",
"models": [
{"lang_code": "en", "model_name": "en_core_web_lg"},
{"lang_code": "de", "model_name": "de_core_news_md"}
],
}
provider = NlpEngineProvider(nlp_configuration=config)
engine = provider.create_engine()
assert isinstance(engine, SpacyNlpEngine)
assert set(engine.nlp.keys()) == {'en', 'de'}


def test_create_engine_missing_ner_model_configuration_empty_models():
config = {
"nlp_engine_name": "spacy",
"models": [],
# ner_model_configuration is missing
}
provider = NlpEngineProvider(nlp_configuration=config)
with pytest.raises(ValueError) as e:
provider.create_engine()
assert "Configuration should include nlp_engine_name and models" in str(e.value)


def test_read_nlp_conf_file_invalid(tmp_path, caplog):
yaml_content = """
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg
"""
yaml_file = tmp_path / "invalid.yaml"
yaml_file.write_text(yaml_content)

with caplog.at_level("WARNING"):
config = NlpEngineProvider._read_nlp_conf(str(yaml_file))
assert "ner_model_configuration is missing" in caplog.text
assert config["nlp_engine_name"] == "spacy"
yaml_file.unlink()


def test_supported_languages_only_en_warns_and_creates(tmp_path, caplog, mocker):
mocker.patch(
"presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed",
return_value=None,
)
yaml_content = """
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg
supported_languages:
- en
"""
conf_file = _write_yaml(tmp_path, yaml_content)
provider = NlpEngineProvider(conf_file=str(conf_file))
with caplog.at_level("WARNING"):
engine = provider.create_engine()
assert "ner_model_configuration is missing" in caplog.text
assert isinstance(engine, SpacyNlpEngine)
assert "en" in engine.nlp
conf_file.unlink()


def test_supported_languages_non_en_raises(tmp_path, mocker):
yaml_content = """
nlp_engine_name: spacy
models:
- lang_code: de
model_name: de_core_news_md
supported_languages:
- de
"""
conf_file = _write_yaml(tmp_path, yaml_content)
mocker.patch(
"presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed",
return_value=None,
)
with pytest.raises(ValueError) as excinfo:
NlpEngineProvider(conf_file=str(conf_file)).create_engine()
assert "missing 'ner_model_configuration'" in str(excinfo.value)
conf_file.unlink()



def test_recognizer_registry_only_en_warns_and_creates(tmp_path, caplog, mocker):
mocker.patch(
"presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed",
return_value=None,
)
yaml_content = """
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg
recognizer_registry:
supported_languages:
- en
"""
conf_file = _write_yaml(tmp_path, yaml_content, "recog.yaml")
provider = NlpEngineProvider(conf_file=str(conf_file))
with caplog.at_level("WARNING"):
engine = provider.create_engine()
assert "ner_model_configuration is missing" in caplog.text
assert isinstance(engine, SpacyNlpEngine)
conf_file.unlink()



def test_recognizer_registry_non_en_raises(tmp_path, mocker):
yaml_content = """
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg
recognizer_registry:
supported_languages:
- fr
"""
conf_file = _write_yaml(tmp_path, yaml_content, "recog2.yaml")
mocker.patch(
"presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed",
return_value=None,
)
with pytest.raises(ValueError) as excinfo:
NlpEngineProvider(conf_file=str(conf_file)).create_engine()
assert "missing 'ner_model_configuration'" in str(excinfo.value)
conf_file.unlink()



def test_mixed_supported_and_recognizer_non_en_raises(tmp_path, mocker):
yaml_content = """
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg
supported_languages:
- en
recognizer_registry:
supported_languages:
- de
"""
conf_file = _write_yaml(tmp_path, yaml_content, "mixed.yaml")
mocker.patch(
"presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed",
return_value=None,
)
with pytest.raises(ValueError) as excinfo:
NlpEngineProvider(conf_file=str(conf_file)).create_engine()
assert "Detected languages: ['de', 'en']" in str(excinfo.value)
conf_file.unlink()



def test_no_supported_or_recognizer_defaults_to_english(tmp_path, caplog, mocker):
mocker.patch(
"presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed",
return_value=None,
)
yaml_content = """
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg
"""
conf_file = _write_yaml(tmp_path, yaml_content, "none.yaml")
provider = NlpEngineProvider(conf_file=str(conf_file))
with caplog.at_level("WARNING"):
engine = provider.create_engine()
assert "ner_model_configuration is missing" in caplog.text
assert isinstance(engine, SpacyNlpEngine)
assert 'en' in engine.nlp
conf_file.unlink()



def test_when_valid_nlp_engines_then_return_default_configuration():
nlp_engines = (SpacyNlpEngine, StanzaNlpEngine, TransformersNlpEngine)
Expand Down
Loading