diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py index 921c87190..141e8d092 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py @@ -12,8 +12,13 @@ TransformersNlpEngine, ) -logger = logging.getLogger("presidio-analyzer") +DEFAULT_BUILTIN_CONFIG = { + "nlp_engine_name": "spacy", + "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}], + "ner_model_configuration": {}, +} +logger = logging.getLogger("presidio-analyzer") class NlpEngineProvider: """Create different NLP engines from configuration. @@ -62,11 +67,17 @@ def __init__( if conf_file or conf_file == '': self._validate_conf_file_path(conf_file) self.nlp_configuration = self._read_nlp_conf(conf_file) - - if conf_file is None and nlp_configuration is None: + else: conf_file = self._get_full_conf_path() logger.debug(f"Reading default conf file from {conf_file}") - self.nlp_configuration = self._read_nlp_conf(conf_file) + try: + self.nlp_configuration = self._read_nlp_conf(conf_file) + except FileNotFoundError: + logger.warning( + f"Default config file '{conf_file}' not found. " + f"Falling back to built-in default: {DEFAULT_BUILTIN_CONFIG}" + ) + self.nlp_configuration = DEFAULT_BUILTIN_CONFIG @staticmethod def _validate_nlp_engines(nlp_engines: Tuple) -> None: @@ -208,10 +219,7 @@ def _read_nlp_conf(conf_file: Union[Path, str]) -> dict: with open(conf_file) as file: nlp_configuration = yaml.safe_load(file) - if "ner_model_configuration" not in nlp_configuration: - logger.warning( - "configuration file is missing 'ner_model_configuration'. Using default" - ) + NlpEngineProvider._validate_yaml_config_format(nlp_configuration) return nlp_configuration @@ -221,3 +229,44 @@ def _get_full_conf_path( ) -> Path: """Return a Path to the default conf file.""" return Path(Path(__file__).parent.parent, "conf", default_conf_file) + + @staticmethod + def _validate_yaml_config_format(nlp_configuration: Dict) -> None: + """Validate the YAML configuration file format.""" + logger = logging.getLogger("presidio-analyzer") + + for key in ("nlp_engine_name", "models"): + if key not in nlp_configuration: + raise ValueError(f"Configuration file is missing '{key}'.") + + if nlp_configuration.get("ner_model_configuration"): + return + + cfg_langs = { + str(lang).lower() + for lang in nlp_configuration.get("supported_languages", []) or [] + } + + recog_langs = { + str(lang).lower() + for lang in ( + nlp_configuration.get("recognizer_registry", {}) + .get("supported_languages", []) + or [] + ) + } + + requested_langs = cfg_langs | recog_langs + english_only = not requested_langs or requested_langs == {"en"} + + if english_only: + logger.warning( + "ner_model_configuration is missing, " + "Default English configuration will be used." + ) + else: + raise ValueError( + "Configuration file is missing 'ner_model_configuration', " + "which is required when requested languages are not only English. " + f"Detected languages: {sorted(requested_langs)}" + ) diff --git a/presidio-analyzer/tests/test_nlp_engine_provider.py b/presidio-analyzer/tests/test_nlp_engine_provider.py index b65f31522..309f0f827 100644 --- a/presidio-analyzer/tests/test_nlp_engine_provider.py +++ b/presidio-analyzer/tests/test_nlp_engine_provider.py @@ -3,6 +3,7 @@ import pytest import spacy +import shutil from presidio_analyzer.nlp_engine import ( SpacyNlpEngine, @@ -11,6 +12,10 @@ ) from presidio_analyzer.nlp_engine.transformers_nlp_engine import TransformersNlpEngine +def _write_yaml(tmp_path, content: str, name: str = "config.yaml") -> Path: + path = tmp_path / name + path.write_text(content) + return path @pytest.fixture(scope="module") def mock_he_model(): @@ -21,6 +26,8 @@ def mock_he_model(): """ he = spacy.blank("he") he.to_disk("he_test") + yield + shutil.rmtree("he_test", ignore_errors=True) @pytest.fixture(scope="module") @@ -32,6 +39,8 @@ def mock_bn_model(): """ bn = spacy.blank("bn") bn.to_disk("bn_test") + yield + shutil.rmtree("bn_test", ignore_errors=True) @pytest.fixture(scope="session") @@ -171,6 +180,12 @@ def test_when_both_conf_and_config_then_fail(mocker): NlpEngineProvider(conf_file=conf_file, nlp_configuration=nlp_configuration) +def test_when_labels_to_ignore_not_define_in_conf_file_default_into_empty_set(mocker): + conf_file = (Path(__file__).parent.parent/ "presidio_analyzer"/ "conf"/ "spacy_multilingual.yaml") + engine = NlpEngineProvider(conf_file=conf_file).create_engine() + assert len(engine.ner_model_configuration.labels_to_ignore) == 0 + + @pytest.mark.skip_engine("transformers_en") def test_when_create_transformers_nlp_engine_then_succeeds(mocker): mocker.patch( @@ -241,6 +256,221 @@ def test_nlp_engine_provider_init_through_nlp_engine_configuration(): assert isinstance(engine, SpacyNlpEngine) assert engine.engine_name == "spacy" + +def test_create_engine_missing_ner_model_configuration_english_only(): + config = { + "nlp_engine_name": "spacy", + "models": [ + {"lang_code": "en", "model_name": "en_core_web_lg"} + ], + } + provider = NlpEngineProvider(nlp_configuration=config) + engine = provider.create_engine() + assert isinstance(engine, SpacyNlpEngine) + assert 'en' in engine.nlp + assert isinstance(engine.nlp['en'], spacy.lang.en.English) + + +def test_create_engine_missing_ner_model_configuration_non_english(mocker): + mocker.patch( + "presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed", + return_value=None, + ) + config = { + "nlp_engine_name": "spacy", + "models": [ + {"lang_code": "de", "model_name": "de_core_news_md"} + ], + } + provider = NlpEngineProvider(nlp_configuration=config) + engine = provider.create_engine() + assert isinstance(engine, SpacyNlpEngine) + assert 'de' in engine.nlp + assert isinstance(engine.nlp['de'], spacy.lang.de.German) + + +def test_create_engine_missing_ner_model_configuration_mixed_languages(mocker): + mocker.patch( + "presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed", + return_value=None, + ) + config = { + "nlp_engine_name": "spacy", + "models": [ + {"lang_code": "en", "model_name": "en_core_web_lg"}, + {"lang_code": "de", "model_name": "de_core_news_md"} + ], + } + provider = NlpEngineProvider(nlp_configuration=config) + engine = provider.create_engine() + assert isinstance(engine, SpacyNlpEngine) + assert set(engine.nlp.keys()) == {'en', 'de'} + + +def test_create_engine_missing_ner_model_configuration_empty_models(): + config = { + "nlp_engine_name": "spacy", + "models": [], + # ner_model_configuration is missing + } + provider = NlpEngineProvider(nlp_configuration=config) + with pytest.raises(ValueError) as e: + provider.create_engine() + assert "Configuration should include nlp_engine_name and models" in str(e.value) + + +def test_read_nlp_conf_file_invalid(tmp_path, caplog): + yaml_content = """ +nlp_engine_name: spacy +models: + - lang_code: en + model_name: en_core_web_lg +""" + yaml_file = tmp_path / "invalid.yaml" + yaml_file.write_text(yaml_content) + + with caplog.at_level("WARNING"): + config = NlpEngineProvider._read_nlp_conf(str(yaml_file)) + assert "ner_model_configuration is missing" in caplog.text + assert config["nlp_engine_name"] == "spacy" + yaml_file.unlink() + + +def test_supported_languages_only_en_warns_and_creates(tmp_path, caplog, mocker): + mocker.patch( + "presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed", + return_value=None, + ) + yaml_content = """ +nlp_engine_name: spacy +models: + - lang_code: en + model_name: en_core_web_lg +supported_languages: + - en +""" + conf_file = _write_yaml(tmp_path, yaml_content) + provider = NlpEngineProvider(conf_file=str(conf_file)) + with caplog.at_level("WARNING"): + engine = provider.create_engine() + assert "ner_model_configuration is missing" in caplog.text + assert isinstance(engine, SpacyNlpEngine) + assert "en" in engine.nlp + conf_file.unlink() + + +def test_supported_languages_non_en_raises(tmp_path, mocker): + yaml_content = """ +nlp_engine_name: spacy +models: + - lang_code: de + model_name: de_core_news_md +supported_languages: + - de +""" + conf_file = _write_yaml(tmp_path, yaml_content) + mocker.patch( + "presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed", + return_value=None, + ) + with pytest.raises(ValueError) as excinfo: + NlpEngineProvider(conf_file=str(conf_file)).create_engine() + assert "missing 'ner_model_configuration'" in str(excinfo.value) + conf_file.unlink() + + + +def test_recognizer_registry_only_en_warns_and_creates(tmp_path, caplog, mocker): + mocker.patch( + "presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed", + return_value=None, + ) + yaml_content = """ +nlp_engine_name: spacy +models: + - lang_code: en + model_name: en_core_web_lg +recognizer_registry: + supported_languages: + - en +""" + conf_file = _write_yaml(tmp_path, yaml_content, "recog.yaml") + provider = NlpEngineProvider(conf_file=str(conf_file)) + with caplog.at_level("WARNING"): + engine = provider.create_engine() + assert "ner_model_configuration is missing" in caplog.text + assert isinstance(engine, SpacyNlpEngine) + conf_file.unlink() + + + +def test_recognizer_registry_non_en_raises(tmp_path, mocker): + yaml_content = """ +nlp_engine_name: spacy +models: + - lang_code: en + model_name: en_core_web_lg +recognizer_registry: + supported_languages: + - fr +""" + conf_file = _write_yaml(tmp_path, yaml_content, "recog2.yaml") + mocker.patch( + "presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed", + return_value=None, + ) + with pytest.raises(ValueError) as excinfo: + NlpEngineProvider(conf_file=str(conf_file)).create_engine() + assert "missing 'ner_model_configuration'" in str(excinfo.value) + conf_file.unlink() + + + +def test_mixed_supported_and_recognizer_non_en_raises(tmp_path, mocker): + yaml_content = """ +nlp_engine_name: spacy +models: + - lang_code: en + model_name: en_core_web_lg +supported_languages: + - en +recognizer_registry: + supported_languages: + - de +""" + conf_file = _write_yaml(tmp_path, yaml_content, "mixed.yaml") + mocker.patch( + "presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed", + return_value=None, + ) + with pytest.raises(ValueError) as excinfo: + NlpEngineProvider(conf_file=str(conf_file)).create_engine() + assert "Detected languages: ['de', 'en']" in str(excinfo.value) + conf_file.unlink() + + + +def test_no_supported_or_recognizer_defaults_to_english(tmp_path, caplog, mocker): + mocker.patch( + "presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed", + return_value=None, + ) + yaml_content = """ +nlp_engine_name: spacy +models: + - lang_code: en + model_name: en_core_web_lg +""" + conf_file = _write_yaml(tmp_path, yaml_content, "none.yaml") + provider = NlpEngineProvider(conf_file=str(conf_file)) + with caplog.at_level("WARNING"): + engine = provider.create_engine() + assert "ner_model_configuration is missing" in caplog.text + assert isinstance(engine, SpacyNlpEngine) + assert 'en' in engine.nlp + conf_file.unlink() + + def test_when_valid_nlp_engines_then_return_default_configuration(): nlp_engines = (SpacyNlpEngine, StanzaNlpEngine, TransformersNlpEngine)