Skip to content

Commit 30586e3

Browse files
fix: Implement correct file filter syntax and ignore_rules for FDD
1 parent 35cb38f commit 30586e3

3 files changed

Lines changed: 74 additions & 7 deletions

File tree

datamasque/client/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@
6161
FileDiscoveryLocatorResult,
6262
FileDiscoveryMatch,
6363
FileDiscoveryResult,
64+
FileFilter,
65+
FileFilterMatchAgainst,
6466
FileRulesetGenerationRequest,
6567
ForeignKeyRef,
6668
InDataDiscoveryConfig,
@@ -146,6 +148,8 @@
146148
"FileDiscoveryLocatorResult",
147149
"FileDiscoveryMatch",
148150
"FileDiscoveryResult",
151+
"FileFilter",
152+
"FileFilterMatchAgainst",
149153
"FileId",
150154
"FileOrContent",
151155
"FileRulesetGenerationRequest",

datamasque/client/models/discovery.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
"""Typed request and response shapes for schema-discovery and ruleset-generation endpoints."""
22

3+
from enum import Enum
34
from typing import Any, Optional, Union
45

5-
from pydantic import BaseModel, ConfigDict, Field, field_validator
6+
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
67

78
from datamasque.client.models.connection import ConnectionConfig, ConnectionId, unwrap_connection_id
89
from datamasque.client.models.data_selection import HashColumnsTableConfig, Locator, UserSelection
@@ -28,6 +29,7 @@ class InDataDiscoveryConfig(BaseModel):
2829
row_sample_size: Optional[int] = None
2930
custom_rules: Optional[list[InDataDiscoveryRule]] = None
3031
non_sensitive_rules: Optional[list[InDataDiscoveryRule]] = None
32+
ignore_rules: Optional[list[InDataDiscoveryRule]] = None
3133
force: Optional[bool] = None
3234

3335

@@ -86,6 +88,35 @@ def _unwrap_connection(cls, value: Any) -> Any:
8688
return unwrap_connection_id(value)
8789

8890

91+
class FileFilterMatchAgainst(str, Enum):
92+
"""Which part of a file's path an `include`/`skip` filter is matched against."""
93+
94+
path = "path"
95+
filename = "filename"
96+
97+
98+
class FileFilter(BaseModel):
99+
"""
100+
A single `include` or `skip` filter for file data discovery.
101+
102+
Exactly one of `glob` or `regex` must be set.
103+
`match_against` selects whether the pattern is applied to the full path or just the filename
104+
(the server defaults to the full path when omitted).
105+
"""
106+
107+
model_config = ConfigDict(extra="forbid")
108+
109+
glob: Optional[str] = None
110+
regex: Optional[str] = None
111+
match_against: Optional[FileFilterMatchAgainst] = None
112+
113+
@model_validator(mode="after")
114+
def _check_glob_xor_regex(self) -> "FileFilter":
115+
if bool(self.glob) == bool(self.regex):
116+
raise ValueError("A `FileFilter` must set exactly one of `glob` or `regex`.")
117+
return self
118+
119+
89120
class FileDataDiscoveryOptions(BaseModel):
90121
"""Run options nested under `FileDataDiscoveryRequest.options`."""
91122

@@ -117,8 +148,8 @@ class FileDataDiscoveryRequest(BaseModel):
117148
disable_global_ignored_keywords: Optional[bool] = None
118149
in_data_discovery: Optional[InDataDiscoveryConfig] = None
119150
recurse: Optional[bool] = None
120-
include: Optional[list[str]] = None
121-
skip: Optional[list[str]] = None
151+
include: Optional[list[FileFilter]] = None
152+
skip: Optional[list[FileFilter]] = None
122153
encoding: Optional[str] = None
123154
workers: Optional[int] = None
124155

tests/test_discovery.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,18 @@
66

77
import pytest
88
import requests_mock
9+
from pydantic import ValidationError
910

1011
from datamasque.client import (
1112
DataMasqueClient,
1213
DiscoveryConfig,
1314
DiscoveryConfigId,
1415
FileDataDiscoveryOptions,
1516
FileDataDiscoveryRequest,
17+
FileFilter,
18+
FileFilterMatchAgainst,
1619
FileRulesetGenerationRequest,
20+
InDataDiscoveryConfig,
1721
RulesetGenerationRequest,
1822
RunId,
1923
SchemaDiscoveryPage,
@@ -808,8 +812,8 @@ def test_start_file_data_discovery_run_full(client):
808812
disable_global_ignored_keywords=False,
809813
in_data_discovery={"enabled": True, "row_sample_size": 50},
810814
recurse=True,
811-
include=["*.csv"],
812-
skip=["**/tmp/**"],
815+
include=[{"glob": "*.csv"}],
816+
skip=[{"regex": r".*/tmp/.*", "match_against": "path"}],
813817
encoding="utf-8",
814818
workers=4,
815819
)
@@ -833,13 +837,41 @@ def test_start_file_data_discovery_run_full(client):
833837
"disable_global_ignored_keywords": False,
834838
"in_data_discovery": {"enabled": True, "row_sample_size": 50},
835839
"recurse": True,
836-
"include": ["*.csv"],
837-
"skip": ["**/tmp/**"],
840+
"include": [{"glob": "*.csv"}],
841+
"skip": [{"regex": r".*/tmp/.*", "match_against": "path"}],
838842
"encoding": "utf-8",
839843
"workers": 4,
840844
}
841845

842846

847+
def test_file_filter_requires_exactly_one_of_glob_or_regex():
848+
"""A `FileFilter` with neither, or both, of `glob`/`regex` is rejected."""
849+
FileFilter(glob="*.csv")
850+
FileFilter(regex=r".*\.csv")
851+
FileFilter(glob="*.csv", match_against=FileFilterMatchAgainst.filename)
852+
853+
with pytest.raises(ValidationError, match="exactly one of `glob` or `regex`"):
854+
FileFilter()
855+
856+
with pytest.raises(ValidationError, match="exactly one of `glob` or `regex`"):
857+
FileFilter(glob="*.csv", regex=r".*\.csv")
858+
859+
860+
def test_file_data_discovery_ignore_rules_serialize():
861+
"""`in_data_discovery.ignore_rules` round-trips into the wire payload."""
862+
req = FileDataDiscoveryRequest(
863+
connection="conn-1",
864+
in_data_discovery=InDataDiscoveryConfig(
865+
enabled=True,
866+
custom_rules=[{"name": "cc", "pattern": r"^1234"}],
867+
non_sensitive_rules=[{"pattern": r"^5555"}],
868+
ignore_rules=[{"pattern": r"^4321"}],
869+
),
870+
)
871+
dumped = req.model_dump(exclude_none=True, mode="json")
872+
assert dumped["in_data_discovery"]["ignore_rules"] == [{"pattern": r"^4321"}]
873+
874+
843875
def test_start_file_data_discovery_run_raises_on_non_201(client):
844876
with requests_mock.Mocker() as m:
845877
m.post(

0 commit comments

Comments
 (0)