14
14
# KIND, either express or implied. See the License for the
15
15
# specific language governing permissions and limitations
16
16
# under the License.
17
- from typing import Iterator , Optional
17
+ from typing import Iterator , Optional , Set
18
18
19
19
from pyiceberg .exceptions import ValidationException
20
20
from pyiceberg .expressions import BooleanExpression
21
21
from pyiceberg .expressions .visitors import ROWS_CANNOT_MATCH , _InclusiveMetricsEvaluator
22
22
from pyiceberg .manifest import ManifestContent , ManifestEntry , ManifestEntryStatus , ManifestFile
23
+ from pyiceberg .schema import Schema
23
24
from pyiceberg .table import Table
24
25
from pyiceberg .table .snapshots import Operation , Snapshot , ancestors_between
25
26
from pyiceberg .typedef import Record
26
27
27
- VALIDATE_DATA_FILES_EXIST_OPERATIONS = {Operation .OVERWRITE , Operation .REPLACE , Operation .DELETE }
28
+ VALIDATE_DATA_FILES_EXIST_OPERATIONS : Set [Operation ] = {Operation .OVERWRITE , Operation .REPLACE , Operation .DELETE }
29
+ VALIDATE_ADDED_DATA_FILES_OPERATIONS : Set [Operation ] = {Operation .APPEND , Operation .OVERWRITE }
28
30
29
31
30
32
def _validation_history (
@@ -77,6 +79,47 @@ def _validation_history(
77
79
return manifests_files , snapshots
78
80
79
81
82
+ def _filter_manifest_entries (
83
+ entry : ManifestEntry ,
84
+ snapshot_ids : set [int ],
85
+ data_filter : Optional [BooleanExpression ],
86
+ partition_set : Optional [dict [int , set [Record ]]],
87
+ entry_status : Optional [ManifestEntryStatus ],
88
+ schema : Schema ,
89
+ ) -> bool :
90
+ """Filter manifest entries based on data filter and partition set.
91
+
92
+ Args:
93
+ entry: Manifest entry to filter
94
+ snapshot_ids: set of snapshot ids to match data files
95
+ data_filter: Optional filter to match data files
96
+ partition_set: Optional set of partitions to match data files
97
+ entry_status: Optional status to match data files
98
+ schema: schema for filtering
99
+
100
+ Returns:
101
+ True if the entry should be included, False otherwise
102
+ """
103
+ if entry .snapshot_id not in snapshot_ids :
104
+ return False
105
+
106
+ if entry_status is not None and entry .status != entry_status :
107
+ return False
108
+
109
+ if data_filter is not None :
110
+ evaluator = _InclusiveMetricsEvaluator (schema , data_filter )
111
+ if evaluator .eval (entry .data_file ) is ROWS_CANNOT_MATCH :
112
+ return False
113
+
114
+ if partition_set is not None :
115
+ partition = entry .data_file .partition
116
+ spec_id = entry .data_file .spec_id
117
+ if spec_id not in partition_set or partition not in partition_set [spec_id ]:
118
+ return False
119
+
120
+ return True
121
+
122
+
80
123
def _deleted_data_files (
81
124
table : Table ,
82
125
starting_snapshot : Snapshot ,
@@ -108,27 +151,12 @@ def _deleted_data_files(
108
151
ManifestContent .DATA ,
109
152
)
110
153
111
- if data_filter is not None :
112
- evaluator = _InclusiveMetricsEvaluator (table .schema (), data_filter ).eval
113
-
114
154
for manifest in manifests :
115
155
for entry in manifest .fetch_manifest_entry (table .io , discard_deleted = False ):
116
- if entry .snapshot_id not in snapshot_ids :
117
- continue
118
-
119
- if entry .status != ManifestEntryStatus .DELETED :
120
- continue
121
-
122
- if data_filter is not None and evaluator (entry .data_file ) is ROWS_CANNOT_MATCH :
123
- continue
124
-
125
- if partition_set is not None :
126
- spec_id = entry .data_file .spec_id
127
- partition = entry .data_file .partition
128
- if spec_id not in partition_set or partition not in partition_set [spec_id ]:
129
- continue
130
-
131
- yield entry
156
+ if _filter_manifest_entries (
157
+ entry , snapshot_ids , data_filter , partition_set , ManifestEntryStatus .DELETED , table .schema ()
158
+ ):
159
+ yield entry
132
160
133
161
134
162
def _validate_deleted_data_files (
@@ -150,3 +178,60 @@ def _validate_deleted_data_files(
150
178
if any (conflicting_entries ):
151
179
conflicting_snapshots = {entry .snapshot_id for entry in conflicting_entries }
152
180
raise ValidationException (f"Deleted data files were found matching the filter for snapshots { conflicting_snapshots } !" )
181
+
182
+
183
+ def _added_data_files (
184
+ table : Table ,
185
+ starting_snapshot : Snapshot ,
186
+ data_filter : Optional [BooleanExpression ],
187
+ partition_set : Optional [dict [int , set [Record ]]],
188
+ parent_snapshot : Optional [Snapshot ],
189
+ ) -> Iterator [ManifestEntry ]:
190
+ """Return manifest entries for data files added between the starting snapshot and parent snapshot.
191
+
192
+ Args:
193
+ table: Table to get the history from
194
+ starting_snapshot: Starting snapshot to get the history from
195
+ data_filter: Optional filter to match data files
196
+ partition_set: Optional set of partitions to match data files
197
+ parent_snapshot: Parent snapshot to get the history from
198
+
199
+ Returns:
200
+ Iterator of manifest entries for added data files matching the conditions
201
+ """
202
+ if parent_snapshot is None :
203
+ return
204
+
205
+ manifests , snapshot_ids = _validation_history (
206
+ table ,
207
+ parent_snapshot ,
208
+ starting_snapshot ,
209
+ VALIDATE_ADDED_DATA_FILES_OPERATIONS ,
210
+ ManifestContent .DATA ,
211
+ )
212
+
213
+ for manifest in manifests :
214
+ for entry in manifest .fetch_manifest_entry (table .io ):
215
+ if _filter_manifest_entries (entry , snapshot_ids , data_filter , partition_set , None , table .schema ()):
216
+ yield entry
217
+
218
+
219
+ def _validate_added_data_files (
220
+ table : Table ,
221
+ starting_snapshot : Snapshot ,
222
+ data_filter : Optional [BooleanExpression ],
223
+ parent_snapshot : Optional [Snapshot ],
224
+ ) -> None :
225
+ """Validate that no files matching a filter have been added to the table since a starting snapshot.
226
+
227
+ Args:
228
+ table: Table to validate
229
+ starting_snapshot: Snapshot current at the start of the operation
230
+ data_filter: Expression used to find added data files
231
+ parent_snapshot: Ending snapshot on the branch being validated
232
+
233
+ """
234
+ conflicting_entries = _added_data_files (table , starting_snapshot , data_filter , None , parent_snapshot )
235
+ if any (conflicting_entries ):
236
+ conflicting_snapshots = {entry .snapshot_id for entry in conflicting_entries if entry .snapshot_id is not None }
237
+ raise ValidationException (f"Added data files were found matching the filter for snapshots { conflicting_snapshots } !" )
0 commit comments