-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconstraints.py
269 lines (238 loc) · 9.64 KB
/
constraints.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
"""This module provides the basic objects for the dataframe_validation"""
import json
from dataclasses import dataclass, field
from ast import literal_eval
import pandas as pd
import numpy as np
from utils import TypeEncoder
@dataclass
class StandardConstraints:
"""
Standard Constraints class provides a data constraints discovery.
"""
constraints: dict = field(default_factory=dict)
def __str__(self):
var = []
for key,val in self.constraints.items():
var.append(f'{key}: {val}')
return "\n".join(var)
def get_data_type(self, data: pd.DataFrame, colname: str) -> str:
"""Get column data types"""
return data[colname].dtype.name
def is_nullable(self, data: pd.DataFrame, colname: str) -> bool:
"""Get nullable constraint True/False"""
return data[colname].isna().any()
def is_unique(self, data: pd.DataFrame, colname: str) -> bool:
"""Get unique constraint True/False"""
return ~data[colname].duplicated().any()
def max_length(self, data: pd.DataFrame, colname: str) -> int:
"""Get max length constraint"""
return max(data[colname].map(str).map(len))
def min_length(self, data: pd.DataFrame, colname: str) -> int:
"""Get min length constraint"""
return min(data[colname].dropna().map(str).map(len))
def value_range(self, data: pd.DataFrame, colname: str) -> set:
"""Get range of values constraint"""
return set(data[colname])
def min_value(self, data: pd.DataFrame, colname: str) -> float:
"""Get min value constraint"""
return data[colname].min()
def max_value(self, data: pd.DataFrame, colname: str) -> float:
"""Get min value constraint"""
return data[colname].max()
def min_date(self, data: pd.DataFrame, colname: str) -> str:
"""Get min value constraint"""
if data[colname].notnull().any():
val = data[colname].min().strftime("%Y-%m-%d")
else:
val = data[colname].min()
return val
def max_date(self, data: pd.DataFrame, colname: str) -> str:
"""Get min value constraint"""
if data[colname].notnull().any():
val = data[colname].max().strftime("%Y-%m-%d")
else:
val = data[colname].max()
return val
def generate_constraints(self, data: pd.DataFrame) -> dict:
"""
Discover standard constraints dict based on provided DataFrame
:param data: a pandas DataFrame
:return: A dict with constraints
"""
all_cols = data.columns
# separate string columns from category columns
for col in all_cols:
if issubclass(data[col].dtypes.type, np.object_) and (
len(data[col].unique()) <= 20
):
data[col] = data[col].astype("category")
elif issubclass(data[col].dtypes.type, np.object_) and (
len(data[col].unique()) > 20
):
data[col] = data[col].astype(str)
nr_cols = data.select_dtypes(include=["number"]).columns
str_cols = data.select_dtypes(include=["string", "object"]).columns
cat_cols = data.select_dtypes(include=["category"]).columns
dt_cols = data.select_dtypes(include=["datetime64"]).columns
for col in all_cols:
self.constraints[col] = {
"data_type": self.get_data_type(data, col),
"nullable": self.is_nullable(data, col),
}
for col in cat_cols:
self.constraints[col].update(
{
"min_length": self.min_length(data, col),
"max_length": self.max_length(data, col),
"value_range": self.value_range(data, col),
}
)
for col in str_cols:
self.constraints[col].update(
{
"unique": self.is_unique(data, col),
"min_length": self.min_length(data, col),
"max_length": self.max_length(data, col),
}
)
for col in nr_cols:
self.constraints[col].update(
{
"min_value": self.min_value(data, col),
"max_value": self.max_value(data, col),
}
)
for col in dt_cols:
self.constraints[col].update(
{
"min_date": self.min_date(data, col),
"max_date": self.max_date(data, col),
}
)
return self.constraints
def modify_constraint(self, column: str, modify_dict: dict) -> dict:
"""
Modify a constrain for a specific column
Parameters:
column: an str with DataFrame column name
modify_dict: a dic with constraint type as key
and constrain value as value
Returns:
A modify dict with updated constraints
"""
self.constraints[column].update(modify_dict)
return self.constraints
def save_as(self, save_as: str):
"""
Save constraints to file
:param save_as: an str with csv or json file name
:return: a csv or json file saved to local disk
"""
if save_as.endswith(".json"):
with open(save_as, "w", encoding="utf-8") as s_file:
json.dump(self.constraints, s_file, indent=4, cls=TypeEncoder)
elif save_as.endswith(".csv"):
frame = pd.DataFrame(self.constraints).T
frame.to_csv(save_as)
else:
raise ValueError("Save values can be 'json' or 'csv'")
def read_constraints(self, file_name: str):
"""
Read constraints from file
:param file_name: an str with csv or json file name
:returns: a dict with constrains key, values pairs
"""
if file_name.endswith(".json"):
with open(file_name, "r", encoding="utf-8") as read_file:
self.constraints = json.loads(read_file.read())
elif file_name.endswith(".csv"):
frame = pd.read_csv(file_name, index_col=0)
frame["rules"] = [
{k: v for k, v in m.items() if pd.notnull(v)}
for m in frame.to_dict(orient="records")
]
frame.loc[:, ["rules"]].groupby(frame.index)
frame = frame.to_dict()["rules"]
# loop for value_range to read set as literal
for _, val in frame.items():
for key, _ in val.items():
if key == "value_range":
range_values = (
val["value_range"]
.replace("'", '"')
.replace("nan", "'nan'")
)
val["value_range"] = literal_eval(
literal_eval(json.dumps(range_values))
)
self.constraints = frame
return self.constraints
@dataclass
class CustomConstraints:
"""
CustomConstraint class for storing custom constraints rules.
"""
custom_constraints: list = field(default_factory=list)
def add_custom_constraint(self, name: str, query: str) -> list:
"""
Add a custom constraint
:param name: an str with the name of the custom validation
:param query: a pandas query str
:return: an updated custom constraints list
"""
new_constraint = {}
new_constraint["name"] = name
new_constraint["query"] = query
if new_constraint in self.custom_constraints:
print(f"{new_constraint} already exists.")
else:
self.custom_constraints.append(new_constraint)
return self.custom_constraints
def delete_custom_constraint(self, name: str) -> list:
"""
Delete a custom constraint
:param: an str with the name of the custom rule
:return: an updated custom constraints list
"""
for constraint in self.custom_constraints:
if constraint["name"] == name:
self.custom_constraints.remove(constraint)
return self.custom_constraints
def view_custom_constraints(self):
"""
Convert list of custom constraints to DataFrame
:param: None
:return: a DataFrame with all custom constraints
"""
return pd.DataFrame(self.custom_constraints)
def save_as(self, save_as: str):
"""
Save constraints to file
:param save_as: an str with csv or json file name
:returns: saves a csv or json file to local disk
"""
if save_as.endswith(".json"):
with open(save_as, "w", encoding="utf-8") as s_file:
json.dump(
self.custom_constraints, s_file, indent=4, cls=TypeEncoder
)
elif save_as.endswith(".csv"):
frame = pd.DataFrame(self.custom_constraints)
frame.to_csv(save_as)
else:
raise ValueError("Save values can be 'json' or 'csv'")
def read_constraints(self, file_name: str):
"""
Read constraints from file
:param file_name: an str with csv or json file name
:returns: a dict with constrains key, values pairs
"""
if file_name.endswith(".json"):
with open(file_name, "r", encoding="utf-8") as read_file:
self.custom_constraints = json.loads(read_file.read())
elif file_name.endswith(".csv"):
frame = pd.read_csv(file_name, index_col=0)
frame = frame.to_dict("records")
self.custom_constraints = frame
return self.custom_constraints