-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdatasets.py
138 lines (122 loc) · 4.75 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from aequitas.flow.datasets import GenericDataset
from constants import (
VARIANTS,
SENSITIVE_COLUMN,
LABEL_COLUMN,
BOOL_COLUMNS,
CATEGORICAL_COLUMNS,
)
from typing import Literal, Optional
import pandas as pd
class IIDDataset(GenericDataset):
def __init__(
self,
dataset: Literal["BankAccountFraud", "FolkTables"],
variant: str,
label_column: Optional[str] = None,
sensitive_column: Optional[str] = None,
):
"""
Generate an IID version of the desired Aequitas Dataset.
Parameters
----------
dataset : Literal["BankAccountFraud", "FolkTables"]
The Aequitas Dataset.
variant : str
The variant of the dataset.
label_column : Optional[str], optional
The label column of the dataset. By default, None, which will use the
dataset's default label column.
sensitive_column : Optional[str], optional
The sensitive column of the dataset. By default, None, which will use the
dataset's default sensitive column.
"""
if variant not in VARIANTS[dataset]:
raise ValueError(
f"For the {dataset} dataset, variant must be one of {VARIANTS[dataset]}"
)
if label_column is None:
if dataset == "BankAccountFraud":
label_column = LABEL_COLUMN[dataset]
else:
label_column = LABEL_COLUMN[dataset][variant]
if sensitive_column is None:
sensitive_column = SENSITIVE_COLUMN[dataset]
train_path = f"data/{dataset}/{variant}/iid/train.csv"
validation_path = f"data/{dataset}/{variant}/iid/validation.csv"
test_path = f"data/{dataset}/{variant}/iid/test.csv"
super().__init__(
label_column=label_column,
sensitive_column=sensitive_column,
train_path=train_path,
validation_path=validation_path,
test_path=test_path,
extension="csv",
)
self.dataset = dataset
self.variant = variant
def load_data(self) -> None:
"""Load the defined dataset."""
super().load_data()
if self.dataset == "BankAccountFraud":
self.data[CATEGORICAL_COLUMNS[self.dataset]] = self.data[
CATEGORICAL_COLUMNS[self.dataset]
].astype("category")
self.data["customer_age_bin"] = self.data["customer_age_bin"].astype(
"category"
)
else:
self.data[CATEGORICAL_COLUMNS[self.dataset][self.variant]] = self.data[
CATEGORICAL_COLUMNS[self.dataset][self.variant]
].astype("category")
self.data[BOOL_COLUMNS[self.dataset]] = self.data[
BOOL_COLUMNS[self.dataset]
].astype("bool")
class NoisyDataset(IIDDataset):
def __init__(
self,
dataset: Literal["BankAccountFraud", "FolkTables"],
variant: str,
noise_rates: dict[int, float],
y_dependant: list[int],
label_column: Optional[str] = None,
sensitive_column: Optional[str] = None,
):
"""
Generate a Noisy version of the IID version of the specified Aequitas Dataset.
Parameters
----------
dataset : Literal["BankAccountFraud", "FolkTables"]
The Aequitas Dataset.
variant : str
The variant of the dataset.
noise_rates : dict[int, float]
The noise rates to be applied to each of the sensitive groups.
Ex: {0: 0.05, 1: 0.1}
y_dependant : list[int]
The classe(s) to be affected by the noise. Either [0], [1] or [0, 1].
label_column : Optional[str], optional
The label column of the dataset. By default, None, which will use the
dataset's default label column.
sensitive_column : Optional[str], optional
The sensitive column of the dataset. By default, None, which will use the
dataset's default sensitive column.
"""
super().__init__(
dataset=dataset,
variant=variant,
label_column=label_column,
sensitive_column=sensitive_column,
)
self.noise_rates = noise_rates
self.y_dependant = y_dependant
def load_data(self) -> None:
"""Load the defined dataset."""
super().load_data()
noisy_labels = pd.read_csv(
f"data/{self.dataset}/{self.variant}/noisy/"
f"label_{self.y_dependant[0] if len(self.y_dependant) == 1 else 'both'}/"
f"train_{self.noise_rates[0]}_{self.noise_rates[1]}.csv",
index_col=0,
)["0"]
self.data.loc[noisy_labels.index, self.label_column] = noisy_labels