-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathDataHandler.py
More file actions
92 lines (54 loc) · 2.52 KB
/
DataHandler.py
File metadata and controls
92 lines (54 loc) · 2.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import pandas
class DataHandler:
__file_formats = ['csv','xls','xlsx','xlsb','ods','xlsm']
def __init__(self,file,extension:str,test_size:float = 0.2):
# if not os.path.exists(file):
# raise FileExistsError(f'The Given file name : {file} does not exists.')
if test_size <= 0 or test_size >= 1:
raise ValueError(f'Invalid Test Size : {test_size}')
if extension == DataHandler.__file_formats[0]:
self.data = pandas.read_csv(file)
elif extension in DataHandler.__file_formats:
self.data = pandas.read_excel(file)
self.test_size = test_size
self.data = self.data.sample(frac = 1)
self.data.info()
self.data.describe()
self.data.head()
def getDataFrame(self):
return self.data
def getColumns(self):
return self.data.columns
def check(file_type:str):
for types in DataHandler.__file_formats:
if file_type.endswith(types):
return True
return False
def split_data(self,label_column:str):
self.label_name = label_column
self.classes = self.data[label_column].value_counts().keys()
self.test_size = int(self.test_size * self.data.shape[0])
self.train_labels = self.data.iloc[:self.data.shape[0] - self.test_size][label_column]
self.test_labels = self.data.iloc[:self.test_size][label_column]
data = self.data.drop(columns = [label_column])
self.train_features = self.data.iloc[:data.shape[0] - self.test_size,:]
self.test_features = self.data.iloc[:self.test_size,:]
self.train_data = self.data.iloc[:self.data.shape[0] - self.test_size,:]
self.test_data = self.data.iloc[:self.test_size,:]
def setFeatureMap(self,numerical_data:list,categorical_data:list):
self.feature_map = {}
for key in numerical_data:
self.feature_map[key] = True
for key in categorical_data:
self.feature_map[key] = False
def getTrainFeatures(self):
return self.train_features.head()
def getTrainLabels(self):
return self.train_labels.head()
def getTestFeatures(self):
return self.test_features.head()
def getTestLabels(self):
return self.test_labels.head()
def sort_features(self,column_name:str):
return self.data.sort_values(by = column_name)