-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_partition.py
92 lines (79 loc) · 3.76 KB
/
data_partition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import numpy as np
import pandas as pd
import os
import random
import math
from sklearn.model_selection import train_test_split
class DistributedDataSet:
def __init__(self, data, seed, batch_size, clients_num):
self.data = data
self.seed = seed
self.batch_size = batch_size
self.clients_num = clients_num
self.data_size = None
self.n_classes = None
self.client_datasets = {}
def data_split(self, test_size, y_name):
train, test = train_test_split(self.data, random_state=self.seed, test_size=test_size, stratify=self.data[y_name])
train.index = np.arange(len(train))
test.index = np.arange(len(test))
return train, test
def split_and_shuffle_labels(self, y_data, seed, amount, n_classes):
y_data=pd.DataFrame(y_data,columns=["labels"])
y_data["i"]=np.arange(len(y_data))
label_dict = dict()
for i in range(n_classes):
var_name="label" + str(i)
label_info=y_data[y_data["labels"]==i]
np.random.seed(seed)
label_info=np.random.permutation(label_info)
label_info=label_info[0:amount]
label_info=pd.DataFrame(label_info, columns=["labels","i"])
label_dict.update({var_name: label_info })
return label_dict
def get_iid_subsamples_indices(self, label_dict, number_of_samples, batch_size, n_classes):
sample_dict= dict()
batch_size = int(batch_size/n_classes)
for i in range(number_of_samples):
sample_name="sample"+str(i)
dumb=pd.DataFrame()
for j in range(n_classes):
label_name=str("label")+str(j)
a=label_dict[label_name][i*batch_size:(i+1)*batch_size]
dumb=pd.concat([dumb,a], axis=0)
dumb.reset_index(drop=True, inplace=True)
sample_dict.update({sample_name: dumb})
return sample_dict
def create_iid_subsamples(self, sample_dict, data):
data_li = []
for i in range(len(sample_dict)): ### len(sample_dict)= number of samples
sample_name="sample"+str(i)
indices=np.sort(np.array(sample_dict[sample_name]["i"]))
df = data.loc[indices]
data_li.append(df)
return data_li
def get_distributed_dataset(self, test_size, y_name):
self.data.rename(columns={y_name: "y"}, inplace=True)
y_name = "y"
train, test = self.data_split(test_size, y_name)
min_class_num = min(train[y_name].value_counts().values)
self.n_classes = self.data[y_name].nunique()
self.data_size = len(self.data)
label_dict = self.split_and_shuffle_labels(train[y_name].values, self.seed, min_class_num, self.n_classes)
clients_labels_dict = {}
for labels, values in label_dict.items():
a = np.array_split(values, self.clients_num)
client_count = 0
for i in a:
try:
clients_labels_dict[client_count][labels] = i
except:
clients_labels_dict[client_count] = {}
clients_labels_dict[client_count][labels] = i
client_count+=1
for cl, cl_label_dict in clients_labels_dict.items():
subsample_indices = self.get_iid_subsamples_indices(cl_label_dict, int((self.n_classes*min_class_num)/(self.clients_num*self.batch_size)), self.batch_size, self.n_classes)
# print(self.data_size,1-test_size,self.clients_num,self.batch_size)
iid_samples = self.create_iid_subsamples(subsample_indices, train)
self.client_datasets[cl] = iid_samples
return self.client_datasets, test