|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +# importing required libraries |
| 4 | +from matplotlib import pyplot as plt |
| 5 | +from utils import mnist_reader |
| 6 | +from future.utils import iteritems |
| 7 | +from datetime import datetime |
| 8 | +from scipy.stats import norm |
| 9 | +from scipy.stats import multivariate_normal as mvn |
| 10 | +import numpy as np |
| 11 | +import matplotlib |
| 12 | +from sklearn.decomposition import PCA |
| 13 | +from sklearn.decomposition import IncrementalPCA |
| 14 | + |
| 15 | + |
| 16 | +class Dataset(object): |
| 17 | + |
| 18 | + def __init__(self): |
| 19 | + pass |
| 20 | + |
| 21 | + def load(self, folder_path, data_type): |
| 22 | + """ |
| 23 | + This function loads the data-set |
| 24 | + :param folder_path: path to data-set folder |
| 25 | + :param data_type: train or test data |
| 26 | + :return: data and labels |
| 27 | + """ |
| 28 | + train_data, test_data = mnist_reader.load_mnist(folder_path, kind=data_type) |
| 29 | + return train_data, test_data |
| 30 | + |
| 31 | + def normalize(self, data_vector): |
| 32 | + """ |
| 33 | + This function normalizes the data |
| 34 | + :param data_vector: data to be normalised |
| 35 | + :return: normalised data |
| 36 | + """ |
| 37 | + data_vector.astype('float32') |
| 38 | + normalised_data = (data_vector / 255) |
| 39 | + return normalised_data |
| 40 | + |
| 41 | + |
| 42 | +data_set = Dataset() |
| 43 | +x_train, y_train = data_set.load('data/fashion', 'train') |
| 44 | +x_test, y_test = data_set.load('data/fashion', 't10k') |
| 45 | + |
| 46 | +x_train_norm = data_set.normalize(x_train) |
| 47 | +x_test_norm = data_set.normalize(x_test) |
| 48 | + |
| 49 | +# pca = PCA() |
| 50 | +# pca.fit(x_train) |
| 51 | +# cumsum = np.cumsum(pca.explained_variance_ratio_) |
| 52 | +# d = np.argmax(cumsum >= 0.95) + 1 |
| 53 | +# |
| 54 | +# pca = PCA(n_components = 187) |
| 55 | +# x_train_pca = pca.fit_transform(x_train) |
| 56 | +# x_test_pca = pca.fit_transform(x_test) |
| 57 | + |
| 58 | +n_batches = 50 |
| 59 | + |
| 60 | +inc_pca = IncrementalPCA(n_components=187) |
| 61 | +for X_batch in np.array_split(x_train_norm, n_batches): |
| 62 | + inc_pca.partial_fit(X_batch) |
| 63 | +x_train_pca_inc = inc_pca.transform(x_train_norm) |
| 64 | + |
| 65 | +for X_batch in np.array_split(x_test_norm, n_batches): |
| 66 | + inc_pca.partial_fit(X_batch) |
| 67 | +x_test_pca_inc = inc_pca.transform(x_test_norm) |
| 68 | +# |
| 69 | +# X_train, y_train = mnist_reader.load_mnist('data/fashion', kind='train') |
| 70 | +# X_test, y_test = mnist_reader.load_mnist('data/fashion', kind='t10k') |
| 71 | +# |
| 72 | +# X_train = X_train.astype('float32') #images loaded in as int64, 0 to 255 integers |
| 73 | +# X_test = X_test.astype('float32') |
| 74 | +# # Normalization |
| 75 | +# X_train /= 255 |
| 76 | +# X_test /= 255 |
| 77 | + |
| 78 | +# plt.figure(figsize=(12,10))# Showing the Input Data after Normalizing |
| 79 | +# x, y = 4, 4 |
| 80 | +# for i in range(15): |
| 81 | +# plt.subplot(y, x, i+1) |
| 82 | +# plt.imshow(X_train[i].reshape((28,28)),interpolation='nearest') |
| 83 | +# plt.show() |
| 84 | + |
| 85 | +# some_item = X_train[9000] |
| 86 | +# # some_item_image = some_item.reshape(28, 28) |
| 87 | +# # plt.imshow(some_item_image, cmap = matplotlib.cm.binary,interpolation="nearest") |
| 88 | +# # plt.axis("off") |
| 89 | +# # plt.show() |
| 90 | + |
| 91 | + |
| 92 | +class Bayes(object): |
| 93 | + |
| 94 | + def __init__(self): |
| 95 | + self.priors = dict() |
| 96 | + self.gaussian = dict() |
| 97 | + |
| 98 | + @staticmethod |
| 99 | + def mean(x): |
| 100 | + """ |
| 101 | + returns mean of the data |
| 102 | + :param x: data vector |
| 103 | + :return: mean |
| 104 | + """ |
| 105 | + mean_x = np.mean(x, axis=0) |
| 106 | + return mean_x |
| 107 | + |
| 108 | + @staticmethod |
| 109 | + def covariance(x): |
| 110 | + """ |
| 111 | + returns covariance of the data |
| 112 | + :param x: data vector |
| 113 | + :return: covariance of the data |
| 114 | + """ |
| 115 | + cov_x = np.cov(x.T) |
| 116 | + return cov_x |
| 117 | + |
| 118 | + def prior(self, labels): |
| 119 | + """ |
| 120 | + this function calculates the priors for each category |
| 121 | + :param labels: category labels |
| 122 | + :return: None |
| 123 | + """ |
| 124 | + for category in labels: |
| 125 | + self.priors[category] = {len(labels[labels == category]) / len(labels)} |
| 126 | + return 0 |
| 127 | + |
| 128 | + def fit(self, data, y): |
| 129 | + """ |
| 130 | + calculates associated mean and covariance of each class in the data-set |
| 131 | + :param data: data |
| 132 | + :param y : data labels |
| 133 | + :return: None |
| 134 | + """ |
| 135 | + smoothing_factor = 1e-2 |
| 136 | + samples, feature_length = data.shape |
| 137 | + |
| 138 | + labels = set(y) |
| 139 | + for category in labels: |
| 140 | + current_data = data[y == category] |
| 141 | + self.gaussian[category] = { |
| 142 | + 'mean': current_data.mean(axis=0), |
| 143 | + 'cov': np.cov(current_data.T) + np.eye(feature_length) * smoothing_factor |
| 144 | + } |
| 145 | + self.priors[category] = float(len(y[y == category])) / len(y) |
| 146 | + return 0 |
| 147 | + |
| 148 | + def predict(self, data): |
| 149 | + """ |
| 150 | + this function predicts the class of an unknown feature vector |
| 151 | + :param data: feature vectors whose class has to be determined |
| 152 | + :return: class of the feature vector |
| 153 | + """ |
| 154 | + samples, feature_length = data.shape |
| 155 | + k = len(self.gaussian) |
| 156 | + p = np.zeros((samples, k)) |
| 157 | + |
| 158 | + for category, g in iteritems(self.gaussian): |
| 159 | + mean, covariance = g['mean'], g['cov'] |
| 160 | + p[:, category] = mvn.logpdf(data, mean=mean, cov=covariance) + np.log(self.priors[category]) |
| 161 | + |
| 162 | + return np.argmax(p, axis=1) |
| 163 | + |
| 164 | + def accuracy(self, data, labels): |
| 165 | + """ |
| 166 | + returns the accuracy/ score of the prediction |
| 167 | + :param data: data |
| 168 | + :param labels: labels for each feature vector |
| 169 | + :return: score of the prediction |
| 170 | + """ |
| 171 | + prediction = self.predict(data) |
| 172 | + return np.mean(prediction == labels) |
| 173 | + |
| 174 | + |
| 175 | +if __name__ == '__main__': |
| 176 | + model = Bayes() |
| 177 | + t0 = datetime.now() |
| 178 | + model.fit(x_train_pca_inc, y_train) |
| 179 | + print("Training time:", (datetime.now() - t0)) |
| 180 | + |
| 181 | + t0 = datetime.now() |
| 182 | + print("Train accuracy:", model.accuracy(x_train_pca_inc, y_train)) |
| 183 | + print("Time to compute train accuracy:", (datetime.now() - t0), "Train size:", len(y_train)) |
| 184 | + |
| 185 | + t0 = datetime.now() |
| 186 | + print("Test accuracy:", model.accuracy(x_test_pca_inc, y_test)) |
| 187 | + print("Time to compute test accuracy:", (datetime.now() - t0), "Test size:", len(y_test)) |
0 commit comments