From cc2c9394f585cae743a8939a467532323431edbd Mon Sep 17 00:00:00 2001 From: Apurva Narde <58493193+Steepspace@users.noreply.github.com> Date: Thu, 12 Aug 2021 17:24:07 -0400 Subject: [PATCH] Feature/naive bayes (#3) * Implemented Naive Bayes - for digits - for faces - implemeted trainAndTune where we calculate the prior probabilities and conditional probability table. - determine the smoothing parameter k that results in the best accuracy in the validation data. - implemented calculateLogJointProbabilities where we store the probabilty of a label given all of the features for every label. * Basic Feature Extraction - modified the enhanced feature extraction to account for dark pixels. - Conducted time/accuracy analysis on the training data by using 10%, 20%, ..., 100% for 5 iterations each to ensure consistency among the results. - recorded the execution time of the training data, the mean accuracy, and the standard deviation of the accuracy - store the results in a dictionary which is then written to results.txt * Implemented the division for Counter objects - modified divideAll to accept a list of keys for the divisor to act on * Added commands to run naive bayes analysis - for digits - for faces --- commands.txt | 7 ++ dataClassifier.py | 77 ++++++++++++++++++++-- naiveBayes.py | 158 ++++++++++++++++++++++++++++++++++++++++++++-- util.py | 16 ++++- 4 files changed, 242 insertions(+), 16 deletions(-) diff --git a/commands.txt b/commands.txt index 18efc5a..bc30b5f 100644 --- a/commands.txt +++ b/commands.txt @@ -8,3 +8,10 @@ python dataClassifier.py -c mira --autotune python dataClassifier.py -d digits -c naiveBayes -f -a -t 1000 python dataClassifier.py -d digits -c minicontest python runMinicontest.py + +-- my commands -- +# naive bayes digits +python dataClassifier.py -c naiveBayes -t 1000 -s 1000 -k 0.05 > out.txt + +# naive bayes faces +python dataClassifier.py -c naiveBayes -k 0.001 -d faces > out.txt diff --git a/dataClassifier.py b/dataClassifier.py index c440e03..9d7d871 100644 --- a/dataClassifier.py +++ b/dataClassifier.py @@ -67,12 +67,20 @@ def enhancedFeatureExtractorDigit(datum): ## """ - features = basicFeatureExtractorDigit(datum) + # features = basicFeatureExtractorDigit(datum) + a = datum.getPixels() - "*** YOUR CODE HERE ***" - - return features + features = util.Counter() + for x in range(DIGIT_DATUM_WIDTH): + for y in range(DIGIT_DATUM_HEIGHT): + if datum.getPixel(x, y) == 0: + features[(x,y)] = 0 + elif datum.getPixel(x, y) == 1: + features[(x,y)] = 1 + else: + features[(x,y)] = 2 + return features def contestFeatureExtractorDigit(datum): """ @@ -314,9 +322,55 @@ def runClassifier(args, options): validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) + ####################################################### + # Conduct training and testing on 10%, 20%, ..., 100% # + ####################################################### + + import random + import timeit + import numpy as np + + results = {} + + for percent in np.arange(0.1, 1, 0.1): + print '--------------------------------------' + print("Percent:", percent) + n = int(round(len(trainingData)*percent)) + print(type(n)) + exec_time = np.zeros(5) + accuracy = np.zeros(5) + for epoch in range(5): + print("Iteration:", epoch) + ######### + # Train # + ######### + print "Training..." + training_sample_index = random.sample(range(len(trainingData)), n) + training_sample = list(map(lambda x: trainingData[x], training_sample_index)) + training_label = list(map(lambda x: trainingLabels[x], training_sample_index)) + start_time = timeit.default_timer() + classifier.train(training_sample, training_label, validationData, validationLabels) + exec_time[epoch] = timeit.default_timer()-start_time + + ######## + # Test # + ######## + print "Testing..." + guesses = classifier.classify(testData) + correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) + accuracy[epoch] = 100.0 * correct / len(testLabels) + print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % accuracy[epoch] + analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) + + results[n] = (exec_time.mean(), accuracy.mean(), accuracy.std()) + print("Results") + print(results[n]) + # Conduct training and testing print "Training..." + start_time = timeit.default_timer() classifier.train(trainingData, trainingLabels, validationData, validationLabels) + exec_time = timeit.default_timer()-start_time print "Validating..." guesses = classifier.classify(validationData) correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True) @@ -324,9 +378,20 @@ def runClassifier(args, options): print "Testing..." guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) - print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) + accuracy = 100.0 * correct / len(testLabels) + print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % accuracy analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) - + + results[len(trainingData)] = (exec_time, accuracy, 0) + + ######################### + # Write Results to File # + ######################### + + with open('results.txt', 'w') as fp: + for n, result in results.items(): + fp.write("{}, {}, {}, {}\n".format(n, result[0], result[1], result[2])) + # do odds ratio computation if specified at command line if((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb")) ): label1, label2 = options.label1, options.label2 diff --git a/naiveBayes.py b/naiveBayes.py index f233afb..b88bd0d 100644 --- a/naiveBayes.py +++ b/naiveBayes.py @@ -60,9 +60,150 @@ def trainAndTune(self, trainingData, trainingLabels, validationData, validationL self.legalLabels. """ - "*** YOUR CODE HERE ***" - util.raiseNotDefined() - + # print("Testing train tune") + # print(self.legalLabels) + # print(type(trainingLabels[0]), type(self.legalLabels[0])) + # print("Training labels", trainingLabels) + # print(len(self.features)) + # print('Training Data', trainingData[0]) + + # for i in range(100): + # highestValue = trainingData[i].sortedKeys()[0] + # print(trainingData[i][highestValue]) + + # for items in trainingData[0].items(): + # print(items) + # for label in trainingLabels: + # print(label) + + + ###################### + # Prior Distribution # + ###################### + prior = util.Counter() + + for label in trainingLabels: + prior[label] += 1 + + prior.normalize() + + self.prior = prior + + # self.calculateLogJointProbabilities(trainingData[0]) + + # print(prior) + + #################################################### + # Keep track of feature values per pixel per label # + #################################################### + count_dict = {} + feature_val_dict = {} + FEATURE_VALS = [0,1,2] + + feature_val_counter = util.Counter() + for feature in self.features: + feature_val_counter[feature] + + for label in self.legalLabels: + feature_val_dict = {} + for val in FEATURE_VALS: + feature_val_dict[val] = feature_val_counter.copy() + + count_dict[label] = feature_val_dict + + for i in range(len(trainingLabels)): + label = trainingLabels[i] + for items in trainingData[i].items(): + count_dict[label][items[1]][items[0]] += 1 + + # for label_item in count_dict.items(): + # print("#####") + # print("Label:", label_item[0]) + # print("#####") + # print() + + # for val_item in label_item[1].items(): + # print("#############") + # print("Feature Value", val_item[0]) + # print("#############") + # print() + + # print("#############") + # print("Counter", val_item[1]) + # print("#############") + # print() + + ###################################### + # Calculate Conditional Probabilites # + ###################################### + + validation_accuracy = 0 + self.k = kgrid[0] + self.table = None + + for k in kgrid: + prob_table = {} + + for label, table in count_dict.items(): + total_sum = util.Counter() + for counts in table.values(): + total_sum += counts + + total_sum.incrementAll(total_sum.keys(), len(FEATURE_VALS)*k) + + feature_dict = {} + for feature, counts in table.items(): + temp_counts = counts.copy() + temp_counts.incrementAll(temp_counts.keys(), k) + feature_dict[feature] = temp_counts/total_sum + + prob_table[label] = feature_dict + + ####################### + # Validation Accuracy # + ####################### + + # print("K Value: ", self.k) + # print("Validation Accuracy: ", validation_accuracy) + + current_accuracy = 0 + old_table = self.table + self.table = prob_table + + for datum, label in zip(validationData, validationLabels): + logJoint = self.calculateLogJointProbabilities(datum) + validation_label = logJoint.argMax() + + if(validation_label == label): + current_accuracy += 1 + + # print("Current K Value: ", k) + # print("Current Accuracy: ", current_accuracy) + # print "--------------------------------------" + if(current_accuracy > validation_accuracy): + self.k = k + validation_accuracy = current_accuracy + + else: + self.table = old_table + + # for label_item in self.table.items(): + # print("#####") + # print("Label:", label_item[0]) + # print("#####") + # print() + + # for val_item in label_item[1].items(): + # print("#############") + # print("Feature Value", val_item[0]) + # print("#############") + # print() + + # print("#############") + # print("Counter", val_item[1]) + # print("#############") + # print() + def classify(self, testData): """ Classify the data based on the posterior distribution over labels. @@ -87,10 +228,13 @@ def calculateLogJointProbabilities(self, datum): self.legalLabels. """ logJoint = util.Counter() - - "*** YOUR CODE HERE ***" - util.raiseNotDefined() - + + for label in self.legalLabels: + logJoint[label] += math.log(self.prior[label]) + + for datum_key, datum_value in datum.items(): + logJoint[label] += math.log(self.table[label][datum_value][datum_key]) + return logJoint def findHighOddsFeatures(self, label1, label2): diff --git a/util.py b/util.py index 91205f0..37c9284 100644 --- a/util.py +++ b/util.py @@ -207,12 +207,12 @@ def normalize(self): for key in self.keys(): self[key] = self[key] / total - def divideAll(self, divisor): + def divideAll(self, keys, divisor): """ - Divides all counts by divisor + Divides all counts of keys by divisor """ divisor = float(divisor) - for key in self: + for key in keys: self[key] /= divisor def copy(self): @@ -290,6 +290,16 @@ def __add__( self, y ): continue addend[key] = y[key] return addend + + def __div__( self, y ): + + quotient = Counter() + for key in self: + if key in y: + quotient[key] = self[key] / float(y[key]) + else: + quotient[key] = self[key] + return quotient def __sub__( self, y ): """