Skip to content

Commit

Permalink
Feature/naive bayes (#3)
Browse files Browse the repository at this point in the history
* Implemented Naive Bayes
- for digits
- for faces
- implemeted trainAndTune where we calculate the prior probabilities and
conditional probability table.
- determine the smoothing parameter k that results in the best accuracy in the validation data.
- implemented calculateLogJointProbabilities where we store the probabilty of a label given all of the features for every label.

* Basic Feature Extraction
- modified the enhanced feature extraction to account for dark pixels.
- Conducted time/accuracy analysis on the training data by using 10%, 20%, ..., 100% for 5 iterations each to ensure consistency among the results.
- recorded the execution time of the training data, the mean accuracy, and the standard deviation of the accuracy
- store the results in a dictionary which is then written to results.txt

* Implemented the division for Counter objects
- modified divideAll to accept a list of keys for the divisor to act on

* Added commands to run naive bayes analysis
- for digits
- for faces
  • Loading branch information
Steepspace authored Aug 12, 2021
1 parent 584e992 commit cc2c939
Show file tree
Hide file tree
Showing 4 changed files with 242 additions and 16 deletions.
7 changes: 7 additions & 0 deletions commands.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,10 @@ python dataClassifier.py -c mira --autotune
python dataClassifier.py -d digits -c naiveBayes -f -a -t 1000
python dataClassifier.py -d digits -c minicontest
python runMinicontest.py

-- my commands --
# naive bayes digits
python dataClassifier.py -c naiveBayes -t 1000 -s 1000 -k 0.05 > out.txt

# naive bayes faces
python dataClassifier.py -c naiveBayes -k 0.001 -d faces > out.txt
77 changes: 71 additions & 6 deletions dataClassifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,20 @@ def enhancedFeatureExtractorDigit(datum):
##
"""
features = basicFeatureExtractorDigit(datum)
# features = basicFeatureExtractorDigit(datum)
a = datum.getPixels()

"*** YOUR CODE HERE ***"

return features
features = util.Counter()
for x in range(DIGIT_DATUM_WIDTH):
for y in range(DIGIT_DATUM_HEIGHT):
if datum.getPixel(x, y) == 0:
features[(x,y)] = 0
elif datum.getPixel(x, y) == 1:
features[(x,y)] = 1
else:
features[(x,y)] = 2

return features

def contestFeatureExtractorDigit(datum):
"""
Expand Down Expand Up @@ -314,19 +322,76 @@ def runClassifier(args, options):
validationData = map(featureFunction, rawValidationData)
testData = map(featureFunction, rawTestData)

#######################################################
# Conduct training and testing on 10%, 20%, ..., 100% #
#######################################################

import random
import timeit
import numpy as np

results = {}

for percent in np.arange(0.1, 1, 0.1):
print '--------------------------------------'
print("Percent:", percent)
n = int(round(len(trainingData)*percent))
print(type(n))
exec_time = np.zeros(5)
accuracy = np.zeros(5)
for epoch in range(5):
print("Iteration:", epoch)
#########
# Train #
#########
print "Training..."
training_sample_index = random.sample(range(len(trainingData)), n)
training_sample = list(map(lambda x: trainingData[x], training_sample_index))
training_label = list(map(lambda x: trainingLabels[x], training_sample_index))
start_time = timeit.default_timer()
classifier.train(training_sample, training_label, validationData, validationLabels)
exec_time[epoch] = timeit.default_timer()-start_time

########
# Test #
########
print "Testing..."
guesses = classifier.classify(testData)
correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True)
accuracy[epoch] = 100.0 * correct / len(testLabels)
print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % accuracy[epoch]
analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)

results[n] = (exec_time.mean(), accuracy.mean(), accuracy.std())
print("Results")
print(results[n])

# Conduct training and testing
print "Training..."
start_time = timeit.default_timer()
classifier.train(trainingData, trainingLabels, validationData, validationLabels)
exec_time = timeit.default_timer()-start_time
print "Validating..."
guesses = classifier.classify(validationData)
correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels))
print "Testing..."
guesses = classifier.classify(testData)
correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True)
print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))
accuracy = 100.0 * correct / len(testLabels)
print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % accuracy
analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)


results[len(trainingData)] = (exec_time, accuracy, 0)

#########################
# Write Results to File #
#########################

with open('results.txt', 'w') as fp:
for n, result in results.items():
fp.write("{}, {}, {}, {}\n".format(n, result[0], result[1], result[2]))

# do odds ratio computation if specified at command line
if((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb")) ):
label1, label2 = options.label1, options.label2
Expand Down
158 changes: 151 additions & 7 deletions naiveBayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,150 @@ def trainAndTune(self, trainingData, trainingLabels, validationData, validationL
self.legalLabels.
"""

"*** YOUR CODE HERE ***"
util.raiseNotDefined()

# print("Testing train tune")
# print(self.legalLabels)
# print(type(trainingLabels[0]), type(self.legalLabels[0]))
# print("Training labels", trainingLabels)
# print(len(self.features))
# print('Training Data', trainingData[0])

# for i in range(100):
# highestValue = trainingData[i].sortedKeys()[0]
# print(trainingData[i][highestValue])

# for items in trainingData[0].items():
# print(items)
# for label in trainingLabels:
# print(label)


######################
# Prior Distribution #
######################
prior = util.Counter()

for label in trainingLabels:
prior[label] += 1

prior.normalize()

self.prior = prior

# self.calculateLogJointProbabilities(trainingData[0])

# print(prior)

####################################################
# Keep track of feature values per pixel per label #
####################################################
count_dict = {}
feature_val_dict = {}
FEATURE_VALS = [0,1,2]

feature_val_counter = util.Counter()
for feature in self.features:
feature_val_counter[feature]

for label in self.legalLabels:
feature_val_dict = {}
for val in FEATURE_VALS:
feature_val_dict[val] = feature_val_counter.copy()

count_dict[label] = feature_val_dict

for i in range(len(trainingLabels)):
label = trainingLabels[i]
for items in trainingData[i].items():
count_dict[label][items[1]][items[0]] += 1

# for label_item in count_dict.items():
# print("#####")
# print("Label:", label_item[0])
# print("#####")
# print()

# for val_item in label_item[1].items():
# print("#############")
# print("Feature Value", val_item[0])
# print("#############")
# print()

# print("#############")
# print("Counter", val_item[1])
# print("#############")
# print()

######################################
# Calculate Conditional Probabilites #
######################################

validation_accuracy = 0
self.k = kgrid[0]
self.table = None

for k in kgrid:
prob_table = {}

for label, table in count_dict.items():
total_sum = util.Counter()
for counts in table.values():
total_sum += counts

total_sum.incrementAll(total_sum.keys(), len(FEATURE_VALS)*k)

feature_dict = {}
for feature, counts in table.items():
temp_counts = counts.copy()
temp_counts.incrementAll(temp_counts.keys(), k)
feature_dict[feature] = temp_counts/total_sum

prob_table[label] = feature_dict

#######################
# Validation Accuracy #
#######################

# print("K Value: ", self.k)
# print("Validation Accuracy: ", validation_accuracy)

current_accuracy = 0
old_table = self.table
self.table = prob_table

for datum, label in zip(validationData, validationLabels):
logJoint = self.calculateLogJointProbabilities(datum)
validation_label = logJoint.argMax()

if(validation_label == label):
current_accuracy += 1

# print("Current K Value: ", k)
# print("Current Accuracy: ", current_accuracy)
# print "--------------------------------------"
if(current_accuracy > validation_accuracy):
self.k = k
validation_accuracy = current_accuracy

else:
self.table = old_table

# for label_item in self.table.items():
# print("#####")
# print("Label:", label_item[0])
# print("#####")
# print()

# for val_item in label_item[1].items():
# print("#############")
# print("Feature Value", val_item[0])
# print("#############")
# print()

# print("#############")
# print("Counter", val_item[1])
# print("#############")
# print()

def classify(self, testData):
"""
Classify the data based on the posterior distribution over labels.
Expand All @@ -87,10 +228,13 @@ def calculateLogJointProbabilities(self, datum):
self.legalLabels.
"""
logJoint = util.Counter()

"*** YOUR CODE HERE ***"
util.raiseNotDefined()


for label in self.legalLabels:
logJoint[label] += math.log(self.prior[label])

for datum_key, datum_value in datum.items():
logJoint[label] += math.log(self.table[label][datum_value][datum_key])

return logJoint

def findHighOddsFeatures(self, label1, label2):
Expand Down
16 changes: 13 additions & 3 deletions util.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,12 +207,12 @@ def normalize(self):
for key in self.keys():
self[key] = self[key] / total

def divideAll(self, divisor):
def divideAll(self, keys, divisor):
"""
Divides all counts by divisor
Divides all counts of keys by divisor
"""
divisor = float(divisor)
for key in self:
for key in keys:
self[key] /= divisor

def copy(self):
Expand Down Expand Up @@ -290,6 +290,16 @@ def __add__( self, y ):
continue
addend[key] = y[key]
return addend

def __div__( self, y ):

quotient = Counter()
for key in self:
if key in y:
quotient[key] = self[key] / float(y[key])
else:
quotient[key] = self[key]
return quotient

def __sub__( self, y ):
"""
Expand Down

0 comments on commit cc2c939

Please sign in to comment.