From cc2c9394f585cae743a8939a467532323431edbd Mon Sep 17 00:00:00 2001
From: Apurva Narde <58493193+Steepspace@users.noreply.github.com>
Date: Thu, 12 Aug 2021 17:24:07 -0400
Subject: [PATCH] Feature/naive bayes (#3)

* Implemented Naive Bayes
- for digits
- for faces
- implemeted trainAndTune where we calculate the prior probabilities and
conditional probability table.
- determine the smoothing parameter k that results in the best accuracy in the validation data.
- implemented calculateLogJointProbabilities where we store the probabilty of a label given all of the features for every label.

* Basic Feature Extraction
- modified the enhanced feature extraction to account for dark pixels.
- Conducted time/accuracy analysis on the training data by using 10%, 20%, ..., 100% for 5 iterations each to ensure consistency among the results.
- recorded the execution time of the training data, the mean accuracy, and the standard deviation of the accuracy
- store the results in a dictionary which is then written to results.txt

* Implemented the division for Counter objects
- modified divideAll to accept a list of keys for the divisor to act on

* Added commands to run naive bayes analysis
- for digits
- for faces
---
 commands.txt      |   7 ++
 dataClassifier.py |  77 ++++++++++++++++++++--
 naiveBayes.py     | 158 ++++++++++++++++++++++++++++++++++++++++++++--
 util.py           |  16 ++++-
 4 files changed, 242 insertions(+), 16 deletions(-)

diff --git a/commands.txt b/commands.txt
index 18efc5a..bc30b5f 100644
--- a/commands.txt
+++ b/commands.txt
@@ -8,3 +8,10 @@ python dataClassifier.py -c mira --autotune
 python dataClassifier.py -d digits -c naiveBayes -f -a -t 1000  
 python dataClassifier.py -d digits -c minicontest
 python runMinicontest.py
+
+-- my commands --
+# naive bayes digits
+python dataClassifier.py -c naiveBayes -t 1000 -s 1000 -k 0.05 > out.txt
+
+# naive bayes faces
+python dataClassifier.py -c naiveBayes -k 0.001 -d faces > out.txt
diff --git a/dataClassifier.py b/dataClassifier.py
index c440e03..9d7d871 100644
--- a/dataClassifier.py
+++ b/dataClassifier.py
@@ -67,12 +67,20 @@ def enhancedFeatureExtractorDigit(datum):
   
   ##
   """
-  features =  basicFeatureExtractorDigit(datum)
+  # features =  basicFeatureExtractorDigit(datum)
+  a = datum.getPixels()
 
-  "*** YOUR CODE HERE ***"
-  
-  return features
+  features = util.Counter()
+  for x in range(DIGIT_DATUM_WIDTH):
+    for y in range(DIGIT_DATUM_HEIGHT):
+      if datum.getPixel(x, y) == 0:
+        features[(x,y)] = 0
+      elif datum.getPixel(x, y) == 1:
+        features[(x,y)] = 1
+      else:
+        features[(x,y)] = 2
 
+  return features
 
 def contestFeatureExtractorDigit(datum):
   """
@@ -314,9 +322,55 @@ def runClassifier(args, options):
   validationData = map(featureFunction, rawValidationData)
   testData = map(featureFunction, rawTestData)
   
+  #######################################################
+  # Conduct training and testing on 10%, 20%, ..., 100% #
+  #######################################################
+
+  import random
+  import timeit
+  import numpy as np
+
+  results = {}
+
+  for percent in np.arange(0.1, 1, 0.1):
+    print '--------------------------------------'
+    print("Percent:", percent)
+    n = int(round(len(trainingData)*percent))
+    print(type(n))
+    exec_time = np.zeros(5)
+    accuracy = np.zeros(5)
+    for epoch in range(5):
+      print("Iteration:", epoch)
+      #########
+      # Train #
+      #########
+      print "Training..."
+      training_sample_index = random.sample(range(len(trainingData)), n)
+      training_sample = list(map(lambda x: trainingData[x], training_sample_index))
+      training_label = list(map(lambda x: trainingLabels[x], training_sample_index))
+      start_time = timeit.default_timer()
+      classifier.train(training_sample, training_label, validationData, validationLabels)
+      exec_time[epoch] = timeit.default_timer()-start_time
+
+      ########
+      # Test #
+      ########
+      print "Testing..."
+      guesses = classifier.classify(testData)
+      correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True)
+      accuracy[epoch] = 100.0 * correct / len(testLabels)
+      print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % accuracy[epoch]
+      analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)
+
+    results[n] = (exec_time.mean(), accuracy.mean(), accuracy.std())
+    print("Results")
+    print(results[n])
+
   # Conduct training and testing
   print "Training..."
+  start_time = timeit.default_timer()
   classifier.train(trainingData, trainingLabels, validationData, validationLabels)
+  exec_time = timeit.default_timer()-start_time
   print "Validating..."
   guesses = classifier.classify(validationData)
   correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
@@ -324,9 +378,20 @@ def runClassifier(args, options):
   print "Testing..."
   guesses = classifier.classify(testData)
   correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True)
-  print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))
+  accuracy = 100.0 * correct / len(testLabels)
+  print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % accuracy
   analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)
-  
+
+  results[len(trainingData)] = (exec_time, accuracy, 0)
+
+  #########################
+  # Write Results to File #
+  #########################
+
+  with open('results.txt', 'w') as fp:
+    for n, result in results.items():
+      fp.write("{}, {}, {}, {}\n".format(n, result[0], result[1], result[2]))
+
   # do odds ratio computation if specified at command line
   if((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb")) ):
     label1, label2 = options.label1, options.label2
diff --git a/naiveBayes.py b/naiveBayes.py
index f233afb..b88bd0d 100644
--- a/naiveBayes.py
+++ b/naiveBayes.py
@@ -60,9 +60,150 @@ def trainAndTune(self, trainingData, trainingLabels, validationData, validationL
     self.legalLabels.
     """
 
-    "*** YOUR CODE HERE ***"
-    util.raiseNotDefined()
-        
+    # print("Testing train tune")
+    # print(self.legalLabels)
+    # print(type(trainingLabels[0]), type(self.legalLabels[0]))
+    # print("Training labels", trainingLabels)
+    # print(len(self.features))
+    # print('Training Data', trainingData[0])
+
+    # for i in range(100):
+    #   highestValue = trainingData[i].sortedKeys()[0]
+    #   print(trainingData[i][highestValue])
+
+    # for items in trainingData[0].items():
+    #   print(items)
+    # for label in trainingLabels:
+    #   print(label)
+
+
+    ######################
+    # Prior Distribution #
+    ######################
+    prior = util.Counter()
+
+    for label in trainingLabels:
+      prior[label] += 1
+
+    prior.normalize()
+
+    self.prior = prior
+
+    # self.calculateLogJointProbabilities(trainingData[0])
+
+    # print(prior)
+
+    ####################################################
+    # Keep track of feature values per pixel per label #
+    ####################################################
+    count_dict = {}
+    feature_val_dict = {}
+    FEATURE_VALS = [0,1,2]
+
+    feature_val_counter = util.Counter()
+    for feature in self.features:
+      feature_val_counter[feature]
+
+    for label in self.legalLabels:
+      feature_val_dict = {}
+      for val in FEATURE_VALS:
+        feature_val_dict[val] = feature_val_counter.copy()
+
+      count_dict[label] = feature_val_dict
+
+    for i in range(len(trainingLabels)):
+      label = trainingLabels[i]
+      for items in trainingData[i].items():
+        count_dict[label][items[1]][items[0]] += 1
+
+    # for label_item in count_dict.items():
+    #   print("#####")
+    #   print("Label:", label_item[0])
+    #   print("#####")
+    #   print()
+
+    #   for val_item in label_item[1].items():
+    #     print("#############")
+    #     print("Feature Value", val_item[0])
+    #     print("#############")
+    #     print()
+
+    #     print("#############")
+    #     print("Counter", val_item[1])
+    #     print("#############")
+    #     print()
+
+    ######################################
+    # Calculate Conditional Probabilites #
+    ######################################
+
+    validation_accuracy = 0
+    self.k = kgrid[0]
+    self.table = None
+
+    for k in kgrid:
+      prob_table = {}
+
+      for label, table in count_dict.items():
+        total_sum = util.Counter()
+        for counts in table.values():
+          total_sum += counts
+
+        total_sum.incrementAll(total_sum.keys(), len(FEATURE_VALS)*k)
+
+        feature_dict = {}
+        for feature, counts in table.items():
+          temp_counts = counts.copy()
+          temp_counts.incrementAll(temp_counts.keys(), k)
+          feature_dict[feature] = temp_counts/total_sum
+
+        prob_table[label] = feature_dict
+
+      #######################
+      # Validation Accuracy #
+      #######################
+
+      # print("K Value: ", self.k)
+      # print("Validation Accuracy: ", validation_accuracy)
+
+      current_accuracy = 0
+      old_table = self.table
+      self.table = prob_table
+
+      for datum, label in zip(validationData, validationLabels):
+        logJoint = self.calculateLogJointProbabilities(datum)
+        validation_label = logJoint.argMax()
+
+        if(validation_label == label):
+          current_accuracy += 1
+
+      # print("Current K Value: ", k)
+      # print("Current Accuracy: ", current_accuracy)
+      # print "--------------------------------------"
+      if(current_accuracy > validation_accuracy):
+        self.k = k
+        validation_accuracy = current_accuracy
+
+      else:
+        self.table = old_table
+
+    # for label_item in self.table.items():
+    #   print("#####")
+    #   print("Label:", label_item[0])
+    #   print("#####")
+    #   print()
+
+    #   for val_item in label_item[1].items():
+    #     print("#############")
+    #     print("Feature Value", val_item[0])
+    #     print("#############")
+    #     print()
+
+    #     print("#############")
+    #     print("Counter", val_item[1])
+    #     print("#############")
+    #     print()
+
   def classify(self, testData):
     """
     Classify the data based on the posterior distribution over labels.
@@ -87,10 +228,13 @@ def calculateLogJointProbabilities(self, datum):
     self.legalLabels.
     """
     logJoint = util.Counter()
-    
-    "*** YOUR CODE HERE ***"
-    util.raiseNotDefined()
-    
+
+    for label in self.legalLabels:
+      logJoint[label] += math.log(self.prior[label])
+
+      for datum_key, datum_value in datum.items():
+        logJoint[label] += math.log(self.table[label][datum_value][datum_key])
+
     return logJoint
   
   def findHighOddsFeatures(self, label1, label2):
diff --git a/util.py b/util.py
index 91205f0..37c9284 100644
--- a/util.py
+++ b/util.py
@@ -207,12 +207,12 @@ def normalize(self):
     for key in self.keys():
       self[key] = self[key] / total
       
-  def divideAll(self, divisor):
+  def divideAll(self, keys, divisor):
     """
-    Divides all counts by divisor
+    Divides all counts of keys by divisor
     """
     divisor = float(divisor)
-    for key in self:
+    for key in keys:
       self[key] /= divisor
 
   def copy(self):
@@ -290,6 +290,16 @@ def __add__( self, y ):
         continue
       addend[key] = y[key]
     return addend
+
+  def __div__( self, y ):
+
+    quotient = Counter()
+    for key in self:
+      if key in y:
+        quotient[key] = self[key] / float(y[key])
+      else:
+        quotient[key] = self[key]
+    return quotient
     
   def __sub__( self, y ):
     """