hmason · hughdbrown · Oct 26, 2010 · Oct 26, 2010 · Oct 26, 2010 · Oct 26, 2010
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+build/
+
diff --git a/README b/README
@@ -14,6 +14,17 @@ Dependencies
 
 hackMatch uses NLTK (nltk.org) and hcluster (for distance metrics).
 
+To install the dependencies, run 'pip install -r requirements.txt'.
+
+After installing nltk, you need to get the stopwords corpus. Run download_stopwords.py at the command line:
+
+	$ python download_stopwords.py
+	[nltk_data] Downloading package 'stopwords' to
+	[nltk_data]     /home/hbrown/nltk_data...
+	[nltk_data]   Unzipping corpora/stopwords.zip.
+	True
+	$	
+
 =======
 License
 =======

diff --git a/download_stopwords.py b/download_stopwords.py
@@ -0,0 +1,2 @@
+from nltk import download
+download('stopwords')
diff --git a/hackmatch.py b/hackmatch.py
@@ -7,136 +7,137 @@
 Created by Hilary Mason, Chris Wiggins, and Evan Korth.
 Copyright (c) 2010 hackNY. All rights reserved.
 """
+# pylint: disable=W0614
+# pylint: disable=C0301
 
-import sys, os
-import csv
-import string
 from collections import defaultdict
 from optparse import OptionParser
-from nltk.tokenize import *
 from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
 from hcluster import jaccard
+from operator import itemgetter
+from csv import DictReader
 
 # startups: Name,E-mail,Company,In NYC,Funding,Site,Blog,Twitter,Num Employees,Environment,Project,Skills,Misc
 # students: Student Name,e-mail,University,Major,Degree,Graduation Date,Site,Blog,Twitter,Facebook,Project,Skills,Misc
 
+# Hack
+# I'd like to write this:
+#   return reduce(list.extend, list_of_lists)
+# but it generates an error I don't get
+def list_reducer(list_iter):
+    result = []
+    for l in list_iter:
+        result.extend(l)
+    return result
+
+def get_stopwords():
+    """
+    get_stopwords: generate a list of stop words
+    """
+    return stopwords.words('english') + [',', '\xe2', '.', ')', '(', ':', "'s", "'nt", '\x99', '\x86', '\xae', '\x92']
+
+def parse_csv(filename):
+    """
+    parse_csv: parses the CSV file to a dict
+    """
+    csv_reader = DictReader(open(filename))
+    return [r for r in csv_reader]
+
+def print_matches(matches, num_matches):
+    """
+    print_matches: print the top 'num_matches' matches
+    """
+    for key, value_dict in matches.items():
+        print key
+        all_matches = sorted(value_dict.items(), key=itemgetter(1))
+        top_matches = all_matches[-num_matches:]
+        for item, score in top_matches:
+            print "\t%(item)s :: %(score)s" % locals()
+            # print "'%s' '%s' %s" % (n.translate(string.maketrans("",""), string.punctuation), item.translate(string.maketrans("",""), string.punctuation), score)
+        print '\n'
+
 class HackMatch(object):
+    """
+    HackMatch: class to encapsulate matching companies versus startups on selected fields
+    """
     DEBUG = False
     BOW_FIELDS = ['Environment', 'Project', 'Skills', 'Misc']
     COMPLETENESS_THRESHOLD = 4 # num of words necessary to match
 
     def __init__(self, student_file, startup_file, num_matches=3, distance=jaccard):
-        self.stopwords = self.get_stopwords()
+        self.stopwords = set(get_stopwords())
         self.distance = distance
 
-        student_data = self.parseCSV(student_file)
-        startup_data = self.parseCSV(startup_file)
+        student_data = parse_csv(student_file)
+        startup_data = parse_csv(startup_file)
 
-        doc_words = self.defineFeatures([student_data, startup_data], self.BOW_FIELDS)
+        doc_words = self.define_features([student_data, startup_data], self.BOW_FIELDS)
 
-        # matches = self.doRanking(student_data, startup_data, doc_words, self.BOW_FIELDS, base_name_field='Student Name', match_name_field='Company')
-        matches = self.doRanking(startup_data, student_data, doc_words, self.BOW_FIELDS)
+        # matches = self.do_ranking(student_data, startup_data, doc_words, self.BOW_FIELDS, base_name_field='Student Name', match_name_field='Company')
+        matches = self.do_ranking(startup_data, student_data, doc_words, self.BOW_FIELDS)
 
-        self.printMatches(matches, num_matches)
-
-    def printMatches(self, matches, num_matches):
-        for n, m in matches.items():
-            print n
-            for item, score in sorted(m.items(), key=lambda(i,c):(-c, i))[:num_matches]:
-                print "\t%s :: %s" % (item, score)
-                # print "'%s' '%s' %s" % (n.translate(string.maketrans("",""), string.punctuation), item.translate(string.maketrans("",""), string.punctuation), score)
-            print '\n'
-
-
-    def doRanking(self, base_data, match_data, doc_words, fields=[], base_name_field='Company', match_name_field='Student Name'):
+        print_matches(matches, num_matches)
+
+    def do_ranking(self, base_data, match_data, doc_words, fields=None, base_name_field='Company', match_name_field='Student Name'):
         """
         do ranking
         """
-        base = {}
-        for item in base_data:
-            base[item[base_name_field]] = self.extractFeatures(item, doc_words, fields)
-
+        fields = fields or []
+        base = dict((item[base_name_field], self.extract_features(item, doc_words, fields)) for item in base_data)
+
         matches = defaultdict(dict)
         for match_item in match_data:
-            match_features = self.extractFeatures(match_item, doc_words, fields)
-
+            match_features = self.extract_features(match_item, doc_words, fields)
+            temp_dict = matches[match_item[match_name_field]]
             for base_item, base_item_features in base.items(): # actually do the comparison
                 if not base_item_features or not match_features:
-                    matches[match_item[match_name_field]][base_item] = 0.0
+                    temp_dict[base_item] = 0.0
                 else:
-                    matches[match_item[match_name_field]][base_item] = self.distance(base_item_features, match_features)
+                    temp_dict[base_item] = self.distance(base_item_features, match_features)
                 if self.DEBUG:
                     print "%s :: %s = %s " % (match_item[match_name_field], base_item, self.distance(base_item_features, match_features))
-
         return matches
 
-    def extractFeatures(self, item, doc_words, fields=[]):
-        s_tokens = []
-        for f in fields:
-            tokens = None
-            try:
-                tokens = word_tokenize(item[f])
-            except (KeyError, TypeError):
-                pass
-
-            if tokens:
-                s_tokens.extend(tokens)
-
-        s_features = []        
-        for token in doc_words:
-            if token in s_tokens:
-                s_features.append(1)
-            else:
-                s_features.append(0)
-
-        if sum(s_features) <= self.COMPLETENESS_THRESHOLD:
-            return None
-
-        return s_features
+    def extract_features(self, item_dict, doc_words, fields=None):
+        """
+        extract_features: Determine whether features pass test
+        """
+        fields = fields or []
+        tokeniter = (item_dict[f] for f in fields if f in item_dict)
+        s_tokens = list_reducer(tokeniter)
+        s_features = [token in s_tokens for token in doc_words]
+        return s_features if sum(s_features) > self.COMPLETENESS_THRESHOLD else None
 
-    def defineFeatures(self, data, fields=[]):
+    def define_features(self, data, fields=None):
         """
         define the global bag of words features
         """
-        ngram_freq = {}
+        fields = fields or []
+        ngram_freq = defaultdict(int)
 
-        for d in data:
-            for r in d:
-                for f in fields:
-                    tokens = None
-                    try:
-                        tokens = word_tokenize(r[f])
-                    except (KeyError, TypeError):
-                        pass
-
-                    if tokens:
-                        for t in [t.lower() for t in tokens if t.lower() not in self.stopwords]:
-                            t = t.strip('.')
-                            ngram_freq[t] = ngram_freq.get(t, 0) + 1
-
-        ngram_freq = dict([(w,c) for w,c in ngram_freq.items() if c > 1])
+        featureiter = (
+            r[f]
+            for d in data
+            for r in d
+            for f in fields
+            if f in r
+        )
+        for field in featureiter:
+            tokeniter = (word.lower() for word in word_tokenize(field))
+            legaliter = (word.strip('.') for word in tokeniter if word not in self.stopwords)
+            for legal_word in legaliter:
+                ngram_freq[legal_word] += 1
+        ngram_freq = dict((word, word_count) for word, word_count in ngram_freq.items() if word_count > 1)
         if self.DEBUG:
             print "Global vocabulary: %s" % len(ngram_freq)        
         return ngram_freq
-
-    def get_stopwords(self):
-        sw = stopwords.words('english')
-        sw.extend([',', '\xe2', '.', ')', '(', ':', "'s", "'nt", '\x99', '\x86', '\xae', '\x92'])
-        return sw
-
-    def parseCSV(self, filename):
-        """
-        parseCSV: parses the CSV file to a dict
-        """
-        csv_reader = csv.DictReader(open(filename))
-        return [r for r in csv_reader]
-
-
+
 if __name__ == '__main__':
     parser = OptionParser()
-    parser.add_option("-n","--number", action="store", type="int", dest="num_matches",default=10,help="number of results to return")
-    parser.add_option("-s","--student", action="store", type="string", dest="student_file",default="unmatched_students.csv",help="csv of student data")
-    parser.add_option("-t","--startup", action="store", type="string", dest="startup_file",default="unmatched_top_startups.csv",help="csv of startup data")
+    parser.add_option("-n", "--number", action="store", type="int", dest="num_matches", default=10, help="number of results to return")
+    parser.add_option("-s", "--student", action="store", type="string", dest="student_file", default="unmatched_students.csv", help="csv of student data")
+    parser.add_option("-t", "--startup", action="store", type="string", dest="startup_file", default="unmatched_top_startups.csv", help="csv of startup data")
     (options, args) = parser.parse_args()
 
-    h = HackMatch(num_matches=options.num_matches, student_file=options.student_file, startup_file=options.startup_file)
+    hackmatch = HackMatch(num_matches=options.num_matches, student_file=options.student_file, startup_file=options.startup_file)
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+hcluster>=0.2.0
+nltk>=2.0b9
diff --git a/startups.txt b/startups.txt
@@ -0,0 +1,2 @@
+"Name","E-mail","Company","In NYC","Funding","Site","Blog","Twitter","Num Employees","Environment","Project","Skills","Misc"
+"Foobar Corp","[email protected]","Foobar Corp","Y","Y",http://www.foo.com,"","",100,"linux windows oracle","risk-management finance","python java C#","linux windows python facebook"
diff --git a/students.txt b/students.txt
@@ -0,0 +1,2 @@
+Student Name,e-mail,University,Major,Degree,Graduation Date,Site,Blog,Twitter,Facebook,Project,Skills,Misc
+Hugh,[email protected],Toronto,AI,AI,1990,iwebthereforeiam.com,"","","","risk-management windows","python oracle","finance"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from nltk import download
		download('stopwords')
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		"Name","E-mail","Company","In NYC","Funding","Site","Blog","Twitter","Num Employees","Environment","Project","Skills","Misc"
		"Foobar Corp","[email protected]","Foobar Corp","Y","Y",http://www.foo.com,"","",100,"linux windows oracle","risk-management finance","python java C#","linux windows python facebook"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Student Name,e-mail,University,Major,Degree,Graduation Date,Site,Blog,Twitter,Facebook,Project,Skills,Misc
		Hugh,[email protected],Toronto,AI,AI,1990,iwebthereforeiam.com,"","","","risk-management windows","python oracle","finance"