diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dc84959 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +build/ + diff --git a/README b/README index b2ab13d..501be1e 100644 --- a/README +++ b/README @@ -14,6 +14,17 @@ Dependencies hackMatch uses NLTK (nltk.org) and hcluster (for distance metrics). +To install the dependencies, run 'pip install -r requirements.txt'. + +After installing nltk, you need to get the stopwords corpus. Run download_stopwords.py at the command line: + + $ python download_stopwords.py + [nltk_data] Downloading package 'stopwords' to + [nltk_data] /home/hbrown/nltk_data... + [nltk_data] Unzipping corpora/stopwords.zip. + True + $ + ======= License ======= diff --git a/download_stopwords.py b/download_stopwords.py new file mode 100644 index 0000000..8bafa70 --- /dev/null +++ b/download_stopwords.py @@ -0,0 +1,2 @@ +from nltk import download +download('stopwords') diff --git a/hackmatch.py b/hackmatch.py index 6490f02..8efa583 100644 --- a/hackmatch.py +++ b/hackmatch.py @@ -7,136 +7,137 @@ Created by Hilary Mason, Chris Wiggins, and Evan Korth. Copyright (c) 2010 hackNY. All rights reserved. """ +# pylint: disable=W0614 +# pylint: disable=C0301 -import sys, os -import csv -import string from collections import defaultdict from optparse import OptionParser -from nltk.tokenize import * from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize from hcluster import jaccard +from operator import itemgetter +from csv import DictReader # startups: Name,E-mail,Company,In NYC,Funding,Site,Blog,Twitter,Num Employees,Environment,Project,Skills,Misc # students: Student Name,e-mail,University,Major,Degree,Graduation Date,Site,Blog,Twitter,Facebook,Project,Skills,Misc +# Hack +# I'd like to write this: +# return reduce(list.extend, list_of_lists) +# but it generates an error I don't get +def list_reducer(list_iter): + result = [] + for l in list_iter: + result.extend(l) + return result + +def get_stopwords(): + """ + get_stopwords: generate a list of stop words + """ + return stopwords.words('english') + [',', '\xe2', '.', ')', '(', ':', "'s", "'nt", '\x99', '\x86', '\xae', '\x92'] + +def parse_csv(filename): + """ + parse_csv: parses the CSV file to a dict + """ + csv_reader = DictReader(open(filename)) + return [r for r in csv_reader] + +def print_matches(matches, num_matches): + """ + print_matches: print the top 'num_matches' matches + """ + for key, value_dict in matches.items(): + print key + all_matches = sorted(value_dict.items(), key=itemgetter(1)) + top_matches = all_matches[-num_matches:] + for item, score in top_matches: + print "\t%(item)s :: %(score)s" % locals() + # print "'%s' '%s' %s" % (n.translate(string.maketrans("",""), string.punctuation), item.translate(string.maketrans("",""), string.punctuation), score) + print '\n' + class HackMatch(object): + """ + HackMatch: class to encapsulate matching companies versus startups on selected fields + """ DEBUG = False BOW_FIELDS = ['Environment', 'Project', 'Skills', 'Misc'] COMPLETENESS_THRESHOLD = 4 # num of words necessary to match def __init__(self, student_file, startup_file, num_matches=3, distance=jaccard): - self.stopwords = self.get_stopwords() + self.stopwords = set(get_stopwords()) self.distance = distance - student_data = self.parseCSV(student_file) - startup_data = self.parseCSV(startup_file) + student_data = parse_csv(student_file) + startup_data = parse_csv(startup_file) - doc_words = self.defineFeatures([student_data, startup_data], self.BOW_FIELDS) + doc_words = self.define_features([student_data, startup_data], self.BOW_FIELDS) - # matches = self.doRanking(student_data, startup_data, doc_words, self.BOW_FIELDS, base_name_field='Student Name', match_name_field='Company') - matches = self.doRanking(startup_data, student_data, doc_words, self.BOW_FIELDS) + # matches = self.do_ranking(student_data, startup_data, doc_words, self.BOW_FIELDS, base_name_field='Student Name', match_name_field='Company') + matches = self.do_ranking(startup_data, student_data, doc_words, self.BOW_FIELDS) - self.printMatches(matches, num_matches) - - def printMatches(self, matches, num_matches): - for n, m in matches.items(): - print n - for item, score in sorted(m.items(), key=lambda(i,c):(-c, i))[:num_matches]: - print "\t%s :: %s" % (item, score) - # print "'%s' '%s' %s" % (n.translate(string.maketrans("",""), string.punctuation), item.translate(string.maketrans("",""), string.punctuation), score) - print '\n' - - - def doRanking(self, base_data, match_data, doc_words, fields=[], base_name_field='Company', match_name_field='Student Name'): + print_matches(matches, num_matches) + + def do_ranking(self, base_data, match_data, doc_words, fields=None, base_name_field='Company', match_name_field='Student Name'): """ do ranking """ - base = {} - for item in base_data: - base[item[base_name_field]] = self.extractFeatures(item, doc_words, fields) - + fields = fields or [] + base = dict((item[base_name_field], self.extract_features(item, doc_words, fields)) for item in base_data) + matches = defaultdict(dict) for match_item in match_data: - match_features = self.extractFeatures(match_item, doc_words, fields) - + match_features = self.extract_features(match_item, doc_words, fields) + temp_dict = matches[match_item[match_name_field]] for base_item, base_item_features in base.items(): # actually do the comparison if not base_item_features or not match_features: - matches[match_item[match_name_field]][base_item] = 0.0 + temp_dict[base_item] = 0.0 else: - matches[match_item[match_name_field]][base_item] = self.distance(base_item_features, match_features) + temp_dict[base_item] = self.distance(base_item_features, match_features) if self.DEBUG: print "%s :: %s = %s " % (match_item[match_name_field], base_item, self.distance(base_item_features, match_features)) - return matches - def extractFeatures(self, item, doc_words, fields=[]): - s_tokens = [] - for f in fields: - tokens = None - try: - tokens = word_tokenize(item[f]) - except (KeyError, TypeError): - pass - - if tokens: - s_tokens.extend(tokens) - - s_features = [] - for token in doc_words: - if token in s_tokens: - s_features.append(1) - else: - s_features.append(0) - - if sum(s_features) <= self.COMPLETENESS_THRESHOLD: - return None - - return s_features + def extract_features(self, item_dict, doc_words, fields=None): + """ + extract_features: Determine whether features pass test + """ + fields = fields or [] + tokeniter = (item_dict[f] for f in fields if f in item_dict) + s_tokens = list_reducer(tokeniter) + s_features = [token in s_tokens for token in doc_words] + return s_features if sum(s_features) > self.COMPLETENESS_THRESHOLD else None - def defineFeatures(self, data, fields=[]): + def define_features(self, data, fields=None): """ define the global bag of words features """ - ngram_freq = {} + fields = fields or [] + ngram_freq = defaultdict(int) - for d in data: - for r in d: - for f in fields: - tokens = None - try: - tokens = word_tokenize(r[f]) - except (KeyError, TypeError): - pass - - if tokens: - for t in [t.lower() for t in tokens if t.lower() not in self.stopwords]: - t = t.strip('.') - ngram_freq[t] = ngram_freq.get(t, 0) + 1 - - ngram_freq = dict([(w,c) for w,c in ngram_freq.items() if c > 1]) + featureiter = ( + r[f] + for d in data + for r in d + for f in fields + if f in r + ) + for field in featureiter: + tokeniter = (word.lower() for word in word_tokenize(field)) + legaliter = (word.strip('.') for word in tokeniter if word not in self.stopwords) + for legal_word in legaliter: + ngram_freq[legal_word] += 1 + ngram_freq = dict((word, word_count) for word, word_count in ngram_freq.items() if word_count > 1) if self.DEBUG: print "Global vocabulary: %s" % len(ngram_freq) return ngram_freq - - def get_stopwords(self): - sw = stopwords.words('english') - sw.extend([',', '\xe2', '.', ')', '(', ':', "'s", "'nt", '\x99', '\x86', '\xae', '\x92']) - return sw - - def parseCSV(self, filename): - """ - parseCSV: parses the CSV file to a dict - """ - csv_reader = csv.DictReader(open(filename)) - return [r for r in csv_reader] - - + if __name__ == '__main__': parser = OptionParser() - parser.add_option("-n","--number", action="store", type="int", dest="num_matches",default=10,help="number of results to return") - parser.add_option("-s","--student", action="store", type="string", dest="student_file",default="unmatched_students.csv",help="csv of student data") - parser.add_option("-t","--startup", action="store", type="string", dest="startup_file",default="unmatched_top_startups.csv",help="csv of startup data") + parser.add_option("-n", "--number", action="store", type="int", dest="num_matches", default=10, help="number of results to return") + parser.add_option("-s", "--student", action="store", type="string", dest="student_file", default="unmatched_students.csv", help="csv of student data") + parser.add_option("-t", "--startup", action="store", type="string", dest="startup_file", default="unmatched_top_startups.csv", help="csv of startup data") (options, args) = parser.parse_args() - h = HackMatch(num_matches=options.num_matches, student_file=options.student_file, startup_file=options.startup_file) \ No newline at end of file + hackmatch = HackMatch(num_matches=options.num_matches, student_file=options.student_file, startup_file=options.startup_file) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..fe8566b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +hcluster>=0.2.0 +nltk>=2.0b9 diff --git a/startups.txt b/startups.txt new file mode 100644 index 0000000..6662658 --- /dev/null +++ b/startups.txt @@ -0,0 +1,2 @@ +"Name","E-mail","Company","In NYC","Funding","Site","Blog","Twitter","Num Employees","Environment","Project","Skills","Misc" +"Foobar Corp","x@foo.com","Foobar Corp","Y","Y",http://www.foo.com,"","",100,"linux windows oracle","risk-management finance","python java C#","linux windows python facebook" diff --git a/students.txt b/students.txt new file mode 100644 index 0000000..45fa9ce --- /dev/null +++ b/students.txt @@ -0,0 +1,2 @@ +Student Name,e-mail,University,Major,Degree,Graduation Date,Site,Blog,Twitter,Facebook,Project,Skills,Misc +Hugh,hughdbrown@yahoo.com,Toronto,AI,AI,1990,iwebthereforeiam.com,"","","","risk-management windows","python oracle","finance"