Skip to content
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
build/

11 changes: 11 additions & 0 deletions README
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,17 @@ Dependencies

hackMatch uses NLTK (nltk.org) and hcluster (for distance metrics).

To install the dependencies, run 'pip install -r requirements.txt'.

After installing nltk, you need to get the stopwords corpus. Run download_stopwords.py at the command line:

$ python download_stopwords.py
[nltk_data] Downloading package 'stopwords' to
[nltk_data] /home/hbrown/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
True
$

=======
License
=======
Expand Down
2 changes: 2 additions & 0 deletions download_stopwords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from nltk import download
download('stopwords')
179 changes: 90 additions & 89 deletions hackmatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,136 +7,137 @@
Created by Hilary Mason, Chris Wiggins, and Evan Korth.
Copyright (c) 2010 hackNY. All rights reserved.
"""
# pylint: disable=W0614
# pylint: disable=C0301

import sys, os
import csv
import string
from collections import defaultdict
from optparse import OptionParser
from nltk.tokenize import *
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from hcluster import jaccard
from operator import itemgetter
from csv import DictReader

# startups: Name,E-mail,Company,In NYC,Funding,Site,Blog,Twitter,Num Employees,Environment,Project,Skills,Misc
# students: Student Name,e-mail,University,Major,Degree,Graduation Date,Site,Blog,Twitter,Facebook,Project,Skills,Misc

# Hack
# I'd like to write this:
# return reduce(list.extend, list_of_lists)
# but it generates an error I don't get
def list_reducer(list_iter):
result = []
for l in list_iter:
result.extend(l)
return result

def get_stopwords():
"""
get_stopwords: generate a list of stop words
"""
return stopwords.words('english') + [',', '\xe2', '.', ')', '(', ':', "'s", "'nt", '\x99', '\x86', '\xae', '\x92']

def parse_csv(filename):
"""
parse_csv: parses the CSV file to a dict
"""
csv_reader = DictReader(open(filename))
return [r for r in csv_reader]

def print_matches(matches, num_matches):
"""
print_matches: print the top 'num_matches' matches
"""
for key, value_dict in matches.items():
print key
all_matches = sorted(value_dict.items(), key=itemgetter(1))
top_matches = all_matches[-num_matches:]
for item, score in top_matches:
print "\t%(item)s :: %(score)s" % locals()
# print "'%s' '%s' %s" % (n.translate(string.maketrans("",""), string.punctuation), item.translate(string.maketrans("",""), string.punctuation), score)
print '\n'

class HackMatch(object):
"""
HackMatch: class to encapsulate matching companies versus startups on selected fields
"""
DEBUG = False
BOW_FIELDS = ['Environment', 'Project', 'Skills', 'Misc']
COMPLETENESS_THRESHOLD = 4 # num of words necessary to match

def __init__(self, student_file, startup_file, num_matches=3, distance=jaccard):
self.stopwords = self.get_stopwords()
self.stopwords = set(get_stopwords())
self.distance = distance

student_data = self.parseCSV(student_file)
startup_data = self.parseCSV(startup_file)
student_data = parse_csv(student_file)
startup_data = parse_csv(startup_file)

doc_words = self.defineFeatures([student_data, startup_data], self.BOW_FIELDS)
doc_words = self.define_features([student_data, startup_data], self.BOW_FIELDS)

# matches = self.doRanking(student_data, startup_data, doc_words, self.BOW_FIELDS, base_name_field='Student Name', match_name_field='Company')
matches = self.doRanking(startup_data, student_data, doc_words, self.BOW_FIELDS)
# matches = self.do_ranking(student_data, startup_data, doc_words, self.BOW_FIELDS, base_name_field='Student Name', match_name_field='Company')
matches = self.do_ranking(startup_data, student_data, doc_words, self.BOW_FIELDS)

self.printMatches(matches, num_matches)

def printMatches(self, matches, num_matches):
for n, m in matches.items():
print n
for item, score in sorted(m.items(), key=lambda(i,c):(-c, i))[:num_matches]:
print "\t%s :: %s" % (item, score)
# print "'%s' '%s' %s" % (n.translate(string.maketrans("",""), string.punctuation), item.translate(string.maketrans("",""), string.punctuation), score)
print '\n'


def doRanking(self, base_data, match_data, doc_words, fields=[], base_name_field='Company', match_name_field='Student Name'):
print_matches(matches, num_matches)

def do_ranking(self, base_data, match_data, doc_words, fields=None, base_name_field='Company', match_name_field='Student Name'):
"""
do ranking
"""
base = {}
for item in base_data:
base[item[base_name_field]] = self.extractFeatures(item, doc_words, fields)

fields = fields or []
base = dict((item[base_name_field], self.extract_features(item, doc_words, fields)) for item in base_data)

matches = defaultdict(dict)
for match_item in match_data:
match_features = self.extractFeatures(match_item, doc_words, fields)

match_features = self.extract_features(match_item, doc_words, fields)
temp_dict = matches[match_item[match_name_field]]
for base_item, base_item_features in base.items(): # actually do the comparison
if not base_item_features or not match_features:
matches[match_item[match_name_field]][base_item] = 0.0
temp_dict[base_item] = 0.0
else:
matches[match_item[match_name_field]][base_item] = self.distance(base_item_features, match_features)
temp_dict[base_item] = self.distance(base_item_features, match_features)
if self.DEBUG:
print "%s :: %s = %s " % (match_item[match_name_field], base_item, self.distance(base_item_features, match_features))

return matches

def extractFeatures(self, item, doc_words, fields=[]):
s_tokens = []
for f in fields:
tokens = None
try:
tokens = word_tokenize(item[f])
except (KeyError, TypeError):
pass

if tokens:
s_tokens.extend(tokens)

s_features = []
for token in doc_words:
if token in s_tokens:
s_features.append(1)
else:
s_features.append(0)

if sum(s_features) <= self.COMPLETENESS_THRESHOLD:
return None

return s_features
def extract_features(self, item_dict, doc_words, fields=None):
"""
extract_features: Determine whether features pass test
"""
fields = fields or []
tokeniter = (item_dict[f] for f in fields if f in item_dict)
s_tokens = list_reducer(tokeniter)
s_features = [token in s_tokens for token in doc_words]
return s_features if sum(s_features) > self.COMPLETENESS_THRESHOLD else None

def defineFeatures(self, data, fields=[]):
def define_features(self, data, fields=None):
"""
define the global bag of words features
"""
ngram_freq = {}
fields = fields or []
ngram_freq = defaultdict(int)

for d in data:
for r in d:
for f in fields:
tokens = None
try:
tokens = word_tokenize(r[f])
except (KeyError, TypeError):
pass

if tokens:
for t in [t.lower() for t in tokens if t.lower() not in self.stopwords]:
t = t.strip('.')
ngram_freq[t] = ngram_freq.get(t, 0) + 1

ngram_freq = dict([(w,c) for w,c in ngram_freq.items() if c > 1])
featureiter = (
r[f]
for d in data
for r in d
for f in fields
if f in r
)
for field in featureiter:
tokeniter = (word.lower() for word in word_tokenize(field))
legaliter = (word.strip('.') for word in tokeniter if word not in self.stopwords)
for legal_word in legaliter:
ngram_freq[legal_word] += 1
ngram_freq = dict((word, word_count) for word, word_count in ngram_freq.items() if word_count > 1)
if self.DEBUG:
print "Global vocabulary: %s" % len(ngram_freq)
return ngram_freq

def get_stopwords(self):
sw = stopwords.words('english')
sw.extend([',', '\xe2', '.', ')', '(', ':', "'s", "'nt", '\x99', '\x86', '\xae', '\x92'])
return sw

def parseCSV(self, filename):
"""
parseCSV: parses the CSV file to a dict
"""
csv_reader = csv.DictReader(open(filename))
return [r for r in csv_reader]



if __name__ == '__main__':
parser = OptionParser()
parser.add_option("-n","--number", action="store", type="int", dest="num_matches",default=10,help="number of results to return")
parser.add_option("-s","--student", action="store", type="string", dest="student_file",default="unmatched_students.csv",help="csv of student data")
parser.add_option("-t","--startup", action="store", type="string", dest="startup_file",default="unmatched_top_startups.csv",help="csv of startup data")
parser.add_option("-n", "--number", action="store", type="int", dest="num_matches", default=10, help="number of results to return")
parser.add_option("-s", "--student", action="store", type="string", dest="student_file", default="unmatched_students.csv", help="csv of student data")
parser.add_option("-t", "--startup", action="store", type="string", dest="startup_file", default="unmatched_top_startups.csv", help="csv of startup data")
(options, args) = parser.parse_args()

h = HackMatch(num_matches=options.num_matches, student_file=options.student_file, startup_file=options.startup_file)
hackmatch = HackMatch(num_matches=options.num_matches, student_file=options.student_file, startup_file=options.startup_file)
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
hcluster>=0.2.0
nltk>=2.0b9
2 changes: 2 additions & 0 deletions startups.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
"Name","E-mail","Company","In NYC","Funding","Site","Blog","Twitter","Num Employees","Environment","Project","Skills","Misc"
"Foobar Corp","[email protected]","Foobar Corp","Y","Y",http://www.foo.com,"","",100,"linux windows oracle","risk-management finance","python java C#","linux windows python facebook"
2 changes: 2 additions & 0 deletions students.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Student Name,e-mail,University,Major,Degree,Graduation Date,Site,Blog,Twitter,Facebook,Project,Skills,Misc
Hugh,[email protected],Toronto,AI,AI,1990,iwebthereforeiam.com,"","","","risk-management windows","python oracle","finance"