Updated to fix data

geodesignhub · Mar 7, 2018 · 6a3e1a3 · 6a3e1a3
1 parent 4e59ce1
commit 6a3e1a3
Show file tree

Hide file tree

Showing 5 changed files with 258 additions and 138 deletions.
diff --git a/app.py b/app.py
@@ -12,144 +12,18 @@
 from shapely import speedups
 import ShapelyHelper
 import random
-# import averaged_perceptron_tagger
-from sklearn.feature_extraction.text import CountVectorizer
+import utils
 import redis
 import os
 redis_url = os.getenv('REDIS_URL', 'redis://localhost:6379')
 redis = redis.from_url(redis_url)
 
-# Imports
-import difflib
-import nltk
-import string
-from collections import OrderedDict
-from nltk import word_tokenize, pos_tag
-from nltk.corpus import wordnet as wn
-
-nltk.download('punkt')
-nltk.download('stopwords')
-nltk.download('averaged_perceptron_tagger')
-nltk.download('wordnet')
-
-# Source: http://nlpforhackers.io/wordnet-sentence-similarity/
-class SentenceSimilarity():	
-	def __init__(self):
-		self.sentences = []
-		self.corpusDict = {}
-
-		self.matches = {}
-		self.matchingDict = {}
-
-	def generateSentences(self, corpusDict):
-		self.corpusDict = corpusDict
-		for diagramid, desc in corpusDict.items():
-			self.sentences.append(desc)
-
-	def penn_to_wn(self, tag):
-		""" Convert between a Penn Treebank tag to a simplified Wordnet tag """
-		if tag.startswith('N'):
-			return 'n'
-		if tag.startswith('V'):
-			return 'v'
-		if tag.startswith('J'):
-			return 'a'
-		if tag.startswith('R'):
-			return 'r'
-		return None
-
-	def tagged_to_synset(self, word, tag):
-		wn_tag = self.penn_to_wn(tag)
-		if wn_tag is None:
-			return None
-		try:
-			return wn.synsets(word, wn_tag)[0]
-		except:
-			return None
-
-	def sentence_similarity(self, sentence1, sentence2):
-
-		sentence1 = pos_tag(word_tokenize(sentence1))
-		sentence2 = pos_tag(word_tokenize(sentence2))
-
-		synsets1 = [self.tagged_to_synset(*tagged_word) for tagged_word in sentence1]
-		synsets2 = [self.tagged_to_synset(*tagged_word) for tagged_word in sentence2]
+from rq import Queue
+from worker import conn
 
-		synsets1 = [ss for ss in synsets1 if ss]
-		synsets2 = [ss for ss in synsets2 if ss]
-
-		score, count = 0.0, 0
-		best_score = [0.0]
-		for ss1 in synsets1:
-			for ss2 in synsets2:
-				best1_score=ss1.path_similarity(ss2)
-				if best1_score is not None:
-					best_score.append(best1_score)
-			max1=max(best_score)
-			if best_score is not None:
-				score += max1
-				if max1 is not 0.0:
-					count += 1
-					best_score=[0.0]
-
-		try:
-			score /= count
-		except ZeroDivisionError as ze: 
-			score = 0
-		return score
+q = Queue(connection=conn)
 
-
-	def doSentenceSimilarity(self):
-
-		for idx, t in enumerate(self.sentences):
-			cDictList = list(self.corpusDict.items())
-			matchlist = []
-			sourcediagramid = cDictList[idx][0]
-			target_sentence = t
-
-			for sid, sentence in enumerate(self.sentences):
-				# print("Similarity(\"%s\", \"%s\") = %s" % (target_sentence, sentence, sentence_similarity(target_sentence, sentence)))
-				# print("Similarity(\"%s\", \"%s\") = %s" % (sentence, target_sentence, sentence_similarity(sentence, target_sentence)))
-				if ((self.sentence_similarity(target_sentence, sentence) > 0.4 and self.sentence_similarity(sentence, target_sentence) > 0.4)):
-					targetdiagramid = cDictList[sid][0]
-					matchlist.append(targetdiagramid)
-				else:
-					matchlist.append(0)
-
-				# if (self.is_ci_partial_seq_token_stopword_lemma_match(target_sentence, sentence)):
-				# 	targetdiagramid = cDictList[sid][0]
-				# 	matchlist.append(targetdiagramid)
-				# else:
-				# 	matchlist.append(0)
-			self.matchingDict[sourcediagramid] = matchlist
-
-		return self.matchingDict
-		# return 0
-
-
-class BagofWordsGenerator():
-	def __init__(self):
-		self.corpus = []
-		self.corpusDict = {}
-
-	def addtoCorpus(self, diagramdescirption):
-		self.corpus.append(diagramdescirption)
-
-	def addtoCorpusDict(self,diagramid, diagramdescirption):
-		self.corpusDict[diagramid] = diagramdescirption
-
-	def getOrderedCorpus(self):
-		return self.corpusDict
-
-	def generateBagofWords(self):
-		words = []
-		vectorizer = CountVectorizer()
-		features = vectorizer.fit_transform(self.corpus).todense()
-		vocab =  vectorizer.vocabulary_
-
-		for key, value in vocab.items():
-			words.append([key, int(value)])
-		return words
+# Imports
 
 
 app = Flask(__name__)
@@ -204,7 +78,7 @@ def api_root():
 		else:
 			diagrams = json.loads(d.text)
 
-		myBagofWordsGenerator = BagofWordsGenerator()
+		myBagofWordsGenerator = utils.BagofWordsGenerator()
 		formattedfinalsynthesis = {"type":"FeatureCollection","features":[]}
 		for f in finalsynthesis['features']:
 			diagramid = f['properties']['diagramid']
@@ -232,11 +106,8 @@ def api_root():
 			sentenceSimilarity = json.loads(ss)
 		else:
 			tmpCorpusDict = myBagofWordsGenerator.getOrderedCorpus()
-			orderedCorpusDict = OrderedDict(sorted(tmpCorpusDict.items(), key=lambda t: t[0]))
-			mySS = SentenceSimilarity()
-			mySS.generateSentences(orderedCorpusDict)
-			sentenceSimilarity = mySS.doSentenceSimilarity()
-			redis.set(key, json.dumps(sentenceSimilarity))
+			result = q.enqueue(utils.createSenteceSimilarity,{'data':tmpCorpusDict,'key':key})
+			sentenceSimilarity = {}
 		# sentenceSimilarity ={}
 
 

diff --git a/requirements.txt b/requirements.txt
@@ -12,4 +12,5 @@ scikit-learn
 nltk
 numpy
 scipy
-redis
+redis
+rq
diff --git a/sandbox/fuzzymatching.py b/sandbox/fuzzymatching.py
@@ -0,0 +1,85 @@
+
+# import nltk.corpus
+# from nltk.corpus import wordnet
+# import nltk.tokenize.punkt
+# import nltk.stem.snowball
+
+# Source: http://nbviewer.jupyter.org/urls/gist.github.com/mjbommar/e2a019e346b879c13d3d/raw/74a206c2629d6e661645e18369f05f6c79d15b65/fuzzy-sentence-matching-python.ipynb 
+# class FuzzyMatcher():
+# 	def __init__(self):
+# 		self.stopwords = nltk.corpus.stopwords.words('english')
+# 		self.stopwords.extend(string.punctuation)
+# 		self.stopwords.append('')
+
+# 		# Create tokenizer and stemmer
+# 		self.tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
+# 		self.lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
+# 		self.sentences = []
+# 		self.corpusDict = {}
+
+# 		self.matches = {}
+# 		self.matchingDict = {}
+
+# 	def generateSentences(self, corpusDict):
+# 		self.corpusDict = corpusDict
+# 		for diagramid, desc in corpusDict.items():
+# 			self.sentences.append(desc)
+
+
+# 	def get_wordnet_pos(self, pos_tag):
+# 		if pos_tag[1].startswith('J'):
+# 			return (pos_tag[0], wordnet.ADJ)
+# 		elif pos_tag[1].startswith('V'):
+# 			return (pos_tag[0], wordnet.VERB)
+# 		elif pos_tag[1].startswith('N'):
+# 			return (pos_tag[0], wordnet.NOUN)
+# 		elif pos_tag[1].startswith('R'):
+# 			return (pos_tag[0], wordnet.ADV)
+# 		else:
+# 			return (pos_tag[0], wordnet.NOUN)
+
+# 	def is_ci_partial_seq_token_stopword_lemma_match(self,a, b):
+# 		"""Check if a and b are matches."""
+# 		pos_a = map(self.get_wordnet_pos, nltk.pos_tag(self.tokenizer.tokenize(a)))
+# 		pos_b = map(self.get_wordnet_pos, nltk.pos_tag(self.tokenizer.tokenize(b)))
+# 		lemmae_a = [self.lemmatizer.lemmatize(token.lower().strip(string.punctuation), pos) for token, pos in pos_a \
+# 		if token.lower().strip(string.punctuation) not in self.stopwords]
+# 		lemmae_b = [self.lemmatizer.lemmatize(token.lower().strip(string.punctuation), pos) for token, pos in pos_b \
+# 		if token.lower().strip(string.punctuation) not in self.stopwords]
+# 		s = difflib.SequenceMatcher(None, lemmae_a, lemmae_b)
+
+# 		return (s.ratio() > 0.66)
+
+# 	# def is_ci_partial_noun_set_token_stopword_lemma_match(self,a, b):
+# 	# 	"""Check if a and b are matches."""
+# 	# 	pos_a = map(self.get_wordnet_pos, nltk.pos_tag(self.tokenizer.tokenize(a)))
+# 	# 	pos_b = map(self.get_wordnet_pos, nltk.pos_tag(self.tokenizer.tokenize(b)))
+# 	# 	lemmae_a = [self.lemmatizer.lemmatize(token.lower().strip(string.punctuation), pos) for token, pos in pos_a \
+# 	# 	if pos == wordnet.NOUN and token.lower().strip(string.punctuation) not in self.stopwords]
+# 	# 	lemmae_b = [self.lemmatizer.lemmatize(token.lower().strip(string.punctuation), pos) for token, pos in pos_b \
+# 	# 	if pos == wordnet.NOUN and token.lower().strip(string.punctuation) not in self.stopwords]
+# 	# 	try: 
+# 	# 		ratio = len(set(lemmae_a).intersection(lemmae_b)) / float(len(set(lemmae_a).union(lemmae_b)))
+# 	# 	except ZeroDivisionError as ze:
+# 	# 		ratio = 0
+# 	# 	return (ratio > 0.66)
+
+# 	def doFuzzyMatching(self):
+# 		print (self.sentences)
+# 		for idx, t in enumerate(self.sentences):
+# 			cDictList = list(self.corpusDict.items())
+# 			matchlist = []
+# 			sourcediagramid = cDictList[idx][0]
+# 			target_sentence = t
+
+# 			for sid, sentence in enumerate(self.sentences):
+
+# 				if (self.is_ci_partial_seq_token_stopword_lemma_match(target_sentence, sentence)):
+# 					targetdiagramid = cDictList[sid][0]
+# 					matchlist.append(targetdiagramid)
+# 				else:
+# 					matchlist.append(0)
+# 			self.matchingDict[sourcediagramid] = matchlist
+
+# 		return self.matchingDict
+# 		# return 0
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,4 +12,5 @@ scikit-learn @@
     nltk
     numpy
     scipy
-    redis
+    redis
+    rq