-
Notifications
You must be signed in to change notification settings - Fork 164
/
Copy pathengines.py
76 lines (54 loc) · 2.81 KB
/
engines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas as pd
import time
import redis
from flask import current_app
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
def info(msg):
current_app.logger.info(msg)
class ContentEngine(object):
SIMKEY = 'p:smlr:%s'
def __init__(self):
self._r = redis.StrictRedis.from_url(current_app.config['REDIS_URL'])
def train(self, data_source):
start = time.time()
ds = pd.read_csv(data_source)
info("Training data ingested in %s seconds." % (time.time() - start))
# Flush the stale training data from redis
self._r.flushdb()
start = time.time()
self._train(ds)
info("Engine trained in %s seconds." % (time.time() - start))
def _train(self, ds):
"""
Train the engine.
Create a TF-IDF matrix of unigrams, bigrams, and trigrams for each product. The 'stop_words' param
tells the TF-IDF module to ignore common english words like 'the', etc.
Then we compute similarity between all products using SciKit Leanr's linear_kernel (which in this case is
equivalent to cosine similarity).
Iterate through each item's similar items and store the 100 most-similar. Stops at 100 because well...
how many similar products do you really need to show?
Similarities and their scores are stored in redis as a Sorted Set, with one set for each item.
:param ds: A pandas dataset containing two fields: description & id
:return: Nothin!
"""
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(ds['description'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
for idx, row in ds.iterrows():
similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
similar_items = [(cosine_similarities[idx][i], ds['id'][i]) for i in similar_indices]
# First item is the item itself, so remove it.
# This 'sum' is turns a list of tuples into a single tuple: [(1,2), (3,4)] -> (1,2,3,4)
flattened = sum(similar_items[1:], ())
self._r.zadd(self.SIMKEY % row['id'], *flattened)
def predict(self, item_id, num):
"""
Couldn't be simpler! Just retrieves the similar items and their 'score' from redis.
:param item_id: string
:param num: number of similar items to return
:return: A list of lists like: [["19", 0.2203], ["494", 0.1693], ...]. The first item in each sub-list is
the item ID and the second is the similarity score. Sorted by similarity score, descending.
"""
return self._r.zrange(self.SIMKEY % item_id, 0, num-1, withscores=True, desc=True)
content_engine = ContentEngine()