Skip to content

Commit

Permalink
Create Conceptualizer.py
Browse files Browse the repository at this point in the history
  • Loading branch information
xinzhu-cai authored Aug 28, 2019
1 parent 5b2c0c6 commit ded1532
Showing 1 changed file with 93 additions and 0 deletions.
93 changes: 93 additions & 0 deletions ConceptGenerator/Conceptualizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from LDA import tokenize
from gensim.utils import simple_preprocess
from utilities import get_concepts_of_instance_by_probase
from utilities import split_text_in_words
import collections
import numpy as np
import operator

class Conceptualizer():
def __init__(self, lda):
self.lda = lda
self.ldamodel = lda.ldamodel

def conceptualize(self, sentence, instance, debug=False, eval=False):
"""
Conceptualize the given instance in the given context (sentence)
:param sentence: a sentence as context
:param instance: the instance, which should be conceptualized in the given context
:return: the most likely concept for the intance in the given context
"""
concepts = get_concepts_of_instance_by_probase(instance, eval=False)
if len(concepts) == 0: # TODO
return None
if debug:
print("Probase, ")
print(sorted(concepts.items(), key=operator.itemgetter(1)))
probabilities_of_concepts = self.__calculate_probs_of_concepts(concepts, sentence, debug)
if probabilities_of_concepts is None or len(probabilities_of_concepts) == 0:
return None
if debug:
print("All concepts: ")
print(sorted(probabilities_of_concepts, key = lambda x: -x[1]))
if eval:
probabilities_of_concepts = sorted(probabilities_of_concepts, key = lambda x: -x[1])
return probabilities_of_concepts
most_likely_concept = max(probabilities_of_concepts, key=lambda item: item[1])[0]
return most_likely_concept

def __calculate_probs_of_concepts(self, concepts, sentence, debug):
"""
Calculates for each concept the probability of the concept for the given sentence
:param concepts: the concepts and their probability
:param sentence: the given sentence
:return: the concepts and ther probabilities
"""
probabilities_of_concepts = []
# word1 = "Apple Company"
# word2 = "Apple"
# word3 = "Company"
# bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(word2))
# print("words,", bag_of_words)
# topics_of_text = self.ldamodel.get_term_topics(bag_of_words[0][0], minimum_probability=0.0)
# print("topics, ",topics_of_text)
# topics_of_text = self.ldamodel.get_document_topics(bag_of_words, minimum_probability=0.0)
# print("topics, ",topics_of_text)
# bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(word2))
# print("words,", bag_of_words)
# topics_of_text = self.ldamodel.get_document_topics(bag_of_words, minimum_probability=0.0)
# print("topics, ",topics_of_text)

# bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(word3))
# print("words,", bag_of_words)
# topics_of_text = self.ldamodel.get_document_topics(bag_of_words, minimum_probability=0.0)
# print("topics, ",topics_of_text)
flag = False
# from sentence to (token_id, counts)
bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(sentence))
# topic_distribution_for_given_bow
topics_of_text = self.ldamodel.get_document_topics(bag_of_words) # probability of topics given sentence

for concept in concepts:
prob_c_given_w = concepts[concept] # probability of concept given the instance from Probase
if concept not in self.ldamodel.id2word.token2id.keys():
# simple_preprocess: Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.
bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(concept))
probs_of_topics_for_given_concept = [x[1] for x in list(self.ldamodel.get_document_topics(bag_of_words))]
else:
topic_terms_ = self.ldamodel.state.get_lambda()
topics_terms_proba_ = np.apply_along_axis(lambda x: x/x.sum(), 1, topic_terms_)
probs_of_topics_for_given_concept = topics_terms_proba_[:,self.ldamodel.id2word.token2id[concept]] # probability of topics given concept
if not flag and debug:
print("bag of words:")
print(bag_of_words)
print("topic distribution:")
print(sorted(topics_of_text, key = lambda x: -x[1]))
flag = True
sum = 0
for topic_id, prob_of_topic in topics_of_text: # p(s, z)
sum += probs_of_topics_for_given_concept[topic_id] * prob_of_topic # p( z | c ) * p(s, z)
prob_c_given_w_z = prob_c_given_w * sum # p(c | w, z) = p(c | w) * sum_z p(s, z)*p( z | c )

probabilities_of_concepts.append((concept, prob_c_given_w_z))
return probabilities_of_concepts

0 comments on commit ded1532

Please sign in to comment.