-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
5b2c0c6
commit ded1532
Showing
1 changed file
with
93 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
from LDA import tokenize | ||
from gensim.utils import simple_preprocess | ||
from utilities import get_concepts_of_instance_by_probase | ||
from utilities import split_text_in_words | ||
import collections | ||
import numpy as np | ||
import operator | ||
|
||
class Conceptualizer(): | ||
def __init__(self, lda): | ||
self.lda = lda | ||
self.ldamodel = lda.ldamodel | ||
|
||
def conceptualize(self, sentence, instance, debug=False, eval=False): | ||
""" | ||
Conceptualize the given instance in the given context (sentence) | ||
:param sentence: a sentence as context | ||
:param instance: the instance, which should be conceptualized in the given context | ||
:return: the most likely concept for the intance in the given context | ||
""" | ||
concepts = get_concepts_of_instance_by_probase(instance, eval=False) | ||
if len(concepts) == 0: # TODO | ||
return None | ||
if debug: | ||
print("Probase, ") | ||
print(sorted(concepts.items(), key=operator.itemgetter(1))) | ||
probabilities_of_concepts = self.__calculate_probs_of_concepts(concepts, sentence, debug) | ||
if probabilities_of_concepts is None or len(probabilities_of_concepts) == 0: | ||
return None | ||
if debug: | ||
print("All concepts: ") | ||
print(sorted(probabilities_of_concepts, key = lambda x: -x[1])) | ||
if eval: | ||
probabilities_of_concepts = sorted(probabilities_of_concepts, key = lambda x: -x[1]) | ||
return probabilities_of_concepts | ||
most_likely_concept = max(probabilities_of_concepts, key=lambda item: item[1])[0] | ||
return most_likely_concept | ||
|
||
def __calculate_probs_of_concepts(self, concepts, sentence, debug): | ||
""" | ||
Calculates for each concept the probability of the concept for the given sentence | ||
:param concepts: the concepts and their probability | ||
:param sentence: the given sentence | ||
:return: the concepts and ther probabilities | ||
""" | ||
probabilities_of_concepts = [] | ||
# word1 = "Apple Company" | ||
# word2 = "Apple" | ||
# word3 = "Company" | ||
# bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(word2)) | ||
# print("words,", bag_of_words) | ||
# topics_of_text = self.ldamodel.get_term_topics(bag_of_words[0][0], minimum_probability=0.0) | ||
# print("topics, ",topics_of_text) | ||
# topics_of_text = self.ldamodel.get_document_topics(bag_of_words, minimum_probability=0.0) | ||
# print("topics, ",topics_of_text) | ||
# bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(word2)) | ||
# print("words,", bag_of_words) | ||
# topics_of_text = self.ldamodel.get_document_topics(bag_of_words, minimum_probability=0.0) | ||
# print("topics, ",topics_of_text) | ||
|
||
# bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(word3)) | ||
# print("words,", bag_of_words) | ||
# topics_of_text = self.ldamodel.get_document_topics(bag_of_words, minimum_probability=0.0) | ||
# print("topics, ",topics_of_text) | ||
flag = False | ||
# from sentence to (token_id, counts) | ||
bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(sentence)) | ||
# topic_distribution_for_given_bow | ||
topics_of_text = self.ldamodel.get_document_topics(bag_of_words) # probability of topics given sentence | ||
|
||
for concept in concepts: | ||
prob_c_given_w = concepts[concept] # probability of concept given the instance from Probase | ||
if concept not in self.ldamodel.id2word.token2id.keys(): | ||
# simple_preprocess: Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long. | ||
bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(concept)) | ||
probs_of_topics_for_given_concept = [x[1] for x in list(self.ldamodel.get_document_topics(bag_of_words))] | ||
else: | ||
topic_terms_ = self.ldamodel.state.get_lambda() | ||
topics_terms_proba_ = np.apply_along_axis(lambda x: x/x.sum(), 1, topic_terms_) | ||
probs_of_topics_for_given_concept = topics_terms_proba_[:,self.ldamodel.id2word.token2id[concept]] # probability of topics given concept | ||
if not flag and debug: | ||
print("bag of words:") | ||
print(bag_of_words) | ||
print("topic distribution:") | ||
print(sorted(topics_of_text, key = lambda x: -x[1])) | ||
flag = True | ||
sum = 0 | ||
for topic_id, prob_of_topic in topics_of_text: # p(s, z) | ||
sum += probs_of_topics_for_given_concept[topic_id] * prob_of_topic # p( z | c ) * p(s, z) | ||
prob_c_given_w_z = prob_c_given_w * sum # p(c | w, z) = p(c | w) * sum_z p(s, z)*p( z | c ) | ||
|
||
probabilities_of_concepts.append((concept, prob_c_given_w_z)) | ||
return probabilities_of_concepts |