diff --git a/ConceptGenerator/Conceptualizer.py b/ConceptGenerator/Conceptualizer.py new file mode 100644 index 0000000..585ce31 --- /dev/null +++ b/ConceptGenerator/Conceptualizer.py @@ -0,0 +1,93 @@ +from LDA import tokenize +from gensim.utils import simple_preprocess +from utilities import get_concepts_of_instance_by_probase +from utilities import split_text_in_words +import collections +import numpy as np +import operator + +class Conceptualizer(): + def __init__(self, lda): + self.lda = lda + self.ldamodel = lda.ldamodel + + def conceptualize(self, sentence, instance, debug=False, eval=False): + """ + Conceptualize the given instance in the given context (sentence) + :param sentence: a sentence as context + :param instance: the instance, which should be conceptualized in the given context + :return: the most likely concept for the intance in the given context + """ + concepts = get_concepts_of_instance_by_probase(instance, eval=False) + if len(concepts) == 0: # TODO + return None + if debug: + print("Probase, ") + print(sorted(concepts.items(), key=operator.itemgetter(1))) + probabilities_of_concepts = self.__calculate_probs_of_concepts(concepts, sentence, debug) + if probabilities_of_concepts is None or len(probabilities_of_concepts) == 0: + return None + if debug: + print("All concepts: ") + print(sorted(probabilities_of_concepts, key = lambda x: -x[1])) + if eval: + probabilities_of_concepts = sorted(probabilities_of_concepts, key = lambda x: -x[1]) + return probabilities_of_concepts + most_likely_concept = max(probabilities_of_concepts, key=lambda item: item[1])[0] + return most_likely_concept + + def __calculate_probs_of_concepts(self, concepts, sentence, debug): + """ + Calculates for each concept the probability of the concept for the given sentence + :param concepts: the concepts and their probability + :param sentence: the given sentence + :return: the concepts and ther probabilities + """ + probabilities_of_concepts = [] + # word1 = "Apple Company" + # word2 = "Apple" + # word3 = "Company" + # bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(word2)) + # print("words,", bag_of_words) + # topics_of_text = self.ldamodel.get_term_topics(bag_of_words[0][0], minimum_probability=0.0) + # print("topics, ",topics_of_text) + # topics_of_text = self.ldamodel.get_document_topics(bag_of_words, minimum_probability=0.0) + # print("topics, ",topics_of_text) + # bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(word2)) + # print("words,", bag_of_words) + # topics_of_text = self.ldamodel.get_document_topics(bag_of_words, minimum_probability=0.0) + # print("topics, ",topics_of_text) + + # bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(word3)) + # print("words,", bag_of_words) + # topics_of_text = self.ldamodel.get_document_topics(bag_of_words, minimum_probability=0.0) + # print("topics, ",topics_of_text) + flag = False + # from sentence to (token_id, counts) + bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(sentence)) + # topic_distribution_for_given_bow + topics_of_text = self.ldamodel.get_document_topics(bag_of_words) # probability of topics given sentence + + for concept in concepts: + prob_c_given_w = concepts[concept] # probability of concept given the instance from Probase + if concept not in self.ldamodel.id2word.token2id.keys(): + # simple_preprocess: Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long. + bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(concept)) + probs_of_topics_for_given_concept = [x[1] for x in list(self.ldamodel.get_document_topics(bag_of_words))] + else: + topic_terms_ = self.ldamodel.state.get_lambda() + topics_terms_proba_ = np.apply_along_axis(lambda x: x/x.sum(), 1, topic_terms_) + probs_of_topics_for_given_concept = topics_terms_proba_[:,self.ldamodel.id2word.token2id[concept]] # probability of topics given concept + if not flag and debug: + print("bag of words:") + print(bag_of_words) + print("topic distribution:") + print(sorted(topics_of_text, key = lambda x: -x[1])) + flag = True + sum = 0 + for topic_id, prob_of_topic in topics_of_text: # p(s, z) + sum += probs_of_topics_for_given_concept[topic_id] * prob_of_topic # p( z | c ) * p(s, z) + prob_c_given_w_z = prob_c_given_w * sum # p(c | w, z) = p(c | w) * sum_z p(s, z)*p( z | c ) + + probabilities_of_concepts.append((concept, prob_c_given_w_z)) + return probabilities_of_concepts