Create Conceptualizer.py

xinzhu-cai · Aug 28, 2019 · ded1532 · ded1532
1 parent 5b2c0c6
commit ded1532
Showing 1 changed file with 93 additions and 0 deletions.
diff --git a/ConceptGenerator/Conceptualizer.py b/ConceptGenerator/Conceptualizer.py
@@ -0,0 +1,93 @@
+from LDA import tokenize
+from gensim.utils import simple_preprocess
+from utilities import get_concepts_of_instance_by_probase
+from utilities import split_text_in_words
+import collections
+import numpy as np
+import operator
+
+class Conceptualizer():
+    def __init__(self, lda):
+        self.lda = lda
+        self.ldamodel = lda.ldamodel
+
+    def conceptualize(self, sentence, instance, debug=False, eval=False):
+        """
+        Conceptualize the given instance in the given context (sentence)
+        :param sentence: a sentence as context
+        :param instance: the instance, which should be conceptualized in the given context
+        :return: the most likely concept for the intance in the given context
+        """
+        concepts = get_concepts_of_instance_by_probase(instance, eval=False)
+        if len(concepts) == 0:  # TODO
+            return None
+        if debug:
+            print("Probase, ")
+            print(sorted(concepts.items(), key=operator.itemgetter(1)))
+        probabilities_of_concepts = self.__calculate_probs_of_concepts(concepts, sentence, debug)
+        if probabilities_of_concepts is None or len(probabilities_of_concepts) == 0:
+            return None
+        if debug:
+            print("All concepts: ")
+            print(sorted(probabilities_of_concepts, key = lambda x: -x[1]))
+        if eval:
+            probabilities_of_concepts = sorted(probabilities_of_concepts, key = lambda x: -x[1])
+            return probabilities_of_concepts
+        most_likely_concept = max(probabilities_of_concepts, key=lambda item: item[1])[0]
+        return most_likely_concept
+
+    def __calculate_probs_of_concepts(self, concepts, sentence, debug):
+        """
+        Calculates for each concept the probability of the concept for the given sentence
+        :param concepts: the concepts and their probability
+        :param sentence: the given sentence
+        :return: the concepts and ther probabilities
+        """
+        probabilities_of_concepts = []
+        # word1 = "Apple Company"
+        # word2 = "Apple"
+        # word3 = "Company"
+                # bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(word2))
+        # print("words,", bag_of_words)
+        # topics_of_text = self.ldamodel.get_term_topics(bag_of_words[0][0], minimum_probability=0.0)
+        # print("topics, ",topics_of_text)
+        # topics_of_text = self.ldamodel.get_document_topics(bag_of_words, minimum_probability=0.0)
+        # print("topics, ",topics_of_text)
+        # bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(word2))
+        # print("words,", bag_of_words)
+        # topics_of_text = self.ldamodel.get_document_topics(bag_of_words, minimum_probability=0.0)
+        # print("topics, ",topics_of_text)
+
+        # bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(word3))
+        # print("words,", bag_of_words)
+        # topics_of_text = self.ldamodel.get_document_topics(bag_of_words, minimum_probability=0.0)
+        # print("topics, ",topics_of_text)
+        flag = False
+        # from sentence to (token_id, counts)
+        bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(sentence))
+        # topic_distribution_for_given_bow
+        topics_of_text = self.ldamodel.get_document_topics(bag_of_words) # probability of topics given sentence
+
+        for concept in concepts:
+            prob_c_given_w = concepts[concept] # probability of concept given the instance from Probase 
+            if concept not in self.ldamodel.id2word.token2id.keys():
+                # simple_preprocess: Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.
+                bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(concept))
+                probs_of_topics_for_given_concept = [x[1] for x in list(self.ldamodel.get_document_topics(bag_of_words))]
+            else:
+                topic_terms_ = self.ldamodel.state.get_lambda()
+                topics_terms_proba_ = np.apply_along_axis(lambda x: x/x.sum(), 1, topic_terms_)
+                probs_of_topics_for_given_concept = topics_terms_proba_[:,self.ldamodel.id2word.token2id[concept]] # probability of topics given concept
+            if not flag and debug:
+                print("bag of words:")
+                print(bag_of_words)
+                print("topic distribution:")
+                print(sorted(topics_of_text, key = lambda x: -x[1]))
+                flag = True
+            sum = 0
+            for topic_id, prob_of_topic in topics_of_text: # p(s, z)
+                sum += probs_of_topics_for_given_concept[topic_id] * prob_of_topic # p( z | c ) * p(s, z)
+            prob_c_given_w_z = prob_c_given_w * sum  # p(c | w, z) = p(c | w) * sum_z p(s, z)*p( z | c )
+
+            probabilities_of_concepts.append((concept, prob_c_given_w_z))
+        return probabilities_of_concepts