diff --git a/ConceptGenerator/LDA.py b/ConceptGenerator/LDA.py new file mode 100644 index 0000000..21d368e --- /dev/null +++ b/ConceptGenerator/LDA.py @@ -0,0 +1,124 @@ +import itertools +import os + +import gensim +from gensim import corpora +from gensim.corpora.wikicorpus import _extract_pages, filter_wiki +from gensim.parsing.preprocessing import STOPWORDS +from gensim.utils import smart_open, simple_preprocess +from stop_words import get_stop_words + + +def tokenize(text): + """ + Preprocess and then tokenize a given text + :param text: the text which should be tokenized. + :return: the token of the given text, after preprocess the text + """ + return [token for token in simple_preprocess(text) if token not in STOPWORDS] + + +def iter_over_dump_file(dump_file, min_length_of_article=50, ignore_namespaces=None): + """ + Iterator over wiki_dump_file. + Returns title and tokens for next article in dump file. + Ignores short articles. + Ignores meta articles, throug given namespaces. + Default namespaces are 'Wikipedia', 'Category', 'File', 'Portal', 'Template', 'MediaWiki', 'User', 'Help', 'Book', 'Draft' + :param dump_file: the dump file + :param min_length_of_article: the min number of words in the next article. Default = 50 + :param ignore_namespaces: list of namespaces which should be ignored. + :return: title, tokens + """ + if ignore_namespaces is None: + ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split() + for title, text, pageid in _extract_pages(smart_open(dump_file)): + text = filter_wiki(text) + tokens = tokenize(text) + if len(tokens) < min_length_of_article or any( + title.startswith(namespace + ':') for namespace in ignore_namespaces): + continue # ignore short articles and various meta-articles + yield title, tokens + + +class LDA(): + def __init__(self): + self.stop_words = get_stop_words('en') + + def load(self, model_file): + """ + Loads a LDA model from a given file + :param model_file: the file which contains the model, which should be loaded + """ + from gensim.models.ldamodel import LdaModel + # self.ldamodel = LdaModel.load(model_file) + self.ldamodel = gensim.models.ldamulticore.LdaMulticore.load(model_file) + # print(self.ldamodel.print_topics(num_topics=100)) + + # self.ldamodel = gensim.models.wrappers.LdaMallet.load(model_file) + # from gensim.models.wrappers.ldamallet import malletmodel2ldamodel + # self.ldamodel.show_topics(num_topics=5, num_words=10) + # self.ldamodel = malletmodel2ldamodel(self.ldamodel) + # print(self.ldamodel.__dict__) + + def generate_bow_of_dump_file(self, dump_file, bow_output_file, dict_output_file): + doc_stream = (tokens for _, tokens in iter_over_dump_file(dump_file)) + id2word_dict = gensim.corpora.Dictionary(doc_stream) #obtain: (word_id:word) + print(id2word_dict) + id2word_dict.filter_extremes(no_below=20, no_above=0.1, keep_n=250000) # word must appear >10 times, and no more than 20% documents + print(id2word_dict) + dump_corpus = DumpCorpus(dump_file, id2word_dict) #from dictionary to bag of words + print("save bow...") + #Iterate through the document stream corpus, saving the documents to fname and recording byte offset of each document. + gensim.corpora.MmCorpus.serialize(bow_output_file, dump_corpus) + print("save dict") + id2word_dict.save(dict_output_file) + + def train_on_dump_file(self, num_topics, bow_path, dict_path, model_outputfile, training_iterations=20, + max_docs=None): + """ + Trains a new LDA model based on a wikipedia dump or any other dump in the same format. + The dump could be zipped. + :param num_topics: the number of topics, which should be generated + :param bow_path: the path inclusive filename, where the bag of words should be saved + :param dict_path: the path incl. filename, where the dictionary should be saved + :param model_outputfile: the file in which the trained model should be stored + :param training_iterations: the number of LDA training iterations + :param max_docs: the number of how many docs should be used for training, if None all docs are used + """ + print("load bow...") + mm_corpus = gensim.corpora.MmCorpus(bow_path) + print("load dict...") + id2word_dict = gensim.corpora.Dictionary.load(dict_path) + clipped_corpus = gensim.utils.ClippedCorpus(mm_corpus, max_docs) + print("start training") + #train LDA on bag of word corpus + self.ldamodel = gensim.models.ldamulticore.LdaMulticore(clipped_corpus, num_topics=num_topics, + id2word=id2word_dict, passes=training_iterations, + minimum_probability=0) + print("save model") + self.ldamodel.save(model_outputfile) + + +class DumpCorpus(object): + def __init__(self, dump_file, dictionary, clip_docs=None): + """ + Parse the first `clip_docs` documents from file `dump_file`. + Yield each document in turn, as a list of tokens (unicode strings). + """ + self.dump_file = dump_file + self.dictionary = dictionary + self.clip_docs = clip_docs + + def __iter__(self): + """ + Iterator over wiki corpus + :return: bag-of-words format = list of `(token_id, token_count)` 2-tuples + """ + self.titles = [] + for title, tokens in itertools.islice(iter_over_dump_file(self.dump_file), self.clip_docs): + self.titles.append(title) + yield self.dictionary.doc2bow(tokens) # tokens to (token_id, token_count) tuples + + def __len__(self): + return self.clip_docs diff --git a/ConceptGenerator/layer1_calculate_features.py b/ConceptGenerator/layer1_calculate_features.py new file mode 100644 index 0000000..1a20e7f --- /dev/null +++ b/ConceptGenerator/layer1_calculate_features.py @@ -0,0 +1,266 @@ +import nltk +import csv +import re +import numpy as np +import inflect +import pickle +from difflib import SequenceMatcher +from gensim.models import Word2Vec + +p = inflect.engine() +model = Word2Vec.load("/home/xinzhu/Dataset/Word2Vec-on-Wikipedia-Corpus/model/word2vec_gensim") +print("Prepare Word2Vec model done!") + +prefix = '/home/xinzhu/Code/model/feature/' +infile = open(prefix+'unigram_freq.csv', mode='r') +reader = csv.reader(infile) +freq_dict = {row[0]:row[1] for row in reader} + +fin = open('/home/xinzhu/Code/Mydata/data/vocab_python2.pkl', 'rb') +vocab = pickle.load(fin) +print('loading saved vocab...') +fin.close() + +fin = open('/home/xinzhu/Code/Mydata/data/embd_python2.pkl', 'rb') +embd = pickle.load(fin) +print('loading saved embd...') +fin.close() + +cnt = 0 + +def emb_sim(a,d): + avec = np.array([0.0]*300) + dvec = np.array([0.0]*300) + try: + aL = a.split(' ') + dL = d.split(' ') + for word in aL: + try: + emres = [float(x) for x in embd[vocab[word]]] + avec += emres + except: + pass + for word in dL: + try: + emres = [float(x) for x in embd[vocab[word]]] + dvec += emres + except: + pass + avec /= len(aL) + dvec /= len(dL) + except: + try: + avec = [float(x) for x in embd[vocab[a]]] + dvec = [float(x) for x in embd[vocab[d]]] + except: + pass + upnum = 0 + downnum = 0 + try: + for i in range(len(avec)): + upnum += avec[i]*dvec[i] + downnum += avec[i]*avec[i] + downnum += dvec[i]*dvec[i] + if downnum == 0: + return 0 + return upnum/downnum + except: + return 0 + +def pos_sim(a,d): +#"""POS similarity a is answer, d is distractor""" + try: + apos = nltk.pos_tag(nltk.word_tokenize(a)) + dpos = nltk.pos_tag(nltk.word_tokenize(d)) + aset = set() + dset = set() + for tag in apos: + aset.add(tag[1]) + for tag in dpos: + dset.add(tag[1]) + M11 = len(aset & dset) + M10 = len(aset - dset) + M01 = len(dset - aset) + similarity = M11/(M11+M10+M01) if (M11+M10+M01)>0 else 0 + #print("POS_sim, ",similarity) + return similarity + except: + return 0 + +def edit_distance(s1, s2): +#"""levenshteinDistance""" + try: + return nltk.edit_distance(s1,s2) + except: + return 0 + +def token_sim(s1,s2): +#""" jaccard similarity between two strings""" + try: + aset = set(nltk.word_tokenize(s1)) + dset = set(nltk.word_tokenize(s2)) + return nltk.jaccard_distance(aset,dset) + except: + return 0 + +def length_sim(a,d): +#"""calculate a and d's character and token lengths and the difference of lengths""" + try: + acharlen = len(a) + dcharlen = len(d) + atokenlen = len(nltk.word_tokenize(a)) + dtokenlen = len(nltk.word_tokenize(d)) + diffcharlen = abs(acharlen-dcharlen) + difftokenlen = abs(atokenlen-dtokenlen) + return [acharlen,dcharlen,atokenlen,dtokenlen,diffcharlen,difftokenlen] + except: + return [acharlen,dcharlen,1,1,diffcharlen,difftokenlen] + +# Function to find Longest Common Sub-string +def suffix(str1,str2): + try: + # initialize SequenceMatcher object with + # input string + seqMatch = SequenceMatcher(None,str1,str2) + # find match of longest sub-string + # output will be like Match(a=0, b=0, size=5) + match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) + # print longest substring + if (match.size!=0): + res = str1[match.a: match.a + match.size] + abs_len = len(res) + return [abs_len,float(abs_len)/len(str1) if len(str1)>0 else 0.0,float(abs_len)/len(str2) if len(str2)>0 else 0.0] + else: + return [0,0.0,0.0] + except: + return [0,0.0,0.0] + +def freq(a,d): +#"""average word frequency in a and d""" + try: + aL = a.split() + dL = d.split() + afreqs = [] + dfreqs = [] + for word in aL: + afreqs.append(freq_dict.get(word,0)) + for word in dL: + dfreqs.append(freq_dict.get(word,0)) + return [sum(afreqs)/len(afreqs) if len(afreqs)>0 else 0,sum(dfreqs)/len(dfreqs) if len(dfreqs)>0 else 0] + except: + return [0.0,0.0] + +def is_plural( noun): + try: + return p.singular_noun(noun) is not False + except: + return False + +def singlar_or_plural(a,d): + try: + a = nltk.word_tokenize(a) + d = nltk.word_tokenize(d) + aflag = False + dflag = False + for x in a: + if is_plural(x): + aflag = True + for x in d: + if is_plural(x): + dflag = True + if aflag == dflag: + return 1 + else: + return 0 + except: + return 0 + +def num(s): +# whether numbers appear in a and d + if re.search(r'\d', s): + return True + _known = { + 'zero': 0, + 'one': 1, + 'two': 2, + 'three': 3, + 'four': 4, + 'five': 5, + 'six': 6, + 'seven': 7, + 'eight': 8, + 'nine': 9, + 'ten': 10, + 'eleven': 11, + 'twelve': 12, + 'thirteen': 13, + 'fourteen': 14, + 'fifteen': 15, + 'sixteen': 16, + 'seventeen': 17, + 'eighteen': 18, + 'nineteen': 19, + 'twenty': 20, + 'thirty': 30, + 'forty': 40, + 'fifty': 50, + 'sixty': 60, + 'seventy': 70, + 'eighty': 80, + 'ninety': 90 + } + for n in _known.keys(): + if n in s: + return True + return False + +def wiki_sim(a,d): + res = 0 + try: + res = model.similarity(a,d) + except: + pass + return res + +def cal_10_feature_vec(params): + q = params[0].replace('_',' ') + a = params[1].replace('_',' ') + d = params[2].replace('_',' ') + y = params[3] + features = [] + features.extend([emb_sim(q,d),emb_sim(a,d)]) + features.append(pos_sim(a,d)) + features.append(edit_distance(a,d)) + features.extend([token_sim(q,d),token_sim(a,d),token_sim(q,a)]) + features.extend(length_sim(a,d)) + features.extend(suffix(a,d)) + features.extend(freq(a,d)) + global cnt + cnt += 1 + if cnt%10000 == 0: + print(cnt) + return [features,y,q,a,d] + +def cal_26_feature_vec(params): +#"""26-dimensional feature vector""" + q = params[0] + a = params[1] + d = params[2] + features = [] + features.extend([emb_sim(q,d),emb_sim(a,d)]) #2 + features.append(pos_sim(a,d)) #1 + features.append(edit_distance(a,d)) #1 + features.extend([token_sim(q,d),token_sim(a,d),token_sim(q,a)]) #3 + features.extend(length_sim(a,d)) #6 + features.extend(suffix(a,d)) #3 + features.extend(freq(a,d)) #2 + features.append(singlar_or_plural(a,d)) #1 + features.extend([int(num(a)),int(num(d))]) #2 + features.append(wiki_sim(a,d)) #1 + # print("total features, ",features) + global cnt + cnt += 1 + if cnt%10000 == 0: + print(cnt) + # print(features) + return features \ No newline at end of file diff --git a/ConceptGenerator/layer1_candidate_generator.py b/ConceptGenerator/layer1_candidate_generator.py new file mode 100644 index 0000000..818052c --- /dev/null +++ b/ConceptGenerator/layer1_candidate_generator.py @@ -0,0 +1,289 @@ +from sklearn.externals import joblib +import numpy as np +import json +import sys +sys.path.append("/home/xinzhu/Code/CG/Layer1") +from search_candidates_from_e import search_candidates_from_e +from utilities import normalize_instance +from layer1_calculate_features2 import cal_26_feature_vec +from wordnet_candidate_generation import wordnet_predict +# from layer1_word2vec_predict import word2vec_predict + +# class Word2vecGenerator(object): +# """docstring for Word2vecGenerator""" +# def __init__(self, model_path): +# self.ranker = joblib.load(model_path) +# print("init word2vec candidate generator") + +# def top_k_candidates(self, sentence, answer, k=10): +# try: +# candidates = word2vec_predict(sentence) +# print("candidate length,",len(candidates)) +# except: +# print("search_candidates_from_e error!") +# return {} + +# if candidates is None: +# print("candidates is None") +# return None +# try: +# features = [] # concept probability + embedding features for each candidate +# Y = [] # label, whether is a distractor or not +# scores = [] # score from Probase +# candidate = [] + +# for c,v in candidates.items(): +# features.append(cal_26_feature_vec([sentence,answer,c])) # need to recalculate features if add LM score +# scores.append([v]) +# candidate.append(c) + +# print("Calculate feature done! candidate count:", len(features)) +# features = np.array(features,dtype=np.float32) +# scores = np.array(scores,dtype=np.float64) +# scores_normed = scores / scores.max(axis=0) +# features = np.hstack((features, scores_normed)) + +# where_are_NaNs = np.isnan(features) +# features[where_are_NaNs] = 0 + +# print("feature shape: ",features.shape) +# predicts = self.ranker.predict_proba(features)[:,1] # get probability +# index = np.argsort(-predicts) +# print("index:", index) + +# topk = {} +# if k= 10: + break + print(c) + result.append(c) + i += 1 + return result + + + def hit_rate(self, topk, distractors, k=10): + """ + Calculate hit rate in topk for an item in dataset + """ + total = len(distractors) + valid = 0 + # different forms of distractors to be tested + for i in range(len(distractors)): + d = normalize_instance(distractors[i]) + for k in [distractors[i], d, d.capitalize(), ' '.join([x.capitalize() for x in d.split()]), ''.join([x.capitalize() for x in d.split()])]: + if k in topk: + valid += 1 + break + return float(valid)/total, topk + + +def total_generate_candidates(model_path, source): + if source == 'Probase': + generator = ProbaseCandidateGenerator(model_path) + elif source == 'WordNet': + generator = WordNetCandidateGenerator(model_path) + f = "/home/xinzhu/Code/CG/Layer1/dataset/total_new.json" + index_list = [int(line) for line in open("/home/xinzhu/Code/CG/Layer1/test_index.txt",'r').readlines()] + dataset = json.load(open(f, 'r')) + resf = open('/home/xinzhu/Code/CG/CGresult/Wordnet_layer1.txt','w') + for index in index_list: + item = dataset[index-1] + topk = generator.top_k_candidates(item['sentence'], item['answer']) + topk = topk.keys() + resf.write("sentence: "+item['sentence']) + resf.write('\n') + resf.write("answer: "+item['answer']) + resf.write('\n') + for candidate in topk: + resf.write(candidate.encode('utf-8')) + resf.write('\n') + resf.write("*"*50) + resf.write('\n') + resf.close() + + +def domain_generate_candidates(model_path, domain, source): + if source == 'Probase': + generator = ProbaseCandidateGenerator(model_path) + elif source == 'WordNet': + generator = WordNetCandidateGenerator(model_path) + f = "/home/xinzhu/Code/CG/Layer1/dataset/"+domain+".json" + index_list = [int(line) for line in open("/home/xinzhu/Code/CG/Layer1/"+domain+"_test_index.txt",'r').readlines()] + dataset = json.load(open(f, 'r')) + resf = open('/home/xinzhu/Code/CG/CGresult/'+domain+'_layer1.txt','w') + for index in index_list: + item = dataset[index-1] + topk = generator.top_k_candidates(item['sentence'], item['answer']) + topk = topk.keys() + resf.write("sentence: "+item['sentence'].encode('utf-8')) + # try: + # resf.write("sentence: "+item['sentence']) + # except: + # try: + + # except: + # pass + resf.write('\n') + try: + resf.write("answer: "+item['answer']) + except: + resf.write("answer: "+item['answer'].encode('utf-8')) + resf.write('\n') + for candidate in topk: + resf.write(candidate.encode('utf-8')) + resf.write('\n') + resf.write("*"*50) + resf.write('\n') + resf.close() + + +if __name__=="__main__": + # total + wordNet + # model_path = "/home/xinzhu/Code/CG/Layer1/wordnet_model/total_new_adaboost_new.joblib.dat" + # generate_candidates(model_path,'WordNet') + + # science + Probase + for domain in ['science','vocabulary','common','trivia']: + model_path = "/home/xinzhu/Code/CG/Layer1/model2/"+domain+"_adaboost_new.joblib.dat" + domain_generate_candidates(model_path, domain,'Probase') \ No newline at end of file diff --git a/ConceptGenerator/layer1_prepare_training_data.py b/ConceptGenerator/layer1_prepare_training_data.py new file mode 100644 index 0000000..13e1f66 --- /dev/null +++ b/ConceptGenerator/layer1_prepare_training_data.py @@ -0,0 +1,456 @@ +import json +from os import listdir +from os.path import isfile, join +import re +import urllib +import requests +from search_candidates_from_e import search_candidates_from_e +from utilities import normalize_instance +import multiprocessing +from multiprocessing import Pool +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import numpy as np +# from layer1_word2vec_predict import word2vec_predict +from layer1_calculate_features2 import cal_26_feature_vec +from wordnet_candidate_generation import wordnet_predict +# from Calculate_features import cal_26_feature_vec + +data_path = "/home/xinzhu/Code/CG/Layer1/dataset/" +feature_path = "/home/xinzhu/Code/CG/Layer1/feature2/" +word2vec_feature_path = "/home/xinzhu/Code/CG/Layer1/word2vec_feature/" +model_path = "/home/xinzhu/Code/CG/Layer1/model2/" + +cache = {} + +def get_concepts_of_instance_by_probase(instance, use_cache=True): + """ + Fetches the concept and the probabilities for a given instance by probase. + :param instance: the instance, for which the concepts should be requested + :param use_cache: if true a cache for instances and corresponding concepts is used, to avoid unnecessary requests + :return: the concepts and their probability + """ + from urlparse import urlparse + if use_cache == True and instance in cache: + return cache[instance] + try: + requestUrl = 'https://concept.research.microsoft.com/api/Concept/ScoreByProb?instance={}&topK=10&api_key=eT5luCbmII34ZvpPVs7HxtbUU1cFcE12'.format(urllib.pathname2url(instance)) + try: + response = requests.get(requestUrl) + except requests.exceptions.ConnectionError as e: + print(e) + print("\n\ntry one last time...") + response = requests.get(requestUrl) + except: + print("error, ", instance) + response = None + if response is None: + return None + concepts = response.json() + cache[instance] = concepts + return concepts + +def calculate_features(item, mode=1, can_num=100, source="Probase"): + """ + Given an item, generate can_num candidates and test hit rate. + param: can_num: number of candidates to be generated + param: mode: 1 means single-processing, 2 means multi-processing have to return an additional len(distractors) + return: features: probabilty from Probase + embedding similarities for each candidate + """ + + # select a candidate generation source + print("calculating features...") + if source == 'Probase': + candidates = search_candidates_from_e(item['sentence'],item['answer'],can_num) + elif source == "WordNet": + candidates = wordnet_predict(item['sentence'],item['answer'],can_num) + else: + # candidates = word2vec_predict(item['sentence'],can_num) + pass + # print(candidates) + # if answer is not in Probase + if candidates is None: + return np.array([]), np.array([]) + + cnt = 1 + rankings = {} # from candidate to its ranking + features = [] # concept probability + embedding features for each candidate + Y = [] # label, whether is a distractor or not + res = [] + dic = {} + visit = [0]*len(item['distractors']) + + # different forms of distractors to be tested + distractors = [] + for i in range(len(item['distractors'])): + d = normalize_instance(item['distractors'][i]) + for k in [item['distractors'][i], d, d.capitalize(), ' '.join([x.capitalize() for x in d.split()]), ''.join([x.capitalize() for x in d.split()])]: + distractors.append(k) + dic[k] = i + + item['answer'] = normalize_instance(item['answer']) + item['sentence'] = " ".join([normalize_instance(x) for x in item['sentence'].split()]) + scores = [] + LMProb = [] + pairs = [] + + for c,v in sorted(candidates.items(), key=lambda d: -d[1]): + # print("feature for ,", c) + y = 0 + rankings[c] = cnt + if c in distractors: + if visit[dic[c]] == 1: + cnt += 1 + continue + res.append(rankings[c]) + visit[dic[c]] = 1 + y = 1 + cnt += 1 + try: + features.append(cal_26_feature_vec([item['sentence'],item['answer'],c])) + Y.append(y) + scores.append([v]) + pairs.append([item['sentence'],c]) + except: + print("error") + pass + + for i in range(len(item['distractors'])): + if visit[i] == 0: + try: + features.append(cal_26_feature_vec([item['sentence'],item['answer'],item['distractors'][i]])) + Y.append(1) + scores.append([0]) + + except: + print('error') + pass + + features = np.array(features,dtype=np.float32) + scores = np.array(scores,dtype=np.float64) + scores_normed = scores / scores.max(axis=0) + features = np.hstack((features, scores_normed)) + print(features.shape) + if mode == 1: + return features, Y, res + else: + return features, Y, res, len(item['distractors']) + + +def prepare_train_data(): + """ + Calculate features of 100 candidates for each item in file. + """ + feature_path = "/home/xinzhu/Code/CDC/data/features/" + files = ['Regents_new.json','AI2-ScienceQuestions_new.json'] + for fname in files: + X = [] + Y = [] + f = data_path + fname + xf = feature_path + fname[:-5] + "_X.npy" + yf = feature_path + fname[:-5] + "_y.npy" + of = open(data_path + fname[:-5] + ".txt", 'w') + with open(f, 'r') as content: + dataset = json.load(content) + cnt = 1 + for item in dataset: + if isinstance(item['answer'],int): + continue + try: + tmpX, tmpY, res = calculate_features(item) + if len(X) == 0: + X = tmpX + Y = tmpY + else: + X = np.vstack((X, tmpX)) + Y = np.append(Y, tmpY) + print("Xshape: ",X.shape) + print("yshape: ",Y.shape) + of.write(str(cnt) + '\t' + str(len(item['distractors']))) + for r in res: + of.write('\t'+str(r)) + of.write('\n') + cnt += 1 + except: + pass + np.save(xf, X) + np.save(yf, Y) + + +def prepare_training_data_wordnet(fname, feature_path): + data_path = "/home/xinzhu/Code/CG/Layer1/dataset/" + print("calculate features for WordNet generated candidates...") + f = data_path + fname + ".json" + xf = feature_path + fname + "_X.npy" + yf = feature_path + fname + "_y.npy" + dataset = json.load(open(f, 'r')) + X = [] + Y = [] + cnt = 1 + index = [] + for item in dataset: + cnt += 1 + if isinstance(item['answer'],int): + continue + try: + tmpX, tmpY, res = calculate_features(item, mode=1, can_num=100, source="WordNet") + X.append(tmpX) + Y.append(tmpY) + index.append(cnt) + # print(tmpX) + # print(tmpY) + except: + pass + X = np.asarray(X) + Y = np.asarray(Y) + index = np.asarray(index) + np.save(xf, X) + np.save(yf, Y) + np.save(indexf, index) + + +def prepare_training_data_word2vec(fname,feature_path): + data_path = "/home/xinzhu/Code/CG/Layer1/dataset/" + print("calculate features for word2vec generated candidates...") + f = data_path + fname + ".json" + xf = feature_path + fname + "_X.npy" + yf = feature_path + fname + "_y.npy" + indexf = feature_path + fname + "_index.npy" + dataset = json.load(open(f, 'r')) + X = [] + Y = [] + cnt = 1 + for item in dataset: + if isinstance(item['answer'],int): + continue + try: + tmpX, tmpY, res = calculate_features(item, mode=1, can_num=100, source="Word2vec") + X.append(tmpX) + Y.append(tmpY) + # print(tmpX) + # print(tmpY) + except: + pass + X = np.asarray(X) + Y = np.asarray(Y) + np.save(xf, X) + np.save(yf, Y) + + +def prepare_train_data_new(dataset, fname, index, feature_path=feature_path, source="Probase"): + print("start prepare training data !") + print(len(dataset)) + xf = feature_path + fname + "_train_X.npy" + yf = feature_path + fname + "_train_y.npy" + X = [] + Y = [] + for i in range(len(dataset)): + if i+1 in index: + continue + try: + item = dataset[i] + tmpX, tmpY, res = calculate_features(item, 1, 100, source) + X.append(tmpX) + Y.append(tmpY) + print("test item done!") + except: + pass + X = np.asarray(X) + Y = np.asarray(Y) + np.save(xf, X) + np.save(yf, Y) + + +def prepare_test_data_new(dataset, fname, index, feature_path=feature_path, source="Probase"): + # data_path = "/home/xinzhu/Code/CG/Layer1/dataset/" + xf = feature_path + fname + "_test_X.npy" + yf = feature_path + fname + "_test_y.npy" + X = [] + Y = [] + print("start prepare testing data !") + print(index) + for i in index: + print(i) + item = dataset[i] + try: + tmpX, tmpY, res = test_calculate_features(item, 1, 100, source) + X.append(tmpX) + Y.append(tmpY) + print("test item done!") + except: + pass + X = np.asarray(X) + Y = np.asarray(Y) + np.save(xf, X) + np.save(yf, Y) + + +def test_calculate_features(item, mode=1, can_num=100, source="Probase"): + """ + Given an item, generate can_num candidates and test hit rate. + param: can_num: number of candidates to be generated + param: mode: 1 means single-processing, 2 means multi-processing have to return an additional len(distractors) + return: features: probabilty from Probase + embedding similarities for each candidate + """ + + # select a candidate generation source + # print("calculating features...") + if source == 'Probase': + candidates = search_candidates_from_e(item['sentence'],item['answer'],can_num) + print("Probase candidate done!") + elif source == "WordNet": + candidates = wordnet_predict(item['sentence'],item['answer'],can_num) + else: + pass + # candidates = word2vec_predict(item['sentence'],can_num) + # print(candidates) + # if answer is not in Probase + if candidates is None: + print("candidate is None!") + return np.array([]), np.array([]) + + cnt = 1 + rankings = {} # from candidate to its ranking + features = [] # concept probability + embedding features for each candidate + Y = [] # label, whether is a distractor or not + res = [] + dic = {} + visit = [0]*len(item['distractors']) + + # different forms of distractors to be tested + distractors = [] + for i in range(len(item['distractors'])): + d = normalize_instance(item['distractors'][i]) + for k in [item['distractors'][i], d, d.capitalize(), ' '.join([x.capitalize() for x in d.split()]), ''.join([x.capitalize() for x in d.split()])]: + distractors.append(k) + dic[k] = i + + item['answer'] = normalize_instance(item['answer']) + item['sentence'] = " ".join([normalize_instance(x) for x in item['sentence'].split()]) + scores = [] + LMProb = [] + pairs = [] + + for c,v in sorted(candidates.items(), key=lambda d: -d[1]): + # print("feature for ,", c) + y = 0 + rankings[c] = cnt + if c in distractors: + if visit[dic[c]] == 1: + continue + res.append(rankings[c]) + visit[dic[c]] = 1 + y = 1 + cnt += 1 + print("hit: ",cnt) + try: + features.append(cal_26_feature_vec([item['sentence'],item['answer'],c])) + Y.append(y) + scores.append([v]) + pairs.append([item['sentence'],c]) + except: + print("error") + pass + + features = np.array(features,dtype=np.float32) + scores = np.array(scores,dtype=np.float64) + scores_normed = scores / scores.max(axis=0) + features = np.hstack((features, scores_normed)) + print(features.shape) + if mode == 1: + return features, Y, res + else: + return features, Y, res, len(item['distractors']) + + +def prepare_train_data_by_item(fname,feature_path=feature_path): + """ + Calculate features of 100 candidates for each item in file. + Save a numpy array for each item + """ + print("calculate features...") + f = data_path + fname + ".json" + xf = feature_path + fname + "_X.npy" + yf = feature_path + fname + "_y.npy" + # of = open('/home/xinzhu/Code/CDC/data/LM_features/feature.txt', 'w') + dataset = json.load(open(f, 'r')) + X = [] + Y = [] + cnt = 1 + for item in dataset: + if isinstance(item['answer'],int): + continue + try: + tmpX, tmpY, res = calculate_features(item) + X.append(tmpX) + Y.append(tmpY) + # for x in tmpX: + # of.write(str(x)) + # of.write(' ') + # of.write(tmpY) + # of.write('\n') + # of.flush() + cnt += 1 + except: + pass + X = np.asarray(X) + Y = np.asarray(Y) + np.save(xf, X) + np.save(yf, Y) + # of.close() + + +def multi_prepare_train_data(fname,feature_path=feature_path): + """ + Multiprocessing version of prepare_train_data_by_item + """ + print("multiprocessing calculate features...") + f = data_path + fname + ".json" + xf = feature_path + fname + "_X.npy" + yf = feature_path + fname + "_y.npy" + indexf = feature_path + fname + "_index.npy" + # of = open(data_path + fname + ".txt", 'w') + dataset = json.load(open(f, 'r')) + X = [] + Y = [] + cnt = 1 + results = [] + pool = Pool(multiprocessing.cpu_count()) + index = [] + cur = 0 + for item in dataset: + cur += 1 + if isinstance(item['answer'],int): + continue + try: + results.append(pool.apply_async(calculate_features, args=(item,))) + index.append(cur) + except: + pass + pool.close() + pool.join() + final_index = [] + i = 0 + for result in results: + try: + tmpX, tmpY, res = result.get() + X.append(tmpX) + Y.append(tmpY) + final_index.append(index[i]) + except: + print("get result error") + i += 1 + + X = np.asarray(X) + Y = np.asarray(Y) + final_index = np.asarray(final_index) + np.save(xf, X) + np.save(yf, Y) + np.save(indexf, final_index) + + +# if __name__ == '__main__': + #AI2-ScienceQuestions_new_feature.npy + # prepare_train_data() + # draw_recall_graph("/mnt/e/Course/NLP/Code/CandidateGeneration/ContextDependentConceptualization/src/result/AI2-ScienceQuestions_new.txt") \ No newline at end of file diff --git a/ConceptGenerator/layer1_ranker.py b/ConceptGenerator/layer1_ranker.py new file mode 100644 index 0000000..51c1603 --- /dev/null +++ b/ConceptGenerator/layer1_ranker.py @@ -0,0 +1,385 @@ +import numpy as np +from sklearn.ensemble import AdaBoostClassifier +from sklearn.model_selection import train_test_split +from sklearn import metrics +import xgboost as xgb +from xgboost import plot_importance +from sklearn.metrics import mean_squared_error +from sklearn.externals import joblib +from sklearn import preprocessing +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import os +from layer1_prepare_training_data import prepare_train_data_new, prepare_test_data_new, prepare_training_data_wordnet, prepare_training_data_word2vec, prepare_train_data_by_item, calculate_features, multi_prepare_train_data +import json +from sklearn.model_selection import GridSearchCV +from sklearn.svm import SVC +import argparse +import random + +parser = argparse.ArgumentParser() +parser.add_argument('--source', type=str, choices=['Probase', 'Word2vec', 'WordNet'], default="Probase", help='Probase, Word2vec, WordNet') +parser.add_argument('--fname', type=str, default="total_new", help='filename') +parser.add_argument('--feature_path', type=str, default="/home/xinzhu/Code/CG/Layer1/feature2/", help='feature path') +parser.add_argument('--model_path', type=str, default="/home/xinzhu/Code/CG/Layer1/model2/", help="model path") +parser.add_argument('--model_type', type=str, default="adaboost",help='model type') +args = parser.parse_args() + +data_path = "/home/xinzhu/Code/CG/Layer1/dataset/" +feature_path = "/home/xinzhu/Code/CG/Layer1/feature2/" +model_path = "/home/xinzhu/Code/CG/Layer1/model2/" + +class Ranker(): + """used to rank and evaluate candidates""" + def __init__(self, fname, model_name): # fname = Regents_new, model_name = "name" + self.fname = fname + self.model_name = model_name + self.train_size = 0 # size of training items + self.train_set = [] + self.train_X = [] + self.train_y = [] + self.test_size = 0 # size of testing items + self.test_set = [] + self.test_X = [] + self.test_y = [] + self.train_test_split = 0.2 + + def load(self, fname, split_rate=0.2): + model = train(fname, model_name) + jsonf = open(data_path + fname + ".json", 'r') + dataset = json.load(jsonf) + total_length = len(dataset) + self.test_size = int(total_length * 0.2) + self.train_size = total_length - self.test_size + self.test_set = dataset[:self.test_size] + self.train_set = dataset[self.test_size:] + self.train_X = calculate_features() + + +def transform(arr, mode): + trans = [] + if mode == 'X': + for X in arr: + if len(trans) == 0: + trans = X + else: + trans = np.vstack((trans,X)) + else: + for y in arr: + if len(trans) == 0: + trans = y + else: + trans = np.append(trans, y) + return np.asarray(trans) + + +def dcg_at_k(r, k, method=0): + """Score is discounted cumulative gain (dcg) + Relevance is positive real values. Can use binary + as the previous methods. + Example from + http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf + >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0] + >>> dcg_at_k(r, 1) + 3.0 + >>> dcg_at_k(r, 1, method=1) + 3.0 + >>> dcg_at_k(r, 2) + 5.0 + >>> dcg_at_k(r, 2, method=1) + 4.2618595071429155 + >>> dcg_at_k(r, 10) + 9.6051177391888114 + >>> dcg_at_k(r, 11) + 9.6051177391888114 + Args: + r: Relevance scores (list or numpy) in rank order + (first element is the first item) + k: Number of results to consider + method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...] + If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...] + Returns: + Discounted cumulative gain + """ + r = np.asfarray(r)[:k] + if r.size: + if method == 0: + return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1))) + elif method == 1: + return np.sum(r / np.log2(np.arange(2, r.size + 2))) + else: + raise ValueError('method must be 0 or 1.') + return 0. + + +def ndcg_at_k(r, k, method=0): + """Score is normalized discounted cumulative gain (ndcg) + Relevance is positive real values. Can use binary + as the previous methods. + Example from + http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf + >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0] + >>> ndcg_at_k(r, 1) + 1.0 + >>> r = [2, 1, 2, 0] + >>> ndcg_at_k(r, 4) + 0.9203032077642922 + >>> ndcg_at_k(r, 4, method=1) + 0.96519546960144276 + >>> ndcg_at_k([0], 1) + 0.0 + >>> ndcg_at_k([1], 2) + 1.0 + Args: + r: Relevance scores (list or numpy) in rank order + (first element is the first item) + k: Number of results to consider + method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...] + If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...] + Returns: + Normalized discounted cumulative gain + """ + dcg_max = dcg_at_k(sorted(r, reverse=True), k, method) + if not dcg_max: + return 0. + return dcg_at_k(r, k, method) / dcg_max + + +def train_from_file(fname, model_name, feature_path=feature_path,model_path=model_path,source="Probase"): + """ + Train models for a given json file and model_name + param: fname: only the file name (no path needed) + param: model_name: choose from 'adaboost', 'xgboost', 'k-fold' + """ + print("start ",fname) + trainXf = feature_path + fname + "_train_X.npy" + trainyf = feature_path + fname + "_train_y.npy" + testXf = feature_path + fname + "_test_X.npy" + testyf = feature_path + fname + "_test_y.npy" + if os.path.exists(trainXf) and os.path.exists(testXf): + pass + else: + data_path = "/home/xinzhu/Code/CG/Layer1/dataset/" + f = data_path + fname + ".json" + dataset = json.load(open(f, 'r')) + index_list = [int(line) for line in open("/home/xinzhu/Code/CG/Layer1/test_index.txt",'r').readlines()] + if os.path.exists(trainXf): + prepare_test_data_new(dataset, fname, index_list, feature_path, source) + elif os.path.exists(testXf): + prepare_train_data_new(dataset, fname, index_list, feature_path, source) + else: + prepare_test_data_new(dataset, fname, index_list, feature_path, source) + prepare_train_data_new(dataset, fname, index_list, feature_path, source) + # if source == "Probase": + # #prepare_train_data_by_item(fname,feature_path) # multi_prepare_train_data, prepare_train_data_by_item + # index = multi_prepare_train_data(fname,feature_path) + # elif source == "WordNet": + # prepare_training_data_wordnet(fname,feature_path) + # else: + # prepare_training_data_word2vec(fname,feature_path) + X_train = np.load(trainXf) + y_train = np.load(trainyf) + X_test = np.load(testXf) + y_test = np.load(testyf) + # index = np.load(feature_path + fname + "_index.npy") + # X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size=0.2, random_state=123) + # X_train, X_test, y_train, y_test, train_index, test_index = train_test_split(X, y, index, test_size=0.2, random_state=123) + y_test_origin = y_test + X_test_origin = X_test + X_train = transform(X_train, 'X') + X_test = transform(X_test, 'X') + y_train = transform(y_train, 'y') + y_test = transform(y_test, 'y') + + where_are_NaNs = np.isnan(X_train) + X_train[where_are_NaNs] = 0 + where_are_NaNs = np.isnan(X_test) + X_test[where_are_NaNs] = 0 + + # add preprocessing + # min_max_scaler = preprocessing.MinMaxScaler() + # X_train = min_max_scaler.fit_transform(X_train) + # X_test = min_max_scaler.transform(X_test) + + modelp = model_path + fname + "_" + model_name + "_new.joblib.dat" # store path of final model + if os.path.exists(modelp): + print("load model...") + model = joblib.load(modelp) + else: + print("training...") + if model_name == 'adaboost': + abc = AdaBoostClassifier(n_estimators=50,learning_rate=1) + model = abc.fit(X_train, y_train) + + elif model_name == 'svm': + model = SVC(gamma='auto') + model.fit(X_train, y_train) + + elif model_name == 'adaboost_optimize': + abc = AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=20,min_samples_leaf=5),algorithm='SAMME.R') + param_test1 = {'n_estimators': range(50,200,10),"learning_rate":[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]} + gsearch1 = GridSearchCV(abc,param_test1,cv=10) + gsearch1.fit(X_train,y_train) + learning_rate = gsearch1.best_params_["learning_rate"] + n_estimators = gsearch1.best_params_['n_estimators'] + abc = AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=20, min_samples_leaf=5), + algorithm="SAMME", + n_estimators=n_estimators, learning_rate=learning_rate) + model = abc.fit(X_train, y_train) + + elif model_name == 'xgboost': + # implementation based on Scikit-learn + dtrain = xgb.DMatrix(X_train, label=y_train) + model = xgb.XGBClassifier(objective ='binary:logistic', colsample_bytree = 0.3, learning_rate = 0.1, + max_depth = 5, alpha = 10, n_estimators = 20, random_state=123) + # n_estimators: number of trees you want to build. + # colsample_bytree: percentage of features used per tree. High value can lead to overfitting. + # max_depth: determines how deeply each tree is allowed to grow during any boosting round. + # learning_rate: step size shrinkage used to prevent overfitting. Range is [0,1] + # objective: determines the loss function to be used like reg:linear for regression problems, reg:logistic for classification problems with only decision, binary:logistic for classification problems with probability. + model.fit(X_train,y_train) + preds = model.predict(X_test) + plot_importance(model) # counting the number of times each feature is split on across all boosting rounds (trees) in the model + plt.savefig('feature_importance.png') + rmse = np.sqrt(mean_squared_error(y_test, preds)) + print("RMSE: %f" % (rmse)) + # xgb.plot_tree(model,num_trees=0) + # plt.rcParams['figure.figsize'] = [50, 10] + # plt.savefig('tree.png') + + elif model_name == 'k-fold': + params = {"objective":"binary:logistic",'colsample_bytree': 0.3,'learning_rate': 0.1, + 'max_depth': 5, 'alpha': 10} + data_dmatrix = xgb.DMatrix(data=X,label=y) # an optimized data structure that XGBoost supports + cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3, + num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123) + print((cv_results["test-rmse-mean"]).tail(1)) + + joblib.dump(model, modelp) + + y_pred = model.predict(X_test) + total, valid = 0, 0 + for i in range(len(y_pred)): + if y_test[i] == 1: + total += 1 + if y_pred[i] == 1: + valid += 1 + print("Testing...") + print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) + print("hit rate:",float(valid)/total) + + res = {"Recall@3": 0.0,"Recall@10": 0.0,"Recall@50": 0.0, "Recall@100": 0.0,"P@1": 0.0, "P@3": 0.0, "P@10": 0.0, "F1@3":0.0, "F1@10":0.0,\ + "MRR": 0.0, "MAP@10": 0.0, "NDCG@10": 0.0} + + for tmpX, tmpY in zip(X_test_origin, y_test_origin): + where_are_NaNs = np.isnan(tmpX) + tmpX[where_are_NaNs] = 0 + # tmpX = min_max_scaler.transform(tmpX) + + scores = model.predict_proba(tmpX)[:,1] # calculate probabilities for each candidate + index = np.argsort(-scores) # rank according to probabilities, greatest to smallest + length_index = len(index) + # tmpY = tmpY.tolist() + tmpres = {"Recall@3": 0.0,"Recall@10": 0.0,"Recall@50": 0.0, "Recall@100": 0.0,"P@1": 0.0, "P@3": 0.0, "P@10": 0.0, "F1@3":0.0, "F1@10":0.0,\ + "MRR": 0.0, "MAP@10": 0.0, "NDCG@10": 0.0} + # P@1 P@3 P@10 + for precisionk in [1,3,10]: + precision = 0 + for i in range(precisionk): + try: + if tmpY[index[i]] == 1: + precision += 1 + except: + pass + res["P@"+str(precisionk)] += float(precision)/precisionk + tmpres["P@"+str(precisionk)] = float(precision)/precisionk + + # Recall@10, Recall@50, Recall@100 + length = tmpY.count(1) # total valid distractors + for recallk in [3, 10,50,100]: + recall = 0 + for i in range(recallk): + if i >= length_index: + break + try: + if tmpY[index[i]] == 1: + recall += 1 + except: + pass + tmpres["Recall@"+str(recallk)] = float(recall)/(length+1) + res["Recall@"+str(recallk)] += float(recall)/(length+1) + + # MRR + for i in range(length_index): + if tmpY[index[i]] == 1: + tmpres["MRR"] += 1.0/(i+1) + + # MAP@10 + num_correct = 0.0 + for i in range(10): + try: + if tmpY[index[i]] == 1: + num_correct += 1.0 + tmpres["MAP@10"] += num_correct / (i + 1) + except: + pass + try: + tmpres["MAP@10"] /= num_correct + except: + pass + + # NDCG@10 + scores = [] + for i in range(10): + try: + scores.append(tmpY[index[i]]) + except: + scores.append(0) + res["NDCG@10"] += ndcg_at_k(scores,10) + + res['MAP@10'] += tmpres["MAP@10"] + res['MRR'] += tmpres['MRR'] + try: + res["F1@3"] += 2*tmpres["Recall@3"]*tmpres["P@3"] / (tmpres["Recall@3"]+tmpres["P@3"]) + except: + pass + try: + res["F1@10"] += 2*tmpres["Recall@10"]*tmpres["P@10"] / (tmpres["Recall@10"]+tmpres["P@10"]) + except: + pass + total += 1 + # print("done, ",total) + for k in res.keys(): + res[k] /= total + + # indexf = open("test_index.txt",'w') + # for i in test_index: + # indexf.write(str(i)) + # indexf.write('\n') + # indexf.close() + + print("metrics: ") + for k, v in res.items(): + print(k+": "+str(v)) + +if __name__ == '__main__': +# # train("Regents_new",'adaboost') +# # evaluate("Regents_new",'adaboost') +# train_from_file("Regents_new",'adaboost') +# train_from_file("Regents_new",'xgboost') +# train_from_file("AI2-ScienceQuestions_new",'adaboost') + # train_from_file("mcq_new",'adaboost') + # train_from_file("mcql_new",'adaboost') + # train_from_file("trivia_new",'adaboost') + # train_from_file("total_new",'adaboost') + + # Probase + rerank + train_from_file(args.fname, args.model_type, args.feature_path, args.model_path, args.source) + + # WordNet + rerank + #train_from_file("total_new",'adaboost',feature_path="/home/xinzhu/Code/CG/Layer1/wordnet_feature/",model_path="/home/xinzhu/Code/CG/Layer1/wordnet_model/", source="WordNet") + + # train_from_file("total_new",'adaboost',feature_path="/home/xinzhu/Code/CG/Layer1/model_word2vec/",model_path="/home/xinzhu/Code/CG/Layer1/feature_word2vec/", source="Word2vec") + # try to optimize model + # train_from_file("total_new",'adaboost',feature_path="/home/xinzhu/Code/CG/Layer1/hyper_feature/",model_path="/home/xinzhu/Code/CG/Layer1/hyper_model/") \ No newline at end of file diff --git a/ConceptGenerator/search_candidates_from_e.py b/ConceptGenerator/search_candidates_from_e.py new file mode 100644 index 0000000..9d3456b --- /dev/null +++ b/ConceptGenerator/search_candidates_from_e.py @@ -0,0 +1,96 @@ +import time +from utilities import normalize_instance +from Conceptualizer import Conceptualizer +from LDA import LDA +import sqlite3 +import gensim +from gensim import corpora +from collections import defaultdict + +conn = sqlite3.connect('/home/xinzhu/Dataset/Probase/probase.db') +c = conn.cursor() +dump_file = '/home/xinzhu/Dataset/enwiki-latest-pages-articles.xml.bz2' #for generate_input_files +bow_path = '/home/xinzhu/Code/CDC/data/full_wiki_bow.mm' # doc to [(word_id, count)..] mapping +dict_path = '/home/xinzhu/Code/CDC/data/full_wiki.dictionary' # word_id to word mapping +model_file = '/home/xinzhu/Code/CDC/models/ldamodel_topics100_trainiter20_full_en.gensim' +num_topics = 100 + +id2word_dict = gensim.corpora.Dictionary.load(dict_path) +# print(id2word_dict.token2id.items()[:100]) + +lda = LDA() +debug = False +lda.load(model_file) +print("Load LDA model") +conceptualizer = Conceptualizer(lda) + +def search_e_from_c(c,concept,k): + """ + Find all entities under concept + :param c: the database cursor + :param concept: concept to be searched + :param k: maximum number of entities to be generated + :return: a sorted list containing (entity_name, frequency) pairs + """ + cursor = c.execute('select entity, frequency from isa_core where concept=?',(concept,)) + entities = [] + for row in cursor: + entities.append([row[0],int(row[1])]) + entities = sorted(entities,key=lambda x:-x[1]) + return entities[:k] if len(entities) > k else entities + +def candidate_prob(candidates): + """ + Merge all condidates and calculate their probabilities + :param candidates: a list containing the candidate, frequency pairs for each concept ([ ['candidate_name', frequency] ... ], concept_probability) + :return : a dict containing + """ + cd = defaultdict(lambda: 0) + for candidateL, probC in candidates: + total_freq = sum(freq for candidate,freq in candidateL) + for candidate, freq in candidateL: + value = float(freq)/total_freq*probC + cd[candidate] += value + return cd + +def search_candidates_from_e(sentence, key, can_num=10): + """ + Given a sentence and key, conceptulize it and find candidates for the key + :param sentence: a complete sentence with key filled into the originial gap + :param key: entity to be searched in Probase + :param can_num: maximum number of candidates to be generated + :return: a list containing the candidate, frequency pairs for each concept ([ ['candidate_name', frequency] ... ], concept_probability) + a dict {'candidate_name':frequency...} + """ + sentence = sentence.replace('**blank**', key) + print("Probase sentence: ", sentence) + probabilities_of_concepts = conceptualizer.conceptualize(sentence, key, debug, eval=True) + print("Probability of concepts done!") + if probabilities_of_concepts is None: + return None + cnt = 0 + candidates = [] + syn_key = normalize_instance(key,mode=1) + for concept, prob in probabilities_of_concepts: + # add original candidates if its normalized form is not in syn_key + tmp = [x for x in search_e_from_c(c, concept, can_num) if normalize_instance(x[0]) not in syn_key] + cnt += len(tmp) + candidates.append((tmp, prob)) + if cnt > can_num: + candidates = candidate_prob(candidates) + return candidates + candidates = candidate_prob(candidates) + return candidates + +# debug = True +# search_candidates_from_e("apple and iPad are useful products", "apple") +# search_candidates_from_e("He likes to eat apple", "apple") + +# search_candidates_from_e("Earth's core is primarily composed of magma of the following materials", "magma") +# search_candidates_from_e("the ba4ic unit of life is cell",'cell') +#print(search_candidates_from_e("human have been on the earth for the shortest amount of time",'human',100)) # "Insects","Fish","Reptiles" + +#The following shows 100 candidates +# candidates = search_candidates_from_e("the most basic unit of living things is Cells",'Cells',100) # "Bones","Tissues","Organs" +# cd = candidate_prob(candidates) +# print(sorted(cd.items(), key=lambda d: -d[1])) \ No newline at end of file diff --git a/ConceptGenerator/utilities.py b/ConceptGenerator/utilities.py new file mode 100644 index 0000000..5a5921c --- /dev/null +++ b/ConceptGenerator/utilities.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- +import re +import urllib +from nltk.stem import WordNetLemmatizer +from nltk.corpus import wordnet +import requests +from nltk.stem import PorterStemmer + +cache = {} +lemmatizer = WordNetLemmatizer() +stemmer = PorterStemmer() +def get_concepts_of_instance_by_probase(instance, eval, use_cache=True): + """ + Fetches the concept and the probabilities for a given instance by probase. + :param instance: the instance, for which the concepts should be requested + :param use_cache: if true a cache for instances and corresponding concepts is used, to avoid unnecessary requests + :return: the concepts and their probability + """ + from urlparse import urlparse + if use_cache == True and instance in cache: + return cache[instance] + if eval: + probase_url = 'https://concept.research.microsoft.com/api/Concept/ScoreByProb?instance={' \ + '}&topK=100&api_key=eT5luCbmII34ZvpPVs7HxtbUU1cFcE12' + else: + probase_url = 'https://concept.research.microsoft.com/api/Concept/ScoreByProb?instance={}&topK=20&api_key=eT5luCbmII34ZvpPVs7HxtbUU1cFcE12' + try: + requestUrl = probase_url.format(urllib.pathname2url(instance)) + except: + print("request error!") + requestUrl = probase_url.format(urllib.request.pathname2url(instance)) + try: + response = requests.get(requestUrl) + except requests.exceptions.ConnectionError as e: + print(e) + print("\n\ntry one last time...") + response = requests.get(requestUrl) + + if response is None: + print("microsoft api error!") + return None + concepts = response.json() + return concepts + + +def appendIfNotEmpty(list, item): + """ + Append item to list, if item is not None. in place + :param list: the list, where the item should been appended to + :param item: the item which should been appended to the list + """ + if item: + list.append(item) + + +def split_text_in_words(text): + """ + Splits a given text into words + :param text: the text which should be splited into words + :return: a list containing the splitted words + """ + real_words = [] + + words = re.findall(r'\'|’|"|”|“|»|«|\(|\)|\[|\]|\{|\}:;|[^\'’"”“»«\(\)\[\]\{\}\s:;]+', text) + for word in words: + word = word.strip() + if word.startswith("..."): + real_words.append(word[:3]) + appendIfNotEmpty(real_words, word[3:]) + if word.startswith(("\"", "(", "[", "{", "<", "«", "…", "“")): + real_words.append(word[:1]) + word = word[1:] + if word.endswith("..."): + appendIfNotEmpty(real_words, word[:-3]) + real_words.append(word[-3:]) + elif word.endswith((".", ",", ":", ";", "]" ")", "}", "!", "?", "\"", ">", "»", "…", "”")): + appendIfNotEmpty(real_words, word[:-1]) + real_words.append(word[-1:]) + else: + appendIfNotEmpty(real_words, word) + return real_words + + +def normalize_instance(s, mode=2): + """ + Normalize to a lowercase lemma string + :param s: the s to be processed + :param mode: 1 means return all syset, 2 means only return itself + """ + try: + s = s.lower() + s = lemmatizer.lemmatize(s) + # s = stemmer.stem(s) + except: + return s + if mode == 1: + synset = set() + for syn in wordnet.synsets(s): + for l in syn.lemmas(): + synset.add(l.name().replace('_',' ')) + return synset + else: + return s