diff --git a/ConceptGenerator/LDA.py b/ConceptGenerator/LDA.py
new file mode 100644
index 0000000..21d368e
--- /dev/null
+++ b/ConceptGenerator/LDA.py
@@ -0,0 +1,124 @@
+import itertools
+import os
+
+import gensim
+from gensim import corpora
+from gensim.corpora.wikicorpus import _extract_pages, filter_wiki
+from gensim.parsing.preprocessing import STOPWORDS
+from gensim.utils import smart_open, simple_preprocess
+from stop_words import get_stop_words
+
+
+def tokenize(text):
+    """
+    Preprocess and then tokenize a given text
+    :param text: the text which should be tokenized.
+    :return: the token of the given text, after preprocess the text
+    """
+    return [token for token in simple_preprocess(text) if token not in STOPWORDS]
+
+
+def iter_over_dump_file(dump_file, min_length_of_article=50, ignore_namespaces=None):
+    """
+    Iterator over wiki_dump_file.
+    Returns title and tokens for next article in dump file.
+    Ignores short articles.
+    Ignores meta articles, throug given namespaces.
+    Default namespaces are 'Wikipedia', 'Category', 'File', 'Portal', 'Template', 'MediaWiki', 'User', 'Help', 'Book', 'Draft'
+    :param dump_file: the dump file
+    :param min_length_of_article: the min number of words in the next article. Default = 50
+    :param ignore_namespaces: list of namespaces which should be ignored.
+    :return: title, tokens
+    """
+    if ignore_namespaces is None:
+        ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
+    for title, text, pageid in _extract_pages(smart_open(dump_file)):
+        text = filter_wiki(text)
+        tokens = tokenize(text)
+        if len(tokens) < min_length_of_article or any(
+                title.startswith(namespace + ':') for namespace in ignore_namespaces):
+            continue  # ignore short articles and various meta-articles
+        yield title, tokens
+
+
+class LDA():
+    def __init__(self):
+        self.stop_words = get_stop_words('en')
+
+    def load(self, model_file):
+        """
+        Loads a LDA model from a given file
+        :param model_file: the file which contains the model, which should be loaded
+        """
+        from gensim.models.ldamodel import LdaModel
+        # self.ldamodel = LdaModel.load(model_file)
+        self.ldamodel = gensim.models.ldamulticore.LdaMulticore.load(model_file)
+        # print(self.ldamodel.print_topics(num_topics=100))
+        
+        # self.ldamodel = gensim.models.wrappers.LdaMallet.load(model_file)
+        # from gensim.models.wrappers.ldamallet import malletmodel2ldamodel
+        # self.ldamodel.show_topics(num_topics=5, num_words=10)
+        # self.ldamodel = malletmodel2ldamodel(self.ldamodel)
+        # print(self.ldamodel.__dict__)
+
+    def generate_bow_of_dump_file(self, dump_file, bow_output_file, dict_output_file):
+        doc_stream = (tokens for _, tokens in iter_over_dump_file(dump_file))
+        id2word_dict = gensim.corpora.Dictionary(doc_stream) #obtain: (word_id:word)
+        print(id2word_dict)
+        id2word_dict.filter_extremes(no_below=20, no_above=0.1, keep_n=250000) # word must appear >10 times, and no more than 20% documents
+        print(id2word_dict)
+        dump_corpus = DumpCorpus(dump_file, id2word_dict) #from dictionary to bag of words
+        print("save bow...")
+        #Iterate through the document stream corpus, saving the documents to fname and recording byte offset of each document.
+        gensim.corpora.MmCorpus.serialize(bow_output_file, dump_corpus)
+        print("save dict")
+        id2word_dict.save(dict_output_file)
+
+    def train_on_dump_file(self, num_topics, bow_path, dict_path, model_outputfile, training_iterations=20,
+                           max_docs=None):
+        """
+        Trains a new LDA model based on a wikipedia dump or any other dump in the same format.
+        The dump could be zipped.
+        :param num_topics: the number of topics, which should be generated
+        :param bow_path: the path inclusive filename, where the bag of words should be saved
+        :param dict_path: the path incl. filename, where the dictionary should be saved
+        :param model_outputfile: the file in which the trained model should be stored
+        :param training_iterations: the number of LDA training iterations
+        :param max_docs: the number of how many docs should be used for training, if None all docs are used
+        """
+        print("load bow...")
+        mm_corpus = gensim.corpora.MmCorpus(bow_path)
+        print("load dict...")
+        id2word_dict = gensim.corpora.Dictionary.load(dict_path)
+        clipped_corpus = gensim.utils.ClippedCorpus(mm_corpus, max_docs)
+        print("start training")
+        #train LDA on bag of word corpus
+        self.ldamodel = gensim.models.ldamulticore.LdaMulticore(clipped_corpus, num_topics=num_topics,
+                                                                id2word=id2word_dict, passes=training_iterations,
+                                                                minimum_probability=0)
+        print("save model")
+        self.ldamodel.save(model_outputfile)
+
+
+class DumpCorpus(object):
+    def __init__(self, dump_file, dictionary, clip_docs=None):
+        """
+        Parse the first `clip_docs` documents from file `dump_file`.
+        Yield each document in turn, as a list of tokens (unicode strings).
+        """
+        self.dump_file = dump_file
+        self.dictionary = dictionary
+        self.clip_docs = clip_docs
+
+    def __iter__(self):
+        """
+        Iterator over wiki corpus
+        :return: bag-of-words format = list of `(token_id, token_count)` 2-tuples
+        """
+        self.titles = []
+        for title, tokens in itertools.islice(iter_over_dump_file(self.dump_file), self.clip_docs):
+            self.titles.append(title)
+            yield self.dictionary.doc2bow(tokens) # tokens to (token_id, token_count) tuples
+
+    def __len__(self):
+        return self.clip_docs
diff --git a/ConceptGenerator/layer1_calculate_features.py b/ConceptGenerator/layer1_calculate_features.py
new file mode 100644
index 0000000..1a20e7f
--- /dev/null
+++ b/ConceptGenerator/layer1_calculate_features.py
@@ -0,0 +1,266 @@
+import nltk
+import csv
+import re
+import numpy as np
+import inflect
+import pickle
+from difflib import SequenceMatcher 
+from gensim.models import Word2Vec
+
+p = inflect.engine()
+model = Word2Vec.load("/home/xinzhu/Dataset/Word2Vec-on-Wikipedia-Corpus/model/word2vec_gensim")
+print("Prepare Word2Vec model done!")
+
+prefix = '/home/xinzhu/Code/model/feature/'
+infile = open(prefix+'unigram_freq.csv', mode='r')
+reader = csv.reader(infile)
+freq_dict = {row[0]:row[1] for row in reader}
+
+fin = open('/home/xinzhu/Code/Mydata/data/vocab_python2.pkl', 'rb')
+vocab = pickle.load(fin)
+print('loading saved vocab...')
+fin.close()
+
+fin = open('/home/xinzhu/Code/Mydata/data/embd_python2.pkl', 'rb')
+embd = pickle.load(fin)
+print('loading saved embd...')
+fin.close()
+
+cnt = 0
+
+def emb_sim(a,d):
+    avec = np.array([0.0]*300)
+    dvec = np.array([0.0]*300)
+    try:
+        aL = a.split(' ')
+        dL = d.split(' ')
+        for word in aL:
+            try:
+                emres = [float(x) for x in embd[vocab[word]]]
+                avec += emres
+            except:
+                pass
+        for word in dL:
+            try:
+                emres = [float(x) for x in embd[vocab[word]]]
+                dvec += emres
+            except:
+                pass
+        avec /= len(aL)
+        dvec /= len(dL)
+    except:
+        try:
+            avec = [float(x) for x in embd[vocab[a]]]
+            dvec = [float(x) for x in embd[vocab[d]]]
+        except:
+            pass
+    upnum = 0
+    downnum = 0
+    try:
+        for i in range(len(avec)):
+            upnum += avec[i]*dvec[i]
+            downnum += avec[i]*avec[i]
+            downnum += dvec[i]*dvec[i]
+        if downnum == 0:
+            return 0
+        return upnum/downnum
+    except:
+        return 0
+
+def pos_sim(a,d):
+#"""POS similarity a is answer, d is distractor"""
+    try:
+        apos = nltk.pos_tag(nltk.word_tokenize(a))
+        dpos = nltk.pos_tag(nltk.word_tokenize(d))
+        aset = set()
+        dset = set()
+        for tag in apos:
+            aset.add(tag[1])
+        for tag in dpos:
+            dset.add(tag[1])
+        M11 = len(aset & dset)
+        M10 = len(aset - dset)
+        M01 = len(dset - aset)
+        similarity = M11/(M11+M10+M01) if (M11+M10+M01)>0 else 0
+        #print("POS_sim, ",similarity)
+        return similarity
+    except:
+        return 0
+
+def edit_distance(s1, s2):
+#"""levenshteinDistance"""
+    try:
+        return nltk.edit_distance(s1,s2)
+    except:
+        return 0
+
+def token_sim(s1,s2):
+#""" jaccard similarity between two strings"""
+    try:
+        aset = set(nltk.word_tokenize(s1))
+        dset = set(nltk.word_tokenize(s2))
+        return nltk.jaccard_distance(aset,dset)
+    except:
+        return 0
+
+def length_sim(a,d):
+#"""calculate a and d's character and token lengths and the difference of lengths"""
+    try:
+        acharlen = len(a)
+        dcharlen = len(d)
+        atokenlen = len(nltk.word_tokenize(a))
+        dtokenlen = len(nltk.word_tokenize(d))
+        diffcharlen = abs(acharlen-dcharlen)
+        difftokenlen = abs(atokenlen-dtokenlen)
+        return [acharlen,dcharlen,atokenlen,dtokenlen,diffcharlen,difftokenlen]
+    except:
+        return [acharlen,dcharlen,1,1,diffcharlen,difftokenlen]
+
+# Function to find Longest Common Sub-string
+def suffix(str1,str2): 
+    try:
+        # initialize SequenceMatcher object with  
+        # input string 
+        seqMatch = SequenceMatcher(None,str1,str2) 
+        # find match of longest sub-string 
+        # output will be like Match(a=0, b=0, size=5) 
+        match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) 
+        # print longest substring 
+        if (match.size!=0): 
+            res = str1[match.a: match.a + match.size]
+            abs_len = len(res)
+            return [abs_len,float(abs_len)/len(str1) if len(str1)>0 else 0.0,float(abs_len)/len(str2) if len(str2)>0 else 0.0]  
+        else: 
+            return [0,0.0,0.0] 
+    except:
+        return [0,0.0,0.0] 
+
+def freq(a,d):
+#"""average word frequency in a and d"""
+    try:
+        aL = a.split()
+        dL = d.split()
+        afreqs = []
+        dfreqs = []
+        for word in aL:
+            afreqs.append(freq_dict.get(word,0))
+        for word in dL:
+            dfreqs.append(freq_dict.get(word,0))
+        return [sum(afreqs)/len(afreqs) if len(afreqs)>0 else 0,sum(dfreqs)/len(dfreqs) if len(dfreqs)>0 else 0]
+    except:
+        return [0.0,0.0]
+
+def is_plural( noun):
+    try:
+        return p.singular_noun(noun) is not False
+    except:
+        return False
+
+def singlar_or_plural(a,d):
+    try:
+        a = nltk.word_tokenize(a)
+        d = nltk.word_tokenize(d)
+        aflag = False
+        dflag = False
+        for x in a:
+            if is_plural(x):
+                aflag = True
+        for x in d:
+            if is_plural(x):
+                dflag = True
+        if aflag == dflag:
+            return 1
+        else:
+            return 0
+    except:
+        return 0
+
+def num(s):
+# whether numbers appear in a and d 
+    if re.search(r'\d', s):
+        return True
+    _known = {
+    'zero': 0,
+    'one': 1,
+    'two': 2,
+    'three': 3,
+    'four': 4,
+    'five': 5,
+    'six': 6,
+    'seven': 7,
+    'eight': 8,
+    'nine': 9,
+    'ten': 10,
+    'eleven': 11,
+    'twelve': 12,
+    'thirteen': 13,
+    'fourteen': 14,
+    'fifteen': 15,
+    'sixteen': 16,
+    'seventeen': 17,
+    'eighteen': 18,
+    'nineteen': 19,
+    'twenty': 20,
+    'thirty': 30,
+    'forty': 40,
+    'fifty': 50,
+    'sixty': 60,
+    'seventy': 70,
+    'eighty': 80,
+    'ninety': 90
+    }
+    for n in _known.keys():
+        if n in s:
+            return True
+    return False
+
+def wiki_sim(a,d):
+    res = 0
+    try:
+        res = model.similarity(a,d)
+    except:
+        pass
+    return res
+
+def cal_10_feature_vec(params):
+    q = params[0].replace('_',' ')
+    a = params[1].replace('_',' ')
+    d = params[2].replace('_',' ')
+    y = params[3]
+    features = []
+    features.extend([emb_sim(q,d),emb_sim(a,d)])
+    features.append(pos_sim(a,d))
+    features.append(edit_distance(a,d))
+    features.extend([token_sim(q,d),token_sim(a,d),token_sim(q,a)])
+    features.extend(length_sim(a,d))
+    features.extend(suffix(a,d))
+    features.extend(freq(a,d))
+    global cnt
+    cnt += 1
+    if cnt%10000 == 0:
+        print(cnt)
+    return [features,y,q,a,d]
+
+def cal_26_feature_vec(params):
+#"""26-dimensional feature vector"""
+    q = params[0]
+    a = params[1]
+    d = params[2]
+    features = []
+    features.extend([emb_sim(q,d),emb_sim(a,d)]) #2
+    features.append(pos_sim(a,d)) #1
+    features.append(edit_distance(a,d)) #1
+    features.extend([token_sim(q,d),token_sim(a,d),token_sim(q,a)]) #3
+    features.extend(length_sim(a,d)) #6
+    features.extend(suffix(a,d)) #3
+    features.extend(freq(a,d)) #2
+    features.append(singlar_or_plural(a,d)) #1
+    features.extend([int(num(a)),int(num(d))]) #2
+    features.append(wiki_sim(a,d)) #1
+    # print("total features, ",features)
+    global cnt
+    cnt += 1
+    if cnt%10000 == 0:
+        print(cnt)
+    # print(features)
+    return features
\ No newline at end of file
diff --git a/ConceptGenerator/layer1_candidate_generator.py b/ConceptGenerator/layer1_candidate_generator.py
new file mode 100644
index 0000000..818052c
--- /dev/null
+++ b/ConceptGenerator/layer1_candidate_generator.py
@@ -0,0 +1,289 @@
+from sklearn.externals import joblib
+import numpy as np
+import json
+import sys
+sys.path.append("/home/xinzhu/Code/CG/Layer1")
+from search_candidates_from_e import search_candidates_from_e
+from utilities import normalize_instance
+from layer1_calculate_features2 import cal_26_feature_vec
+from wordnet_candidate_generation import wordnet_predict
+# from layer1_word2vec_predict import word2vec_predict
+
+# class Word2vecGenerator(object):
+# 	"""docstring for Word2vecGenerator"""
+# 	def __init__(self, model_path):
+# 		self.ranker = joblib.load(model_path)
+# 		print("init word2vec candidate generator")
+
+# 	def top_k_candidates(self, sentence, answer, k=10):
+# 		try:
+# 			candidates = word2vec_predict(sentence)
+# 			print("candidate length,",len(candidates))
+# 		except:
+# 			print("search_candidates_from_e error!")
+# 			return {}
+
+# 		if candidates is None:
+# 			print("candidates is None")
+# 			return None
+# 		try:
+# 			features = [] # concept probability + embedding features for each candidate
+# 			Y = [] # label, whether is a distractor or not
+# 			scores = [] # score from Probase
+# 			candidate = []
+
+# 			for c,v in candidates.items():
+# 				features.append(cal_26_feature_vec([sentence,answer,c])) # need to recalculate features if add LM score
+# 				scores.append([v])
+# 				candidate.append(c)
+
+# 			print("Calculate feature done! candidate count:", len(features))
+# 			features = np.array(features,dtype=np.float32)
+# 			scores = np.array(scores,dtype=np.float64)
+# 			scores_normed = scores / scores.max(axis=0) 
+# 			features = np.hstack((features, scores_normed))
+			
+# 			where_are_NaNs = np.isnan(features)
+# 			features[where_are_NaNs] = 0
+			
+# 			print("feature shape: ",features.shape)
+# 			predicts = self.ranker.predict_proba(features)[:,1] # get probability
+# 			index = np.argsort(-predicts)
+# 			print("index:", index)
+			
+# 			topk = {}
+# 			if k<len(index):
+# 				index = index[:k]
+# 			for i in index:
+# 				topk[candidate[i]] = predicts[i]
+# 			print("Top 10 Candidates:")
+# 			for c in topk:
+# 				print(c)
+# 			return topk
+# 		except:
+# 			print("top k candidate error!")
+# 			return {}
+
+
+class WordNetCandidateGenerator(object):
+	"""docstring for  WordNetCandidateGenerator"""
+	def __init__(self, model_path):
+		self.ranker = joblib.load(model_path)
+		print("init wordnet candidate generator")
+
+	def top_k_candidates(self, sentence, answer, k=10):
+		try:
+			candidates = wordnet_predict(sentence, answer, 100)
+			print("candidate length,",len(candidates))
+		except:
+			print("WordNet error!")
+			return {}
+
+		if candidates is None:
+			print("candidates is None")
+			return None
+		try:
+			features = [] # concept probability + embedding features for each candidate
+			Y = [] # label, whether is a distractor or not
+			scores = [] # score from Probase
+			candidate = []
+
+			for c,v in candidates.items():
+				try:
+					features.append(cal_26_feature_vec([sentence,answer,c])) # need to recalculate features if add LM score
+					scores.append([v])
+					candidate.append(c)
+				except:
+					pass
+
+			print("Calculate feature done! candidate count:", len(features))
+			features = np.array(features,dtype=np.float32)
+			scores = np.array(scores,dtype=np.float64)
+			scores_normed = scores / scores.max(axis=0) 
+			features = np.hstack((features, scores_normed))
+			
+			where_are_NaNs = np.isnan(features)
+			features[where_are_NaNs] = 0
+			
+			print("feature shape: ",features.shape)
+			predicts = self.ranker.predict_proba(features)[:,1] # get probability
+			index = np.argsort(-predicts)
+
+			topk = {}
+			if k<len(index):
+				index = index[:k]
+			for i in index:
+				topk[candidate[i]] = predicts[i]
+			print("Top 10 Candidates:")
+			for c in topk:
+				print(c)
+			return topk
+		except:
+			print("top k candidate error!")
+			return {}
+
+
+		
+class ProbaseCandidateGenerator():
+	def __init__(self, model_path):
+		self.ranker = joblib.load(model_path)
+		print("init candidate generator")
+
+
+	def top_k_candidates(self, sentence, answer, k=10):
+		"""
+		Generate top k distractors for a given sentence and answer based on pretrained ranker
+		return: topk: a list containing k candidates
+		"""
+		try:
+			candidates = search_candidates_from_e(sentence, answer, 100)
+			print("candidate length,",len(candidates))
+		except:
+			print("Probase search_candidates_from_e error!")
+			return {}
+
+		if candidates is None:
+			print("candidates is None")
+			return None
+		try:
+			features = [] # concept probability + embedding features for each candidate
+			Y = [] # label, whether is a distractor or not
+			scores = [] # score from Probase
+			candidate = []
+
+			for c,v in candidates.items():
+				try:
+					features.append(cal_26_feature_vec([sentence,answer,c])) # need to recalculate features if add LM score
+					scores.append([v])
+					candidate.append(c)
+				except:
+					pass
+
+			print("Calculate feature done! candidate count:", len(features))
+			features = np.array(features,dtype=np.float32)
+			scores = np.array(scores,dtype=np.float64)
+			scores_normed = scores / scores.max(axis=0) 
+			features = np.hstack((features, scores_normed))
+			
+			where_are_NaNs = np.isnan(features)
+			features[where_are_NaNs] = 0
+			
+			print("feature shape: ",features.shape)
+			predicts = self.ranker.predict_proba(features)[:,1] # get probability
+			index = np.argsort(-predicts)
+
+			topk = {}
+			if k<len(index):
+				index = index[:k]
+			for i in index:
+				topk[candidate[i]] = predicts[i]
+			print("Top 10 Candidates:")
+			for c in topk:
+				print(c)
+			return topk
+		except:
+			print("top k candidate error!")
+			return {}
+
+
+	def topk_from_probase(self, sentence, answer, k=10):
+		candidates = search_candidates_from_e(sentence, answer, 100)
+		if candidates is None:
+			return None
+		result = []
+		print("Top 10 Candidates from Probase:")
+		i = 0
+		for c,v in sorted(candidates.items(), key=lambda d: -d[1]):
+			if i >= 10:
+				break
+			print(c)
+			result.append(c)
+			i += 1
+		return result
+
+
+	def hit_rate(self, topk, distractors, k=10):
+		"""
+		Calculate hit rate in topk for an item in dataset
+		"""
+		total = len(distractors)
+		valid = 0
+		# different forms of distractors to be tested
+		for i in range(len(distractors)):
+			d = normalize_instance(distractors[i])
+			for k in [distractors[i], d, d.capitalize(), ' '.join([x.capitalize() for x in d.split()]), ''.join([x.capitalize() for x in d.split()])]:
+				if k in topk:
+					valid += 1
+					break
+		return float(valid)/total, topk
+
+
+def total_generate_candidates(model_path, source):
+	if source == 'Probase':
+		generator = ProbaseCandidateGenerator(model_path)
+	elif source == 'WordNet':
+		generator = WordNetCandidateGenerator(model_path)
+	f = "/home/xinzhu/Code/CG/Layer1/dataset/total_new.json"
+	index_list = [int(line) for line in open("/home/xinzhu/Code/CG/Layer1/test_index.txt",'r').readlines()]
+	dataset = json.load(open(f, 'r'))
+	resf = open('/home/xinzhu/Code/CG/CGresult/Wordnet_layer1.txt','w')
+	for index in index_list:
+		item = dataset[index-1]
+		topk = generator.top_k_candidates(item['sentence'], item['answer'])
+		topk = topk.keys()
+		resf.write("sentence: "+item['sentence'])
+		resf.write('\n')
+		resf.write("answer: "+item['answer'])
+		resf.write('\n')
+		for candidate in topk:
+			resf.write(candidate.encode('utf-8'))
+			resf.write('\n')
+		resf.write("*"*50)
+		resf.write('\n')
+	resf.close()	
+
+
+def domain_generate_candidates(model_path, domain, source):
+	if source == 'Probase':
+		generator = ProbaseCandidateGenerator(model_path)
+	elif source == 'WordNet':
+		generator = WordNetCandidateGenerator(model_path)
+	f = "/home/xinzhu/Code/CG/Layer1/dataset/"+domain+".json"
+	index_list = [int(line) for line in open("/home/xinzhu/Code/CG/Layer1/"+domain+"_test_index.txt",'r').readlines()]
+	dataset = json.load(open(f, 'r'))
+	resf = open('/home/xinzhu/Code/CG/CGresult/'+domain+'_layer1.txt','w')
+	for index in index_list:
+		item = dataset[index-1]
+		topk = generator.top_k_candidates(item['sentence'], item['answer'])
+		topk = topk.keys()
+		resf.write("sentence: "+item['sentence'].encode('utf-8'))
+		# try:
+		# 	resf.write("sentence: "+item['sentence'])
+		# except:
+		# 	try:
+				
+		# 	except:
+		# 		pass
+		resf.write('\n')
+		try:
+			resf.write("answer: "+item['answer'])
+		except:
+			resf.write("answer: "+item['answer'].encode('utf-8'))
+		resf.write('\n')
+		for candidate in topk:
+			resf.write(candidate.encode('utf-8'))
+			resf.write('\n')
+		resf.write("*"*50)
+		resf.write('\n')
+	resf.close()	
+
+
+if __name__=="__main__":
+	# total + wordNet
+	# model_path = "/home/xinzhu/Code/CG/Layer1/wordnet_model/total_new_adaboost_new.joblib.dat"
+	# generate_candidates(model_path,'WordNet')
+
+	# science + Probase
+	for domain in ['science','vocabulary','common','trivia']:
+		model_path = "/home/xinzhu/Code/CG/Layer1/model2/"+domain+"_adaboost_new.joblib.dat"
+		domain_generate_candidates(model_path, domain,'Probase')
\ No newline at end of file
diff --git a/ConceptGenerator/layer1_prepare_training_data.py b/ConceptGenerator/layer1_prepare_training_data.py
new file mode 100644
index 0000000..13e1f66
--- /dev/null
+++ b/ConceptGenerator/layer1_prepare_training_data.py
@@ -0,0 +1,456 @@
+import json
+from os import listdir
+from os.path import isfile, join
+import re
+import urllib
+import requests
+from search_candidates_from_e import search_candidates_from_e
+from utilities import normalize_instance
+import multiprocessing
+from multiprocessing import Pool
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import numpy as np
+# from layer1_word2vec_predict import word2vec_predict
+from layer1_calculate_features2 import cal_26_feature_vec
+from wordnet_candidate_generation import wordnet_predict
+# from Calculate_features import cal_26_feature_vec
+
+data_path = "/home/xinzhu/Code/CG/Layer1/dataset/"
+feature_path = "/home/xinzhu/Code/CG/Layer1/feature2/"
+word2vec_feature_path = "/home/xinzhu/Code/CG/Layer1/word2vec_feature/"
+model_path = "/home/xinzhu/Code/CG/Layer1/model2/"
+
+cache = {}
+
+def get_concepts_of_instance_by_probase(instance, use_cache=True):
+	"""
+	Fetches the concept and the probabilities for a given instance by probase.
+	:param instance: the instance, for which the concepts should be requested
+	:param use_cache: if true a cache for instances and corresponding concepts is used, to avoid unnecessary requests
+	:return: the concepts and their probability
+	"""
+	from urlparse import urlparse
+	if use_cache == True and instance in cache:
+		return cache[instance]
+	try:
+		requestUrl = 'https://concept.research.microsoft.com/api/Concept/ScoreByProb?instance={}&topK=10&api_key=eT5luCbmII34ZvpPVs7HxtbUU1cFcE12'.format(urllib.pathname2url(instance))
+		try:
+			response = requests.get(requestUrl)
+		except requests.exceptions.ConnectionError as e:
+			print(e)
+			print("\n\ntry one last time...")
+			response = requests.get(requestUrl)
+	except:
+		print("error, ", instance)
+		response = None
+	if response is None:
+		return None
+	concepts = response.json()
+	cache[instance] = concepts
+	return concepts
+
+def calculate_features(item, mode=1, can_num=100, source="Probase"):
+	"""
+	Given an item, generate can_num candidates and test hit rate. 
+	param: can_num: number of candidates to be generated
+	param: mode: 1 means single-processing, 2 means multi-processing have to return an additional len(distractors)
+	return: features: probabilty from Probase + embedding similarities for each candidate
+	"""
+
+	# select a candidate generation source
+	print("calculating features...")
+	if source == 'Probase':
+		candidates = search_candidates_from_e(item['sentence'],item['answer'],can_num)
+	elif source == "WordNet":
+		candidates = wordnet_predict(item['sentence'],item['answer'],can_num)
+	else:
+		# candidates = word2vec_predict(item['sentence'],can_num)
+		pass
+	# print(candidates)
+	# if answer is not in Probase
+	if candidates is None:
+		return np.array([]), np.array([])
+
+	cnt = 1
+	rankings = {} # from candidate to its ranking 
+	features = [] # concept probability + embedding features for each candidate
+	Y = [] # label, whether is a distractor or not
+	res = []
+	dic = {}
+	visit = [0]*len(item['distractors'])
+
+	# different forms of distractors to be tested
+	distractors = []
+	for i in range(len(item['distractors'])):
+		d = normalize_instance(item['distractors'][i])
+		for k in [item['distractors'][i], d, d.capitalize(), ' '.join([x.capitalize() for x in d.split()]), ''.join([x.capitalize() for x in d.split()])]:
+			distractors.append(k)
+			dic[k] = i
+
+	item['answer'] = normalize_instance(item['answer'])
+	item['sentence'] = " ".join([normalize_instance(x) for x in item['sentence'].split()])
+	scores = []
+	LMProb = []
+	pairs = []
+
+	for c,v in sorted(candidates.items(), key=lambda d: -d[1]):
+		# print("feature for ,", c)
+		y = 0
+		rankings[c] = cnt
+		if c in distractors:
+			if visit[dic[c]] == 1:
+				cnt += 1
+				continue
+			res.append(rankings[c])
+			visit[dic[c]] = 1
+			y = 1
+		cnt += 1
+		try:
+			features.append(cal_26_feature_vec([item['sentence'],item['answer'],c]))
+			Y.append(y)
+			scores.append([v])
+			pairs.append([item['sentence'],c])
+		except:
+			print("error")
+			pass
+			
+	for i in range(len(item['distractors'])):
+		if visit[i] == 0:
+			try:
+				features.append(cal_26_feature_vec([item['sentence'],item['answer'],item['distractors'][i]]))
+				Y.append(1)
+				scores.append([0])
+				
+			except:
+				print('error')
+				pass
+
+	features = np.array(features,dtype=np.float32)
+	scores = np.array(scores,dtype=np.float64)
+	scores_normed = scores / scores.max(axis=0) 
+	features = np.hstack((features, scores_normed))
+	print(features.shape)
+	if mode == 1:
+		return features, Y, res
+	else:
+		return features, Y, res, len(item['distractors']) 
+
+
+def prepare_train_data():
+	"""
+	Calculate features of 100 candidates for each item in file. 
+	"""
+	feature_path = "/home/xinzhu/Code/CDC/data/features/"
+	files = ['Regents_new.json','AI2-ScienceQuestions_new.json']
+	for fname in files:
+		X = []
+		Y = []
+		f = data_path + fname
+		xf = feature_path + fname[:-5] + "_X.npy"
+		yf = feature_path + fname[:-5] + "_y.npy"
+		of = open(data_path + fname[:-5] + ".txt", 'w')
+		with open(f, 'r') as content:
+			dataset = json.load(content)
+			cnt = 1
+			for item in dataset:
+				if isinstance(item['answer'],int):
+					continue
+				try:
+					tmpX, tmpY, res = calculate_features(item)
+					if len(X) == 0:
+						X = tmpX
+						Y = tmpY
+					else:
+						X = np.vstack((X, tmpX))
+						Y = np.append(Y, tmpY)
+					print("Xshape: ",X.shape)
+					print("yshape: ",Y.shape)
+					of.write(str(cnt) + '\t' + str(len(item['distractors'])))
+					for r in res:
+						of.write('\t'+str(r))
+					of.write('\n')
+					cnt += 1
+				except:
+					pass
+		np.save(xf, X)
+		np.save(yf, Y)
+
+
+def prepare_training_data_wordnet(fname, feature_path):
+	data_path = "/home/xinzhu/Code/CG/Layer1/dataset/"
+	print("calculate features for WordNet generated candidates...")
+	f = data_path + fname + ".json"
+	xf = feature_path + fname + "_X.npy"
+	yf = feature_path + fname + "_y.npy"
+	dataset = json.load(open(f, 'r'))
+	X = []
+	Y = []
+	cnt = 1	
+	index = []
+	for item in dataset:
+		cnt += 1
+		if isinstance(item['answer'],int):
+			continue
+		try:
+			tmpX, tmpY, res = calculate_features(item, mode=1, can_num=100, source="WordNet")
+			X.append(tmpX)
+			Y.append(tmpY)
+			index.append(cnt)
+			# print(tmpX)
+			# print(tmpY)
+		except:
+			pass
+	X = np.asarray(X)
+	Y = np.asarray(Y)
+	index = np.asarray(index)
+	np.save(xf, X)
+	np.save(yf, Y)
+	np.save(indexf, index)
+
+
+def prepare_training_data_word2vec(fname,feature_path):
+	data_path = "/home/xinzhu/Code/CG/Layer1/dataset/"
+	print("calculate features for word2vec generated candidates...")
+	f = data_path + fname + ".json"
+	xf = feature_path + fname + "_X.npy"
+	yf = feature_path + fname + "_y.npy"
+	indexf = feature_path + fname + "_index.npy"
+	dataset = json.load(open(f, 'r'))
+	X = []
+	Y = []
+	cnt = 1	
+	for item in dataset:
+		if isinstance(item['answer'],int):
+			continue
+		try:
+			tmpX, tmpY, res = calculate_features(item, mode=1, can_num=100, source="Word2vec")
+			X.append(tmpX)
+			Y.append(tmpY)
+			# print(tmpX)
+			# print(tmpY)
+		except:
+			pass
+	X = np.asarray(X)
+	Y = np.asarray(Y)
+	np.save(xf, X)
+	np.save(yf, Y)
+
+
+def prepare_train_data_new(dataset, fname, index, feature_path=feature_path, source="Probase"):
+	print("start prepare training data !")
+	print(len(dataset))
+	xf = feature_path + fname + "_train_X.npy"
+	yf = feature_path + fname + "_train_y.npy"
+	X = []
+	Y = []
+	for i in range(len(dataset)):
+		if i+1 in index:
+			continue
+		try:
+			item = dataset[i]
+			tmpX, tmpY, res = calculate_features(item, 1, 100, source)
+			X.append(tmpX)
+			Y.append(tmpY)
+			print("test item done!")
+		except:
+			pass
+	X = np.asarray(X)
+	Y = np.asarray(Y)
+	np.save(xf, X)
+	np.save(yf, Y)
+
+
+def prepare_test_data_new(dataset, fname, index, feature_path=feature_path, source="Probase"):
+	# data_path = "/home/xinzhu/Code/CG/Layer1/dataset/"
+	xf = feature_path + fname + "_test_X.npy"
+	yf = feature_path + fname + "_test_y.npy"
+	X = []
+	Y = []
+	print("start prepare testing data !")
+	print(index)
+	for i in index:
+		print(i)
+		item = dataset[i]
+		try:
+			tmpX, tmpY, res = test_calculate_features(item, 1, 100, source)
+			X.append(tmpX)
+			Y.append(tmpY)
+			print("test item done!")
+		except:
+			pass
+	X = np.asarray(X)
+	Y = np.asarray(Y)
+	np.save(xf, X)
+	np.save(yf, Y)
+
+
+def test_calculate_features(item, mode=1, can_num=100, source="Probase"):
+	"""
+	Given an item, generate can_num candidates and test hit rate. 
+	param: can_num: number of candidates to be generated
+	param: mode: 1 means single-processing, 2 means multi-processing have to return an additional len(distractors)
+	return: features: probabilty from Probase + embedding similarities for each candidate
+	"""
+
+	# select a candidate generation source
+	# print("calculating features...")
+	if source == 'Probase':
+		candidates = search_candidates_from_e(item['sentence'],item['answer'],can_num)
+		print("Probase candidate done!")
+	elif source == "WordNet":
+		candidates = wordnet_predict(item['sentence'],item['answer'],can_num)
+	else:
+		pass
+		# candidates = word2vec_predict(item['sentence'],can_num)
+	# print(candidates)
+	# if answer is not in Probase
+	if candidates is None:
+		print("candidate is None!")
+		return np.array([]), np.array([])
+
+	cnt = 1
+	rankings = {} # from candidate to its ranking 
+	features = [] # concept probability + embedding features for each candidate
+	Y = [] # label, whether is a distractor or not
+	res = []
+	dic = {}
+	visit = [0]*len(item['distractors'])
+
+	# different forms of distractors to be tested
+	distractors = []
+	for i in range(len(item['distractors'])):
+		d = normalize_instance(item['distractors'][i])
+		for k in [item['distractors'][i], d, d.capitalize(), ' '.join([x.capitalize() for x in d.split()]), ''.join([x.capitalize() for x in d.split()])]:
+			distractors.append(k)
+			dic[k] = i
+
+	item['answer'] = normalize_instance(item['answer'])
+	item['sentence'] = " ".join([normalize_instance(x) for x in item['sentence'].split()])
+	scores = []
+	LMProb = []
+	pairs = []
+
+	for c,v in sorted(candidates.items(), key=lambda d: -d[1]):
+		# print("feature for ,", c)
+		y = 0
+		rankings[c] = cnt
+		if c in distractors:
+			if visit[dic[c]] == 1:
+				continue
+			res.append(rankings[c])
+			visit[dic[c]] = 1
+			y = 1
+			cnt += 1
+		print("hit: ",cnt)
+		try:
+			features.append(cal_26_feature_vec([item['sentence'],item['answer'],c]))
+			Y.append(y)
+			scores.append([v])
+			pairs.append([item['sentence'],c])
+		except:
+			print("error")
+			pass
+
+	features = np.array(features,dtype=np.float32)
+	scores = np.array(scores,dtype=np.float64)
+	scores_normed = scores / scores.max(axis=0) 
+	features = np.hstack((features, scores_normed))
+	print(features.shape)
+	if mode == 1:
+		return features, Y, res
+	else:
+		return features, Y, res, len(item['distractors'])
+
+
+def prepare_train_data_by_item(fname,feature_path=feature_path):
+	"""
+	Calculate features of 100 candidates for each item in file. 
+	Save a numpy array for each item
+	"""
+	print("calculate features...")
+	f = data_path + fname + ".json"
+	xf = feature_path + fname + "_X.npy"
+	yf = feature_path + fname + "_y.npy"
+	# of = open('/home/xinzhu/Code/CDC/data/LM_features/feature.txt', 'w')
+	dataset = json.load(open(f, 'r'))
+	X = []
+	Y = []
+	cnt = 1
+	for item in dataset:
+		if isinstance(item['answer'],int):
+			continue
+		try:
+			tmpX, tmpY, res = calculate_features(item)
+			X.append(tmpX)
+			Y.append(tmpY)
+			# for x in tmpX:
+			# 	of.write(str(x))
+			# 	of.write(' ')
+			# of.write(tmpY)
+			# of.write('\n')
+			# of.flush()
+			cnt += 1
+		except:
+			pass
+	X = np.asarray(X)
+	Y = np.asarray(Y)
+	np.save(xf, X)
+	np.save(yf, Y)
+	# of.close()
+
+
+def multi_prepare_train_data(fname,feature_path=feature_path):
+	"""
+	Multiprocessing version of prepare_train_data_by_item
+	"""
+	print("multiprocessing calculate features...")
+	f = data_path + fname + ".json"
+	xf = feature_path + fname + "_X.npy"
+	yf = feature_path + fname + "_y.npy"
+	indexf = feature_path + fname + "_index.npy"
+	# of = open(data_path + fname + ".txt", 'w')
+	dataset = json.load(open(f, 'r'))
+	X = []
+	Y = []
+	cnt = 1
+	results = []
+	pool = Pool(multiprocessing.cpu_count())
+	index = []
+	cur = 0
+	for item in dataset:
+		cur += 1
+		if isinstance(item['answer'],int):
+			continue
+		try:
+			results.append(pool.apply_async(calculate_features, args=(item,)))
+			index.append(cur)
+		except:
+			pass
+	pool.close()
+	pool.join()
+	final_index = []
+	i = 0
+	for result in results:
+		try:
+			tmpX, tmpY, res = result.get()
+			X.append(tmpX)
+			Y.append(tmpY)
+			final_index.append(index[i])
+		except:
+			print("get result error")
+		i += 1
+
+	X = np.asarray(X)
+	Y = np.asarray(Y)
+	final_index = np.asarray(final_index)
+	np.save(xf, X)
+	np.save(yf, Y)
+	np.save(indexf, final_index)
+
+
+# if __name__ == '__main__':
+	#AI2-ScienceQuestions_new_feature.npy
+	# prepare_train_data()
+	# draw_recall_graph("/mnt/e/Course/NLP/Code/CandidateGeneration/ContextDependentConceptualization/src/result/AI2-ScienceQuestions_new.txt")
\ No newline at end of file
diff --git a/ConceptGenerator/layer1_ranker.py b/ConceptGenerator/layer1_ranker.py
new file mode 100644
index 0000000..51c1603
--- /dev/null
+++ b/ConceptGenerator/layer1_ranker.py
@@ -0,0 +1,385 @@
+import numpy as np
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.model_selection import train_test_split
+from sklearn import metrics
+import xgboost as xgb
+from xgboost import plot_importance
+from sklearn.metrics import mean_squared_error
+from sklearn.externals import joblib
+from sklearn import preprocessing
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import os
+from layer1_prepare_training_data import prepare_train_data_new, prepare_test_data_new, prepare_training_data_wordnet, prepare_training_data_word2vec, prepare_train_data_by_item, calculate_features, multi_prepare_train_data
+import json
+from sklearn.model_selection import GridSearchCV
+from sklearn.svm import SVC
+import argparse
+import random
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--source', type=str, choices=['Probase', 'Word2vec', 'WordNet'], default="Probase", help='Probase, Word2vec, WordNet')
+parser.add_argument('--fname', type=str, default="total_new", help='filename')
+parser.add_argument('--feature_path', type=str, default="/home/xinzhu/Code/CG/Layer1/feature2/", help='feature path')
+parser.add_argument('--model_path', type=str, default="/home/xinzhu/Code/CG/Layer1/model2/", help="model path")
+parser.add_argument('--model_type', type=str, default="adaboost",help='model type')
+args = parser.parse_args()
+
+data_path = "/home/xinzhu/Code/CG/Layer1/dataset/"
+feature_path = "/home/xinzhu/Code/CG/Layer1/feature2/"
+model_path = "/home/xinzhu/Code/CG/Layer1/model2/"
+
+class Ranker():
+ 	"""used to rank and evaluate candidates"""
+ 	def __init__(self, fname, model_name): # fname = Regents_new, model_name = "name"
+ 		self.fname = fname
+ 		self.model_name = model_name
+ 		self.train_size = 0 # size of training items
+ 		self.train_set = []
+ 		self.train_X = []
+ 		self.train_y = []
+ 		self.test_size = 0 # size of testing items
+ 		self.test_set = []
+ 		self.test_X = []
+ 		self.test_y = []
+ 		self.train_test_split = 0.2
+
+ 	def load(self, fname, split_rate=0.2):
+ 		model = train(fname, model_name)
+		jsonf = open(data_path + fname + ".json", 'r')
+		dataset = json.load(jsonf)
+		total_length = len(dataset)
+		self.test_size = int(total_length * 0.2)
+		self.train_size = total_length - self.test_size
+		self.test_set = dataset[:self.test_size]
+		self.train_set = dataset[self.test_size:]
+		self.train_X = calculate_features()
+
+
+def transform(arr, mode):
+	trans = []
+	if mode == 'X':
+		for X in arr:
+			if len(trans) == 0:
+				trans = X
+			else:
+				trans = np.vstack((trans,X))
+	else:
+		for y in arr:
+			if len(trans) == 0:
+				trans = y
+			else:
+				trans = np.append(trans, y)
+	return np.asarray(trans)
+
+
+def dcg_at_k(r, k, method=0):
+    """Score is discounted cumulative gain (dcg)
+    Relevance is positive real values.  Can use binary
+    as the previous methods.
+    Example from
+    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
+    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
+    >>> dcg_at_k(r, 1)
+    3.0
+    >>> dcg_at_k(r, 1, method=1)
+    3.0
+    >>> dcg_at_k(r, 2)
+    5.0
+    >>> dcg_at_k(r, 2, method=1)
+    4.2618595071429155
+    >>> dcg_at_k(r, 10)
+    9.6051177391888114
+    >>> dcg_at_k(r, 11)
+    9.6051177391888114
+    Args:
+        r: Relevance scores (list or numpy) in rank order
+            (first element is the first item)
+        k: Number of results to consider
+        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
+                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
+    Returns:
+        Discounted cumulative gain
+    """
+    r = np.asfarray(r)[:k]
+    if r.size:
+        if method == 0:
+            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
+        elif method == 1:
+            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
+        else:
+            raise ValueError('method must be 0 or 1.')
+    return 0.
+
+
+def ndcg_at_k(r, k, method=0):
+    """Score is normalized discounted cumulative gain (ndcg)
+    Relevance is positive real values.  Can use binary
+    as the previous methods.
+    Example from
+    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
+    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
+    >>> ndcg_at_k(r, 1)
+    1.0
+    >>> r = [2, 1, 2, 0]
+    >>> ndcg_at_k(r, 4)
+    0.9203032077642922
+    >>> ndcg_at_k(r, 4, method=1)
+    0.96519546960144276
+    >>> ndcg_at_k([0], 1)
+    0.0
+    >>> ndcg_at_k([1], 2)
+    1.0
+    Args:
+        r: Relevance scores (list or numpy) in rank order
+            (first element is the first item)
+        k: Number of results to consider
+        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
+                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
+    Returns:
+        Normalized discounted cumulative gain
+    """
+    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
+    if not dcg_max:
+        return 0.
+    return dcg_at_k(r, k, method) / dcg_max
+
+
+def train_from_file(fname, model_name, feature_path=feature_path,model_path=model_path,source="Probase"):
+	"""
+	Train models for a given json file and model_name
+	param: fname: only the file name (no path needed)
+	param: model_name: choose from 'adaboost', 'xgboost', 'k-fold'
+	"""
+	print("start ",fname)
+	trainXf = feature_path + fname + "_train_X.npy"
+	trainyf = feature_path + fname + "_train_y.npy"
+	testXf = feature_path + fname + "_test_X.npy"
+	testyf = feature_path + fname + "_test_y.npy"
+	if os.path.exists(trainXf) and os.path.exists(testXf):
+		pass
+	else:
+		data_path = "/home/xinzhu/Code/CG/Layer1/dataset/"
+		f = data_path + fname + ".json"
+		dataset = json.load(open(f, 'r'))
+		index_list = [int(line) for line in open("/home/xinzhu/Code/CG/Layer1/test_index.txt",'r').readlines()]
+		if os.path.exists(trainXf):
+			prepare_test_data_new(dataset, fname, index_list, feature_path, source)
+		elif os.path.exists(testXf):
+			prepare_train_data_new(dataset, fname, index_list, feature_path, source)
+		else:
+			prepare_test_data_new(dataset, fname, index_list, feature_path, source)
+			prepare_train_data_new(dataset, fname, index_list, feature_path, source)
+		# if source == "Probase":
+		# 	#prepare_train_data_by_item(fname,feature_path) # multi_prepare_train_data, prepare_train_data_by_item
+		# 	index = multi_prepare_train_data(fname,feature_path)
+		# elif source == "WordNet":
+		# 	prepare_training_data_wordnet(fname,feature_path)
+		# else:
+		# 	prepare_training_data_word2vec(fname,feature_path)
+	X_train = np.load(trainXf)
+	y_train = np.load(trainyf)
+	X_test = np.load(testXf)
+	y_test = np.load(testyf)
+		# index = np.load(feature_path + fname + "_index.npy")
+	# X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size=0.2, random_state=123)
+	# X_train, X_test, y_train, y_test, train_index, test_index = train_test_split(X, y, index, test_size=0.2, random_state=123)
+	y_test_origin = y_test
+	X_test_origin = X_test
+	X_train = transform(X_train, 'X')
+	X_test = transform(X_test, 'X')
+	y_train = transform(y_train, 'y')
+	y_test = transform(y_test, 'y')
+
+	where_are_NaNs = np.isnan(X_train)
+	X_train[where_are_NaNs] = 0
+	where_are_NaNs = np.isnan(X_test)
+	X_test[where_are_NaNs] = 0
+
+	# add preprocessing
+	# min_max_scaler = preprocessing.MinMaxScaler()
+	# X_train = min_max_scaler.fit_transform(X_train)
+	# X_test = min_max_scaler.transform(X_test)
+
+	modelp = model_path + fname + "_" + model_name + "_new.joblib.dat" # store path of final model
+	if os.path.exists(modelp):
+		print("load model...")
+		model = joblib.load(modelp)
+	else:
+		print("training...")
+		if model_name == 'adaboost':
+			abc = AdaBoostClassifier(n_estimators=50,learning_rate=1)
+			model = abc.fit(X_train, y_train)
+		
+		elif model_name == 'svm':
+			model = SVC(gamma='auto')
+			model.fit(X_train, y_train)
+
+		elif model_name == 'adaboost_optimize':
+			abc = AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=20,min_samples_leaf=5),algorithm='SAMME.R')
+			param_test1 = {'n_estimators': range(50,200,10),"learning_rate":[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]}
+			gsearch1 = GridSearchCV(abc,param_test1,cv=10)
+			gsearch1.fit(X_train,y_train)
+			learning_rate = gsearch1.best_params_["learning_rate"]
+			n_estimators = gsearch1.best_params_['n_estimators']
+			abc = AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=20, min_samples_leaf=5),
+                         algorithm="SAMME",
+                         n_estimators=n_estimators, learning_rate=learning_rate)
+			model = abc.fit(X_train, y_train)
+
+		elif model_name == 'xgboost':
+			# implementation based on Scikit-learn 
+			dtrain = xgb.DMatrix(X_train, label=y_train)
+			model = xgb.XGBClassifier(objective ='binary:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
+										max_depth = 5, alpha = 10, n_estimators = 20, random_state=123)
+			# n_estimators: number of trees you want to build.
+			# colsample_bytree: percentage of features used per tree. High value can lead to overfitting.
+			# max_depth: determines how deeply each tree is allowed to grow during any boosting round.
+			# learning_rate: step size shrinkage used to prevent overfitting. Range is [0,1]
+			# objective: determines the loss function to be used like reg:linear for regression problems, reg:logistic for classification problems with only decision, binary:logistic for classification problems with probability.
+			model.fit(X_train,y_train)
+			preds = model.predict(X_test)
+			plot_importance(model) #  counting the number of times each feature is split on across all boosting rounds (trees) in the model
+			plt.savefig('feature_importance.png')
+			rmse = np.sqrt(mean_squared_error(y_test, preds))
+			print("RMSE: %f" % (rmse))
+			# xgb.plot_tree(model,num_trees=0)
+			# plt.rcParams['figure.figsize'] = [50, 10]
+			# plt.savefig('tree.png')
+
+		elif model_name == 'k-fold':
+			params = {"objective":"binary:logistic",'colsample_bytree': 0.3,'learning_rate': 0.1,
+			'max_depth': 5, 'alpha': 10} 
+			data_dmatrix = xgb.DMatrix(data=X,label=y) # an optimized data structure that XGBoost supports
+			cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
+			num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)
+			print((cv_results["test-rmse-mean"]).tail(1))
+
+		joblib.dump(model, modelp)
+
+	y_pred = model.predict(X_test)
+	total, valid = 0, 0
+	for i in range(len(y_pred)):
+		if y_test[i] == 1:
+			total += 1
+			if y_pred[i] == 1:
+				valid += 1
+	print("Testing...")
+	print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
+	print("hit rate:",float(valid)/total)
+
+	res = {"Recall@3": 0.0,"Recall@10": 0.0,"Recall@50": 0.0, "Recall@100": 0.0,"P@1": 0.0, "P@3": 0.0, "P@10": 0.0, "F1@3":0.0, "F1@10":0.0,\
+	 "MRR": 0.0, "MAP@10": 0.0, "NDCG@10": 0.0}
+
+	for tmpX, tmpY in zip(X_test_origin, y_test_origin):
+		where_are_NaNs = np.isnan(tmpX)
+		tmpX[where_are_NaNs] = 0
+		# tmpX = min_max_scaler.transform(tmpX)
+
+		scores = model.predict_proba(tmpX)[:,1] # calculate probabilities for each candidate
+		index = np.argsort(-scores) # rank according to probabilities, greatest to smallest
+		length_index = len(index)
+		# tmpY = tmpY.tolist()
+		tmpres = {"Recall@3": 0.0,"Recall@10": 0.0,"Recall@50": 0.0, "Recall@100": 0.0,"P@1": 0.0, "P@3": 0.0, "P@10": 0.0, "F1@3":0.0, "F1@10":0.0,\
+	 	"MRR": 0.0, "MAP@10": 0.0, "NDCG@10": 0.0}
+		# P@1 P@3 P@10
+		for precisionk in [1,3,10]:
+			precision = 0
+			for i in range(precisionk):
+				try:
+					if tmpY[index[i]] == 1:
+						precision += 1
+				except:
+					pass
+			res["P@"+str(precisionk)] += float(precision)/precisionk
+			tmpres["P@"+str(precisionk)] = float(precision)/precisionk
+
+		# Recall@10, Recall@50, Recall@100
+		length = tmpY.count(1) # total valid distractors
+		for recallk in [3, 10,50,100]:
+			recall = 0
+			for i in range(recallk):
+				if i >= length_index:
+					break
+				try:
+					if tmpY[index[i]] == 1:
+						recall += 1
+				except:
+					pass
+			tmpres["Recall@"+str(recallk)] = float(recall)/(length+1)
+			res["Recall@"+str(recallk)] += float(recall)/(length+1)
+
+		# MRR
+		for i in range(length_index):
+			if tmpY[index[i]] == 1:
+				tmpres["MRR"] += 1.0/(i+1)
+
+		# MAP@10
+		num_correct = 0.0
+		for i in range(10):
+			try:
+				if tmpY[index[i]] == 1:
+					num_correct += 1.0
+					tmpres["MAP@10"] += num_correct / (i + 1)
+			except:
+				pass
+		try:
+			tmpres["MAP@10"] /= num_correct
+		except:
+			pass
+
+		# NDCG@10
+		scores = []
+		for i in range(10):
+			try:
+				scores.append(tmpY[index[i]])
+			except:
+				scores.append(0)
+		res["NDCG@10"] += ndcg_at_k(scores,10)
+		
+		res['MAP@10'] += tmpres["MAP@10"]
+		res['MRR'] += tmpres['MRR']
+		try:
+			res["F1@3"] += 2*tmpres["Recall@3"]*tmpres["P@3"] / (tmpres["Recall@3"]+tmpres["P@3"])
+		except:
+			pass
+		try:
+			res["F1@10"] += 2*tmpres["Recall@10"]*tmpres["P@10"] / (tmpres["Recall@10"]+tmpres["P@10"])
+		except:
+			pass		
+		total += 1
+		# print("done, ",total)
+	for k in res.keys():
+		res[k] /= total 
+
+	# indexf = open("test_index.txt",'w')
+	# for i in test_index:
+	# 	indexf.write(str(i))
+	# 	indexf.write('\n')
+	# indexf.close()
+
+	print("metrics: ")
+	for k, v in res.items():
+		print(k+": "+str(v))
+
+if __name__ == '__main__':
+# 	# train("Regents_new",'adaboost')
+# 	# evaluate("Regents_new",'adaboost')
+# 	train_from_file("Regents_new",'adaboost')
+#	train_from_file("Regents_new",'xgboost')
+#	train_from_file("AI2-ScienceQuestions_new",'adaboost')
+	# train_from_file("mcq_new",'adaboost')
+	# train_from_file("mcql_new",'adaboost')
+	# train_from_file("trivia_new",'adaboost')
+	# train_from_file("total_new",'adaboost')
+
+	# Probase + rerank
+	train_from_file(args.fname, args.model_type, args.feature_path, args.model_path, args.source)
+
+	# WordNet + rerank
+	#train_from_file("total_new",'adaboost',feature_path="/home/xinzhu/Code/CG/Layer1/wordnet_feature/",model_path="/home/xinzhu/Code/CG/Layer1/wordnet_model/", source="WordNet")
+
+	# train_from_file("total_new",'adaboost',feature_path="/home/xinzhu/Code/CG/Layer1/model_word2vec/",model_path="/home/xinzhu/Code/CG/Layer1/feature_word2vec/", source="Word2vec")
+	# try to optimize model
+	# train_from_file("total_new",'adaboost',feature_path="/home/xinzhu/Code/CG/Layer1/hyper_feature/",model_path="/home/xinzhu/Code/CG/Layer1/hyper_model/")
\ No newline at end of file
diff --git a/ConceptGenerator/search_candidates_from_e.py b/ConceptGenerator/search_candidates_from_e.py
new file mode 100644
index 0000000..9d3456b
--- /dev/null
+++ b/ConceptGenerator/search_candidates_from_e.py
@@ -0,0 +1,96 @@
+import time
+from utilities import normalize_instance
+from Conceptualizer import Conceptualizer
+from LDA import LDA
+import sqlite3
+import gensim
+from gensim import corpora
+from collections import defaultdict
+
+conn = sqlite3.connect('/home/xinzhu/Dataset/Probase/probase.db')
+c = conn.cursor()
+dump_file = '/home/xinzhu/Dataset/enwiki-latest-pages-articles.xml.bz2' #for generate_input_files
+bow_path = '/home/xinzhu/Code/CDC/data/full_wiki_bow.mm' # doc to [(word_id, count)..] mapping
+dict_path = '/home/xinzhu/Code/CDC/data/full_wiki.dictionary' # word_id to word mapping
+model_file = '/home/xinzhu/Code/CDC/models/ldamodel_topics100_trainiter20_full_en.gensim'
+num_topics = 100
+
+id2word_dict = gensim.corpora.Dictionary.load(dict_path) 
+# print(id2word_dict.token2id.items()[:100])
+
+lda = LDA()
+debug = False
+lda.load(model_file)
+print("Load LDA model")
+conceptualizer = Conceptualizer(lda)
+
+def search_e_from_c(c,concept,k):
+	"""
+	Find all entities under concept
+	:param c: the database cursor
+    :param concept: concept to be searched
+    :param k: maximum number of entities to be generated 
+    :return: a sorted list containing (entity_name, frequency) pairs
+	"""
+	cursor = c.execute('select entity, frequency from isa_core where concept=?',(concept,))
+	entities = []
+	for row in cursor:
+		entities.append([row[0],int(row[1])])
+	entities = sorted(entities,key=lambda x:-x[1])
+	return entities[:k] if len(entities) > k else entities
+
+def candidate_prob(candidates):
+	"""
+	Merge all condidates and calculate their probabilities 
+	:param candidates: a list containing the candidate, frequency pairs for each concept ([ ['candidate_name', frequency] ... ], concept_probability)
+	:return : a dict containing 
+	"""
+	cd = defaultdict(lambda: 0)
+	for candidateL, probC in candidates:
+		total_freq = sum(freq for candidate,freq in candidateL)
+		for candidate, freq in candidateL:
+			value = float(freq)/total_freq*probC
+			cd[candidate] += value
+	return cd
+		
+def search_candidates_from_e(sentence, key, can_num=10):
+	"""
+	Given a sentence and key, conceptulize it and find candidates for the key
+	:param sentence: a complete sentence with key filled into the originial gap
+    :param key: entity to be searched in Probase
+    :param can_num: maximum number of candidates to be generated 
+    :return: a list containing the candidate, frequency pairs for each concept ([ ['candidate_name', frequency] ... ], concept_probability)
+    		a dict {'candidate_name':frequency...}
+	"""
+	sentence = sentence.replace('**blank**', key)
+	print("Probase sentence: ", sentence)
+	probabilities_of_concepts = conceptualizer.conceptualize(sentence, key, debug, eval=True)
+	print("Probability of concepts done!")
+	if probabilities_of_concepts is None:
+		return None
+	cnt = 0
+	candidates = []
+	syn_key = normalize_instance(key,mode=1)
+	for concept, prob in probabilities_of_concepts:
+		# add original candidates if its normalized form is not in syn_key
+		tmp = [x for x in search_e_from_c(c, concept, can_num) if normalize_instance(x[0]) not in syn_key]
+		cnt += len(tmp)
+		candidates.append((tmp, prob))
+		if cnt > can_num:
+			candidates = candidate_prob(candidates)
+			return candidates
+	candidates = candidate_prob(candidates)
+	return candidates
+
+# debug = True
+# search_candidates_from_e("apple and iPad are useful products", "apple")
+# search_candidates_from_e("He likes to eat apple", "apple")
+
+# search_candidates_from_e("Earth's core is primarily composed of magma of the following materials", "magma")
+# search_candidates_from_e("the ba4ic unit of life is cell",'cell')
+#print(search_candidates_from_e("human have been on the earth for the shortest amount of time",'human',100)) # "Insects","Fish","Reptiles"
+
+#The following shows 100 candidates 
+# candidates = search_candidates_from_e("the most basic unit of living things is Cells",'Cells',100) # "Bones","Tissues","Organs"
+# cd = candidate_prob(candidates)
+# print(sorted(cd.items(), key=lambda d: -d[1]))
\ No newline at end of file
diff --git a/ConceptGenerator/utilities.py b/ConceptGenerator/utilities.py
new file mode 100644
index 0000000..5a5921c
--- /dev/null
+++ b/ConceptGenerator/utilities.py
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+import re
+import urllib
+from nltk.stem import WordNetLemmatizer
+from nltk.corpus import wordnet
+import requests
+from nltk.stem import PorterStemmer
+
+cache = {}
+lemmatizer = WordNetLemmatizer()
+stemmer = PorterStemmer()
+def get_concepts_of_instance_by_probase(instance, eval, use_cache=True):
+    """
+    Fetches the concept and the probabilities for a given instance by probase.
+    :param instance: the instance, for which the concepts should be requested
+    :param use_cache: if true a cache for instances and corresponding concepts is used, to avoid unnecessary requests
+    :return: the concepts and their probability
+    """
+    from urlparse import urlparse
+    if use_cache == True and instance in cache:
+        return cache[instance]
+    if eval:
+        probase_url = 'https://concept.research.microsoft.com/api/Concept/ScoreByProb?instance={' \
+                      '}&topK=100&api_key=eT5luCbmII34ZvpPVs7HxtbUU1cFcE12'
+    else:
+        probase_url = 'https://concept.research.microsoft.com/api/Concept/ScoreByProb?instance={}&topK=20&api_key=eT5luCbmII34ZvpPVs7HxtbUU1cFcE12'
+    try:
+        requestUrl = probase_url.format(urllib.pathname2url(instance))
+    except:
+        print("request error!")
+        requestUrl = probase_url.format(urllib.request.pathname2url(instance))
+    try:
+        response = requests.get(requestUrl)
+    except requests.exceptions.ConnectionError as e:
+        print(e)
+        print("\n\ntry one last time...")
+        response = requests.get(requestUrl)
+
+    if response is None:
+        print("microsoft api error!")
+        return None
+    concepts = response.json()
+    return concepts
+
+
+def appendIfNotEmpty(list, item):
+    """
+    Append item to list, if item is not None. in place
+    :param list: the list, where the item should been appended to
+    :param item: the item which should been appended to the list
+    """
+    if item:
+        list.append(item)
+
+
+def split_text_in_words(text):
+    """
+    Splits a given text into words
+    :param text: the text which should be splited into words
+    :return: a list containing the splitted words
+    """
+    real_words = []
+
+    words = re.findall(r'\'|’|"|”|“|»|«|\(|\)|\[|\]|\{|\}:;|[^\'’"”“»«\(\)\[\]\{\}\s:;]+', text)
+    for word in words:
+        word = word.strip()
+        if word.startswith("..."):
+            real_words.append(word[:3])
+            appendIfNotEmpty(real_words, word[3:])
+        if word.startswith(("\"", "(", "[", "{", "<", "«", "…", "“")):
+            real_words.append(word[:1])
+            word = word[1:]
+        if word.endswith("..."):
+            appendIfNotEmpty(real_words, word[:-3])
+            real_words.append(word[-3:])
+        elif word.endswith((".", ",", ":", ";", "]" ")", "}", "!", "?", "\"", ">", "»", "…", "”")):
+            appendIfNotEmpty(real_words, word[:-1])
+            real_words.append(word[-1:])
+        else:
+            appendIfNotEmpty(real_words, word)
+    return real_words
+
+
+def normalize_instance(s, mode=2):
+    """
+    Normalize to a lowercase lemma string
+    :param s: the s to be processed
+    :param mode: 1 means return all syset, 2 means only return itself
+    """
+    try:
+        s = s.lower()
+        s = lemmatizer.lemmatize(s)
+        # s = stemmer.stem(s)
+    except:
+        return s
+    if mode == 1:
+        synset = set()
+        for syn in wordnet.synsets(s):
+            for l in syn.lemmas():
+                synset.add(l.name().replace('_',' '))
+        return synset
+    else:
+        return s