diff --git a/ask b/ask index 44166a5..6e00ad0 100755 --- a/ask +++ b/ask @@ -27,7 +27,7 @@ if __name__ == '__main__': # Decide how many candidates we want to generate # im thinking we should always generate as many questions as possible # and just pick the n best - num_cand = num_questions*10 + num_cand = num_questions*20 # Fetch sentence candidates that can be converted into questions. selected_content = questionContentSelector.process(article_content, num_cand) diff --git a/modules/questionContentSelector.py b/modules/questionContentSelector.py index 7e93be5..0abb294 100644 --- a/modules/questionContentSelector.py +++ b/modules/questionContentSelector.py @@ -8,11 +8,22 @@ import re import nltk -# Use part-of-speech tagging and entity chunking to +# Use part-of-speech tagging to # score the usefulness of a sentence. def entity_score(sentence): - # tokens = nltk.word_tokenize(sentence) - # tagged = nltk.pos_tag(tokens) + tokens = nltk.word_tokenize(sentence) + tokensU = map(lambda (x): x.upper, tokens) + if (2 < len(tokens) and len(tokens) < 12): + if ("IS" in tokensU or "WAS" in tokensU or + "WERE" in tokensU or "BEING" in tokensU or + "ARE" in tokensU): + + if (nltk.pos_tag([tokens[0]])[0] == "PRP"): + return 1.0 + else: + return 0.5 + + #tagged = nltk.pos_tag(tokens) # entities = nltk.chunk.ne_chunk(tagged) score = 0 return score diff --git a/modules/questionFromSentence.py b/modules/questionFromSentence.py index 2a0836d..09f0a5b 100644 --- a/modules/questionFromSentence.py +++ b/modules/questionFromSentence.py @@ -23,6 +23,7 @@ def add_questionmark(sentence): # GIVEN string representing a declarative sentence, # RETURNS string representing a question. def transform(sentence): + sentence = add_questionmark(sentence) # '.' -> '?' (question, success) = transform_IT_IS(sentence) @@ -34,9 +35,29 @@ def transform(sentence): posTag = nltk.pos_tag([tokens[0]])[0] #if (tokens[1].upper() in BEING and posTag == 'PRP'): - if (tokens[1].upper() in BEING): + if (len(tokens) > 1 and tokens[1].upper() in BEING): tokens = [tokens[1].capitalize(), tokens[0].lower()] + tokens[2:] - return (" ".join(tokens), True) + + question = " ".join(tokens) + if ("," in question): + question = question.split(",")[0] + "?" + return (question, True) + + if (len(tokens) > 2 and tokens[2].upper() in BEING): + tokens = [tokens[2].capitalize(), tokens[0].lower(), tokens[1].lower()] + tokens[3:] + #return (" ".join(tokens), True) + question = " ".join(tokens) + if ("," in question): + question = question.split(",")[0] + "?" + return (question, True) + + if (tokens[0].upper() == "IT"): + tokens = ["What"] + tokens[1:] + #return (" ",join(tokens), True) + question = " ".join(tokens) + if ("," in question): + question = question.split(",")[0] + "?" + return (question, True) """ tagged = nltk.pos_tag(tokens) @@ -48,6 +69,7 @@ def transform(sentence): tokens = [word1.capitalize(), word0.lower()] + tokens[2:] return (" ".join(tokens), True) """ + #print("FAIL: " + sentence) return (sentence, False)