Working on Coref

ryhan · Mar 28, 2013 · 741672d · 741672d
1 parent 9bff7dd
commit 741672d
Show file tree

Hide file tree

Showing 983 changed files with 5,701,611 additions and 1 deletion.
diff --git a/answer b/answer
@@ -9,13 +9,13 @@ import re
 import itertools
 import nltk
 from nltk.stem import PorterStemmer
-#import nltk_contrib
 
 # Import our modules from /modules
 sys.path.append("modules")
 
 import questionClassifier
 import sourceContentSelector
+from nltk_contrib.coref.resolve import BaselineCorefResolver
 
 #entity_names = []
 #
@@ -30,6 +30,8 @@ def contains_negative(sent):
   return "no" in sent or "not" in sent or \
   "didn't" in sent or "did not" in sent
 
+resolver = BaselineCorefResolver()
+
 # picks the sentence that has the most keywords in common with the question
 def answer(question, article):
     question = question.strip()
@@ -55,6 +57,7 @@ if __name__ == '__main__':
   # Pre-process article content.
   article = open(path_to_article).read()
 
+
   # Open the question file and start answering questions.
   for question in open(path_to_questions):
     print question

diff --git a/modules/nltk_contrib/README.txt b/modules/nltk_contrib/README.txt
@@ -0,0 +1,40 @@
+Status of NLTK-Contrib Projects
+-------------------------------
+
+nltk.demo/app/projects = new home for mature packages that aren't libraries
+               installed in user space?
+
+agreement
+bioreader			MIGRATE into nltk.corpus
+ccg					MIGRATE [merge into nltk.parse, or a new package?]					
+classifier*			investigate
+classify			REMOVE? [outdated by nltk.classify]
+combined.py
+concord.py			MIGRATE into a new nltk.concordance package
+coref				MIGRATE to nltk.corpus [Sep?]
+dependency			FOLD into depparser
+depparser			MIGRATE [Sep?]
+featuredemo.py		MIGRATE into nltk.draw?
+fst					MIGRATE?
+fuf					MIGRATE [Sep?]
+gluesemantics		REMOVE [once migration into nltk.sem is complete]
+hadoop				investigate
+hole.py				MIGRATE
+lambek				nltk.project?
+lpath				nltk.project?
+mit					MIGRATE [rspeer to advise]
+rdf.py
+readability			nltk.project?
+referring.py
+rte
+sem					REMOVE? [dhgarrette to advise]
+seqclass.py
+speer.cfg			move to mit? [rspeer to advise]
+stringcomp.py
+tag
+test2.cfg
+test2.out
+timex.py			MIGRATE into a new normalize package?
+tnt.py				MIGRATE into nltk.tag?
+toolbox				nltk.project?
+wordnet				REMOVE? [pbone to advise]
diff --git a/modules/nltk_contrib/__init__.py b/modules/nltk_contrib/__init__.py
@@ -0,0 +1,8 @@
+# Natural Language Toolkit (NLTK) Contrib Area
+#
+# Copyright (C) 2001-2011 NLTK Project
+# Authors: Steven Bird <[email protected]>
+#          Edward Loper <[email protected]>
+# URL: http://www.nltk.org/
+# For license information, see LICENSE.TXT
+
diff --git a/modules/nltk_contrib/align/README.txt b/modules/nltk_contrib/align/README.txt
@@ -0,0 +1,80 @@
+
+5/5/10
+
+This directory contains 2 implementations of the Gale-Church alignment algorithm:
+
+1. gale_church.py
+2. align.py 
+        api.py
+        align_util.py
+        distance_measures.py
+
+This README concerns the second implementation which I am the author of
+([email protected]).
+
+########################################################
+TESTING
+
+align.py can be tested using:
+
+python test2.py chapter1_madame_bovary_fr.txt chapter1_madame_bovary_en.txt fr en
+(using data/ versions causes decode problems in the plaintext reader)
+
+This will print output from various demo alignments (see test.py code) as well as
+an alignment of the first chapter of Madame Bovary in the original French to an English
+translation (non-copyrighted!)
+
+The ground truth for the demo alignments is in the data folder. The demo_eval() routine
+in test.py could be activated to do an evaluation (currently not sure if it works)
+
+The ground truth to Madame Bovary alignment is not in data (I will make it available).
+
+The test.py program runs the Gale-Church alignment on Madame Bovary using an 
+'extended' option which unlike the 'original' option of the Gale-Church algorithm
+considers 3-1, 3-2, 1-3, 2-3, 3-3 alignments. This is a 1-3 and a 3-1 alignment in the
+ground truth of Madame Bovary. The program did not correctly handle these (all other
+alignments output were correct). It may have erred because of the probabilities
+(penalties) in the distance_measures.three_side_distance() routine. It may have 
+erred because of a faulty implementation. Finally, Gale-Church may simply fail on
+these cases. If someone discovers the real reason PLEASE let me know!
+
+##########################################################
+CHANGES
+
+The following are the major changes since the last version:
+
+1. align_util.covert_bead_to_tuples() now works!!
+2. print_alignments now works!!
+
+both these routines are for output - the base algorithm produced correct
+results (though they may have not have looked that way in the output!)
+
+3. the api.py includes a recursive_align call which can do alignments from chapters,
+to paragraphs, to sentences, to words, to ...
+
+4. the test.py program show how a plain text file can be used as input using the
+nltk PlaintextCorpusReader (using punkt for sentence breaking)
+
+5. As mentioned in the testing section above there is a new 'extended' alignment
+option which considers 3-1, 3-2, 1-3, 2-3, 3-3 alignments
+
+6. other minor changes 
+
+###########################################################
+TO DO:
+Unicode problems. The printed alignment are not UTF-8. This may best be handled 
+at input (which also has problems which may be a simple as BOM or file formats). 
+Haven't worked at it yet (if anyone has some fix, please pass it on)
+
+The program should have option to print out an ARCADE or TEI style alignment file.
+
+Eventually, I hope to make available a Madame Bovary corpus with 6 English translations
+of Madame Bovary along with German, Spanish, Italian and Russian translations.
+All these translations are written by humans - I also have translations produced using
+Google Translate toolkit (any preferred languages let me know)
+
+Again, eventually all these translations will be easily worked with using TEI formats
+Why not NOW?? Well, "data management/integration" is a pain and I hope to use 
+alignment algorithms of my own devising to produce ground truth (first pass at least) 
+for the translations (leaving something undone gives me motivation on this ;-)
+
diff --git a/modules/nltk_contrib/align/__init__.py b/modules/nltk_contrib/align/__init__.py
@@ -0,0 +1,16 @@
+# Natural Language Toolkit: Aligners
+#
+# Copyright (C) 2001-2011 NLTK Project
+# Author: 
+# URL: <http://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+"""
+Classes and interfaces for aligning text.
+"""
+
+from api import *
+from gale_church import *
+
+__all__ = []
+