Committing, then going to clean up code and comment/document

ryhan · Apr 8, 2013 · 89749a2 · 89749a2
1 parent a0d5906
commit 89749a2
Show file tree

Hide file tree

Showing 257 changed files with 160,558 additions and 2,049 deletions.
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -0,0 +1,77 @@
+License notices
+===============
+
+The full ARKref distribution is GPL'd.
+
+Our original code in ARKref is licensed under the MIT license.
+
+However, the full system heavily relies on components that are distributed
+under stricter licenses -- in particular, the GPL, and the Alias-i license
+that forces the overall distribution to be freely licensed -- so if you use
+ARKref as-is, with those components included, you have to use the stricter
+licenses.
+
+In particular, we believe this means the full system cannot be included in
+commercial software. It should only be used for research.
+
+For ARKref without the components:
+
+(C) Copyright 2009-2010, Brendan O'Connor and Michael Heilman
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+
+--------------
+
+Various included components:
+
+=========================================================
+Stanford Parser, NER, and Tregex are under the GPLv2.
+http://nlp.stanford.edu/software/lex-parser.shtml
+http://nlp.stanford.edu/software/CRF-NER.shtml
+http://nlp.stanford.edu/software/tregex.shtml
+
+=========================================================
+LingPipe is under the Alias-i Royalty Free License, Version 1.
+(Please note LingPipe is used in a fairly limited manner.)
+http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt
+
+=========================================================
+Apache Commons (langauge) is under the Apache license version 2.0:
+http://commons.apache.org/license.html
+
+=========================================================
+arkref.ext.fig.basic is adapted from Percy Liang's Unsupervised NLP Modeling
+Toolkit (and used in an extremely limited manner), which is MIT-licensed:
+
+(C) Copyright 2009, Percy Liang
+
+http://www.cs.berkeley.edu/~pliang
+
+Permission is granted for anyone to copy, use, or modify these programs and
+accompanying documents for purposes of research or education, provided this
+copyright notice is retained, and note is made of any changes that have been
+made.
+
+These programs and documents are distributed without any warranty, express or
+implied.  As the programs were written for research purposes only, they have
+not been tested to the degree that would be advisable in any important
+application.  All use of these programs is entirely at the user's own risk.
+
+=========================================================
diff --git a/Question_Answer_Dataset_v1.1/S08/data/set1/a1.osent b/Question_Answer_Dataset_v1.1/S08/data/set1/a1.osent
diff --git a/Question_Answer_Dataset_v1.1/S08/data/set1/a1.parse b/Question_Answer_Dataset_v1.1/S08/data/set1/a1.parse
diff --git a/Question_Answer_Dataset_v1.1/S08/data/set1/a1.sst b/Question_Answer_Dataset_v1.1/S08/data/set1/a1.sst
diff --git a/Question_Answer_Dataset_v1.1/S08/data/set1/a1.tagged b/Question_Answer_Dataset_v1.1/S08/data/set1/a1.tagged
diff --git a/README.log b/README.log
diff --git a/answer b/answer
@@ -5,6 +5,7 @@
 
 # Useful tools which should be pre-installed
 import os, sys, errno
+import subprocess
 import re
 import itertools
 import nltk
@@ -17,6 +18,10 @@ import sourceContentSelector
 def contains_negative(sent):
   return "no" in sent or "not" in sent or "n't" in sent
 
+def coref(path_to_article):
+  #subprocess.call(["./arkref.sh", "-input", path_to_article])
+  return open(path_to_article).read()
+
 # picks the sentence that has the most keywords in common with the question
 def answer(question, article):
     question = question.strip()
@@ -47,7 +52,7 @@ if __name__ == '__main__':
       if end == -1: continue
       question = line[:end+1].strip()
       line = line[end+1:].split()
-      path_to_article = line.pop()
+      path_to_article = prefix+line.pop()+".txt"
       difficulty_answerer = line.pop()
       difficulty_questioner = line.pop()
       correct_answer = " ".join(line)
@@ -57,7 +62,7 @@ if __name__ == '__main__':
       print "Difficulty from questioner:", difficulty_questioner
 
       # Open the question file and start answering questions.
-      article = open(prefix+path_to_article+".txt").read()
+      article = coref(path_to_article)
       print "Our answer:", answer(question, article)
       print "Correct answer:", correct_answer
 

diff --git a/arkref.jar b/arkref.jar
diff --git a/arkref.sh b/arkref.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env zsh
+h=$(dirname $0)
+java -mx1g -ea -cp $h/bin:$h/arkref.jar:$(print $h/lib/**/*.jar | tr ' ' :) arkref.analysis.ARKref "$@"
diff --git a/arkref/.classpath b/arkref/.classpath
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" path="src"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/3"/>
+	<classpathentry kind="lib" path="lib/simple-xml-2.1.6.jar"/>
+	<classpathentry kind="lib" path="lib/commons-lang-2.4.jar" sourcepath="lib/commons-lang-2.4-sources.jar">
+		<attributes>
+			<attribute name="javadoc_location" value="http://commons.apache.org/lang/api/"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="lib" path="lib/stanford-parser-2008-10-26.jar" sourcepath="/stanford-parser-2008-10-26/src">
+		<attributes>
+			<attribute name="javadoc_location" value="file:/Users/brendano/sw/stanfordnlp/stanford-parser-2008-10-26/javadoc/"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="lib" path="lib/lingpipe-3.8.2.jar" sourcepath="/lingpipe-3.8.2">
+		<attributes>
+			<attribute name="javadoc_location" value="file:/Users/brendano/sw/lingpipe-3.8.2/docs/api/"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="lib" path="lib/supersense-tagger.jar"/>
+	<classpathentry kind="output" path="bin"/>
+</classpath>
diff --git a/arkref/README.md b/arkref/README.md
@@ -0,0 +1,135 @@
+ARKref
+======
+* Website: http://www.ark.cs.cmu.edu/ARKref/
+* Mike Heilman (http://www.cs.cmu.edu/~mheilman/)
+* Brendan O'Connor (http://anyall.org/)
+
+ARKref is a basic implementation of a syntactically rich, rule-based
+coreference system very similar to (the syntactic components of) Haghighi and
+Klein (2009). We find it is useful as a starting point to be adapted into
+larger information extraction and natural language processing systems. For
+example, by tweaking the gazetteers, customizing mention extraction, turning
+the syntactic rules into log-linear features, etc., it can be made useful for
+a variety of applications.
+
+
+Technical requirements
+----------------------
+
+Only Java is required, probably version 1.6. Various libraries (e.g. GraphViz,
+Hpricot) are necessary for the various development support scripts.
+
+
+How to run
+----------
+
+To get started, the following command runs ARKref on a demo document included
+with the code. We start with just one file, the document text:
+
+    $ ls demo/
+    lee_example.txt
+
+    $ cat demo/lee_example.txt
+    This film should be brilliant. It sounds like a great plot, the actors are
+    first grade, and the supporting cast is good as well, and Stallone is
+    attempting to deliver a good performance. However, it can't hold up.
+
+Run ARKref like so, creating intermediate files and output:
+
+    $ ./arkref.sh -input demo/lee_example.txt
+    ...
+
+    $ ls demo/
+    lee_example.ner
+    lee_example.osent
+    lee_example.parse
+    lee_example.tagged
+    lee_example.txt
+
+The file `.tagged` file is the final output, in a mention/entity-tagged pseudo-xml format.
+
+    $ cat demo/lee_example.tagged 
+    <mention mentionid="1" entityid="1_2_9">This film</mention> should be brilliant .
+    <mention mentionid="2" entityid="1_2_9">It</mention> sounds like <mention mentionid="3" entityid="3">a great plot</mention> , <mention mentionid="4" entityid="4_5">the actors</mention> are <mention mentionid="5" entityid="4_5">first grade</mention> , and <mention mentionid="6" entityid="6">the supporting cast</mention> is good as well , and <mention mentionid="7" entityid="7">Stallone</mention> is attempting to deliver <mention mentionid="8" entityid="8">a good performance</mention> .
+    However , <mention mentionid="9" entityid="1_2_9">it</mention> ca n't hold up .
+
+During development, since it takes a while to load the parser and supersense tagger,
+it can be convenient to run them as background servers. If they're running, ARKref will
+automatically use them. Start them in a new terminal window with:
+
+    $ ./servers.sh
+
+Please see `./arkref.sh -help` for more options.
+
+
+Seeing what's going on
+----------------------
+
+The debug output is designed to make it as easy as possible to understand why
+the algorithm is making its decisions. This is possible since the approach is
+strongly procedural and rule-oriented. See it with:
+
+    $ ./arkref.sh -debug -input demo/lee_example.txt
+
+Various development utility scripts are included. (They may require libraries
+to be installed; see their comments.) For example, streamlined tagging view:
+
+    $ cat demo/lee_example.tagged | ./tagviz.rb 
+
+    *This film*_1 should be brilliant .
+
+    *It*_1 sounds like *a great plot* , *the actors*_4 are *first grade*_4 ,
+    and *the supporting cast* is good as well , and *Stallone* is attempting
+    to deliver *a good performance* .
+
+    However , *it*_1 ca n't hold up .
+
+This makes obvious the false positive "4" cluster, resulting from a
+predicate-nominative construction. It's often useful to check for parsing
+errors by looking at the (raw, pre-surgery) trees as PDF or PNG images:
+
+    $ cat demo/lee_example.parse | ./treeviz.py
+
+<center><iframe src="http://docs.google.com/viewer?url=http%3A%2F%2Fwww.ark.cs.cmu.edu%2FARKref%2Flee_example.parse.pdf&embedded=true" width="500" height="350" style="border: none;"></iframe></center>
+
+[[PDF]](http://www.ark.cs.cmu.edu/ARKref/lee_example.parse.pdf)
+
+Evaluation: there is code that loads ACE Phase 2 datasets and evaluates on
+them. Unfortunately, this data cannot be freely redistributed. If you can get
+a copy of it, evaluation can be run something like this:
+
+    $ ./arkref.sh -ace -input ace_rothdev/*.txt | tee log | ./score-micro-average.sh
+    ....................................................................
+	PRECISION:  0.657617
+	RECALL:     0.552433
+	F1:         0.600454
+
+
+More information
+----------------
+
+We are working on a real tech report for this, but in the meantime, a 
+preliminary class project report is available with the code:
+`notes/class_paper/coref_final_for_rtw.pdf`. Please first read:
+
+* Aria Haghighi and Dan Klein. _Simple Coreference Resolution with Rich
+  Syntactic and Semantic Features_. EMNLP 2009.
+  http://www.aclweb.org/anthology/D/D09/D09-1120.pdf
+
+Out of the box, ARKref is roughly equivalent to H&K's system. On
+the dev data set, its F-score is about the same, though the precision/recall
+tradeoff is different.
+
+This approach depends on having a supersense tagger and a syntactic
+constituency parser. ARKref is written to use a reimplementation of the system
+described by Ciaramita and Altun (EMNLP 2006) and the Stanford
+Parser, which are included in this download. ARKref also makes heavy use of
+the Stanford Tregex library for implementation of syntactic rules. Please see
+the file LICENSE.txt for information on implications for redistribution.
+
+
+References:
+M. Ciaramita and Y. Altun. 2006. _Broad-coverage sense disambiguation and
+information extraction with a supersense sequence tagger_. In Proc. EMNLP.
+
+
diff --git a/build.sh b/build.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env zsh
+
+# quite obviously this is not a proper build system. but if it compiles
+# correctly inside eclipse, it should work here.
+
+# REV=$( (svn info; git svn info) | perl -ne 'print $1 if /^Revision: (\d+)/')
+# if [[ "$REV" == "" ]]; then
+#   exit -1
+# fi
+# TARGET=arkref-r${REV}.jar
+
+TARGET=arkref.jar
+
+set -eux
+
+rm -rf build
+mkdir -p build
+
+javac -cp $(print $(dirname $0)/lib/**/*.jar | tr ' ' :) -d build src/**/*.java
+
+(cd build && jar cf $TARGET arkref)
+
+mv build/$TARGET arkref.jar
diff --git a/config/MORPH_CACHE.gz b/config/MORPH_CACHE.gz
diff --git a/config/NOUNS_WS_SS_P.gz b/config/NOUNS_WS_SS_P.gz
diff --git a/config/VERBS_WS_SS.gz b/config/VERBS_WS_SS.gz
diff --git a/config/arkref.properties b/config/arkref.properties
@@ -0,0 +1,23 @@
+
+
+parserServerPort = 5556
+parserGrammarFile = lib/englishPCFG.ser.gz
+parserMaxLength = 150
+parserMinLength = 5
+
+
+
+########################
+#supersense tagger properties
+
+supersenseServerPort = 5557
+supersenseModelFile = config/superSenseModelAllSemcor.ser.gz
+
+propertiesFilePath=config/arkref.properties
+
+useOldDataFormat=true
+nounFile=config/NOUNS_WS_SS_P.gz
+verbFile=config/VERBS_WS_SS.gz
+
+
+