diff --git a/big-data-utils/pom.xml b/big-data-utils/pom.xml
index 5a5cfb129..96a703e17 100644
--- a/big-data-utils/pom.xml
+++ b/big-data-utils/pom.xml
@@ -3,7 +3,7 @@
illinois-cogcomp-nlp
edu.illinois.cs.cogcomp
- 4.0.15
+ 4.0.19
4.0.0
@@ -23,7 +23,7 @@
edu.illinois.cs.cogcomp
illinois-core-utilities
- 4.0.15
+ 4.0.19
org.xeustechnologies.google-api
diff --git a/chunker/pom.xml b/chunker/pom.xml
index 65fa16ffb..6e72ebc4d 100644
--- a/chunker/pom.xml
+++ b/chunker/pom.xml
@@ -2,7 +2,7 @@
illinois-cogcomp-nlp
edu.illinois.cs.cogcomp
- 4.0.15
+ 4.0.19
4.0.0
@@ -13,23 +13,23 @@
edu.illinois.cs.cogcomp
illinois-core-utilities
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
LBJava
- 1.3.1
+ 1.3.3
edu.illinois.cs.cogcomp
LBJava-NLP-tools
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-pos
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
diff --git a/commasrl/pom.xml b/commasrl/pom.xml
index 46ecefd7b..7f07500a6 100644
--- a/commasrl/pom.xml
+++ b/commasrl/pom.xml
@@ -4,7 +4,7 @@
illinois-cogcomp-nlp
edu.illinois.cs.cogcomp
- 4.0.15
+ 4.0.19
4.0.0
@@ -13,7 +13,7 @@
UTF-8
UTF-8
- 1.2.26
+ 1.3.3
@@ -35,48 +35,48 @@
edu.illinois.cs.cogcomp
illinois-core-utilities
- 4.0.15
+ 4.0.19
true
edu.illinois.cs.cogcomp
illinois-curator
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-tokenizer
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-corpusreaders
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-inference
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
stanford_3.3.1
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-pos
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-ner
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-chunker
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
diff --git a/core-utilities/pom.xml b/core-utilities/pom.xml
index 59a39f359..fee1df17e 100644
--- a/core-utilities/pom.xml
+++ b/core-utilities/pom.xml
@@ -6,7 +6,7 @@
illinois-cogcomp-nlp
edu.illinois.cs.cogcomp
- 4.0.15
+ 4.0.19
illinois-core-utilities
diff --git a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/View.java b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/View.java
index f5bded81a..3e8e6cb08 100644
--- a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/View.java
+++ b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/textannotation/View.java
@@ -137,7 +137,8 @@ private void removeAllTokenFromConstituentMapping(Constituent c) {
/**
* Convenience method for addConstituent(constituent, false)
- * @param constituent
+ *
+ * @param constituent The new constituent to be added.
*/
public void addConstituent(Constituent constituent){
this.addConstituent(constituent, false);
@@ -148,9 +149,10 @@ public void addConstituent(Constituent constituent){
* Otherwise, we return the new constituent.
*
* @param constituent The new constituent to be added.
+ * @param force if true, add constituent even if it is a duplicate
*/
public void addConstituent(Constituent constituent, boolean force) {
- if(!constituents.contains(constituent) || force) {
+ if(force || this.tokensToConstituents[constituent.getStartSpan()] == null || !constituents.contains(constituent)) {
constituents.add(constituent);
startSpan = Math.min(this.startSpan, constituent.getStartSpan());
@@ -161,7 +163,7 @@ public void addConstituent(Constituent constituent, boolean force) {
this.addTokenToConstituentMapping(token, constituent);
}
}
- }else {
+ } else {
System.err.println("Warning (View.java): not adding duplicate Constituent: " + constituent + ", use addConstituent(c, true) to force add.");
}
}
diff --git a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/vectors/ExceptionlessInputStream.java b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/vectors/ExceptionlessInputStream.java
index e2e810ce8..2a34e7e60 100644
--- a/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/vectors/ExceptionlessInputStream.java
+++ b/core-utilities/src/main/java/edu/illinois/cs/cogcomp/core/datastructures/vectors/ExceptionlessInputStream.java
@@ -34,8 +34,8 @@ public class ExceptionlessInputStream extends FilterInputStream {
private char[] chars = null;
/** The underlying data input stream. */
private DataInputStream dis;
-
-
+ /** if there is a zip stream, we must close it, closing the resulting stream does not close the file. */
+ private ZipFile zipfile = null;
/**
* Opens a buffered (and uncompressed) stream for reading from the specified file.
*
@@ -70,9 +70,9 @@ public static ExceptionlessInputStream openCompressedStream(String filename) {
try {
ZipFile zip = new ZipFile(filename);
- eis =
- new ExceptionlessInputStream(new BufferedInputStream(zip.getInputStream(zip
+ eis = new ExceptionlessInputStream(new BufferedInputStream(zip.getInputStream(zip
.getEntry(zipEntryName))));
+ eis.zipfile = zip;
} catch (Exception e) {
System.err.println("Can't open '" + filename + "' for input:");
e.printStackTrace();
@@ -160,7 +160,11 @@ private void handleException(Exception e) {
**/
public void close() {
try {
- dis.close();
+ dis.close();
+ if (zipfile != null) {
+ zipfile.close();
+ zipfile = null;
+ }
} catch (Exception e) {
System.err.println("Can't close input stream:");
e.printStackTrace();
diff --git a/corpusreaders/pom.xml b/corpusreaders/pom.xml
index b78860c78..516dc30d7 100644
--- a/corpusreaders/pom.xml
+++ b/corpusreaders/pom.xml
@@ -6,7 +6,7 @@
illinois-cogcomp-nlp
edu.illinois.cs.cogcomp
- 4.0.15
+ 4.0.19
illinois-corpusreaders
@@ -15,12 +15,12 @@
edu.illinois.cs.cogcomp
illinois-core-utilities
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-tokenizer
- 4.0.15
+ 4.0.19
org.slf4j
diff --git a/curator/pom.xml b/curator/pom.xml
index 8263a3901..3a5f9c84e 100644
--- a/curator/pom.xml
+++ b/curator/pom.xml
@@ -7,7 +7,7 @@
illinois-cogcomp-nlp
edu.illinois.cs.cogcomp
- 4.0.15
+ 4.0.19
illinois-curator
@@ -16,7 +16,7 @@
edu.illinois.cs.cogcomp
illinois-core-utilities
- 4.0.15
+ 4.0.19
diff --git a/dataless-classifier/pom.xml b/dataless-classifier/pom.xml
index e647ccd21..5c11a9dbd 100644
--- a/dataless-classifier/pom.xml
+++ b/dataless-classifier/pom.xml
@@ -3,7 +3,7 @@
illinois-cogcomp-nlp
edu.illinois.cs.cogcomp
- 4.0.15
+ 4.0.19
4.0.0
@@ -21,12 +21,12 @@
edu.illinois.cs.cogcomp
illinois-core-utilities
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-tokenizer
- 4.0.15
+ 4.0.19
org.slf4j
diff --git a/depparse/pom.xml b/depparse/pom.xml
index cc573accf..7694a46e0 100644
--- a/depparse/pom.xml
+++ b/depparse/pom.xml
@@ -7,7 +7,7 @@
illinois-cogcomp-nlp
edu.illinois.cs.cogcomp
- 4.0.15
+ 4.0.19
illinois-depparse
@@ -16,27 +16,27 @@
edu.illinois.cs.cogcomp
illinois-core-utilities
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-edison
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-lemmatizer
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-pos
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-chunker
- 4.0.15
+ 4.0.19
diff --git a/edison/pom.xml b/edison/pom.xml
index 59b33a20e..6ad78978b 100644
--- a/edison/pom.xml
+++ b/edison/pom.xml
@@ -7,7 +7,7 @@
illinois-cogcomp-nlp
edu.illinois.cs.cogcomp
- 4.0.15
+ 4.0.19
illinois-edison
@@ -16,7 +16,7 @@
edu.illinois.cs.cogcomp
illinois-core-utilities
- 4.0.15
+ 4.0.19
@@ -80,13 +80,13 @@
edu.illinois.cs.cogcomp
illinois-corpusreaders
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-curator
- 4.0.15
+ 4.0.19
test
@@ -98,7 +98,7 @@
edu.illinois.cs.cogcomp
LBJava
- 1.3.0
+ 1.3.3
diff --git a/external/clausie/pom.xml b/external/clausie/pom.xml
index b0b1fbc72..8e5bcf73c 100644
--- a/external/clausie/pom.xml
+++ b/external/clausie/pom.xml
@@ -5,7 +5,7 @@
illinois-cogcomp-nlp
edu.illinois.cs.cogcomp
- 4.0.15
+ 4.0.19
../../pom.xml
@@ -24,7 +24,7 @@
edu.illinois.cs.cogcomp
external-commons
- 4.0.15
+ 4.0.19
org.slf4j
diff --git a/external/external-commons/pom.xml b/external/external-commons/pom.xml
index bc9f5b1d2..2aef92289 100644
--- a/external/external-commons/pom.xml
+++ b/external/external-commons/pom.xml
@@ -2,7 +2,7 @@
illinois-cogcomp-nlp
edu.illinois.cs.cogcomp
- 4.0.15
+ 4.0.19
../../pom.xml
@@ -16,12 +16,12 @@
edu.illinois.cs.cogcomp
illinois-core-utilities
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-tokenizer
- 4.0.15
+ 4.0.19
org.cogcomp
diff --git a/external/path-lstm/pom.xml b/external/path-lstm/pom.xml
index 661175121..cc112a91b 100644
--- a/external/path-lstm/pom.xml
+++ b/external/path-lstm/pom.xml
@@ -2,7 +2,7 @@
illinois-cogcomp-nlp
edu.illinois.cs.cogcomp
- 4.0.15
+ 4.0.19
../../pom.xml
@@ -16,12 +16,12 @@
edu.illinois.cs.cogcomp
external-commons
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-edison
- 4.0.15
+ 4.0.19
org.cogcomp
diff --git a/external/stanford_3.3.1/pom.xml b/external/stanford_3.3.1/pom.xml
index 30aef7555..b89706d0f 100644
--- a/external/stanford_3.3.1/pom.xml
+++ b/external/stanford_3.3.1/pom.xml
@@ -5,7 +5,7 @@
illinois-cogcomp-nlp
edu.illinois.cs.cogcomp
- 4.0.15
+ 4.0.19
../../pom.xml
@@ -19,7 +19,7 @@
edu.illinois.cs.cogcomp
illinois-core-utilities
- 4.0.15
+ 4.0.19
@@ -36,7 +36,7 @@
edu.illinois.cs.cogcomp
illinois-corpusreaders
- 4.0.15
+ 4.0.19
diff --git a/external/stanford_3.8.0/pom.xml b/external/stanford_3.8.0/pom.xml
index d3ebcd45f..fc029821b 100644
--- a/external/stanford_3.8.0/pom.xml
+++ b/external/stanford_3.8.0/pom.xml
@@ -2,7 +2,7 @@
illinois-cogcomp-nlp
edu.illinois.cs.cogcomp
- 4.0.15
+ 4.0.19
../../pom.xml
@@ -16,12 +16,12 @@
edu.illinois.cs.cogcomp
illinois-corpusreaders
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
external-commons
- 4.0.15
+ 4.0.19
org.slf4j
diff --git a/inference/pom.xml b/inference/pom.xml
index 4aed0a26c..14894c039 100644
--- a/inference/pom.xml
+++ b/inference/pom.xml
@@ -6,7 +6,7 @@
illinois-cogcomp-nlp
edu.illinois.cs.cogcomp
- 4.0.15
+ 4.0.19
jar
@@ -22,7 +22,7 @@
edu.illinois.cs.cogcomp
illinois-core-utilities
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
diff --git a/lbjava-nlp-tools/pom.xml b/lbjava-nlp-tools/pom.xml
index 712d456c8..219e68c11 100644
--- a/lbjava-nlp-tools/pom.xml
+++ b/lbjava-nlp-tools/pom.xml
@@ -4,7 +4,7 @@
illinois-cogcomp-nlp
edu.illinois.cs.cogcomp
- 4.0.15
+ 4.0.19
LBJava-NLP-tools
@@ -25,12 +25,12 @@
edu.illinois.cs.cogcomp
LBJava
- 1.3.0
+ 1.3.3
edu.illinois.cs.cogcomp
illinois-core-utilities
- 4.0.15
+ 4.0.19
org.slf4j
diff --git a/lbjava-nlp-tools/src/main/java/edu/illinois/cs/cogcomp/lbjava/nlp/Word.java b/lbjava-nlp-tools/src/main/java/edu/illinois/cs/cogcomp/lbjava/nlp/Word.java
index c8aa1337f..71714755e 100644
--- a/lbjava-nlp-tools/src/main/java/edu/illinois/cs/cogcomp/lbjava/nlp/Word.java
+++ b/lbjava-nlp-tools/src/main/java/edu/illinois/cs/cogcomp/lbjava/nlp/Word.java
@@ -184,7 +184,10 @@ public Word(String f, String pos, String l, String sense, Word p, int start,
capitalized = f != null && f.length() > 0
&& Character.isUpperCase(f.charAt(0));
partOfSpeech = pos;
- if (partOfSpeech != null) POS.fromToken(partOfSpeech);
+
+ // if assertions are enabled, this fails, so I seen no
+ // reason to leave it in - redman
+ //if (partOfSpeech != null) POS.fromToken(partOfSpeech);
lemma = l;
wordSense = sense;
}
diff --git a/lemmatizer/pom.xml b/lemmatizer/pom.xml
index be3fee001..90322e575 100644
--- a/lemmatizer/pom.xml
+++ b/lemmatizer/pom.xml
@@ -7,7 +7,7 @@
illinois-cogcomp-nlp
edu.illinois.cs.cogcomp
- 4.0.15
+ 4.0.19
illinois-lemmatizer
@@ -16,12 +16,12 @@
edu.illinois.cs.cogcomp
illinois-core-utilities
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-edison
- 4.0.15
+ 4.0.19
edu.stanford.nlp
diff --git a/md/pom.xml b/md/pom.xml
index 93df57aab..990e6d88f 100644
--- a/md/pom.xml
+++ b/md/pom.xml
@@ -3,7 +3,7 @@
illinois-cogcomp-nlp
edu.illinois.cs.cogcomp
- 4.0.15
+ 4.0.19
4.0.0
@@ -20,37 +20,37 @@
edu.illinois.cs.cogcomp
LBJava
- 1.2.26
+ 1.3.3
edu.illinois.cs.cogcomp
illinois-corpusreaders
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-core-utilities
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-pos
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-edison
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-ner
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
stanford_3.3.1
- 4.0.15
+ 4.0.19
org.slf4j
@@ -70,7 +70,7 @@
edu.illinois.cs.cogcomp
lbjava-maven-plugin
- 1.3.0
+ 1.3.3
${project.basedir}/src/lbj/md.lbj
diff --git a/ner/benchmark/CoNLL/config/reuters.config b/ner/benchmark/CoNLL/config/reuters.config
index 61c541fc0..eb0f73779 100644
--- a/ner/benchmark/CoNLL/config/reuters.config
+++ b/ner/benchmark/CoNLL/config/reuters.config
@@ -1,6 +1,6 @@
# Required fields
-modelName CoNLL
-pathToModelFile models/CoNLL
+modelName reuters
+pathToModelFile models/reuters
# Optional fields
labelTypes PER ORG LOC MISC
@@ -10,9 +10,8 @@ randomNoiseLevel 0.0
omissionRate 0.0
# parameter sweep reveals these to be the best params, L2 model is best.
-# These were identified as part of the L1 L2 split parameter sweep of Oct '17
-learningRatePredictionsLevel1 .04
-thicknessPredictionsLevel1 40
-learningRatePredictionsLevel2 .04
-thicknessPredictionsLevel2 40
+learningRatePredictionsLevel1 .05
+thicknessPredictionsLevel1 30
+learningRatePredictionsLevel2 .05
+thicknessPredictionsLevel2 30
diff --git a/ner/benchmark/EnronCoNLL/config/EnronCoNLL.config b/ner/benchmark/EnronCoNLL/config/EnronCoNLL.config
index f3a68fbc6..5f7212890 100644
--- a/ner/benchmark/EnronCoNLL/config/EnronCoNLL.config
+++ b/ner/benchmark/EnronCoNLL/config/EnronCoNLL.config
@@ -2,7 +2,7 @@
# Required fields
modelName EnronCoNLL
-pathToModelFile ner/models/EnronCoNLL
+pathToModelFile models/EnronCoNLL
randomNoiseLevel 0.0
omissionRate 0.0
diff --git a/ner/benchmark/MUC7/config/muc7.config b/ner/benchmark/MUC7/config/muc7.config
index f260e0507..4e9176238 100644
--- a/ner/benchmark/MUC7/config/muc7.config
+++ b/ner/benchmark/MUC7/config/muc7.config
@@ -1,16 +1,15 @@
# Required fields
-modelName MUC7
-pathToModelFile ner/models/MUC7
+modelName muc7
+pathToModelFile models/muc7
# Optional fields
labelTypes PER ORG LOC MISC
# there are no misc tags in the MUC data.
labelsToIgnoreInEvaluation MISC
-FeaturePruningThreshold 0.0
# parameter sweep reveals these to be the best params, L1 model is best.
-learningRatePredictionsLevel1 .1
-thicknessPredictionsLevel1 20
+learningRatePredictionsLevel1 .08
+thicknessPredictionsLevel1 5
learningRatePredictionsLevel2 .08
-thicknessPredictionsLevel2 10
+thicknessPredictionsLevel2 5
diff --git a/ner/benchmark/Ontonotes/config/ontonotes.config b/ner/benchmark/Ontonotes/config/ontonotes.config
index d86a7a7c3..9571bf0f0 100644
--- a/ner/benchmark/Ontonotes/config/ontonotes.config
+++ b/ner/benchmark/Ontonotes/config/ontonotes.config
@@ -1,10 +1,9 @@
# Required fields
-modelName OntoNotes
-pathToModelFile ner/models/OntoNotes
+modelName ontonotes
+pathToModelFile models/ontonotes
# Optional fields
labelTypes TIME LAW GPE NORP LANGUAGE PERCENT FAC PRODUCT ORDINAL LOC PERSON WORK_OF_ART MONEY DATE EVENT QUANTITY ORG CARDINAL
-FeaturePruningThreshold 0.0
# parameter sweep reveals these to be the best params, L1 model is best.
learningRatePredictionsLevel1 .03
diff --git a/ner/benchmark/Web/config/web.config b/ner/benchmark/Web/config/web.config
index 56e2c46f1..0784cd4f1 100644
--- a/ner/benchmark/Web/config/web.config
+++ b/ner/benchmark/Web/config/web.config
@@ -1,6 +1,6 @@
# Required fields, web data is only tested, against the reuters model
-modelName EnronCoNLL_testCoNLL
-pathToModelFile models/EnronCoNLL_testCoNLL
+modelName web
+pathToModelFile models/Web
# Optional fields
labelTypes PER ORG LOC MISC
diff --git a/ner/pom.xml b/ner/pom.xml
index 0dbd48a92..d00bccc36 100644
--- a/ner/pom.xml
+++ b/ner/pom.xml
@@ -6,7 +6,7 @@
illinois-cogcomp-nlp
edu.illinois.cs.cogcomp
- 4.0.15
+ 4.0.19
illinois-ner
@@ -23,12 +23,12 @@
edu.illinois.cs.cogcomp
illinois-tokenizer
- 4.0.15
+ 4.0.19
edu.illinois.cs.cogcomp
illinois-core-utilities
- 4.0.15
+ 4.0.19
org.cogcomp
@@ -39,12 +39,12 @@
edu.illinois.cs.cogcomp
LBJava
- 1.3.1
+ 1.3.3
edu.illinois.cs.cogcomp
LBJava-NLP-tools
- 4.0.15
+ 4.0.19
org.slf4j
@@ -90,7 +90,7 @@
edu.illinois.cs.cogcomp
lbjava-maven-plugin
- 1.3.1
+ 1.3.2
${project.basedir}/src/main/lbj/LbjTagger.lbj
diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java
index df0020a5e..49335c001 100644
--- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java
+++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/BrownClusters.java
@@ -9,11 +9,14 @@
import org.cogcomp.Datastore;
import org.cogcomp.DatastoreException;
+
+import edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder;
import edu.illinois.cs.cogcomp.core.resources.ResourceConfigurator;
import edu.illinois.cs.cogcomp.ner.IO.InFile;
import edu.illinois.cs.cogcomp.ner.LbjTagger.Data;
import edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord;
-import edu.illinois.cs.cogcomp.ner.LbjTagger.ParametersForLbjCode;
+import edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer;
+import edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder;
import edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector;
import gnu.trove.map.hash.THashMap;
import io.minio.errors.InvalidEndpointException;
@@ -48,6 +51,9 @@ private BrownClusters() {
/** clusters store, keyed on catenated paths. */
static private HashMap clusters = new HashMap<>();
+ /** this is just to test the tokenizer produces same as the splitter. */
+ static private TextAnnotationBuilder tokenizer = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
+
/**
* Makes a unique key based on the paths, for storage in a hashmap.
@@ -116,7 +122,7 @@ public static BrownClusters get(Vector pathsToClusterFiles, Vector= thresholds.elementAt(i)) {
- h.put(word, path);
+ h.put(word, path);
}
line = in.readLine();
}
@@ -181,21 +187,25 @@ final public ArrayList getResources() {
final public String[] getPrefixes(NEWord w) {
return getPrefixes(w.form);
}
-
+
final public String[] getPrefixes(String word) {
+
+ // not cached.
ArrayList v = new ArrayList<>(wordToPathByResource.size());
for (int j = 0; j < wordToPathByResource.size(); j++) {
if (isLowercaseBrownClustersByResource[j])
word = word.toLowerCase();
THashMap wordToPath = wordToPathByResource.get(j);
- final String prefix = "resource" + j + ":";
- if (wordToPath != null && wordToPath.containsKey(word)) {
- String path = wordToPath.get(word);
- int pathlength = path.length();
- v.add(prefix + path.substring(0, Math.min(pathlength, prefixLengths[0])));
- for (int i = 1; i < prefixLengths.length; i++)
- if (prefixLengths[i - 1] < pathlength)
- v.add(prefix + path.substring(0, Math.min(pathlength, prefixLengths[i])));
+ if (wordToPath != null) {
+ String path = wordToPath.get(word);
+ final String prefix = "resource"+j+":";
+ if (path != null) {
+ int pathlength = path.length();
+ v.add(prefix + path.substring(0, Math.min(pathlength, prefixLengths[0])));
+ for (int i = 1; i < prefixLengths.length; i++)
+ if (prefixLengths[i - 1] < pathlength)
+ v.add(prefix + path.substring(0, Math.min(pathlength, prefixLengths[i])));
+ }
}
}
String[] res = new String[v.size()];
@@ -212,12 +222,6 @@ final public String getPrefixesCombined(String word) {
return ret;
}
- private static void printArr(String[] arr) {
- for (String anArr : arr)
- logger.info(" " + anArr);
- logger.info("");
- }
-
final public void printOovData(Data data) {
HashMap tokensHash = new HashMap<>();
HashMap tokensHashIC = new HashMap<>();
@@ -246,6 +250,24 @@ final public void printOovData(Data data) {
}
}
}
-
}
+
+ /**
+ * Purge all brown cluster data, clearing memory.
+ */
+ static public void reset() {
+ clusters = new HashMap<>();
+ }
+
+ /**
+ * Purge all brown cluster data, clearing memory.
+ */
+ static public void purge(Vector pathsToClusterFiles) {
+ synchronized (INIT_SYNC) {
+ // first check for a cluster already loaded for this data.
+ String key = getKey(pathsToClusterFiles);
+ clusters.remove(key);
+ }
+ }
+
}
diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/ContextAggregation.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/ContextAggregation.java
index 1350f01a6..c3efbaffc 100644
--- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/ContextAggregation.java
+++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/ContextAggregation.java
@@ -9,6 +9,7 @@
import edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord;
import edu.illinois.cs.cogcomp.ner.LbjTagger.ParametersForLbjCode;
+import edu.illinois.cs.cogcomp.ner.config.NerBaseConfigurator;
import java.util.HashMap;
import java.util.Hashtable;
@@ -20,9 +21,9 @@ public class ContextAggregation {
* that the data was annotated with dictionaries etc.
*/
public static void annotate(NEWord word) {
- if (word.params.featuresToUse.containsKey("aggregateContext")
+ if (word.params.featuresToUse.containsKey(NerBaseConfigurator.AGGREGATE_CONTEXT)
|| word.params.featuresToUse
- .containsKey("aggregateGazetteerMatches")) {
+ .containsKey(NerBaseConfigurator.AGGREGATE_GAZETTEER)) {
int i = 0;
NEWord w = word, last = word.nextIgnoreSentenceBoundary;
diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/GazetteersFactory.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/GazetteersFactory.java
index 565741c15..25a502a2b 100644
--- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/GazetteersFactory.java
+++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/ExpressiveFeatures/GazetteersFactory.java
@@ -10,6 +10,7 @@
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
+import java.util.Vector;
import edu.illinois.cs.cogcomp.core.constants.Language;
@@ -53,4 +54,20 @@ static public Gazetteers get(int maxPhraseLength, String path, boolean flatgazet
return gazetteers_map.get(path);
}
}
+
+ /**
+ * Purge all gaz data, clearing memory.
+ */
+ static public void reset() {
+ gazetteers_map = new HashMap<>();
+ }
+
+ /**
+ * Purge a single gazetteer entry for the gaz at that path.
+ */
+ static public void purge(String path) {
+ synchronized (GAZ_INIT_LOCK) {
+ gazetteers_map.remove(path);
+ }
+ }
}
diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/InferenceMethods/PredictionsAndEntitiesConfidenceScores.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/InferenceMethods/PredictionsAndEntitiesConfidenceScores.java
index 20adf058f..779e72077 100644
--- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/InferenceMethods/PredictionsAndEntitiesConfidenceScores.java
+++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/InferenceMethods/PredictionsAndEntitiesConfidenceScores.java
@@ -7,17 +7,16 @@
*/
package edu.illinois.cs.cogcomp.ner.InferenceMethods;
-import edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.TwoLayerPredictionAggregationFeatures;
-import edu.illinois.cs.cogcomp.ner.LbjFeatures.NETaggerLevel1;
-import edu.illinois.cs.cogcomp.ner.LbjTagger.*;
-import edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord.RealFeature;
-import edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.CharacteristicWords;
-import edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector;
-
import java.util.ArrayList;
import java.util.Vector;
-/*
+import edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector;
+import edu.illinois.cs.cogcomp.ner.LbjTagger.Data;
+import edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord;
+import edu.illinois.cs.cogcomp.ner.LbjTagger.NamedEntity;
+import edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.CharacteristicWords;
+
+/**
* This class is responsible for handling prediction scores of the entities. That is, this class can
* prune the entities/predictions on which we're not confident at
*
diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/InferenceMethods/PredictionsToProbabilities.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/InferenceMethods/PredictionsToProbabilities.java
index 35effff2a..89c87d73e 100644
--- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/InferenceMethods/PredictionsToProbabilities.java
+++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/InferenceMethods/PredictionsToProbabilities.java
@@ -31,15 +31,13 @@ public static CharacteristicWords getAndSetPredictionConfidences(SparseNetworkLe
}
double[] correctedScores = new double[scores.length];
double min = scores[0].score;
- int maxScoreIdx = 0;
- double maxScore = scores[maxScoreIdx].score;
- String maxLabel = scores[maxScoreIdx].value;
+ double max = scores[0].score;
+ String maxLabel = scores[0].value;
for (int i = 0; i < scores.length; i++) {
if (min > scores[i].score)
min = scores[i].score;
- if (maxScore < scores[i].score) {
- maxScore = scores[i].score;
- maxScoreIdx = i;
+ if (max < scores[i].score) {
+ max = scores[i].score;
maxLabel = scores[i].value;
}
}
@@ -55,19 +53,20 @@ public static CharacteristicWords getAndSetPredictionConfidences(SparseNetworkLe
correctedScores[i] /= sum;
}
+ /* this doesn't seem necessary
for (int i = 0; i < correctedScores.length; i++)
- correctedScores[i] = correctedScores[i];
+ correctedScores[i] = correctedScores[i];*/
CharacteristicWords res = new CharacteristicWords(scores.length);
for (int i = 0; i < scores.length; i++)
res.addElement(scores[i].value, correctedScores[i]);
+ w.setRawScore((float)max);
if (predictionType.equals(NEWord.LabelToLookAt.PredictionLevel1Tagger)) {
w.neTypeLevel1 = maxLabel;
w.predictionConfidencesLevel1Classifier = res;
- }
- if (predictionType.equals(NEWord.LabelToLookAt.PredictionLevel2Tagger)) {
+ } else if (predictionType.equals(NEWord.LabelToLookAt.PredictionLevel2Tagger)) {
w.neTypeLevel2 = maxLabel;
w.predictionConfidencesLevel2Classifier = res;
}
diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java
index c38f9c9c7..fac8e91cd 100644
--- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java
+++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/LearningCurveMultiDataset.java
@@ -12,6 +12,7 @@
import edu.illinois.cs.cogcomp.lbjava.learn.BatchTrainer;
import edu.illinois.cs.cogcomp.lbjava.learn.SparseAveragedPerceptron;
import edu.illinois.cs.cogcomp.lbjava.learn.SparseNetworkLearner;
+import edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector;
import edu.illinois.cs.cogcomp.lbjava.parse.Parser;
import edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.ExpressiveFeaturesAnnotator;
import edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.TwoLayerPredictionAggregationFeatures;
@@ -23,6 +24,7 @@
import java.io.File;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.Vector;
import static java.lang.Float.NaN;
@@ -123,7 +125,7 @@ public static void getLearningCurve(int fixedNumIterations, String dataFormat, S
*/
public static void getLearningCurve(Vector trainDataSet, Vector testDataSet,
int fixedNumIterations, boolean incremental, ParametersForLbjCode params) throws Exception {
- double bestF1Level1 = -1;
+ double bestF1Level1 = -2;
int bestRoundLevel1 = 0;
// Get the directory name (.model is appended in LbjTagger/Parameters.java:139)
String modelPath = params.pathToModelFile;
@@ -178,14 +180,15 @@ public static void getLearningCurve(Vector trainDataSet, Vector test
deleteme = new File(testPathL1);
if (deleteme.exists())
deleteme.delete();
- logger.info("Pre-extracting the training data for Level 1 classifier, saving to "+trainPathL1);
- BatchTrainer bt1train = prefetchAndGetBatchTrainer(tagger1, trainDataSet, trainPathL1, params);
- logger.info("Pre-extracting the testing data for Level 1 classifier, saving to "+testPathL1);
- BatchTrainer bt1test = prefetchAndGetBatchTrainer(tagger1, testDataSet, testPathL1, params);
- Parser testParser1 = bt1test.getParser();
// create the best model possible.
{
+ logger.info("Pre-extracting the training data for Level 1 classifier, saving to "+trainPathL1);
+ BatchTrainer bt1train = prefetchAndGetBatchTrainer(tagger1, trainDataSet, trainPathL1, params);
+ logger.info("Pre-extracting the testing data for Level 1 classifier, saving to "+testPathL1);
+ BatchTrainer bt1test = prefetchAndGetBatchTrainer(tagger1, testDataSet, testPathL1, params);
+ Parser testParser1 = bt1test.getParser();
+
NETaggerLevel1 saveme = null;
for (int i = 0; (fixedNumIterations == -1 && i < 200 && i - bestRoundLevel1 < 10)
|| (fixedNumIterations > 0 && i <= fixedNumIterations); ++i) {
@@ -202,22 +205,29 @@ public static void getLearningCurve(Vector trainDataSet, Vector test
bestRoundLevel1 = i;
saveme = (NETaggerLevel1) tagger1.clone();
saveme.beginTraining();
-
- System.out.println(saveme);
- System.out.println(bestF1Level1);
- System.out.println(f1Level1);
-
- }
- logger.info(i + " rounds. Best so far for Level1 : (" + bestRoundLevel1 + ")="
+ logger.info(i + " rounds. New best for Level1 : (" + bestRoundLevel1 + ")="
+ + bestF1Level1);
+ } else {
+ logger.info(i + " rounds. Best so far for Level1 : (" + bestRoundLevel1 + ")="
+ bestF1Level1);
+ }
}
-
saveme.getBaseLTU().featurePruningThreshold = params.featurePruningThreshold;
saveme.doneTraining();
saveme.save();
+ bt1train.getParser().close();
+ bt1test.getParser().close();
logger.info("Level 1; best round : " + bestRoundLevel1 + "\tbest F1 : " + bestF1Level1);
}
+ // dispose of the L1 caching files
+ deleteme = new File(trainPathL1);
+ if (deleteme.exists())
+ deleteme.delete();
+ deleteme = new File(testPathL1);
+ if (deleteme.exists())
+ deleteme.delete();
+
// Read the best model back in, optimize by pruning useless features, then write it agains
tagger1 = new NETaggerLevel1(paramLevel1, modelPath + ".level1", modelPath + ".level1.lex");
@@ -252,18 +262,18 @@ public static void getLearningCurve(Vector trainDataSet, Vector test
if (params.featuresToUse.containsKey("PredictionsLevel1")) {
logger.info("Level 2 classifier learning rate = "+params.learningRatePredictionsLevel2+
", thickness = "+params.thicknessPredictionsLevel2);
- double bestF1Level2 = -1;
+ double bestF1Level2 = -2;
int bestRoundLevel2 = 0;
- logger.info("Pre-extracting the training data for Level 2 classifier, saving to "+trainPathL2);
- BatchTrainer bt2train =
- prefetchAndGetBatchTrainer(tagger2, trainDataSet, trainPathL2, params);
- logger.info("Pre-extracting the testing data for Level 2 classifier, saving to "+testPathL2);
- BatchTrainer bt2test =
- prefetchAndGetBatchTrainer(tagger2, testDataSet, testPathL2, params);
- Parser testParser2 = bt2test.getParser();
// create the best model possible.
{
+ logger.info("Pre-extracting the training data for Level 2 classifier, saving to "+trainPathL2);
+ BatchTrainer bt2train =
+ prefetchAndGetBatchTrainer(tagger2, trainDataSet, trainPathL2, params);
+ logger.info("Pre-extracting the testing data for Level 2 classifier, saving to "+testPathL2);
+ BatchTrainer bt2test =
+ prefetchAndGetBatchTrainer(tagger2, testDataSet, testPathL2, params);
+ Parser testParser2 = bt2test.getParser();
NETaggerLevel2 saveme = null;
for (int i = 0; (fixedNumIterations == -1 && i < 200 && i - bestRoundLevel2 < 10)
|| (fixedNumIterations > 0 && i <= fixedNumIterations); ++i) {
@@ -276,25 +286,32 @@ public static void getLearningCurve(Vector trainDataSet, Vector test
TestDiscrete.testDiscrete(simpleTest, tagger2, null, testParser2, true, 0);
double f1Level2 = simpleTest.getOverallStats()[2];
- if (f1Level2 >= bestF1Level2) {
+ if(Double.isNaN(f1Level2))
+ f1Level2 = 0;
+ if (f1Level2 > bestF1Level2) {
bestF1Level2 = f1Level2;
bestRoundLevel2 = i;
saveme = (NETaggerLevel2) tagger2.clone();
saveme.beginTraining();
- }
- logger.info(i + " rounds. Best so far for Level2 : (" + bestRoundLevel2 + ") "
+ logger.info(i + " rounds. New best for Level2 : (" + bestRoundLevel2 + ") "
+ bestF1Level2);
+ } else {
+ logger.info(i + " rounds. Best so far for Level2 : (" + bestRoundLevel2 + ") "
+ + bestF1Level2);
+ }
}
saveme.getBaseLTU().featurePruningThreshold = params.featurePruningThreshold;
saveme.doneTraining();
saveme.save();
+ bt2train.getParser().close();
+ bt2test.getParser().close();
}
// trash the l2 prefetch data
deleteme = new File(trainPathL2);
if (deleteme.exists())
deleteme.delete();
- deleteme = new File(testPathL1);
+ deleteme = new File(testPathL2);
if (deleteme.exists())
deleteme.delete();
@@ -362,22 +379,37 @@ public void close() {
}
public Object next() {
- if (datasetId >= dataset.size())
- return null;
- // logger.debug("token = "+tokenId+"; sentence = "+sentenceId+"; dataset = "+datasetId+" --- datasets="+dataset.size()+" now sentences= "+dataset.elementAt(datasetId).sentences.size()+"; now tokens = "+dataset.elementAt(datasetId).sentences.elementAt(sentenceId).size());
- Object res =
+ if (datasetId >= dataset.size()) {
+ return null; // expected, we are just done with the dataset.
+ }
+ Data nerdata = dataset.elementAt(datasetId);
+ if (nerdata.documents.size() <= docid) {
+ logger.info("Encountered a dataset with no documents in it.");
+ return null; // a dataset with no documents in it is odd.
+ }
+ NERDocument nerdoc = nerdata.documents.get(docid);
+ if (nerdoc.sentences.size() <= sentenceId) {
+ logger.info("Encountered a document with no sentences in it : "+nerdoc.docname);
+ return null;
+ }
+ LinkedVector nersentence = nerdoc.sentences.get(sentenceId);
+ if (nersentence.size() <= tokenId) {
+ logger.info("Encountered a sentnce with no tokens in it : "+nerdoc.docname);
+ return null;
+ }
+ Object res = nersentence.get(tokenId);
+ /*Object res =
dataset.elementAt(datasetId).documents.get(docid).sentences.get(sentenceId)
- .get(tokenId);
- if (tokenId < dataset.elementAt(datasetId).documents.get(docid).sentences.get(
- sentenceId).size() - 1)
+ .get(tokenId);*/
+ if (tokenId < nersentence.size() - 1)
tokenId++;
else {
tokenId = 0;
- if (sentenceId < dataset.elementAt(datasetId).documents.get(docid).sentences.size() - 1) {
+ if (sentenceId < nerdoc.sentences.size() - 1) {
sentenceId++;
} else {
sentenceId = 0;
- if (docid < dataset.elementAt(datasetId).documents.size() - 1) {
+ if (docid < nerdata.documents.size() - 1) {
docid++;
} else {
docid = 0;
diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEWord.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEWord.java
index 43c72c3af..0cefb75ac 100644
--- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEWord.java
+++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/NEWord.java
@@ -26,6 +26,10 @@ public class NEWord extends Word {
/** This field is used to store a computed named entity type tag. */
public String neTypeLevel1;
+
+ /** raw score as returned by the classifier without normalization. */
+ private float rawScore;
+
public String neTypeLevel2;
public NamedEntity predictedEntity = null;// if non-null it keeps the named entity the tagger
public ParametersForLbjCode params = null;
@@ -61,10 +65,9 @@ public class NEWord extends Word {
private HashMap nonLocalFeatures = null;
private String[] nonLocFeatArray = null;
- /*
- * This stuff was added for form normalization purposes.
- */
-
+ /** this feature is only populate if useFileTypes feature is enabled. */
+ private String fileType = null;
+
/**
* An NEWord
can be constructed from a Word
object representing the
* same word, an NEWord
representing the previous word in the sentence, and the
@@ -81,7 +84,7 @@ public NEWord(Word w, NEWord p, String type) {
neLabel = type;
neTypeLevel1 = null;
}
-
+
/**
* Add the provided token to the sentence, for also do any additional word spliting.
*
@@ -95,6 +98,22 @@ public static void addTokenToSentence(LinkedVector sentence, String token, Strin
addTokenToSentence(sentence, word);
}
+ /**
+ * Add the provided token to the sentence, also do any additional word splitting. Additional argument
+ * indicates the file type which must be provided. If there is no file type, the file type is null.
+ *
+ * @param sentence the sentence to add the word to.
+ * @param token the individual token.
+ * @param tag the tag to annotate the word with.
+ * @param fileType a string representing file type.
+ */
+ public static void addTokenToSentence(LinkedVector sentence, String token, String tag, ParametersForLbjCode prs, String fileType) {
+ NEWord word = new NEWord(new Word(token), null, tag);
+ word.params = prs;
+ word.setFileType(fileType);
+ addTokenToSentence(sentence, word);
+ }
+
public static void addTokenToSentence(LinkedVector sentence, NEWord word) {
Vector v = NEWord.splitWord(word);
if (word.params.tokenizationScheme
@@ -160,7 +179,7 @@ public ArrayList resetLevel1AggregationFeatures() {
* Produces a simple String
representation of this word in which the
* neLabel
field appears followed by the word's part of speech and finally the form
* (i.e., spelling) of the word all surrounded by parentheses.
- **/
+ */
public String toString() {
return "(" + neLabel + " " + partOfSpeech + " " + form + ")";
}
@@ -200,8 +219,51 @@ public void setPrediction(String label, LabelToLookAt labelType) {
this.neTypeLevel2 = label;
}
+ /**
+ * @return the file type of this term (same for entire document).
+ */
+ public String getFileType() {
+ return fileType;
+ }
+
+ /**
+ * @param fileType the file type of this term (same for entire document).
+ */
+ public void setFileType(String fileType) {
+ this.fileType = fileType;
+ }
+
+ /**
+ * This method will return the score of the chosen label.
+ * @return the score of the best label for this term.
+ */
+ public double getScore() {
+ if (predictionConfidencesLevel2Classifier == null || predictionConfidencesLevel2Classifier.topScores.size() == 0)
+ if (predictionConfidencesLevel1Classifier == null || predictionConfidencesLevel1Classifier.topScores.size() == 0)
+ throw new RuntimeException("Attempt to get label score before scores are set.");
+ else
+ return this.predictionConfidencesLevel1Classifier.topScores.elementAt(0);
+ else
+ return this.predictionConfidencesLevel2Classifier.topScores.elementAt(0);
+ }
+
+
+ /**
+ * This method will return the score of the chosen label.
+ * @return the unnormalized score as returned directly by classifier.
+ */
+ public float getRawScore() {
+ return rawScore;
+ }
+
+ /**
+ * @param rawScore the unnormalized score as returned directly by classifier.
+ */
+ public void setRawScore(float rawScore) {
+ this.rawScore = rawScore;
+ }
- public enum LabelToLookAt {
+ public enum LabelToLookAt {
PredictionLevel2Tagger, PredictionLevel1Tagger, GoldLabel
}
diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/Parameters.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/Parameters.java
index 491968c6b..2585bcba4 100644
--- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/Parameters.java
+++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/Parameters.java
@@ -118,8 +118,6 @@ public static ParametersForLbjCode readAndLoadConfig(ResourceManager rm, boolean
}
param.debug = rm.getDebug();
- // ParametersForLbjCode.currentParameters.debug = param.debug;
-
double randomNoiseLevel = rm.getDouble(NerBaseConfigurator.RANDOM_NOISE_LEVEL);
double omissionRate = rm.getDouble(NerBaseConfigurator.OMISSION_RATE);
@@ -136,13 +134,26 @@ public static ParametersForLbjCode readAndLoadConfig(ResourceManager rm, boolean
Language lang = Language.getLanguageByCode(rm.getString("language"));
param.language = lang;
}
-
if (rm.containsKey("labelsToAnonymizeInEvaluation")) {
String labelsToAnonymizeInEvaluation =
rm.getString("labelsToAnonymizeInEvaluation");
param.labelsToAnonymizeInEvaluation =
new Vector<>(Arrays.asList(labelsToAnonymizeInEvaluation.split(" ")));
}
+ if (rm.containsKey(NerBaseConfigurator.LABELS_TO_KEEP)) {
+ String labelsToKeep = rm.getString(NerBaseConfigurator.LABELS_TO_KEEP);
+ param.labelsToKeep = new ArrayList(Arrays.asList(labelsToKeep.split(" ")));
+ }
+
+ // this property can be either "1" or "true" to enable file type.
+ if (rm.containsKey(NerBaseConfigurator.USE_FILETYPE)) {
+ String usefiletype = rm.getString(NerBaseConfigurator.USE_FILETYPE);
+ if (usefiletype.equalsIgnoreCase("true") || usefiletype.equals("1")) {
+ logger.info("File Type information will be included in the feature set.");
+ param.useFileType = true;
+ }
+ }
+
if (rm.containsKey("labelsToIgnoreInEvaluation")) {
String labelsToIgnoreInEvaluation = rm.getString("labelsToIgnoreInEvaluation");
param.labelsToIgnoreInEvaluation =
diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/ParametersForLbjCode.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/ParametersForLbjCode.java
index 6446f0682..0e24af324 100644
--- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/ParametersForLbjCode.java
+++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/LbjTagger/ParametersForLbjCode.java
@@ -13,6 +13,7 @@
import edu.illinois.cs.cogcomp.ner.ExpressiveFeatures.Gazetteers;
import edu.illinois.cs.cogcomp.ner.config.NerBaseConfigurator;
+import java.util.ArrayList;
import java.util.HashMap;
import java.util.Vector;
@@ -35,6 +36,12 @@ public enum TokenizationScheme {
// will be initialized to something like {"PER","ORG","LOC","MISC"}; */
public String[] labelTypes = {"PER", "ORG", "LOC", "MISC"};
+ /** labels of interest if a subset of all labels, all other labels are ignored. */
+ public ArrayList labelsToKeep = null;
+
+ /** use filetype feature, greatly increasing the number of features. */
+ public boolean useFileType = false;
+
/** Labels to ignore when evaluating model performance, e.g. "MISC" for the MUC7 dataset. */
public Vector labelsToIgnoreInEvaluation = null;
diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/Main.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/Main.java
index f343b8741..0b7ee7233 100644
--- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/Main.java
+++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/Main.java
@@ -266,7 +266,7 @@ else if (modelName.toLowerCase().equals("ontonotes"))
} else {
this.nerAnnotator = new NERAnnotator(this.resourceManager, viewName);
}
- System.out.println("Completed loading resources, assuming a ");
+ System.out.println("Completed loading resources ");
}
// display the command prompt depending on the mode we are in.
diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERAnnotator.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERAnnotator.java
index 5b3e81498..55492d8ba 100644
--- a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERAnnotator.java
+++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERAnnotator.java
@@ -27,6 +27,7 @@
import edu.illinois.cs.cogcomp.annotation.Annotator;
import edu.illinois.cs.cogcomp.annotation.AnnotatorConfigurator;
import edu.illinois.cs.cogcomp.core.datastructures.ViewNames;
+import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation;
@@ -55,6 +56,10 @@
*/
public class NERAnnotator extends Annotator {
+ /** name of attribute containing the raw score value represented as a string. This value is
+ * not normalized in any way, it is the value produced by the perceptron. */
+ final static public String RAW_SCORE_ATTRIBUTE = "RawScore";
+
/** our specific logger. */
private final Logger logger = LoggerFactory.getLogger(NERAnnotator.class);
@@ -121,7 +126,13 @@ public void initialize(ResourceManager nerRm) {
// load the models.
synchronized (LOADING_MODELS) {
ModelLoader.load(nerRm, viewName, false, this.params);
- }
+ }
+ if (this.params.labelsToKeep != null) {
+ logger.info("Kept label : "+this.params.labelsToKeep);
+ this.params.taggerLevel1.pruneUnusedLabels(this.params.labelsToKeep);
+ if (this.params.taggerLevel2 != null)
+ this.params.taggerLevel2.pruneUnusedLabels(this.params.labelsToKeep);
+ }
}
/**
@@ -171,26 +182,24 @@ public void addView(TextAnnotation ta) {
// the data always has a single document
// each LinkedVector in data corresponds to a sentence.
int tokenoffset = 0;
- for (LinkedVector vector : nerSentences) {
+ for (LinkedVector nerWords : nerSentences) {
boolean open = false;
// there should be a 1:1 mapping btw sentence tokens in record and words/predictions
// from NER.
int startIndex = -1;
String label = null;
- for (int j = 0; j < vector.size(); j++, tokenoffset++) {
- NEWord neWord = (NEWord) (vector.get(j));
+ for (int j = 0; j < nerWords.size(); j++, tokenoffset++) {
+ NEWord neWord = (NEWord) (nerWords.get(j));
String prediction = neWord.neTypeLevel2;
- // LAM-tlr this is not a great way to ascertain the entity type, it's a bit
- // convoluted, and very
- // inefficient, use enums, or nominalized indexes for this sort of thing.
+ // identify the label.
if (prediction.startsWith("B-")) {
startIndex = tokenoffset;
label = prediction.substring(2);
open = true;
} else if (j > 0) {
- String previous_prediction = ((NEWord) vector.get(j - 1)).neTypeLevel2;
+ String previous_prediction = ((NEWord) nerWords.get(j - 1)).neTypeLevel2;
if (prediction.startsWith("I-")
&& (!previous_prediction.endsWith(prediction.substring(2)))) {
startIndex = tokenoffset;
@@ -201,10 +210,10 @@ public void addView(TextAnnotation ta) {
if (open) {
boolean close = false;
- if (j == vector.size() - 1) {
+ if (j == nerWords.size() - 1) {
close = true;
} else {
- String next_prediction = ((NEWord) vector.get(j + 1)).neTypeLevel2;
+ String next_prediction = ((NEWord) nerWords.get(j + 1)).neTypeLevel2;
if (next_prediction.startsWith("B-"))
close = true;
if (next_prediction.equals("O"))
@@ -226,8 +235,8 @@ public void addView(TextAnnotation ta) {
int e = tokenindices[endIndex];
if (e <= s)
e = s + 1;
-
- nerView.addSpanLabel(s, e, label, 1d);
+ Constituent tokenlabel = nerView.addSpanLabel(s, e, label, neWord.getScore());
+ tokenlabel.addAttribute(RAW_SCORE_ATTRIBUTE, Float.toString(neWord.getRawScore()));
open = false;
}
}
diff --git a/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERResourceManagerFactory.java b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERResourceManagerFactory.java
new file mode 100644
index 000000000..c8a95fd7d
--- /dev/null
+++ b/ner/src/main/java/edu/illinois/cs/cogcomp/ner/NERResourceManagerFactory.java
@@ -0,0 +1,182 @@
+/*
+ * To change this license header, choose License Headers in Project Properties.
+ * To change this template file, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package edu.illinois.cs.cogcomp.ner;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Map.Entry;
+import java.util.Properties;
+
+import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager;
+import edu.illinois.cs.cogcomp.ner.config.NerBaseConfigurator;
+
+/**
+ * The factory, given a properties file reference will load the properties and
+ * "merge" them with the defaults. In the case of properties that reference a
+ * file or directory, the property in the properties file is assumed to be a
+ * relative path. The path will be prefixed with the directory path included in
+ * the static path variable before actually being saved in the resource manager.
+ * The resource manager is return after this refactoring is complete.
+ * @author redman
+ */
+public class NERResourceManagerFactory {
+
+ /**
+ * check first if file in in resource fork (a jar file), if not check if it's in a file. If the
+ * configuration file exists in either a jar file or on the file system return true.
+ * @param configFile the file to find.
+ * @throws IOException
+ */
+ static private Properties checkIfExists(String configFile) throws IOException {
+ InputStream is = NERResourceManagerFactory.class.getClassLoader().getResourceAsStream(configFile);
+ if (is == null) {
+ is = new FileInputStream(configFile);
+ }
+ try {
+ Properties properties = new Properties();
+ properties.load(is);
+ return properties;
+ } finally {
+ try {
+ is.close();
+ } catch (IOException ignored) {
+ }
+ }
+ }
+
+ /**
+ * This method will return a resource manager that can be used by the NER
+ * system, however, all paths are assumed to be relative, the
+ * resourcePath must be set to contain the folder on the system where all
+ * property files, models, gazetteers and brown cluster reside.
+ *
+ * @param propertiesFilename the name of the properties file.
+ * @param modelsReplacementPattern pattern to replace with the model directory, or null to disable.
+ * @param resourcesReplacementPattern pattern to replace with the resources directory, or null to disable.
+ * @param modelsPath path where models are found, or null to disable.
+ * @param resourcesPath path where resources are found, or null to disable.
+ * @return the modified resources.
+ * @throws FileNotFoundException if a required file was not found.
+ * @throws IOException if a file was found but could not be read or parsed.
+ */
+ static public ResourceManager get(String propertiesFilename, String modelsReplacementPattern,
+ String resourcesReplacementPattern, String modelsPath, String resourcesPath)
+ throws FileNotFoundException, IOException {
+
+ // check the models path.
+ if (resourcesPath != null && resourcesPath.length() > 0) {
+ if (!resourcesPath.endsWith(File.separator)) {
+ resourcesPath = resourcesPath + File.separator;
+ }
+ File resourcesDirectory = new File(resourcesPath);
+ if (!resourcesDirectory.exists()) {
+ throw new FileNotFoundException("The resources directory did not exist.");
+ }
+ if (!resourcesDirectory.isDirectory()) {
+ throw new FileNotFoundException("The resources directory existed, but is not a directory.");
+ }
+ }
+ Properties properties = null;
+ try {
+ properties = checkIfExists(propertiesFilename);
+ } catch (IOException e) {
+ if (resourcesPath == null)
+ throw e;
+ // did not exist as presented in the argument, add the resourcePath, see if it's there.
+ propertiesFilename = resourcesPath+propertiesFilename;
+ properties = checkIfExists(propertiesFilename);
+ }
+
+ // check the models path.
+ if (modelsPath != null && modelsPath.length() > 0) {
+ if (modelsPath.length() > 0 && !modelsPath.endsWith(File.separator)) {
+ modelsPath = modelsPath + File.separator;
+ }
+ File modelsDirectory = new File(modelsPath);
+ if (!modelsDirectory.exists()) {
+ throw new FileNotFoundException("The models directory did not exist.");
+ }
+ if (!modelsDirectory.isDirectory()) {
+ throw new FileNotFoundException("The models directory existed, but is not a directory.");
+ }
+ }
+
+ // we now have the new properties, and we have the base default props,
+ // merge them together modifying paths as necessary.
+ ResourceManager rm = new NerBaseConfigurator().getDefaultConfig();
+ for (Entry