Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- `parentQuery` -> `metadataQuery` in REST API docs
- Added documentation about supported types of fields for metadata (and info warnings for unsupported types). ([#329](https://github.com/lum-ai/odinson/pull/329))
### Changed
- Fixed issue where the metadata stored fields caused Mention.populate to crash ([#333](https://github.com/lum-ai/odinson/pull/333))

## [0.5.0] - 2021-08-07
### Added
Expand Down
4 changes: 3 additions & 1 deletion core/src/main/resources/reference.conf
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,10 @@ odinson {

index {

metadataStoredFields = []

# list of document/sentence fields to store in index, **must** include the displayField
storedFields = [
sentenceStoredFields = [
${odinson.displayField}
]

Expand Down
2 changes: 1 addition & 1 deletion core/src/main/scala/ai/lum/odinson/DataGatherer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class DataGatherer(

val analyzer = new WhitespaceAnalyzer()

val storedFields: Seq[String] = indexSettings.storedFields
val storedFields: Seq[String] = indexSettings.sentenceStoredFields

def getStringForSpan(docID: Int, m: OdinsonMatch): String = {
getTokensForSpan(docID, m).mkString(" ")
Expand Down
11 changes: 7 additions & 4 deletions core/src/main/scala/ai/lum/odinson/OdinsonIndexWriter.scala
Original file line number Diff line number Diff line change
Expand Up @@ -349,18 +349,21 @@ object OdinsonIndexWriter {
(dir, vocab)
}

val storedFields = config.apply[List[String]]("odinson.index.storedFields")
val sentenceStoredFields = config.apply[List[String]]("odinson.index.sentenceStoredFields")
val metadataStoredFields = config.apply[List[String]]("odinson.index.metadataStoredFields")
val displayField = config.apply[String]("odinson.displayField")
// Always store the display field, also store these additional fields
if (!storedFields.contains(displayField)) {
throw new OdinsonException("`odinson.index.storedFields` must contain `odinson.displayField`")
if (!sentenceStoredFields.contains(displayField)) {
throw new OdinsonException(
"`odinson.index.sentenceStoredFields` must contain `odinson.displayField`"
)
}

new OdinsonIndexWriter(
// format: off
directory = directory,
vocabulary = vocabulary,
settings = IndexSettings(storedFields),
settings = IndexSettings(sentenceStoredFields, metadataStoredFields),
normalizedTokenField = config.apply[String]("odinson.index.normalizedTokenField"),
addToNormalizedField = config.apply[List[String]]("odinson.index.addToNormalizedField").toSet,
incomingTokenField = config.apply[String]("odinson.index.incomingTokenField"),
Expand Down
18 changes: 12 additions & 6 deletions core/src/main/scala/ai/lum/odinson/utils/IndexSettings.scala
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,15 @@ import ujson.Value
*
* @param storedFields the names of the fields that are stored fields in the lucene index
*/
class IndexSettings(val storedFields: Seq[String]) {
class IndexSettings(val sentenceStoredFields: Seq[String], val metadataStoredFields: Seq[String]) {

// All stored fields
val storedFields = sentenceStoredFields ++ metadataStoredFields

def asJsonValue: Value = {
ujson.Obj(
"storedFields" -> storedFields
"sentenceStoredFields" -> sentenceStoredFields,
"metadataStoredFields" -> metadataStoredFields
)
}

Expand All @@ -28,12 +32,14 @@ class IndexSettings(val storedFields: Seq[String]) {

object IndexSettings {

def apply(storedFields: Seq[String]): IndexSettings = new IndexSettings(storedFields)
def apply(sentenceStoredFields: Seq[String], metadataStoredFields: Seq[String]): IndexSettings =
new IndexSettings(sentenceStoredFields, metadataStoredFields)

def load(dump: String): IndexSettings = {
val json = ujson.read(dump)
val storedFields = json("storedFields").arr.map(_.str)
new IndexSettings(storedFields)
val sentenceStoredFields = json("sentenceStoredFields").arr.map(_.str)
val metadataStoredFields = json("metadataStoredFields").arr.map(_.str)
new IndexSettings(sentenceStoredFields, metadataStoredFields)
}

def fromDirectory(directory: Directory): IndexSettings =
Expand All @@ -43,7 +49,7 @@ object IndexSettings {
IndexSettings.load(stream.readString())
}
} catch {
case e: IOException => IndexSettings(Seq())
case e: IOException => IndexSettings(Seq(), Seq())
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,13 @@ class OdinsonTest extends FlatSpec with Matchers {
mkExtractorEngine(newConfig, doc)
}

def extractorEngineWithSentenceStoredFields(
doc: Document,
fields: Seq[String]
): ExtractorEngine = {
extractorEngineWithConfigValue(doc, "odinson.index.sentenceStoredFields", fields)
}

/** Constructs an `ai.lum.odinson.ExtractorEngine`` from a single-doc
* using an in-memory index (`org.apache.lucene.store.RAMDirectory`)
* @param docID the string key for the document from ai.lum.odinson.utils.TestUtils.ExampleDocs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ class TestExtractorEngine extends OdinsonTest {
it should "getTokensFromSpan correctly from existing Field" in {
// Becky ate gummy bears.
val doc = getDocument("becky-gummy-bears-v2")
val ee = extractorEngineWithConfigValue(doc, "odinson.index.storedFields", Seq("raw", "lemma"))
val ee =
extractorEngineWithConfigValue(doc, "odinson.index.sentenceStoredFields", Seq("raw", "lemma"))
val rules = """
|rules:
| - name: testrule
Expand Down Expand Up @@ -52,7 +53,7 @@ class TestExtractorEngine extends OdinsonTest {
it should "getTokensFromSpan with OdinsonException from non-existing Field" in {
// Becky ate gummy bears.
val doc = getDocument("becky-gummy-bears-v2")
val ee = extractorEngineWithConfigValue(doc, "odinson.index.storedFields", Seq("raw", "lemma"))
val ee = extractorEngineWithSentenceStoredFields(doc, Seq("raw", "lemma"))
val rules = """
|rules:
| - name: testrule
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class TestFields extends OdinsonTest {

val customConfig: Config = defaultConfig
.withValue(
"odinson.index.storedFields",
"odinson.index.sentenceStoredFields",
ConfigValueFactory.fromAnyRef(Seq("raw", "fizzbuzz").asJava)
)
.withValue(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ class TestOdinsonIndexWriter extends OdinsonTest {
// re-compute the index and docs path's
.withValue("odinson.indexDir", ConfigValueFactory.fromAnyRef(indexFile.getAbsolutePath))
.withValue(
"odinson.index.storedFields",
"odinson.index.sentenceStoredFields",
ConfigValueFactory.fromAnyRef(Seq("apple", "banana", "kiwi", "raw").asJava)
)
}
Expand All @@ -121,18 +121,13 @@ class TestOdinsonIndexWriter extends OdinsonTest {
it should "store stored fields and not others" in {

val doc = getDocument("rainbows")
val customConfig: Config = defaultConfig
.withValue(
"odinson.index.storedFields",
ConfigValueFactory.fromAnyRef(Seq("tag", "raw").asJava)
)
def ee = mkExtractorEngine(customConfig, doc)
def ee = extractorEngineWithSentenceStoredFields(doc, Seq("tag", "raw"))

// we asked it to store `tag` so the extractor engine should be able to access the content
ee.getTokensForSpan(0, "tag", 0, 1) should contain only "NNS"
ee.dataGatherer.getTokensForSpan(0, "tag", 0, 1) should contain only "NNS"
// though `entity` is a field in the Document, it wasn't stored, so the extractor engine shouldn't
// be able to retrieve the content
an[OdinsonException] should be thrownBy ee.getTokensForSpan(0, "entity", 0, 1)
an[OdinsonException] should be thrownBy ee.dataGatherer.getTokensForSpan(0, "entity", 0, 1)

}

Expand All @@ -143,7 +138,7 @@ class TestOdinsonIndexWriter extends OdinsonTest {
// re-compute the index and docs path's
.withValue("odinson.indexDir", ConfigValueFactory.fromAnyRef(indexFile.getAbsolutePath))
.withValue(
"odinson.index.storedFields",
"odinson.index.sentenceStoredFields",
ConfigValueFactory.fromAnyRef(Seq("apple", "banana", "kiwi").asJava)
)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class TestMention extends OdinsonTest {

it should "be populated to a certain level when asked" in {
val doc = getDocument("becky-gummy-bears-v2")
val ee = extractorEngineWithConfigValue(doc, "odinson.index.storedFields", Seq("raw", "lemma"))
val ee = extractorEngineWithSentenceStoredFields(doc, Seq("raw", "lemma"))
val mentions = ee.extractMentions(ee.compileRuleString(rules)).toArray
mentions should have size (2) // the main mention and the untyped arg
val event = mentions.filter(_.label.isDefined).head
Expand All @@ -57,7 +57,7 @@ class TestMention extends OdinsonTest {

it should "populate arguments when populated" in {
val doc = getDocument("becky-gummy-bears-v2")
val ee = extractorEngineWithConfigValue(doc, "odinson.index.storedFields", Seq("raw", "lemma"))
val ee = extractorEngineWithSentenceStoredFields(doc, Seq("raw", "lemma"))
val mentions = ee.extractMentions(ee.compileRuleString(rules)).toArray
mentions should have size (2) // the main mention and the untyped arg
val event = mentions.filter(_.label.isDefined).head
Expand All @@ -74,7 +74,7 @@ class TestMention extends OdinsonTest {

it should "produce mention copies that are populated at the same level" in {
val doc = getDocument("becky-gummy-bears-v2")
val ee = extractorEngineWithConfigValue(doc, "odinson.index.storedFields", Seq("raw", "lemma"))
val ee = extractorEngineWithSentenceStoredFields(doc, Seq("raw", "lemma"))
val mentions = ee.extractMentions(ee.compileRuleString(rules)).toArray
mentions should have size (2) // the main mention and the untyped arg
val event = mentions.filter(_.label.isDefined).head
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class TestOdinsonTest extends OdinsonTest {
.withValue("odinson.displayField", ConfigValueFactory.fromAnyRef("foobar"))
// The displayField is required to be in the storedFields
.withValue(
"odinson.index.storedFields",
"odinson.index.sentenceStoredFields",
ConfigValueFactory.fromAnyRef(Seq("foobar").asJava)
),
doc
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class TestTokenStreamUtils extends OdinsonTest {

it should "not get more fields than requested when accessing the Document" in {
val doc = getDocument("becky-gummy-bears-v2")
val ee = extractorEngineWithConfigValue(doc, "odinson.index.storedFields", Seq("raw", "lemma"))
val ee = extractorEngineWithSentenceStoredFields(doc, Seq("raw", "lemma"))

val tokens =
TokenStreamUtils.getTokensFromMultipleFields(0, Set("raw"), ee.indexReader, ee.analyzer)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,9 @@ class TestJsonSerialization extends OdinsonTest {

val doc = getDocument("rainbows")
val engine = mkExtractorEngine(doc)
val storedFields = util.Arrays.asList("raw", "lemma", "tag")

val verboseEngine = mkExtractorEngine(
defaultConfig.withValue(
"odinson.index.storedFields",
ConfigValueFactory.fromIterable(storedFields)
),
doc
)
val storedFields = Seq("raw", "lemma", "tag")

val verboseEngine = extractorEngineWithSentenceStoredFields(doc, storedFields)

val extractors = engine.compileRuleResource("/serialization.yml")

Expand Down
2 changes: 1 addition & 1 deletion extra/src/main/resources/application.conf
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ odinson.index {

parentDocFieldFileName = fileName

storedFields += ${odinson.index.parentDocFieldFileName}
metadataStoredFields += ${odinson.index.parentDocFieldFileName}

// When indexing make sure to add the documents in the order of the external document id, so that the
// results returned by queries will be ordered by external document id
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ object IndexDocuments extends App with LazyLogging {
var config = ConfigFactory.load()

// Warn that the API requires parentDocFieldFileName
val storedFields = config.apply[List[String]]("odinson.index.storedFields")
val metadataStoredFields = config.apply[List[String]]("odinson.index.metadataStoredFields")
val fileNameField = config.apply[String]("odinson.index.parentDocFieldFileName")
if (!storedFields.contains(fileNameField)) {
if (!metadataStoredFields.contains(fileNameField)) {
logger.warn(
"`odinson.index.storedFields` must contain `odinson.index.parentDocFieldFileName` to enable the Odinson API"
"`odinson.index.metadataStoredFields` must contain `odinson.index.parentDocFieldFileName` to enable the Odinson API"
)
}

Expand Down