-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added instance dictionary (for latimer)
- Loading branch information
Paul Cuddihy
committed
Aug 25, 2022
1 parent
e953ac3
commit 5931660
Showing
5 changed files
with
562 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
45 changes: 45 additions & 0 deletions
45
.../java/com/ge/research/semtk/services/ontologyinfo/requests/InstanceDictionaryRequest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
/** | ||
** Copyright 2018 General Electric Company | ||
** | ||
** | ||
** Licensed under the Apache License, Version 2.0 (the "License"); | ||
** you may not use this file except in compliance with the License. | ||
** You may obtain a copy of the License at | ||
** | ||
** http://www.apache.org/licenses/LICENSE-2.0 | ||
** | ||
** Unless required by applicable law or agreed to in writing, software | ||
** distributed under the License is distributed on an "AS IS" BASIS, | ||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
** See the License for the specific language governing permissions and | ||
** limitations under the License. | ||
*/ | ||
|
||
package com.ge.research.semtk.services.ontologyinfo.requests; | ||
|
||
import com.ge.research.semtk.springutilib.requests.SparqlConnectionRequest; | ||
|
||
import io.swagger.v3.oas.annotations.media.Schema; | ||
|
||
public class InstanceDictionaryRequest extends SparqlConnectionRequest { | ||
|
||
@Schema( | ||
description = "Labels assigned to URIs may not have more than this many words", | ||
required = false, | ||
example = "2") | ||
private int maxWords = 2; | ||
|
||
@Schema( | ||
description = "Only find labels that associate with at most this many URIs", | ||
required = false, | ||
example = "1") | ||
private int specificityLimit = 1; | ||
|
||
public int getMaxWords() { | ||
return this.maxWords; | ||
} | ||
public int getSpecificityLimit() { | ||
return this.specificityLimit; | ||
} | ||
|
||
} |
102 changes: 102 additions & 0 deletions
102
...GraphLibrary/src/main/java/com/ge/research/semtk/ontologyTools/InstanceDictGenerator.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
package com.ge.research.semtk.ontologyTools; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Collections; | ||
import java.util.HashSet; | ||
import java.util.Set; | ||
|
||
import com.ge.research.semtk.resultSet.Table; | ||
import com.ge.research.semtk.sparqlToXLib.SparqlToXLibUtil; | ||
import com.ge.research.semtk.sparqlX.SparqlConnection; | ||
|
||
/** | ||
* A utility class to generate data dictionaries from an ontology. | ||
*/ | ||
public class InstanceDictGenerator { | ||
SparqlConnection conn; | ||
OntologyInfo oInfo; | ||
int specificityLimit = 0; | ||
String wordRegex; | ||
|
||
/** | ||
* | ||
* @param conn | ||
* @param oInfo | ||
* @param maxWords - strings with more than this many words are not considered | ||
* @param specificityLimit - don't find a name/label if it identifies more than this many URI instances | ||
*/ | ||
public InstanceDictGenerator(SparqlConnection conn, OntologyInfo oInfo, int maxWords, int specificityLimit) { | ||
this.conn = conn; | ||
this.oInfo = oInfo; | ||
this.specificityLimit = specificityLimit; | ||
|
||
// convert maxWords into a regex that will disqualify | ||
this.wordRegex = "\\\\w+\\\\s+"; | ||
for (int i=1; i < maxWords; i++) { | ||
this.wordRegex += "\\\\w+\\\\s+"; | ||
} | ||
} | ||
|
||
/** | ||
* Generate a tabular report describing this ontology | ||
* instance_uri - | ||
* class_uris - instance belongs to one or more classes | ||
* label - label (or name) associated with the instance. NOT UNIQUE: see label_specificity | ||
* label_specificity - how many uris have this label | ||
* property - what prop was used to associate labgel with instance_uri | ||
* | ||
* @param oInfo the ontology info object | ||
* @param stripNamespace true to remove namespaces from classes and properties | ||
*/ | ||
public Table generate() throws Exception{ | ||
|
||
Table table = null; | ||
|
||
// only match strings that are unique ?o in the world of ?s ?p ?o | ||
|
||
ArrayList<String> propNames = this.oInfo.getPropertyNames(); | ||
for (String propUri : propNames) { | ||
OntologyProperty oProp = oInfo.getProperty(propUri); | ||
Set<String> domains = oProp.getRangeDomains(); | ||
for (String domainUri : domains) { | ||
OntologyRange oRange = oProp.getExactRange(domainUri); | ||
if (oRange.containsUri("http://www.w3.org/2001/XMLSchema#string")) { | ||
// found oProp which can have a range of String when domain is domainUri | ||
|
||
// selects: | ||
// - ?sub a subject URI | ||
// - concatenation of types | ||
// - ?str string that could be identifier | ||
// - ?str_count how many things this string might be an identifier for | ||
// - filters out ?str if it has more than two words: see twoOrFewerWordRegex | ||
String query = String.format( | ||
"select distinct (?sub as ?instance_uri) (GROUP_CONCAT(DISTINCT ?t) as ?class_uris) (?str as ?label) (COUNT(distinct ?sub2) as ?label_specificity) \n" | ||
+ " %s \n " | ||
+ " where {\n" | ||
+ " \n" | ||
+ " ?t <http://www.w3.org/2000/01/rdf-schema#subClassOf>* <%s> .\n" | ||
+ " ?sub a ?t .\n" | ||
+ " ?sub <%s> ?str.\n" | ||
+ " filter ( ! regex (?str, \"%s\")) .\n" | ||
+ " ?sub2 ?pred2 ?str ." | ||
+ "} \n" | ||
+ "GROUP BY ?sub ?str " | ||
+ "HAVING (COUNT(distinct ?sub2) < %d)", | ||
SparqlToXLibUtil.generateSparqlFromOrUsing("", "FROM", conn, this.oInfo), domainUri, propUri, this.wordRegex, this.specificityLimit + 1 ); | ||
|
||
Table tab = conn.getDefaultQueryInterface().executeToTable(query); | ||
tab.appendColumn("property", "literal", propUri); | ||
if (table == null) | ||
table = tab; | ||
else | ||
table.append(tab); | ||
|
||
} | ||
|
||
|
||
} | ||
} | ||
return table; | ||
} | ||
|
||
} |
97 changes: 97 additions & 0 deletions
97
.../src/test/java/com/ge/research/semtk/ontologyTools/test/InstanceDictGeneratorTest_IT.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
/** | ||
** Copyright 2020 General Electric Company | ||
** | ||
** | ||
** Licensed under the Apache License, Version 2.0 (the "License"); | ||
** you may not use this file except in compliance with the License. | ||
** You may obtain a copy of the License at | ||
** | ||
** http://www.apache.org/licenses/LICENSE-2.0 | ||
** | ||
** Unless required by applicable law or agreed to in writing, software | ||
** distributed under the License is distributed on an "AS IS" BASIS, | ||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
** See the License for the specific language governing permissions and | ||
** limitations under the License. | ||
*/ | ||
|
||
package com.ge.research.semtk.ontologyTools.test; | ||
|
||
import static org.junit.Assert.*; | ||
|
||
import java.util.ArrayList; | ||
|
||
import org.junit.AfterClass; | ||
import org.junit.BeforeClass; | ||
import org.junit.Test; | ||
|
||
import com.ge.research.semtk.belmont.Node; | ||
import com.ge.research.semtk.belmont.NodeGroup; | ||
import com.ge.research.semtk.ontologyTools.ClassInstance; | ||
import com.ge.research.semtk.ontologyTools.InstanceDictGenerator; | ||
import com.ge.research.semtk.ontologyTools.NodeGroupCache; | ||
import com.ge.research.semtk.ontologyTools.OntologyInfo; | ||
import com.ge.research.semtk.ontologyTools.PathExplorer; | ||
import com.ge.research.semtk.ontologyTools.PathItemRequest; | ||
import com.ge.research.semtk.ontologyTools.ReturnRequest; | ||
import com.ge.research.semtk.resultSet.Table; | ||
import com.ge.research.semtk.sparqlX.SparqlConnection; | ||
import com.ge.research.semtk.sparqlX.SparqlEndpointInterface; | ||
import com.ge.research.semtk.test.IntegrationTestUtility; | ||
import com.ge.research.semtk.test.TestGraph; | ||
|
||
public class InstanceDictGeneratorTest_IT { | ||
|
||
@BeforeClass | ||
public static void setup() throws Exception { | ||
|
||
} | ||
|
||
|
||
@Test | ||
public void testInstanceDict() throws Exception { | ||
String data = "Battery,Cell,color,birthday\n" | ||
+ "battA,cell200,red,1966-01-01T12:00:00\n" | ||
+ "battA,cell300,blue,1979-01-01T12:00:00-04:00\n" | ||
+ "battB,cell401,white,01/01/2000 00:00:01\n" | ||
+ "battB,cell402,white,07-04-2016\n" | ||
+ "both,both,red,\n" | ||
+ "batt 2word,cell 2words,white,\n" | ||
+ "batt three words,cell three words,blue,\n" | ||
+ "triplet,,,\n"; | ||
|
||
TestGraph.clearGraph(); | ||
TestGraph.uploadOwlResource(this, "sampleBattery.owl"); | ||
TestGraph.ingestCsvString(getClass(), "sampleBatteryGuids.json", data); | ||
SparqlConnection conn = TestGraph.getSparqlConn(); | ||
OntologyInfo oInfo = TestGraph.getOInfo(); | ||
|
||
InstanceDictGenerator generator = new InstanceDictGenerator(conn, oInfo, 2, 2); | ||
Table tab = generator.generate(); | ||
// almost a no-op test : make sure nothing crashes | ||
assertEquals("Wrong number of rows", 11, tab.getNumRows()); | ||
|
||
generator = new InstanceDictGenerator(conn, oInfo, 3, 2); | ||
tab = generator.generate(); | ||
// almost a no-op test : make sure nothing crashes | ||
assertEquals("Wrong number of rows after allowing three words", 13, tab.getNumRows()); | ||
|
||
generator = new InstanceDictGenerator(conn, oInfo, 2, 1); | ||
tab = generator.generate(); | ||
// almost a no-op test : make sure nothing crashes | ||
assertEquals("Wrong number of rows with specificity set to 1", 9, tab.getNumRows()); | ||
} | ||
|
||
@Test | ||
public void manualOverride() throws Exception { | ||
String connStr = | ||
"{\"name\":\"ML4M ipd leb1acdev\",\"domain\":\"\",\"enableOwlImports\":true,\"model\":[{\"type\":\"fuseki\",\"url\":\"http://leb1acdev.hpc.ge.com:3030/ML4M\",\"graph\":\"http://research.ge.com/ipd/model\"}],\"data\":[{\"type\":\"fuseki\",\"url\":\"http://leb1acdev.hpc.ge.com:3030/ML4M\",\"graph\":\"http://research.ge.com/ipd/data\"}]}\n"; | ||
//"{\"name\":\"RACK\",\"domain\":\"\",\"enableOwlImports\":false,\"model\":[{\"type\":\"fuseki\",\"url\":\"http://localhost:3030/RACK\",\"graph\":\"http://rack001/model\"}],\"data\":[{\"type\":\"fuseki\",\"url\":\"http://localhost:3030/RACK\",\"graph\":\"http://rack001/data\"}]}\n"; | ||
SparqlConnection conn = new SparqlConnection(connStr); | ||
OntologyInfo oInfo = new OntologyInfo(conn); | ||
InstanceDictGenerator generator = new InstanceDictGenerator(conn, oInfo, 2, 2); | ||
Table tab = generator.generate(); | ||
System.out.println(tab.toCSVString()); | ||
} | ||
|
||
} |
Oops, something went wrong.