From 5931660419456f369d4df9e8d5d6df41da43ef9f Mon Sep 17 00:00:00 2001 From: Paul Cuddihy Date: Thu, 25 Aug 2022 08:42:43 -0400 Subject: [PATCH] added instance dictionary (for latimer) --- .../OntologyInfoServiceRestController.java | 58 ++++ .../requests/InstanceDictionaryRequest.java | 45 +++ .../ontologyTools/InstanceDictGenerator.java | 102 +++++++ .../test/InstanceDictGeneratorTest_IT.java | 97 +++++++ .../test/resources/sampleBatteryGuids.json | 260 ++++++++++++++++++ 5 files changed, 562 insertions(+) create mode 100644 ontologyInfoService/src/main/java/com/ge/research/semtk/services/ontologyinfo/requests/InstanceDictionaryRequest.java create mode 100644 sparqlGraphLibrary/src/main/java/com/ge/research/semtk/ontologyTools/InstanceDictGenerator.java create mode 100644 sparqlGraphLibrary/src/test/java/com/ge/research/semtk/ontologyTools/test/InstanceDictGeneratorTest_IT.java create mode 100644 sparqlGraphLibrary/src/test/resources/sampleBatteryGuids.json diff --git a/ontologyInfoService/src/main/java/com/ge/research/semtk/services/ontologyinfo/OntologyInfoServiceRestController.java b/ontologyInfoService/src/main/java/com/ge/research/semtk/services/ontologyinfo/OntologyInfoServiceRestController.java index b046e5b83..379e839d4 100644 --- a/ontologyInfoService/src/main/java/com/ge/research/semtk/services/ontologyinfo/OntologyInfoServiceRestController.java +++ b/ontologyInfoService/src/main/java/com/ge/research/semtk/services/ontologyinfo/OntologyInfoServiceRestController.java @@ -41,6 +41,7 @@ import com.ge.research.semtk.edc.client.ResultsClient; import com.ge.research.semtk.edc.client.ResultsClientConfig; import com.ge.research.semtk.ontologyTools.DataDictionaryGenerator; +import com.ge.research.semtk.ontologyTools.InstanceDictGenerator; import com.ge.research.semtk.ontologyTools.RestrictionChecker; import com.ge.research.semtk.ontologyTools.OntologyClass; import com.ge.research.semtk.ontologyTools.OntologyInfo; @@ -53,6 +54,7 @@ import com.ge.research.semtk.resultSet.Table; import com.ge.research.semtk.resultSet.TableResultSet; import com.ge.research.semtk.services.ontologyinfo.requests.CardinalityReportRequest; +import com.ge.research.semtk.services.ontologyinfo.requests.InstanceDictionaryRequest; import com.ge.research.semtk.services.ontologyinfo.requests.OntologyInfoClassRequestBody; import com.ge.research.semtk.services.ontologyinfo.requests.OntologyInfoRequestBody; import com.ge.research.semtk.services.ontologyinfo.requests.SparqlConnectionRequestBody; @@ -536,4 +538,60 @@ public JSONObject getCachedPredicateStats(@RequestBody SparqlConnectionRequest r return retval.toJson(); } + + /** + * Build a dictionary of identifiers: GUID, type(s), string + */ + @Operation( + summary="Get table of URIs and string identifiers associated with them.", + description="Async. Returns a jobID." + ) + @CrossOrigin + @RequestMapping(value="/getInstanceDictionary", method= RequestMethod.POST) + public JSONObject getInstanceDictionary(@RequestBody InstanceDictionaryRequest requestBody, @RequestHeader HttpHeaders headers) { + HeadersManager.setHeaders(headers); + final String ENDPOINT_NAME = "getInstanceDictionary"; + SimpleResultSet res = new SimpleResultSet(false); + + try { + // setup job tracker and results client + String jobId = JobTracker.generateJobId(); + JobTracker tracker = new JobTracker(servicesgraph_props.buildSei()); + tracker.createJob(jobId); + ResultsClient rclient = new ResultsClient(new ResultsClientConfig(results_props.getProtocol(), results_props.getServer(), results_props.getPort())); + + // spin up an async thread + new Thread(() -> { + try { + HeadersManager.setHeaders(headers); + + SparqlConnection conn = requestBody.buildSparqlConnection(); + OntologyInfo oInfo = oInfoCache.get(conn); + InstanceDictGenerator generator = new InstanceDictGenerator(conn, oInfo, requestBody.getMaxWords(), requestBody.getSpecificityLimit()); + Table tab = generator.generate(); + + rclient.execStoreTableResults(jobId, tab); + tracker.setJobSuccess(jobId); + + } catch (Exception e) { + try { + tracker.setJobFailure(jobId, e.getMessage()); + } catch (Exception ee) { + LocalLogger.logToStdErr(ENDPOINT_NAME + " error accessing job tracker"); + LocalLogger.printStackTrace(ee); + } + } + }).start(); + + res.addJobId(jobId); + res.addResultType(SparqlResultTypes.TABLE); + res.setSuccess(true); + + } catch (Exception e) { + res.setSuccess(false); + res.addRationaleMessage(SERVICE_NAME, ENDPOINT_NAME, e); + LocalLogger.printStackTrace(e); + } + return res.toJson(); + } } diff --git a/ontologyInfoService/src/main/java/com/ge/research/semtk/services/ontologyinfo/requests/InstanceDictionaryRequest.java b/ontologyInfoService/src/main/java/com/ge/research/semtk/services/ontologyinfo/requests/InstanceDictionaryRequest.java new file mode 100644 index 000000000..44e0969be --- /dev/null +++ b/ontologyInfoService/src/main/java/com/ge/research/semtk/services/ontologyinfo/requests/InstanceDictionaryRequest.java @@ -0,0 +1,45 @@ +/** + ** Copyright 2018 General Electric Company + ** + ** + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** + ** http://www.apache.org/licenses/LICENSE-2.0 + ** + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + */ + +package com.ge.research.semtk.services.ontologyinfo.requests; + +import com.ge.research.semtk.springutilib.requests.SparqlConnectionRequest; + +import io.swagger.v3.oas.annotations.media.Schema; + +public class InstanceDictionaryRequest extends SparqlConnectionRequest { + + @Schema( + description = "Labels assigned to URIs may not have more than this many words", + required = false, + example = "2") + private int maxWords = 2; + + @Schema( + description = "Only find labels that associate with at most this many URIs", + required = false, + example = "1") + private int specificityLimit = 1; + + public int getMaxWords() { + return this.maxWords; + } + public int getSpecificityLimit() { + return this.specificityLimit; + } + +} diff --git a/sparqlGraphLibrary/src/main/java/com/ge/research/semtk/ontologyTools/InstanceDictGenerator.java b/sparqlGraphLibrary/src/main/java/com/ge/research/semtk/ontologyTools/InstanceDictGenerator.java new file mode 100644 index 000000000..536934140 --- /dev/null +++ b/sparqlGraphLibrary/src/main/java/com/ge/research/semtk/ontologyTools/InstanceDictGenerator.java @@ -0,0 +1,102 @@ +package com.ge.research.semtk.ontologyTools; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import com.ge.research.semtk.resultSet.Table; +import com.ge.research.semtk.sparqlToXLib.SparqlToXLibUtil; +import com.ge.research.semtk.sparqlX.SparqlConnection; + +/** + * A utility class to generate data dictionaries from an ontology. + */ +public class InstanceDictGenerator { + SparqlConnection conn; + OntologyInfo oInfo; + int specificityLimit = 0; + String wordRegex; + + /** + * + * @param conn + * @param oInfo + * @param maxWords - strings with more than this many words are not considered + * @param specificityLimit - don't find a name/label if it identifies more than this many URI instances + */ + public InstanceDictGenerator(SparqlConnection conn, OntologyInfo oInfo, int maxWords, int specificityLimit) { + this.conn = conn; + this.oInfo = oInfo; + this.specificityLimit = specificityLimit; + + // convert maxWords into a regex that will disqualify + this.wordRegex = "\\\\w+\\\\s+"; + for (int i=1; i < maxWords; i++) { + this.wordRegex += "\\\\w+\\\\s+"; + } + } + + /** + * Generate a tabular report describing this ontology + * instance_uri - + * class_uris - instance belongs to one or more classes + * label - label (or name) associated with the instance. NOT UNIQUE: see label_specificity + * label_specificity - how many uris have this label + * property - what prop was used to associate labgel with instance_uri + * + * @param oInfo the ontology info object + * @param stripNamespace true to remove namespaces from classes and properties + */ + public Table generate() throws Exception{ + + Table table = null; + + // only match strings that are unique ?o in the world of ?s ?p ?o + + ArrayList propNames = this.oInfo.getPropertyNames(); + for (String propUri : propNames) { + OntologyProperty oProp = oInfo.getProperty(propUri); + Set domains = oProp.getRangeDomains(); + for (String domainUri : domains) { + OntologyRange oRange = oProp.getExactRange(domainUri); + if (oRange.containsUri("http://www.w3.org/2001/XMLSchema#string")) { + // found oProp which can have a range of String when domain is domainUri + + // selects: + // - ?sub a subject URI + // - concatenation of types + // - ?str string that could be identifier + // - ?str_count how many things this string might be an identifier for + // - filters out ?str if it has more than two words: see twoOrFewerWordRegex + String query = String.format( + "select distinct (?sub as ?instance_uri) (GROUP_CONCAT(DISTINCT ?t) as ?class_uris) (?str as ?label) (COUNT(distinct ?sub2) as ?label_specificity) \n" + + " %s \n " + + " where {\n" + + " \n" + + " ?t * <%s> .\n" + + " ?sub a ?t .\n" + + " ?sub <%s> ?str.\n" + + " filter ( ! regex (?str, \"%s\")) .\n" + + " ?sub2 ?pred2 ?str ." + + "} \n" + + "GROUP BY ?sub ?str " + + "HAVING (COUNT(distinct ?sub2) < %d)", + SparqlToXLibUtil.generateSparqlFromOrUsing("", "FROM", conn, this.oInfo), domainUri, propUri, this.wordRegex, this.specificityLimit + 1 ); + + Table tab = conn.getDefaultQueryInterface().executeToTable(query); + tab.appendColumn("property", "literal", propUri); + if (table == null) + table = tab; + else + table.append(tab); + + } + + + } + } + return table; + } + +} diff --git a/sparqlGraphLibrary/src/test/java/com/ge/research/semtk/ontologyTools/test/InstanceDictGeneratorTest_IT.java b/sparqlGraphLibrary/src/test/java/com/ge/research/semtk/ontologyTools/test/InstanceDictGeneratorTest_IT.java new file mode 100644 index 000000000..e28f69132 --- /dev/null +++ b/sparqlGraphLibrary/src/test/java/com/ge/research/semtk/ontologyTools/test/InstanceDictGeneratorTest_IT.java @@ -0,0 +1,97 @@ +/** + ** Copyright 2020 General Electric Company + ** + ** + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** + ** http://www.apache.org/licenses/LICENSE-2.0 + ** + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + */ + +package com.ge.research.semtk.ontologyTools.test; + +import static org.junit.Assert.*; + +import java.util.ArrayList; + +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +import com.ge.research.semtk.belmont.Node; +import com.ge.research.semtk.belmont.NodeGroup; +import com.ge.research.semtk.ontologyTools.ClassInstance; +import com.ge.research.semtk.ontologyTools.InstanceDictGenerator; +import com.ge.research.semtk.ontologyTools.NodeGroupCache; +import com.ge.research.semtk.ontologyTools.OntologyInfo; +import com.ge.research.semtk.ontologyTools.PathExplorer; +import com.ge.research.semtk.ontologyTools.PathItemRequest; +import com.ge.research.semtk.ontologyTools.ReturnRequest; +import com.ge.research.semtk.resultSet.Table; +import com.ge.research.semtk.sparqlX.SparqlConnection; +import com.ge.research.semtk.sparqlX.SparqlEndpointInterface; +import com.ge.research.semtk.test.IntegrationTestUtility; +import com.ge.research.semtk.test.TestGraph; + +public class InstanceDictGeneratorTest_IT { + + @BeforeClass + public static void setup() throws Exception { + + } + + + @Test + public void testInstanceDict() throws Exception { + String data = "Battery,Cell,color,birthday\n" + + "battA,cell200,red,1966-01-01T12:00:00\n" + + "battA,cell300,blue,1979-01-01T12:00:00-04:00\n" + + "battB,cell401,white,01/01/2000 00:00:01\n" + + "battB,cell402,white,07-04-2016\n" + + "both,both,red,\n" + + "batt 2word,cell 2words,white,\n" + + "batt three words,cell three words,blue,\n" + + "triplet,,,\n"; + + TestGraph.clearGraph(); + TestGraph.uploadOwlResource(this, "sampleBattery.owl"); + TestGraph.ingestCsvString(getClass(), "sampleBatteryGuids.json", data); + SparqlConnection conn = TestGraph.getSparqlConn(); + OntologyInfo oInfo = TestGraph.getOInfo(); + + InstanceDictGenerator generator = new InstanceDictGenerator(conn, oInfo, 2, 2); + Table tab = generator.generate(); + // almost a no-op test : make sure nothing crashes + assertEquals("Wrong number of rows", 11, tab.getNumRows()); + + generator = new InstanceDictGenerator(conn, oInfo, 3, 2); + tab = generator.generate(); + // almost a no-op test : make sure nothing crashes + assertEquals("Wrong number of rows after allowing three words", 13, tab.getNumRows()); + + generator = new InstanceDictGenerator(conn, oInfo, 2, 1); + tab = generator.generate(); + // almost a no-op test : make sure nothing crashes + assertEquals("Wrong number of rows with specificity set to 1", 9, tab.getNumRows()); + } + + @Test + public void manualOverride() throws Exception { + String connStr = + "{\"name\":\"ML4M ipd leb1acdev\",\"domain\":\"\",\"enableOwlImports\":true,\"model\":[{\"type\":\"fuseki\",\"url\":\"http://leb1acdev.hpc.ge.com:3030/ML4M\",\"graph\":\"http://research.ge.com/ipd/model\"}],\"data\":[{\"type\":\"fuseki\",\"url\":\"http://leb1acdev.hpc.ge.com:3030/ML4M\",\"graph\":\"http://research.ge.com/ipd/data\"}]}\n"; + //"{\"name\":\"RACK\",\"domain\":\"\",\"enableOwlImports\":false,\"model\":[{\"type\":\"fuseki\",\"url\":\"http://localhost:3030/RACK\",\"graph\":\"http://rack001/model\"}],\"data\":[{\"type\":\"fuseki\",\"url\":\"http://localhost:3030/RACK\",\"graph\":\"http://rack001/data\"}]}\n"; + SparqlConnection conn = new SparqlConnection(connStr); + OntologyInfo oInfo = new OntologyInfo(conn); + InstanceDictGenerator generator = new InstanceDictGenerator(conn, oInfo, 2, 2); + Table tab = generator.generate(); + System.out.println(tab.toCSVString()); + } + +} diff --git a/sparqlGraphLibrary/src/test/resources/sampleBatteryGuids.json b/sparqlGraphLibrary/src/test/resources/sampleBatteryGuids.json new file mode 100644 index 000000000..1098cf8c3 --- /dev/null +++ b/sparqlGraphLibrary/src/test/resources/sampleBatteryGuids.json @@ -0,0 +1,260 @@ +{ + "version": 3, + "sparqlConn": { + "name": "Junit fuseki", + "domain": "", + "enableOwlImports": true, + "model": [ + { + "type": "fuseki", + "url": "http://localhost:3030/JUNIT", + "graph": "http://junit/GG2NQYY2E/200001934/both" + } + ], + "data": [ + { + "type": "fuseki", + "url": "http://localhost:3030/JUNIT", + "graph": "http://junit/GG2NQYY2E/200001934/both" + } + ] + }, + "sNodeGroup": { + "version": 19, + "limit": 0, + "offset": 0, + "sNodeList": [ + { + "propList": [], + "nodeList": [], + "fullURIName": "http://kdl.ge.com/batterydemo#Color", + "SparqlID": "?Color", + "isReturned": false, + "isRuntimeConstrained": false, + "valueConstraint": "", + "instanceValue": null, + "deletionMode": "NO_DELETE" + }, + { + "propList": [ + { + "valueTypes": [ + "string" + ], + "rangeURI": "http://www.w3.org/2001/XMLSchema#string", + "UriRelationship": "http://kdl.ge.com/batterydemo#cellId", + "Constraints": "", + "SparqlID": "?CellId", + "isReturned": true, + "optMinus": 0, + "isRuntimeConstrained": false, + "instanceValues": [], + "isMarkedForDeletion": false + } + ], + "nodeList": [ + { + "SnodeSparqlIDs": [ + "?Color" + ], + "OptionalMinus": [ + 0 + ], + "Qualifiers": [ + "" + ], + "DeletionMarkers": [ + false + ], + "range": [ + "http://kdl.ge.com/batterydemo#Color" + ], + "ConnectBy": "color", + "Connected": true, + "UriConnectBy": "http://kdl.ge.com/batterydemo#color" + } + ], + "fullURIName": "http://kdl.ge.com/batterydemo#Cell", + "SparqlID": "?Cell", + "isReturned": false, + "isRuntimeConstrained": false, + "valueConstraint": "", + "instanceValue": null, + "deletionMode": "NO_DELETE" + }, + { + "propList": [ + { + "valueTypes": [ + "dateTime" + ], + "rangeURI": "http://www.w3.org/2001/XMLSchema#dateTime", + "UriRelationship": "http://kdl.ge.com/batterydemo#birthday", + "Constraints": "", + "SparqlID": "", + "isReturned": false, + "optMinus": 0, + "isRuntimeConstrained": false, + "instanceValues": [], + "isMarkedForDeletion": false + }, + { + "valueTypes": [ + "string" + ], + "rangeURI": "http://www.w3.org/2001/XMLSchema#string", + "UriRelationship": "http://kdl.ge.com/batterydemo#name", + "Constraints": "", + "SparqlID": "?Name", + "isReturned": true, + "optMinus": 0, + "isRuntimeConstrained": false, + "instanceValues": [], + "isMarkedForDeletion": false + } + ], + "nodeList": [ + { + "SnodeSparqlIDs": [ + "?Cell" + ], + "OptionalMinus": [ + 0 + ], + "Qualifiers": [ + "" + ], + "DeletionMarkers": [ + false + ], + "range": [ + "http://kdl.ge.com/batterydemo#Cell" + ], + "ConnectBy": "cell", + "Connected": true, + "UriConnectBy": "http://kdl.ge.com/batterydemo#cell" + } + ], + "fullURIName": "http://kdl.ge.com/batterydemo#Battery", + "SparqlID": "?Battery", + "isReturned": false, + "isRuntimeConstrained": false, + "valueConstraint": "", + "instanceValue": null, + "deletionMode": "NO_DELETE" + } + ], + "orderBy": [ + { + "sparqlID": "?Name" + }, + { + "sparqlID": "?CellId" + } + ], + "groupBy": [], + "unionHash": {}, + "columnOrder": [] + }, + "importSpec": { + "version": "1", + "baseURI": "", + "columns": [ + { + "colId": "col_0", + "colName": "Battery" + }, + { + "colId": "col_1", + "colName": "Cell" + }, + { + "colId": "col_2", + "colName": "birthday" + }, + { + "colId": "col_3", + "colName": "color" + } + ], + "dataValidator": [], + "texts": [ + { + "textId": "text_0", + "text": "Battery_" + }, + { + "textId": "text_1", + "text": "Cell_" + } + ], + "transforms": [ + { + "transId": "trans_0", + "name": "No_space", + "transType": "replaceAll", + "arg1": "\\s+", + "arg2": "_" + } + ], + "nodes": [ + { + "sparqlID": "?Battery", + "type": "http://kdl.ge.com/batterydemo#Battery", + "URILookupMode": "createIfMissing", + "mapping": [], + "props": [ + { + "URIRelation": "http://kdl.ge.com/batterydemo#birthday", + "mapping": [ + { + "colId": "col_2" + } + ] + }, + { + "URIRelation": "http://kdl.ge.com/batterydemo#name", + "URILookup": [ + "?Battery" + ], + "mapping": [ + { + "colId": "col_0" + } + ] + } + ] + }, + { + "sparqlID": "?Cell", + "type": "http://kdl.ge.com/batterydemo#Cell", + "URILookupMode": "createIfMissing", + "mapping": [], + "props": [ + { + "URIRelation": "http://kdl.ge.com/batterydemo#cellId", + "URILookup": [ + "?Cell" + ], + "mapping": [ + { + "colId": "col_1" + } + ] + } + ] + }, + { + "sparqlID": "?Color", + "type": "http://kdl.ge.com/batterydemo#Color", + "mapping": [ + { + "colId": "col_3" + } + ], + "props": [] + } + ] + }, + "plotSpecs": [] +} \ No newline at end of file