Skip to content

Commit

Permalink
added instance dictionary (for latimer)
Browse files Browse the repository at this point in the history
  • Loading branch information
Paul Cuddihy committed Aug 25, 2022
1 parent e953ac3 commit 5931660
Show file tree
Hide file tree
Showing 5 changed files with 562 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import com.ge.research.semtk.edc.client.ResultsClient;
import com.ge.research.semtk.edc.client.ResultsClientConfig;
import com.ge.research.semtk.ontologyTools.DataDictionaryGenerator;
import com.ge.research.semtk.ontologyTools.InstanceDictGenerator;
import com.ge.research.semtk.ontologyTools.RestrictionChecker;
import com.ge.research.semtk.ontologyTools.OntologyClass;
import com.ge.research.semtk.ontologyTools.OntologyInfo;
Expand All @@ -53,6 +54,7 @@
import com.ge.research.semtk.resultSet.Table;
import com.ge.research.semtk.resultSet.TableResultSet;
import com.ge.research.semtk.services.ontologyinfo.requests.CardinalityReportRequest;
import com.ge.research.semtk.services.ontologyinfo.requests.InstanceDictionaryRequest;
import com.ge.research.semtk.services.ontologyinfo.requests.OntologyInfoClassRequestBody;
import com.ge.research.semtk.services.ontologyinfo.requests.OntologyInfoRequestBody;
import com.ge.research.semtk.services.ontologyinfo.requests.SparqlConnectionRequestBody;
Expand Down Expand Up @@ -536,4 +538,60 @@ public JSONObject getCachedPredicateStats(@RequestBody SparqlConnectionRequest r

return retval.toJson();
}

/**
* Build a dictionary of identifiers: GUID, type(s), string
*/
@Operation(
summary="Get table of URIs and string identifiers associated with them.",
description="Async. Returns a jobID."
)
@CrossOrigin
@RequestMapping(value="/getInstanceDictionary", method= RequestMethod.POST)
public JSONObject getInstanceDictionary(@RequestBody InstanceDictionaryRequest requestBody, @RequestHeader HttpHeaders headers) {
HeadersManager.setHeaders(headers);
final String ENDPOINT_NAME = "getInstanceDictionary";
SimpleResultSet res = new SimpleResultSet(false);

try {
// setup job tracker and results client
String jobId = JobTracker.generateJobId();
JobTracker tracker = new JobTracker(servicesgraph_props.buildSei());
tracker.createJob(jobId);
ResultsClient rclient = new ResultsClient(new ResultsClientConfig(results_props.getProtocol(), results_props.getServer(), results_props.getPort()));

// spin up an async thread
new Thread(() -> {
try {
HeadersManager.setHeaders(headers);

SparqlConnection conn = requestBody.buildSparqlConnection();
OntologyInfo oInfo = oInfoCache.get(conn);
InstanceDictGenerator generator = new InstanceDictGenerator(conn, oInfo, requestBody.getMaxWords(), requestBody.getSpecificityLimit());
Table tab = generator.generate();

rclient.execStoreTableResults(jobId, tab);
tracker.setJobSuccess(jobId);

} catch (Exception e) {
try {
tracker.setJobFailure(jobId, e.getMessage());
} catch (Exception ee) {
LocalLogger.logToStdErr(ENDPOINT_NAME + " error accessing job tracker");
LocalLogger.printStackTrace(ee);
}
}
}).start();

res.addJobId(jobId);
res.addResultType(SparqlResultTypes.TABLE);
res.setSuccess(true);

} catch (Exception e) {
res.setSuccess(false);
res.addRationaleMessage(SERVICE_NAME, ENDPOINT_NAME, e);
LocalLogger.printStackTrace(e);
}
return res.toJson();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/**
** Copyright 2018 General Electric Company
**
**
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
**
** http://www.apache.org/licenses/LICENSE-2.0
**
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*/

package com.ge.research.semtk.services.ontologyinfo.requests;

import com.ge.research.semtk.springutilib.requests.SparqlConnectionRequest;

import io.swagger.v3.oas.annotations.media.Schema;

public class InstanceDictionaryRequest extends SparqlConnectionRequest {

@Schema(
description = "Labels assigned to URIs may not have more than this many words",
required = false,
example = "2")
private int maxWords = 2;

@Schema(
description = "Only find labels that associate with at most this many URIs",
required = false,
example = "1")
private int specificityLimit = 1;

public int getMaxWords() {
return this.maxWords;
}
public int getSpecificityLimit() {
return this.specificityLimit;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package com.ge.research.semtk.ontologyTools;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

import com.ge.research.semtk.resultSet.Table;
import com.ge.research.semtk.sparqlToXLib.SparqlToXLibUtil;
import com.ge.research.semtk.sparqlX.SparqlConnection;

/**
* A utility class to generate data dictionaries from an ontology.
*/
public class InstanceDictGenerator {
SparqlConnection conn;
OntologyInfo oInfo;
int specificityLimit = 0;
String wordRegex;

/**
*
* @param conn
* @param oInfo
* @param maxWords - strings with more than this many words are not considered
* @param specificityLimit - don't find a name/label if it identifies more than this many URI instances
*/
public InstanceDictGenerator(SparqlConnection conn, OntologyInfo oInfo, int maxWords, int specificityLimit) {
this.conn = conn;
this.oInfo = oInfo;
this.specificityLimit = specificityLimit;

// convert maxWords into a regex that will disqualify
this.wordRegex = "\\\\w+\\\\s+";
for (int i=1; i < maxWords; i++) {
this.wordRegex += "\\\\w+\\\\s+";
}
}

/**
* Generate a tabular report describing this ontology
* instance_uri -
* class_uris - instance belongs to one or more classes
* label - label (or name) associated with the instance. NOT UNIQUE: see label_specificity
* label_specificity - how many uris have this label
* property - what prop was used to associate labgel with instance_uri
*
* @param oInfo the ontology info object
* @param stripNamespace true to remove namespaces from classes and properties
*/
public Table generate() throws Exception{

Table table = null;

// only match strings that are unique ?o in the world of ?s ?p ?o

ArrayList<String> propNames = this.oInfo.getPropertyNames();
for (String propUri : propNames) {
OntologyProperty oProp = oInfo.getProperty(propUri);
Set<String> domains = oProp.getRangeDomains();
for (String domainUri : domains) {
OntologyRange oRange = oProp.getExactRange(domainUri);
if (oRange.containsUri("http://www.w3.org/2001/XMLSchema#string")) {
// found oProp which can have a range of String when domain is domainUri

// selects:
// - ?sub a subject URI
// - concatenation of types
// - ?str string that could be identifier
// - ?str_count how many things this string might be an identifier for
// - filters out ?str if it has more than two words: see twoOrFewerWordRegex
String query = String.format(
"select distinct (?sub as ?instance_uri) (GROUP_CONCAT(DISTINCT ?t) as ?class_uris) (?str as ?label) (COUNT(distinct ?sub2) as ?label_specificity) \n"
+ " %s \n "
+ " where {\n"
+ " \n"
+ " ?t <http://www.w3.org/2000/01/rdf-schema#subClassOf>* <%s> .\n"
+ " ?sub a ?t .\n"
+ " ?sub <%s> ?str.\n"
+ " filter ( ! regex (?str, \"%s\")) .\n"
+ " ?sub2 ?pred2 ?str ."
+ "} \n"
+ "GROUP BY ?sub ?str "
+ "HAVING (COUNT(distinct ?sub2) < %d)",
SparqlToXLibUtil.generateSparqlFromOrUsing("", "FROM", conn, this.oInfo), domainUri, propUri, this.wordRegex, this.specificityLimit + 1 );

Table tab = conn.getDefaultQueryInterface().executeToTable(query);
tab.appendColumn("property", "literal", propUri);
if (table == null)
table = tab;
else
table.append(tab);

}


}
}
return table;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/**
** Copyright 2020 General Electric Company
**
**
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
**
** http://www.apache.org/licenses/LICENSE-2.0
**
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*/

package com.ge.research.semtk.ontologyTools.test;

import static org.junit.Assert.*;

import java.util.ArrayList;

import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;

import com.ge.research.semtk.belmont.Node;
import com.ge.research.semtk.belmont.NodeGroup;
import com.ge.research.semtk.ontologyTools.ClassInstance;
import com.ge.research.semtk.ontologyTools.InstanceDictGenerator;
import com.ge.research.semtk.ontologyTools.NodeGroupCache;
import com.ge.research.semtk.ontologyTools.OntologyInfo;
import com.ge.research.semtk.ontologyTools.PathExplorer;
import com.ge.research.semtk.ontologyTools.PathItemRequest;
import com.ge.research.semtk.ontologyTools.ReturnRequest;
import com.ge.research.semtk.resultSet.Table;
import com.ge.research.semtk.sparqlX.SparqlConnection;
import com.ge.research.semtk.sparqlX.SparqlEndpointInterface;
import com.ge.research.semtk.test.IntegrationTestUtility;
import com.ge.research.semtk.test.TestGraph;

public class InstanceDictGeneratorTest_IT {

@BeforeClass
public static void setup() throws Exception {

}


@Test
public void testInstanceDict() throws Exception {
String data = "Battery,Cell,color,birthday\n"
+ "battA,cell200,red,1966-01-01T12:00:00\n"
+ "battA,cell300,blue,1979-01-01T12:00:00-04:00\n"
+ "battB,cell401,white,01/01/2000 00:00:01\n"
+ "battB,cell402,white,07-04-2016\n"
+ "both,both,red,\n"
+ "batt 2word,cell 2words,white,\n"
+ "batt three words,cell three words,blue,\n"
+ "triplet,,,\n";

TestGraph.clearGraph();
TestGraph.uploadOwlResource(this, "sampleBattery.owl");
TestGraph.ingestCsvString(getClass(), "sampleBatteryGuids.json", data);
SparqlConnection conn = TestGraph.getSparqlConn();
OntologyInfo oInfo = TestGraph.getOInfo();

InstanceDictGenerator generator = new InstanceDictGenerator(conn, oInfo, 2, 2);
Table tab = generator.generate();
// almost a no-op test : make sure nothing crashes
assertEquals("Wrong number of rows", 11, tab.getNumRows());

generator = new InstanceDictGenerator(conn, oInfo, 3, 2);
tab = generator.generate();
// almost a no-op test : make sure nothing crashes
assertEquals("Wrong number of rows after allowing three words", 13, tab.getNumRows());

generator = new InstanceDictGenerator(conn, oInfo, 2, 1);
tab = generator.generate();
// almost a no-op test : make sure nothing crashes
assertEquals("Wrong number of rows with specificity set to 1", 9, tab.getNumRows());
}

@Test
public void manualOverride() throws Exception {
String connStr =
"{\"name\":\"ML4M ipd leb1acdev\",\"domain\":\"\",\"enableOwlImports\":true,\"model\":[{\"type\":\"fuseki\",\"url\":\"http://leb1acdev.hpc.ge.com:3030/ML4M\",\"graph\":\"http://research.ge.com/ipd/model\"}],\"data\":[{\"type\":\"fuseki\",\"url\":\"http://leb1acdev.hpc.ge.com:3030/ML4M\",\"graph\":\"http://research.ge.com/ipd/data\"}]}\n";
//"{\"name\":\"RACK\",\"domain\":\"\",\"enableOwlImports\":false,\"model\":[{\"type\":\"fuseki\",\"url\":\"http://localhost:3030/RACK\",\"graph\":\"http://rack001/model\"}],\"data\":[{\"type\":\"fuseki\",\"url\":\"http://localhost:3030/RACK\",\"graph\":\"http://rack001/data\"}]}\n";
SparqlConnection conn = new SparqlConnection(connStr);
OntologyInfo oInfo = new OntologyInfo(conn);
InstanceDictGenerator generator = new InstanceDictGenerator(conn, oInfo, 2, 2);
Table tab = generator.generate();
System.out.println(tab.toCSVString());
}

}
Loading

0 comments on commit 5931660

Please sign in to comment.