Skip to content

Commit 4160ff6

Browse files
committedJan 17, 2020
add files from MIxS-ontology-translation to nmdc-metadata repo
1 parent d463f5c commit 4160ff6

7 files changed

+19335
-0
lines changed
 

‎MIxS-ontology-translation/README.md

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# MIxS ontology translation
2+
Translates MIxS information in the MIxS spreadsheets into an ontology.
3+
4+
The project uses venv to create a virtural enironment.
5+
To create a virtual environment run the command python3 -m venv <environment name> .
6+
I typically name my environment .env, and configure .gitignore to ignore .env files. This prevents the environment libraries from being uploaded to the repository.
7+
8+
After the environment is created, run the command source <environment name>/bin/activate to enter the environment.
9+
Once in the environment, use pip to install libraries (e.g., pip install pandas). Once all the libraries have been installed you can export a list of them using the pip freeze command. Typically, the list is saved to a file named requirements.txt.
10+
e.g., pip freeze > requirements.txt
11+
12+
To load a list of dependencies, use the pip instal -r command.
13+
e.g., pip install -r requirements.txt
14+
15+
You exit the environment by executing the command deactivate in the terminal.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 66,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import pandas as pds\n",
10+
"import json\n",
11+
"from rdflib import Graph, RDFS, RDF, OWL, URIRef, Literal\n",
12+
"from hashlib import md5\n",
13+
"from pandasql import sqldf\n",
14+
"from uuid import uuid4\n",
15+
"\n",
16+
"def pysqldf(q):\n",
17+
" return sqldf(q, globals())"
18+
]
19+
},
20+
{
21+
"cell_type": "code",
22+
"execution_count": 67,
23+
"metadata": {},
24+
"outputs": [],
25+
"source": [
26+
"mixsdf = pds.read_excel(\"data/mixs_v5_local.xlsx\",sheet_name=\"MIxS\")\n",
27+
"# mixsdf.head()"
28+
]
29+
},
30+
{
31+
"cell_type": "markdown",
32+
"metadata": {},
33+
"source": [
34+
"## Prep/clean dataframe\n",
35+
"* Trim spaces around column names\n",
36+
"* Replace spaces in column names with underscore\n",
37+
"* Make column names lower case\n",
38+
"* Replace NA data values with \"\""
39+
]
40+
},
41+
{
42+
"cell_type": "code",
43+
"execution_count": 68,
44+
"metadata": {},
45+
"outputs": [],
46+
"source": [
47+
"## prep column names\n",
48+
"cleanded_columns = mixsdf.columns.str.strip() # trim spaces\n",
49+
"cleanded_columns = cleanded_columns.str.lower() # make lower case\n",
50+
"cleanded_columns = cleanded_columns.str.replace(\" \", \"_\") # replace space with _\n",
51+
"mixsdf.columns = cleanded_columns\n",
52+
"\n",
53+
"## replace NaN\n",
54+
"mixsdf.fillna(\"\", inplace=True)"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": 69,
60+
"metadata": {},
61+
"outputs": [],
62+
"source": [
63+
"# mixsdf.columns"
64+
]
65+
},
66+
{
67+
"cell_type": "markdown",
68+
"metadata": {},
69+
"source": [
70+
"## Make dictionary mapping column names iris"
71+
]
72+
},
73+
{
74+
"cell_type": "code",
75+
"execution_count": 78,
76+
"metadata": {},
77+
"outputs": [],
78+
"source": [
79+
"col_to_iri = {}\n",
80+
"for c in cleanded_columns:\n",
81+
" col_to_iri[c] = \"http://purl.obolibrary.org/obo/MIXS_\" + c\n",
82+
"# col_to_iri"
83+
]
84+
},
85+
{
86+
"cell_type": "markdown",
87+
"metadata": {},
88+
"source": [
89+
"## Create column containing the iri for each record; the iri will be based on the MIXS ID column"
90+
]
91+
},
92+
{
93+
"cell_type": "code",
94+
"execution_count": 79,
95+
"metadata": {},
96+
"outputs": [],
97+
"source": [
98+
"## helper function\n",
99+
"def make_mixs_record_iri(mixs_id, prefix=\"http://purl.obolibrary.org/obo/\"):\n",
100+
" mixs_id = str(mixs_id).strip()\n",
101+
" if len(mixs_id) > 0:\n",
102+
" if mixs_id.find(\":\") > -1:\n",
103+
" return prefix + mixs_id.replace(\":\", \"_\")\n",
104+
" else:\n",
105+
" return prefix + \"MIXS_\" + mixs_id\n",
106+
" else:\n",
107+
" return prefix + \"MIXS_\" + str(uuid4())\n",
108+
" "
109+
]
110+
},
111+
{
112+
"cell_type": "code",
113+
"execution_count": 80,
114+
"metadata": {},
115+
"outputs": [],
116+
"source": [
117+
"## add record_iri column\n",
118+
"mixsdf['record_iri'] = mixsdf.apply(lambda row: make_mixs_record_iri(row.mixs_id), axis=1)\n",
119+
"# mixsdf.head(10) # note: check the 'altitude' item use uuid() to create iri"
120+
]
121+
},
122+
{
123+
"cell_type": "markdown",
124+
"metadata": {},
125+
"source": [
126+
"## Create graph of MIxS records\n",
127+
"* add column headers as annotation properties\n",
128+
"* add rows as instances of class 'MIxS record'"
129+
]
130+
},
131+
{
132+
"cell_type": "code",
133+
"execution_count": 86,
134+
"metadata": {},
135+
"outputs": [],
136+
"source": [
137+
"g = Graph()"
138+
]
139+
},
140+
{
141+
"cell_type": "code",
142+
"execution_count": 87,
143+
"metadata": {},
144+
"outputs": [],
145+
"source": [
146+
"## add column headers as annotations\n",
147+
"for c in cleanded_columns:\n",
148+
" label = c.replace(\"_\", \" \")\n",
149+
" iri = URIRef(col_to_iri[c])\n",
150+
" g.add((iri, RDF.type, OWL.AnnotationProperty))\n",
151+
" g.add ((iri, RDFS.label, Literal(label)))\n",
152+
" "
153+
]
154+
},
155+
{
156+
"cell_type": "code",
157+
"execution_count": 88,
158+
"metadata": {},
159+
"outputs": [],
160+
"source": [
161+
"## create MIxS record class\n",
162+
"record_class_iri = URIRef(\"http://purl.obolibrary.org/MIXS_mixs_record\")\n",
163+
"g.add((record_class_iri, RDF.type, OWL.Class))\n",
164+
"g.add((record_class_iri, RDFS.label, Literal(\"MIxS record\")))"
165+
]
166+
},
167+
{
168+
"cell_type": "code",
169+
"execution_count": 89,
170+
"metadata": {},
171+
"outputs": [],
172+
"source": [
173+
"## add MIxS records to graph\n",
174+
"for ix, row in mixsdf.iterrows():\n",
175+
" record_iri = URIRef(row.record_iri) # create iris and labels\n",
176+
" label = row['item']\n",
177+
" \n",
178+
" g.add((record_iri, RDF.type, record_class_iri)) # add iri to graph\n",
179+
" g.add((record_iri, RDFS.label, Literal(label)))\n",
180+
" \n",
181+
" for c in cleanded_columns: # add each column value as an annotation\n",
182+
" annotation_value = str(row[c]).strip()\n",
183+
" annotation_iri = URIRef(col_to_iri[c])\n",
184+
" g.add((record_iri, annotation_iri, Literal(annotation_value)))"
185+
]
186+
},
187+
{
188+
"cell_type": "code",
189+
"execution_count": 90,
190+
"metadata": {},
191+
"outputs": [],
192+
"source": [
193+
"## add ontology iri\n",
194+
"ontology_iri = URIRef(\"http://purl.obolibrary.org/obo/MIxS-record-translation.owl\")\n",
195+
"g.add((ontology_iri, RDF.type, OWL.Ontology))"
196+
]
197+
},
198+
{
199+
"cell_type": "code",
200+
"execution_count": 91,
201+
"metadata": {},
202+
"outputs": [],
203+
"source": [
204+
"## save graph (note: different formats (e.g., turtle) are possible)\n",
205+
"g.serialize(destination='output/MIxS-record-translation.owl', format='xml')"
206+
]
207+
},
208+
{
209+
"cell_type": "code",
210+
"execution_count": null,
211+
"metadata": {},
212+
"outputs": [],
213+
"source": []
214+
}
215+
],
216+
"metadata": {
217+
"kernelspec": {
218+
"display_name": "Python 3",
219+
"language": "python",
220+
"name": "python3"
221+
},
222+
"language_info": {
223+
"codemirror_mode": {
224+
"name": "ipython",
225+
"version": 3
226+
},
227+
"file_extension": ".py",
228+
"mimetype": "text/x-python",
229+
"name": "python",
230+
"nbconvert_exporter": "python",
231+
"pygments_lexer": "ipython3",
232+
"version": "3.7.3"
233+
}
234+
},
235+
"nbformat": 4,
236+
"nbformat_minor": 2
237+
}
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 2,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import pandas as pds\n",
10+
"import json\n",
11+
"from rdflib import Graph, RDFS, RDF, OWL, URIRef, Literal\n",
12+
"from hashlib import md5\n",
13+
"from pandasql import sqldf\n",
14+
"from uuid import uuid4\n",
15+
"\n",
16+
"def pysqldf(q):\n",
17+
" return sqldf(q, globals())"
18+
]
19+
},
20+
{
21+
"cell_type": "code",
22+
"execution_count": 4,
23+
"metadata": {},
24+
"outputs": [],
25+
"source": [
26+
"epdf = pds.read_excel(\"data/mixs_v5_local.xlsx\",sheet_name=\"environmental_packages\")\n",
27+
"# epdf.head()"
28+
]
29+
},
30+
{
31+
"cell_type": "markdown",
32+
"metadata": {},
33+
"source": [
34+
"## Prep/clean dataframe\n",
35+
"* Trim spaces around column names\n",
36+
"* Replace spaces in column names with underscore\n",
37+
"* Make column names lower case\n",
38+
"* Replace NA data values with \"\""
39+
]
40+
},
41+
{
42+
"cell_type": "code",
43+
"execution_count": 5,
44+
"metadata": {},
45+
"outputs": [],
46+
"source": [
47+
"## prep column names\n",
48+
"cleanded_columns = epdf.columns.str.strip() # trim spaces\n",
49+
"cleanded_columns = cleanded_columns.str.lower() # make lower case\n",
50+
"cleanded_columns = cleanded_columns.str.replace(\" \", \"_\") # replace space with _\n",
51+
"epdf.columns = cleanded_columns\n",
52+
"\n",
53+
"## replace NaN\n",
54+
"epdf.fillna(\"\", inplace=True)"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": 7,
60+
"metadata": {},
61+
"outputs": [],
62+
"source": [
63+
"# epdf.columns"
64+
]
65+
},
66+
{
67+
"cell_type": "markdown",
68+
"metadata": {},
69+
"source": [
70+
"## Make dictionary mapping column names iris"
71+
]
72+
},
73+
{
74+
"cell_type": "code",
75+
"execution_count": 8,
76+
"metadata": {},
77+
"outputs": [],
78+
"source": [
79+
"col_to_iri = {}\n",
80+
"for c in cleanded_columns:\n",
81+
" col_to_iri[c] = \"http://purl.obolibrary.org/obo/MIXS_\" + c\n",
82+
"# col_to_iri"
83+
]
84+
},
85+
{
86+
"cell_type": "markdown",
87+
"metadata": {},
88+
"source": [
89+
"## Create column containing the iri for each record; the iri will be based on the MIXS ID column"
90+
]
91+
},
92+
{
93+
"cell_type": "code",
94+
"execution_count": 9,
95+
"metadata": {},
96+
"outputs": [],
97+
"source": [
98+
"## helper function\n",
99+
"def make_mixs_record_iri(mixs_id, prefix=\"http://purl.obolibrary.org/obo/\"):\n",
100+
" mixs_id = str(mixs_id).strip()\n",
101+
" if len(mixs_id) > 0:\n",
102+
" if mixs_id.find(\":\") > -1:\n",
103+
" return prefix + mixs_id.replace(\":\", \"_\")\n",
104+
" else:\n",
105+
" return prefix + \"MIXS_\" + mixs_id\n",
106+
" else:\n",
107+
" return prefix + \"MIXS_\" + str(uuid4())\n",
108+
" "
109+
]
110+
},
111+
{
112+
"cell_type": "code",
113+
"execution_count": 12,
114+
"metadata": {},
115+
"outputs": [],
116+
"source": [
117+
"## add record_iri column\n",
118+
"epdf['record_iri'] = epdf.apply(lambda row: make_mixs_record_iri(row.mixs_id), axis=1)\n",
119+
"# epdf.head()"
120+
]
121+
},
122+
{
123+
"cell_type": "markdown",
124+
"metadata": {},
125+
"source": [
126+
"## Create graph of MIxS records\n",
127+
"* add column headers as annotation properties\n",
128+
"* add rows as instances of class 'MIxS record'"
129+
]
130+
},
131+
{
132+
"cell_type": "code",
133+
"execution_count": 23,
134+
"metadata": {},
135+
"outputs": [],
136+
"source": [
137+
"g = Graph()"
138+
]
139+
},
140+
{
141+
"cell_type": "code",
142+
"execution_count": 24,
143+
"metadata": {},
144+
"outputs": [],
145+
"source": [
146+
"## add column headers as annotations\n",
147+
"for c in cleanded_columns:\n",
148+
" label = c.replace(\"_\", \" \")\n",
149+
" iri = URIRef(col_to_iri[c])\n",
150+
" g.add((iri, RDF.type, OWL.AnnotationProperty))\n",
151+
" g.add ((iri, RDFS.label, Literal(label)))\n",
152+
" "
153+
]
154+
},
155+
{
156+
"cell_type": "code",
157+
"execution_count": 25,
158+
"metadata": {},
159+
"outputs": [],
160+
"source": [
161+
"## create MIxS record class\n",
162+
"record_class_iri = URIRef(\"http://purl.obolibrary.org/MIXS_environmental_package_record\")\n",
163+
"g.add((record_class_iri, RDF.type, OWL.Class))\n",
164+
"g.add((record_class_iri, RDFS.label, Literal(\"environmental package record\")))"
165+
]
166+
},
167+
{
168+
"cell_type": "code",
169+
"execution_count": 26,
170+
"metadata": {},
171+
"outputs": [],
172+
"source": [
173+
"## add MIxS records to graph\n",
174+
"for ix, row in epdf.iterrows():\n",
175+
" record_iri = URIRef(row.record_iri) # create iris and labels\n",
176+
" label = f\"{row['package_item']} ({row['environmental_package']})\"\n",
177+
" \n",
178+
" g.add((record_iri, RDF.type, record_class_iri)) # add iri to graph\n",
179+
" g.add((record_iri, RDFS.label, Literal(label)))\n",
180+
" \n",
181+
" for c in cleanded_columns: # add each column value as an annotation\n",
182+
" annotation_value = str(row[c]).strip()\n",
183+
" annotation_iri = URIRef(col_to_iri[c])\n",
184+
" g.add((record_iri, annotation_iri, Literal(annotation_value)))"
185+
]
186+
},
187+
{
188+
"cell_type": "code",
189+
"execution_count": 27,
190+
"metadata": {},
191+
"outputs": [],
192+
"source": [
193+
"## add ontology iri\n",
194+
"ontology_iri = URIRef(\"http://purl.obolibrary.org/obo/environmental-package-record-translation.owl\")\n",
195+
"g.add((ontology_iri, RDF.type, OWL.Ontology))"
196+
]
197+
},
198+
{
199+
"cell_type": "code",
200+
"execution_count": 28,
201+
"metadata": {},
202+
"outputs": [],
203+
"source": [
204+
"## save graph (note: different formats (e.g., turtle) are possible)\n",
205+
"g.serialize(destination='output/environmental-package-record-translation.owl', format='xml')"
206+
]
207+
},
208+
{
209+
"cell_type": "code",
210+
"execution_count": null,
211+
"metadata": {},
212+
"outputs": [],
213+
"source": []
214+
}
215+
],
216+
"metadata": {
217+
"kernelspec": {
218+
"display_name": "Python 3",
219+
"language": "python",
220+
"name": "python3"
221+
},
222+
"language_info": {
223+
"codemirror_mode": {
224+
"name": "ipython",
225+
"version": 3
226+
},
227+
"file_extension": ".py",
228+
"mimetype": "text/x-python",
229+
"name": "python",
230+
"nbconvert_exporter": "python",
231+
"pygments_lexer": "ipython3",
232+
"version": "3.7.3"
233+
}
234+
},
235+
"nbformat": 4,
236+
"nbformat_minor": 2
237+
}

‎MIxS-ontology-translation/notebooks/output/MIxS-record-translation.owl

+2,546
Large diffs are not rendered by default.

‎MIxS-ontology-translation/notebooks/output/environmental-package-record-translation.owl

+16,270
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
appnope==0.1.0
2+
backcall==0.1.0
3+
decorator==4.4.0
4+
ipykernel==5.1.2
5+
ipython==7.8.0
6+
ipython-genutils==0.2.0
7+
isodate==0.6.0
8+
jedi==0.15.1
9+
jupyter-client==5.3.4
10+
jupyter-core==4.6.0
11+
numpy==1.17.2
12+
pandas==0.25.1
13+
pandasql==0.7.3
14+
parso==0.5.1
15+
pexpect==4.7.0
16+
pickleshare==0.7.5
17+
prompt-toolkit==2.0.10
18+
ptyprocess==0.6.0
19+
Pygments==2.4.2
20+
pyparsing==2.4.2
21+
python-dateutil==2.8.0
22+
pytz==2019.3
23+
PyYAML==5.1.2
24+
pyzmq==18.1.0
25+
rdflib==4.2.2
26+
six==1.12.0
27+
SQLAlchemy==1.3.10
28+
tornado==6.0.3
29+
traitlets==4.3.3
30+
wcwidth==0.1.7

0 commit comments

Comments
 (0)
Please sign in to comment.