StantonMartin · Jan 17, 2020
diff --git a/‎MIxS-ontology-translation/README.md
+15 b/‎MIxS-ontology-translation/README.md
+15
diff --git a/‎MIxS-ontology-translation/notebooks/MIxS-record-translation.ipynb
+237 b/‎MIxS-ontology-translation/notebooks/MIxS-record-translation.ipynb
+237
diff --git a/‎MIxS-ontology-translation/notebooks/data/mixs_v5_local.xlsx
131 KB b/‎MIxS-ontology-translation/notebooks/data/mixs_v5_local.xlsx
131 KB
diff --git a/‎MIxS-ontology-translation/notebooks/environmental-package-record-translation.ipynb
+237 b/‎MIxS-ontology-translation/notebooks/environmental-package-record-translation.ipynb
+237
diff --git a/‎MIxS-ontology-translation/notebooks/output/MIxS-record-translation.owl
+2,546 b/‎MIxS-ontology-translation/notebooks/output/MIxS-record-translation.owl
+2,546
diff --git a/‎MIxS-ontology-translation/notebooks/output/environmental-package-record-translation.owl
+16,270 b/‎MIxS-ontology-translation/notebooks/output/environmental-package-record-translation.owl
+16,270
diff --git a/‎MIxS-ontology-translation/notebooks/requirements.txt
+30 b/‎MIxS-ontology-translation/notebooks/requirements.txt
+30
@@ -0,0 +1,15 @@
+# MIxS ontology translation
+Translates MIxS information in the MIxS spreadsheets into an ontology.
+
+The project uses venv to create a virtural enironment.
+To create a virtual environment run the command python3 -m venv <environment name> .
+I typically name my environment .env, and configure .gitignore to ignore .env files. This prevents the environment libraries from being uploaded to the repository.
+
+After the environment is created, run the command source <environment name>/bin/activate to enter the environment.
+Once in the environment, use pip to install libraries (e.g., pip install pandas). Once all the libraries have been installed you can export a list of them using the pip freeze command. Typically, the list is saved to a file named requirements.txt.
+e.g., pip freeze > requirements.txt
+
+To load a list of dependencies, use the pip instal -r command.
+e.g., pip install -r requirements.txt
+
+You exit the environment by executing the command deactivate in the terminal.
@@ -0,0 +1,237 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pds\n",
+    "import json\n",
+    "from rdflib import Graph, RDFS, RDF, OWL, URIRef, Literal\n",
+    "from hashlib import md5\n",
+    "from pandasql import sqldf\n",
+    "from uuid import uuid4\n",
+    "\n",
+    "def pysqldf(q):\n",
+    "    return sqldf(q, globals())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mixsdf = pds.read_excel(\"data/mixs_v5_local.xlsx\",sheet_name=\"MIxS\")\n",
+    "# mixsdf.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prep/clean dataframe\n",
+    "* Trim spaces around column names\n",
+    "* Replace spaces in column names with underscore\n",
+    "* Make column names lower case\n",
+    "* Replace NA data values with \"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## prep column names\n",
+    "cleanded_columns = mixsdf.columns.str.strip() # trim spaces\n",
+    "cleanded_columns = cleanded_columns.str.lower() # make lower case\n",
+    "cleanded_columns = cleanded_columns.str.replace(\" \", \"_\") # replace space with _\n",
+    "mixsdf.columns = cleanded_columns\n",
+    "\n",
+    "## replace NaN\n",
+    "mixsdf.fillna(\"\", inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mixsdf.columns"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Make dictionary mapping column names iris"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "col_to_iri = {}\n",
+    "for c in cleanded_columns:\n",
+    "    col_to_iri[c] = \"http://purl.obolibrary.org/obo/MIXS_\" + c\n",
+    "# col_to_iri"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create column containing the iri for each record; the iri will be based on the MIXS ID column"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## helper function\n",
+    "def make_mixs_record_iri(mixs_id, prefix=\"http://purl.obolibrary.org/obo/\"):\n",
+    "    mixs_id = str(mixs_id).strip()\n",
+    "    if len(mixs_id) > 0:\n",
+    "        if mixs_id.find(\":\") > -1:\n",
+    "            return prefix + mixs_id.replace(\":\", \"_\")\n",
+    "        else:\n",
+    "            return prefix + \"MIXS_\" + mixs_id\n",
+    "    else:\n",
+    "        return prefix + \"MIXS_\" + str(uuid4())\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## add record_iri column\n",
+    "mixsdf['record_iri'] = mixsdf.apply(lambda row: make_mixs_record_iri(row.mixs_id), axis=1)\n",
+    "# mixsdf.head(10) # note: check the 'altitude' item use uuid() to create iri"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create graph of MIxS records\n",
+    "* add column headers as annotation properties\n",
+    "* add rows as  instances of class 'MIxS record'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "g = Graph()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## add column headers as annotations\n",
+    "for c in cleanded_columns:\n",
+    "    label = c.replace(\"_\", \" \")\n",
+    "    iri = URIRef(col_to_iri[c])\n",
+    "    g.add((iri, RDF.type, OWL.AnnotationProperty))\n",
+    "    g.add ((iri, RDFS.label, Literal(label)))\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## create MIxS record class\n",
+    "record_class_iri = URIRef(\"http://purl.obolibrary.org/MIXS_mixs_record\")\n",
+    "g.add((record_class_iri, RDF.type, OWL.Class))\n",
+    "g.add((record_class_iri, RDFS.label, Literal(\"MIxS record\")))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## add MIxS records to graph\n",
+    "for ix, row in mixsdf.iterrows():\n",
+    "    record_iri = URIRef(row.record_iri) # create iris and labels\n",
+    "    label = row['item']\n",
+    "    \n",
+    "    g.add((record_iri, RDF.type, record_class_iri)) # add iri to graph\n",
+    "    g.add((record_iri, RDFS.label, Literal(label)))\n",
+    "    \n",
+    "    for c in cleanded_columns: # add each column value as an annotation\n",
+    "        annotation_value = str(row[c]).strip()\n",
+    "        annotation_iri = URIRef(col_to_iri[c])\n",
+    "        g.add((record_iri, annotation_iri, Literal(annotation_value)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## add ontology iri\n",
+    "ontology_iri = URIRef(\"http://purl.obolibrary.org/obo/MIxS-record-translation.owl\")\n",
+    "g.add((ontology_iri, RDF.type, OWL.Ontology))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## save graph (note: different formats (e.g., turtle) are possible)\n",
+    "g.serialize(destination='output/MIxS-record-translation.owl', format='xml')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -0,0 +1,237 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pds\n",
+    "import json\n",
+    "from rdflib import Graph, RDFS, RDF, OWL, URIRef, Literal\n",
+    "from hashlib import md5\n",
+    "from pandasql import sqldf\n",
+    "from uuid import uuid4\n",
+    "\n",
+    "def pysqldf(q):\n",
+    "    return sqldf(q, globals())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "epdf = pds.read_excel(\"data/mixs_v5_local.xlsx\",sheet_name=\"environmental_packages\")\n",
+    "# epdf.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prep/clean dataframe\n",
+    "* Trim spaces around column names\n",
+    "* Replace spaces in column names with underscore\n",
+    "* Make column names lower case\n",
+    "* Replace NA data values with \"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## prep column names\n",
+    "cleanded_columns = epdf.columns.str.strip() # trim spaces\n",
+    "cleanded_columns = cleanded_columns.str.lower() # make lower case\n",
+    "cleanded_columns = cleanded_columns.str.replace(\" \", \"_\") # replace space with _\n",
+    "epdf.columns = cleanded_columns\n",
+    "\n",
+    "## replace NaN\n",
+    "epdf.fillna(\"\", inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# epdf.columns"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Make dictionary mapping column names iris"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "col_to_iri = {}\n",
+    "for c in cleanded_columns:\n",
+    "    col_to_iri[c] = \"http://purl.obolibrary.org/obo/MIXS_\" + c\n",
+    "# col_to_iri"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create column containing the iri for each record; the iri will be based on the MIXS ID column"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## helper function\n",
+    "def make_mixs_record_iri(mixs_id, prefix=\"http://purl.obolibrary.org/obo/\"):\n",
+    "    mixs_id = str(mixs_id).strip()\n",
+    "    if len(mixs_id) > 0:\n",
+    "        if mixs_id.find(\":\") > -1:\n",
+    "            return prefix + mixs_id.replace(\":\", \"_\")\n",
+    "        else:\n",
+    "            return prefix + \"MIXS_\" + mixs_id\n",
+    "    else:\n",
+    "        return prefix + \"MIXS_\" + str(uuid4())\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## add record_iri column\n",
+    "epdf['record_iri'] = epdf.apply(lambda row: make_mixs_record_iri(row.mixs_id), axis=1)\n",
+    "# epdf.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create graph of MIxS records\n",
+    "* add column headers as annotation properties\n",
+    "* add rows as  instances of class 'MIxS record'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "g = Graph()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## add column headers as annotations\n",
+    "for c in cleanded_columns:\n",
+    "    label = c.replace(\"_\", \" \")\n",
+    "    iri = URIRef(col_to_iri[c])\n",
+    "    g.add((iri, RDF.type, OWL.AnnotationProperty))\n",
+    "    g.add ((iri, RDFS.label, Literal(label)))\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## create MIxS record class\n",
+    "record_class_iri = URIRef(\"http://purl.obolibrary.org/MIXS_environmental_package_record\")\n",
+    "g.add((record_class_iri, RDF.type, OWL.Class))\n",
+    "g.add((record_class_iri, RDFS.label, Literal(\"environmental package record\")))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## add MIxS records to graph\n",
+    "for ix, row in epdf.iterrows():\n",
+    "    record_iri = URIRef(row.record_iri) # create iris and labels\n",
+    "    label = f\"{row['package_item']} ({row['environmental_package']})\"\n",
+    "    \n",
+    "    g.add((record_iri, RDF.type, record_class_iri)) # add iri to graph\n",
+    "    g.add((record_iri, RDFS.label, Literal(label)))\n",
+    "    \n",
+    "    for c in cleanded_columns: # add each column value as an annotation\n",
+    "        annotation_value = str(row[c]).strip()\n",
+    "        annotation_iri = URIRef(col_to_iri[c])\n",
+    "        g.add((record_iri, annotation_iri, Literal(annotation_value)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## add ontology iri\n",
+    "ontology_iri = URIRef(\"http://purl.obolibrary.org/obo/environmental-package-record-translation.owl\")\n",
+    "g.add((ontology_iri, RDF.type, OWL.Ontology))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## save graph (note: different formats (e.g., turtle) are possible)\n",
+    "g.serialize(destination='output/environmental-package-record-translation.owl', format='xml')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -0,0 +1,30 @@
+appnope==0.1.0
+backcall==0.1.0
+decorator==4.4.0
+ipykernel==5.1.2
+ipython==7.8.0
+ipython-genutils==0.2.0
+isodate==0.6.0
+jedi==0.15.1
+jupyter-client==5.3.4
+jupyter-core==4.6.0
+numpy==1.17.2
+pandas==0.25.1
+pandasql==0.7.3
+parso==0.5.1
+pexpect==4.7.0
+pickleshare==0.7.5
+prompt-toolkit==2.0.10
+ptyprocess==0.6.0
+Pygments==2.4.2
+pyparsing==2.4.2
+python-dateutil==2.8.0
+pytz==2019.3
+PyYAML==5.1.2
+pyzmq==18.1.0
+rdflib==4.2.2
+six==1.12.0
+SQLAlchemy==1.3.10
+tornado==6.0.3
+traitlets==4.3.3
+wcwidth==0.1.7