Skip to content

Commit 463947d

Browse files
committed
Initial ThermoML transform script
1 parent e18db75 commit 463947d

File tree

2 files changed

+143
-0
lines changed

2 files changed

+143
-0
lines changed
+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
git+https://github.com/sustainable-processes/thermopyl
2+
tqdm
3+
pyyaml

data/thermoml_archive/transform.py

+140
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
import hashlib
2+
import pathlib
3+
import tarfile
4+
import warnings
5+
6+
import pandas as pd
7+
import requests
8+
import tqdm
9+
import yaml
10+
from thermopyl import Parser
11+
12+
from chemnlp.data_val.model import Dataset
13+
14+
15+
def get_and_transform_data():
16+
"""Downloads the archived version of ThermoML, extracts it and
17+
loops through the provided JSON-LD files to construct a dataframe.
18+
19+
"""
20+
# get raw data
21+
fname = "ThermoML.v2020-09-30.tgz"
22+
download_path = pathlib.Path(__file__).parent / fname
23+
remote_data_path = f"https://data.nist.gov/od/ds/mds2-2422/{fname}"
24+
sha256_checksum = "231161b5e443dc1ae0e5da8429d86a88474cb722016e5b790817bb31c58d7ec2"
25+
final_csv_path = pathlib.Path(__file__).parent / "thermoml_archive.csv"
26+
final_expected_csv_checksum = ""
27+
28+
if not download_path.exists():
29+
data = requests.get(remote_data_path)
30+
with open(download_path, "wb") as f:
31+
for chunk in tqdm.tqdm(
32+
data.iter_content(chunk_size=8192), desc="Downloading archive"
33+
):
34+
f.write(chunk)
35+
36+
# check if checksum is correct
37+
sha256 = hashlib.sha256()
38+
with open(download_path, "rb") as f:
39+
for chunk in tqdm.tqdm(iter(lambda: f.read(8192), b""), desc="Checking hash"):
40+
sha256.update(chunk)
41+
42+
if received_hash := sha256.hexdigest() != sha256_checksum:
43+
raise RuntimeError(
44+
"Downloaded file did not match expected checksum -- "
45+
"either a new version has been released or something has gone wrong!\n"
46+
f"Expected: {sha256_checksum}\n"
47+
f"Received: {received_hash}"
48+
)
49+
50+
# Extract tar.gz archive
51+
with tarfile.open(download_path, "r:*") as tar:
52+
tar.extractall(pathlib.Path(__file__).parent)
53+
54+
# Loop through journal DOI folders and scrape files
55+
56+
if final_csv_path.exists():
57+
sha256 = hashlib.sha256()
58+
with open(final_csv_path, "rb") as f:
59+
for chunk in tqdm.tqdm(
60+
iter(lambda: f.read(8192), b""), desc="Checking hash"
61+
):
62+
sha256.update(chunk)
63+
if sha256.hexdigest() != final_expected_csv_checksum:
64+
warnings.warn(
65+
"Old CSV file did not match expected checksum, will try to recreate."
66+
)
67+
final_csv_path.rename(final_csv_path.with_suffix(".old.csv"))
68+
69+
root_dois = ("10.1007", "10.1016", "10.1021")
70+
71+
num_points = 0
72+
num_failed = 0
73+
for doi in root_dois:
74+
for path in tqdm.tqdm(
75+
(pathlib.Path(__file__).parent / doi).glob("*.xml"),
76+
desc=f"Looping over files in {doi}",
77+
):
78+
with open(path, "r") as f:
79+
try:
80+
pd.DataFrame(Parser(path).parse()).to_csv(final_csv_path, mode="a")
81+
num_points += 1
82+
except Exception:
83+
num_failed += 1
84+
85+
print(f"Ingested {num_points} with {num_failed} failures.")
86+
87+
sha256 = hashlib.sha256()
88+
with open(final_csv_path, "rb") as f:
89+
for chunk in tqdm.tqdm(iter(lambda: f.read(8192), b""), desc="Checking hash"):
90+
sha256.update(chunk)
91+
92+
if csv_hash := sha256.hexdigest() != final_expected_csv_checksum:
93+
warnings.warn(
94+
"Final CSV file did not match expected checksum!\n"
95+
f"Expected: {final_expected_csv_checksum}\n"
96+
f"Received: {csv_hash}"
97+
)
98+
99+
# create metadata
100+
meta = Dataset(
101+
**{
102+
"name": "thermoml_archive",
103+
"description": "ThermoML is an XML-based IUPAC standard for the storage and exchange of experimental thermophysical and thermochemical property data. The ThermoML archive is a subset of Thermodynamics Research Center (TRC) data holdings corresponding to cooperation between NIST TRC and five journals.", # noqa
104+
"identifiers": [
105+
{
106+
"id": "",
107+
"type": "inchi",
108+
},
109+
{
110+
"id": "",
111+
"type": "inchikey",
112+
},
113+
],
114+
"license": "https://www.nist.gov/open/license",
115+
"links": [
116+
{
117+
"url": "https://doi.org/10.18434/mds2-2422",
118+
"description": "data publication",
119+
},
120+
{
121+
"url": "https://www.nist.gov/publications/towards-improved-fairness-thermoml-archive",
122+
"description": "NIST publication description",
123+
},
124+
{
125+
"url": "https://trc.nist.gov/ThermoML",
126+
"description": "Live database hosted at NIST Thermodynamics Research Center",
127+
},
128+
],
129+
"num_points": num_points,
130+
"bibtex": [
131+
"@article{Riccardi2022,title = {Towards improved {{FAIRness}} of the {{ThermoML Archive}}},author = {Riccardi, Demian and Trautt, Zachary and Bazyleva, Ala and Paulechka, Eugene and Diky, Vladimir and Magee, Joseph W. and Kazakov, Andrei F. and Townsend, Scott A. and Muzny, Chris D.},year = {2022},journal = {Journal of Computational Chemistry},volume = {43},number = {12},pages = {879--887},doi = {10.1002/jcc.26842},langid = {english}}", # noqa
132+
],
133+
}
134+
)
135+
with open("meta.yaml", "w") as f:
136+
yaml.dump(meta.dict(), f, sort_keys=False)
137+
138+
139+
if __name__ == "__main__":
140+
get_and_transform_data()

0 commit comments

Comments
 (0)