From 4fc746d491b1fe1dabcb1c53650a893cadd9d5c6 Mon Sep 17 00:00:00 2001 From: Will Usher Date: Thu, 14 Jul 2022 15:22:26 +0200 Subject: [PATCH] Write out datapackage schema from user-config (closes #122) --- src/otoole/input.py | 2 +- src/otoole/preprocess/create_datapackage.py | 102 +++++++++------- src/otoole/write_strategies.py | 84 ++++++++----- tests/test_write_strategies.py | 127 +++++++++++++++++++- 4 files changed, 243 insertions(+), 72 deletions(-) diff --git a/src/otoole/input.py b/src/otoole/input.py index 992a981e..02afe24d 100644 --- a/src/otoole/input.py +++ b/src/otoole/input.py @@ -253,7 +253,7 @@ def write(self, inputs: Dict, filepath: str, default_values: Dict): self._footer(handle) - if handle: + if isinstance(handle, TextIO): handle.close() diff --git a/src/otoole/preprocess/create_datapackage.py b/src/otoole/preprocess/create_datapackage.py index 6bd09866..ba6b2a89 100644 --- a/src/otoole/preprocess/create_datapackage.py +++ b/src/otoole/preprocess/create_datapackage.py @@ -6,80 +6,98 @@ import logging import os +from frictionless import Package +from typing import Dict -from datapackage import Package +logger = logging.getLogger() -from otoole.utils import read_packaged_file -logger = logging.getLogger() +def generate_package(package: Package, config: Dict[str, Dict]) -> Package: + """Adds schema information to a basic Resource + Arguments + --------- + package: Package + A frictionless Package + config: Dict[str, Dict] + A user-configuration dictionary -def generate_package(path_to_package): - """Creates a datapackage in folder ``path_to_package`` + Returns + ------- + dict + Notes + ----- [{'fields': 'REGION', 'reference': {'resource': 'REGION', 'fields': 'VALUE'}}] """ - datapath = os.path.join(path_to_package) - package = Package(base_path=datapath) - - package.infer("data/*.csv") + logger.debug(f"Auto-identified resources {package.resources}") - package.descriptor["licenses"] = [ - { - "name": "CC-BY-4.0", - "path": "https://creativecommons.org/licenses/by/4.0/", - "title": "Creative Commons Attribution 4.0", - } - ] + # package.licenses = [ + # { + # "name": "CC-BY-4.0", + # "path": "https://creativecommons.org/licenses/by/4.0/", + # "title": "Creative Commons Attribution 4.0", + # } + # ] - package.descriptor["title"] = "The OSeMOSYS Simplicity Example Model" + # package.title = "The OSeMOSYS Simplicity Example Model" - package.descriptor["name"] = "osemosys_model_simplicity" + # package.name = "osemosys_model_simplicity" - package.descriptor["contributors"] = [ - { - "title": "Will Usher", - "email": "wusher@kth.se", - "path": "http://www.kth.se/wusher", - "role": "author", - } - ] + # package.contributors = [ + # { + # "title": "Will Usher", + # "email": "wusher@kth.se", + # "path": "https://www.kth.se/profile/wusher/", + # "role": "author", + # } + # ] - package.commit() + for resource in package.resources: # typing: Resource - config = read_packaged_file("config.yaml", "otoole.preprocess") - - new_resources = [] - for resource in package.resources: + name = resource.title # Use the title which preserves case - descriptor = resource.descriptor + logger.debug(f"Updating resource '{name}'") - name = resource.name if config[name]["type"] == "param": indices = config[name]["indices"] logger.debug("Indices of %s are %s", name, indices) + fields = [] foreign_keys = [] for index in indices: key = { "fields": index, - "reference": {"resource": index, "fields": "VALUE"}, + "reference": {"resource": index.lower(), "fields": "VALUE"}, } foreign_keys.append(key) + field = {"name": index, "type": config[index]["dtype"]} + + fields.append(field) + + value_field = {"name": "VALUE", "type": config[name]["dtype"]} + + fields.append(value_field) + + resource.schema.fields = fields + resource.schema.foreign_keys = foreign_keys + resource.schema.primary_key = indices + resource.schema.missing_values = [""] + + elif config[name]["type"] == "set": - descriptor["schema"]["foreignKeys"] = foreign_keys - descriptor["schema"]["primaryKey"] = indices - descriptor["schema"]["missingValues"] = [""] + fields = [] + value_field = {"name": "VALUE", "type": config[name]["dtype"]} - new_resources.append(descriptor) + fields.append(value_field) + resource.schema.fields = fields + resource.schema.missing_values = [""] - package.descriptor["resources"] = new_resources - package.commit() + logger.debug(f"Schema for resource {name}: {resource.schema}") - filepath = os.path.join(path_to_package, "datapackage.json") - package.save(filepath) + return package def validate_contents(path_to_package): diff --git a/src/otoole/write_strategies.py b/src/otoole/write_strategies.py index 79afd8d6..3a01f2ab 100644 --- a/src/otoole/write_strategies.py +++ b/src/otoole/write_strategies.py @@ -1,12 +1,12 @@ import logging import os import pandas as pd -from json import dump -from typing import Any, TextIO +from frictionless import Package, Resource +from typing import TextIO from otoole.input import WriteStrategy +from otoole.preprocess.create_datapackage import generate_package from otoole.read_strategies import CSV_TO_EXCEL -from otoole.utils import read_packaged_file logger = logging.getLogger(__name__) @@ -155,17 +155,8 @@ class WriteCsv(WriteStrategy): user_config: dict, default=None """ - def _header(self) -> Any: - os.makedirs(os.path.join(self.filepath), exist_ok=True) - return None - - def _write_parameter( - self, df: pd.DataFrame, parameter_name: str, handle: TextIO, default: float - ) -> pd.DataFrame: - """Write parameter data""" - self._write_out_dataframe(self.filepath, parameter_name, df, index=True) - - def _write_out_dataframe(self, folder, parameter, df, index=False): + @staticmethod + def _write_out_dataframe(folder, parameter, df, index=False): """Writes out a dataframe as a csv into the data subfolder of a datapackage Arguments @@ -184,6 +175,16 @@ def _write_out_dataframe(self, folder, parameter, df, index=False): ) df.to_csv(csvfile, index=index) + def _header(self) -> Package: + os.makedirs(os.path.join(self.filepath), exist_ok=True) + return None + + def _write_parameter( + self, df: pd.DataFrame, parameter_name: str, handle: Package, default: float + ) -> pd.DataFrame: + """Write parameter data""" + self._write_out_dataframe(self.filepath, parameter_name, df, index=True) + def _write_set(self, df: pd.DataFrame, set_name, handle: TextIO) -> pd.DataFrame: """Write set data""" self._write_out_dataframe(self.filepath, set_name, df, index=False) @@ -192,11 +193,7 @@ def _footer(self, handle: TextIO): pass -class WriteDatapackage(WriteCsv): - def _header(self) -> Any: - os.makedirs(os.path.join(self.filepath, "data"), exist_ok=True) - return None - +class WriteDatapackage(WriteStrategy): def _write_out_dataframe(self, folder, parameter, df, index=False): """Writes out a dataframe as a csv into the data subfolder of a datapackage @@ -214,19 +211,52 @@ def _write_out_dataframe(self, folder, parameter, df, index=False): ) df.to_csv(csvfile, index=index) - def _footer(self, handle: TextIO): - datapackage = read_packaged_file("datapackage.json", "otoole.preprocess") - filepath = os.path.join(self.filepath, "datapackage.json") - with open(filepath, "w", newline="") as destination: - dump(datapackage, destination) - self._write_default_values() - - def _write_default_values(self): + def _write_default_values(self, handle): default_values_path = os.path.join(self.filepath, "data", "default_values.csv") with open(default_values_path, "w", newline="") as csv_file: + csv_file.write("name,default_value\n") + rows = [] for name, contents in self.user_config.items(): if contents["type"] == "param": csv_file.write("{},{}\n".format(name, contents["default"])) + rows.append([name, contents["default"]]) + + df = pd.DataFrame(rows, columns=["name", "default_value"]) + self._add_resource("default_values", df) + + def _header(self) -> Package: + os.makedirs(os.path.join(self.filepath, "data"), exist_ok=True) + return Package() + + def _add_resource(self, parameter_name: str, df: pd.DataFrame) -> Resource: + resource = Resource(df) + resource.name = parameter_name.lower() + resource.title = parameter_name + return resource + + def _write_parameter( + self, df: pd.DataFrame, parameter_name: str, handle: Package, default: float + ) -> pd.DataFrame: + """Write parameter data""" + self._write_out_dataframe(self.filepath, parameter_name, df, index=True) + + resource = self._add_resource(parameter_name, df) + handle.add_resource(resource) + + def _write_set(self, df: pd.DataFrame, set_name, handle: Package) -> pd.DataFrame: + """Write set data""" + self._write_out_dataframe(self.filepath, set_name, df, index=False) + + resource = self._add_resource(set_name, df) + handle.add_resource(resource) + + def _footer(self, handle: Package): + + self._write_default_values(handle) + package = generate_package(handle, self.user_config) + + filepath = os.path.join(self.filepath, "datapackage.yaml") + package.to_yaml(filepath) diff --git a/tests/test_write_strategies.py b/tests/test_write_strategies.py index 9daa8a74..1cb0a83e 100644 --- a/tests/test_write_strategies.py +++ b/tests/test_write_strategies.py @@ -1,8 +1,13 @@ +from pytest import fixture + import io import pandas as pd +from frictionless import Package, Resource from tempfile import NamedTemporaryFile +from yaml import SafeLoader, load # type: ignore -from otoole.write_strategies import WriteDatafile, WriteExcel +from otoole.preprocess.create_datapackage import generate_package +from otoole.write_strategies import WriteDatafile, WriteDatapackage, WriteExcel class TestWriteExcel: @@ -71,7 +76,7 @@ def test_form_three_columns(self, user_config): expected_data = [[41, 42]] expected = pd.DataFrame( data=expected_data, - columns=pd.Int64Index([2015, 2016], dtype="int64", name="YEAR"), + columns=pd.Index([2015, 2016], dtype="int64", name="YEAR"), index=pd.MultiIndex.from_tuples( [("SIMPLICITY", "COAL")], names=["REGION", "FUEL"] ), @@ -180,3 +185,121 @@ def test_write_set(self, user_config): for actual_line, expected_line in zip(actual, expected): assert actual_line == expected_line + + +class TestWriteDataPackage: + @fixture() + def simple_user_config(self): + + config = {} + config["TECH"] = {"dtype": "string", "type": "set"} + config["YEAR"] = {"dtype": "integer", "type": "set"} + config["CapitalCost"] = { + "indices": ["TECH", "YEAR"], + "type": "param", + "dtype": "float", + "default": 0.05, + } + + return config + + @fixture() + def simple_default_values(self): + + default_values = {} + default_values["CapitalCost"] = 0.05 + + return default_values + + @fixture + def simple_data(self): + data = {} + data["TECH"] = pd.DataFrame(data=["NGCC"], columns=["VALUE"]) + data["YEAR"] = pd.DataFrame(data=[2010, 2011], columns=["VALUE"]).astype( + {"VALUE": int} + ) + data["CapitalCost"] = pd.DataFrame( + data=[["NGCC", 2010, 1000.0], ["NGCC", 2011, 950.0]], + columns=["TECH", "YEAR", "VALUE"], + ).set_index(["TECH", "YEAR"]) + + return data + + @fixture + def expected_schema(self): + + schema = """ +resources: + - name: tech + title: TECH + schema: + fields: + - name: VALUE + type: string + missingValues: + - '' + - name: year + title: YEAR + schema: + fields: + - name: VALUE + type: integer + missingValues: + - '' + - name: capitalcost + title: CapitalCost + schema: + fields: + - name: TECH + type: string + - name: YEAR + type: integer + - name: VALUE + type: float + foreignKeys: + - fields: TECH + reference: + resource: tech + fields: VALUE + - fields: YEAR + reference: + resource: year + fields: VALUE + primaryKey: + - TECH + - YEAR + missingValues: + - '' + + """ + return load(schema, SafeLoader) + + def test_write_datapackage( + self, simple_user_config, simple_data, simple_default_values + ): + + filepath = NamedTemporaryFile().name + + writer = WriteDatapackage(user_config=simple_user_config) + writer.write(simple_data, filepath, simple_default_values) + + def test_create_json(self, simple_user_config, simple_data, expected_schema): + + filepath = NamedTemporaryFile().name + + resources = [] + for x, y in simple_data.items(): + resource = Resource(y) + resource.name = x.lower() + resource.title = x + resources.append(resource) + + package = Package(resources=resources, onerror="warn") + + package = generate_package(package, simple_user_config) + + actual = package.to_dict() + + package.to_yaml(filepath) + + assert actual == expected_schema