diff --git a/src/otoole/cli.py b/src/otoole/cli.py index 4eca058f..c422820b 100644 --- a/src/otoole/cli.py +++ b/src/otoole/cli.py @@ -88,6 +88,7 @@ def validate_model(args): "Reading from datapackage is deprecated, trying to read from CSVs" ) data_file = read_deprecated_datapackage(data_file) + logger.info("Successfully read folder of CSVs") read_strategy = ReadCsv(user_config=config) elif data_format == "csv": read_strategy = ReadCsv(user_config=config) @@ -152,6 +153,7 @@ def result_matrix(args): "Reading from datapackage is deprecated, trying to read from CSVs" ) input_csvs = read_deprecated_datapackage(args.input_datapackage) + logger.info("Successfully read folder of CSVs") input_data, _ = ReadCsv(user_config=config).read(input_csvs) elif args.input_datafile: input_data, _ = ReadDatafile(user_config=config).read(args.input_datafile) @@ -199,6 +201,8 @@ def conversion_matrix(args): # set read strategy + keep_whitespace = True if args.keep_whitespace else False + if args.from_format == "datafile": read_strategy = ReadDatafile(user_config=config) elif args.from_format == "datapackage": @@ -206,11 +210,12 @@ def conversion_matrix(args): "Reading from datapackage is deprecated, trying to read from CSVs" ) from_path = read_deprecated_datapackage(from_path) - read_strategy = ReadCsv(user_config=config) + logger.info("Successfully read folder of CSVs") + read_strategy = ReadCsv(user_config=config, keep_whitespace=keep_whitespace) elif args.from_format == "csv": - read_strategy = ReadCsv(user_config=config) + read_strategy = ReadCsv(user_config=config, keep_whitespace=keep_whitespace) elif args.from_format == "excel": - read_strategy = ReadExcel(user_config=config) + read_strategy = ReadExcel(user_config=config, keep_whitespace=keep_whitespace) input_data, _ = read_strategy.read(args.from_path) @@ -332,6 +337,11 @@ def get_parser(): help="Input GNUMathProg datafile required for OSeMOSYS short or fast results", default=None, ) + result_parser.add_argument( + "--input_datapackage", + help="Deprecated", + default=None, + ) result_parser.add_argument("config", help="Path to config YAML file") result_parser.add_argument( "--write_defaults", @@ -366,6 +376,12 @@ def get_parser(): default=False, action="store_true", ) + convert_parser.add_argument( + "--keep_whitespace", + help="Keeps leading/trailing whitespace in CSV files", + default=False, + action="store_true", + ) convert_parser.set_defaults(func=conversion_matrix) # Parser for validation @@ -424,6 +440,12 @@ def get_parser(): return parser +class DeprecateAction(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + logger.warning(f"Argument {self.option_strings} is deprecated and is ignored.") + delattr(namespace, self.dest) + + def main(): parser = get_parser() diff --git a/src/otoole/read_strategies.py b/src/otoole/read_strategies.py index 2c148ccb..a74b4166 100644 --- a/src/otoole/read_strategies.py +++ b/src/otoole/read_strategies.py @@ -1,6 +1,6 @@ import logging import os -from typing import Any, Dict, List, TextIO, Tuple, Union +from typing import Any, Dict, List, Optional, TextIO, Tuple, Union import pandas as pd from amply import Amply @@ -34,6 +34,10 @@ def read( class _ReadTabular(ReadStrategy): + def __init__(self, user_config: Dict[str, Dict], keep_whitespace: bool = False): + super().__init__(user_config) + self.keep_whitespace = keep_whitespace + def _check_set(self, df: pd.DataFrame, config_details: Dict, name: str): logger.info("Checking set %s", name) @@ -86,6 +90,24 @@ def _check_parameter(self, df: pd.DataFrame, expected_headers: List, name: str): return narrow[all_headers].set_index(expected_headers) + def _whitespace_converter(self, indices: List[str]) -> Dict[str, Any]: + """Creates converter for striping whitespace in dataframe + + Arguments + --------- + indicies: List[str] + Column headers of dataframe + + Returns + ------- + Dict[str,Any] + Converter dictionary + """ + if self.keep_whitespace: + return {} + else: + return {x: str.strip for x in indices} + class ReadExcel(_ReadTabular): """Read in an Excel spreadsheet in wide format to a dict of Pandas DataFrames""" @@ -177,9 +199,13 @@ def read( logger.info("Looking for %s", parameter) entity_type = details["type"] + try: + converter = self._whitespace_converter(details["indices"]) + except KeyError: # sets don't have indices def + converter = self._whitespace_converter(["VALUE"]) if entity_type == "param": - df = self._get_input_data(filepath, parameter, details) + df = self._get_input_data(filepath, parameter, details, converter) narrow = self._check_parameter(df, details["indices"], parameter) if not narrow.empty: narrow_checked = check_datatypes( @@ -189,7 +215,7 @@ def read( narrow_checked = narrow elif entity_type == "set": - df = self._get_input_data(filepath, parameter, details) + df = self._get_input_data(filepath, parameter, details, converter) narrow = self._check_set(df, details, parameter) if not narrow.empty: narrow_checked = check_set_datatype( @@ -214,9 +240,7 @@ def read( @staticmethod def _get_input_data( - filepath: str, - parameter: str, - details: Dict, + filepath: str, parameter: str, details: Dict, converter: Optional[Dict] = None ) -> pd.DataFrame: """Reads in and checks CSV data format. @@ -234,10 +258,10 @@ def _get_input_data( pd.DataFrame CSV data as a dataframe """ + converter = {} if not converter else converter csv_path = os.path.join(filepath, parameter + ".csv") - try: - df = pd.read_csv(csv_path) + df = pd.read_csv(csv_path, converters=converter) except pd.errors.EmptyDataError: logger.error("No data found in file for %s", parameter) expected_columns = details["indices"] diff --git a/tests/test_read_strategies.py b/tests/test_read_strategies.py index c3456b1c..dd106e9c 100644 --- a/tests/test_read_strategies.py +++ b/tests/test_read_strategies.py @@ -1066,3 +1066,28 @@ def test_read_default_values_csv(self, user_config): actual = reader._check_for_default_values_csv(filepath) expected = None assert actual == expected + + +class TestReadTabular: + """Methods shared for csv and excel""" + + test_data = [ + (True, ["REGION", "TECHNOLOGY"], {}), + ( + False, + ["REGION", "TECHNOLOGY"], + {"REGION": str.strip, "TECHNOLOGY": str.strip}, + ), + ] + + @mark.parametrize( + "keep_whitespace, indices, expected", + test_data, + ids=["create_empty", "create_full"], + ) + def test_whitespace_converter( + self, user_config, keep_whitespace, indices, expected + ): + reader = ReadCsv(user_config=user_config, keep_whitespace=keep_whitespace) + actual = reader._whitespace_converter(indices) + assert actual == expected