Merge pull request OSeMOSYS#146 from trevorb1/whitespace-removal

Whitespace removal on reading in of data
trevorb1 · Feb 23, 2023 · 0073c91 · 0073c91
2 parents fe537fb + c3450e3
commit 0073c91
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 11 deletions.
diff --git a/src/otoole/cli.py b/src/otoole/cli.py
@@ -199,18 +199,20 @@ def conversion_matrix(args):
 
     # set read strategy
 
+    keep_whitespace = True if args.keep_whitespace else False
+
     if args.from_format == "datafile":
         read_strategy = ReadDatafile(user_config=config)
     elif args.from_format == "datapackage":
         logger.warning(
             "Reading from datapackage is deprecated, trying to read from CSVs"
         )
         from_path = read_deprecated_datapackage(from_path)
-        read_strategy = ReadCsv(user_config=config)
+        read_strategy = ReadCsv(user_config=config, keep_whitespace=keep_whitespace)
     elif args.from_format == "csv":
-        read_strategy = ReadCsv(user_config=config)
+        read_strategy = ReadCsv(user_config=config, keep_whitespace=keep_whitespace)
     elif args.from_format == "excel":
-        read_strategy = ReadExcel(user_config=config)
+        read_strategy = ReadExcel(user_config=config, keep_whitespace=keep_whitespace)
 
     input_data, _ = read_strategy.read(args.from_path)
 
@@ -366,6 +368,12 @@ def get_parser():
         default=False,
         action="store_true",
     )
+    convert_parser.add_argument(
+        "--keep_whitespace",
+        help="Keeps leading/trailing whitespace in CSV files",
+        default=False,
+        action="store_true",
+    )
     convert_parser.set_defaults(func=conversion_matrix)
 
     # Parser for validation

diff --git a/src/otoole/read_strategies.py b/src/otoole/read_strategies.py
@@ -1,6 +1,6 @@
 import logging
 import os
-from typing import Any, Dict, List, TextIO, Tuple, Union
+from typing import Any, Dict, List, Optional, TextIO, Tuple, Union
 
 import pandas as pd
 from amply import Amply
@@ -34,6 +34,10 @@ def read(
 
 
 class _ReadTabular(ReadStrategy):
+    def __init__(self, user_config: Dict[str, Dict], keep_whitespace: bool = False):
+        super().__init__(user_config)
+        self.keep_whitespace = keep_whitespace
+
     def _check_set(self, df: pd.DataFrame, config_details: Dict, name: str):
 
         logger.info("Checking set %s", name)
@@ -86,6 +90,24 @@ def _check_parameter(self, df: pd.DataFrame, expected_headers: List, name: str):
 
         return narrow[all_headers].set_index(expected_headers)
 
+    def _whitespace_converter(self, indices: List[str]) -> Dict[str, Any]:
+        """Creates converter for striping whitespace in dataframe
+
+        Arguments
+        ---------
+        indicies: List[str]
+            Column headers of dataframe
+
+        Returns
+        -------
+        Dict[str,Any]
+            Converter dictionary
+        """
+        if self.keep_whitespace:
+            return {}
+        else:
+            return {x: str.strip for x in indices}
+
 
 class ReadExcel(_ReadTabular):
     """Read in an Excel spreadsheet in wide format to a dict of Pandas DataFrames"""
@@ -177,9 +199,13 @@ def read(
             logger.info("Looking for %s", parameter)
 
             entity_type = details["type"]
+            try:
+                converter = self._whitespace_converter(details["indices"])
+            except KeyError:  # sets don't have indices def
+                converter = self._whitespace_converter(["VALUE"])
 
             if entity_type == "param":
-                df = self._get_input_data(filepath, parameter, details)
+                df = self._get_input_data(filepath, parameter, details, converter)
                 narrow = self._check_parameter(df, details["indices"], parameter)
                 if not narrow.empty:
                     narrow_checked = check_datatypes(
@@ -189,7 +215,7 @@ def read(
                     narrow_checked = narrow
 
             elif entity_type == "set":
-                df = self._get_input_data(filepath, parameter, details)
+                df = self._get_input_data(filepath, parameter, details, converter)
                 narrow = self._check_set(df, details, parameter)
                 if not narrow.empty:
                     narrow_checked = check_set_datatype(
@@ -214,9 +240,7 @@ def read(
 
     @staticmethod
     def _get_input_data(
-        filepath: str,
-        parameter: str,
-        details: Dict,
+        filepath: str, parameter: str, details: Dict, converter: Optional[Dict] = None
     ) -> pd.DataFrame:
         """Reads in and checks CSV data format.
 
@@ -234,10 +258,10 @@ def _get_input_data(
         pd.DataFrame
             CSV data as a dataframe
         """
+        converter = {} if not converter else converter
         csv_path = os.path.join(filepath, parameter + ".csv")
-
         try:
-            df = pd.read_csv(csv_path)
+            df = pd.read_csv(csv_path, converters=converter)
         except pd.errors.EmptyDataError:
             logger.error("No data found in file for %s", parameter)
             expected_columns = details["indices"]

diff --git a/tests/test_read_strategies.py b/tests/test_read_strategies.py
@@ -1066,3 +1066,28 @@ def test_read_default_values_csv(self, user_config):
         actual = reader._check_for_default_values_csv(filepath)
         expected = None
         assert actual == expected
+
+
+class TestReadTabular:
+    """Methods shared for csv and excel"""
+
+    test_data = [
+        (True, ["REGION", "TECHNOLOGY"], {}),
+        (
+            False,
+            ["REGION", "TECHNOLOGY"],
+            {"REGION": str.strip, "TECHNOLOGY": str.strip},
+        ),
+    ]
+
+    @mark.parametrize(
+        "keep_whitespace, indices, expected",
+        test_data,
+        ids=["create_empty", "create_full"],
+    )
+    def test_whitespace_converter(
+        self, user_config, keep_whitespace, indices, expected
+    ):
+        reader = ReadCsv(user_config=user_config, keep_whitespace=keep_whitespace)
+        actual = reader._whitespace_converter(indices)
+        assert actual == expected