Skip to content

Commit

Permalink
Merge pull request OSeMOSYS#146 from trevorb1/whitespace-removal
Browse files Browse the repository at this point in the history
Whitespace removal on reading in of data
  • Loading branch information
trevorb1 authored Feb 23, 2023
2 parents fe537fb + c3450e3 commit 0073c91
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 11 deletions.
14 changes: 11 additions & 3 deletions src/otoole/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,18 +199,20 @@ def conversion_matrix(args):

# set read strategy

keep_whitespace = True if args.keep_whitespace else False

if args.from_format == "datafile":
read_strategy = ReadDatafile(user_config=config)
elif args.from_format == "datapackage":
logger.warning(
"Reading from datapackage is deprecated, trying to read from CSVs"
)
from_path = read_deprecated_datapackage(from_path)
read_strategy = ReadCsv(user_config=config)
read_strategy = ReadCsv(user_config=config, keep_whitespace=keep_whitespace)
elif args.from_format == "csv":
read_strategy = ReadCsv(user_config=config)
read_strategy = ReadCsv(user_config=config, keep_whitespace=keep_whitespace)
elif args.from_format == "excel":
read_strategy = ReadExcel(user_config=config)
read_strategy = ReadExcel(user_config=config, keep_whitespace=keep_whitespace)

input_data, _ = read_strategy.read(args.from_path)

Expand Down Expand Up @@ -366,6 +368,12 @@ def get_parser():
default=False,
action="store_true",
)
convert_parser.add_argument(
"--keep_whitespace",
help="Keeps leading/trailing whitespace in CSV files",
default=False,
action="store_true",
)
convert_parser.set_defaults(func=conversion_matrix)

# Parser for validation
Expand Down
40 changes: 32 additions & 8 deletions src/otoole/read_strategies.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
import os
from typing import Any, Dict, List, TextIO, Tuple, Union
from typing import Any, Dict, List, Optional, TextIO, Tuple, Union

import pandas as pd
from amply import Amply
Expand Down Expand Up @@ -34,6 +34,10 @@ def read(


class _ReadTabular(ReadStrategy):
def __init__(self, user_config: Dict[str, Dict], keep_whitespace: bool = False):
super().__init__(user_config)
self.keep_whitespace = keep_whitespace

def _check_set(self, df: pd.DataFrame, config_details: Dict, name: str):

logger.info("Checking set %s", name)
Expand Down Expand Up @@ -86,6 +90,24 @@ def _check_parameter(self, df: pd.DataFrame, expected_headers: List, name: str):

return narrow[all_headers].set_index(expected_headers)

def _whitespace_converter(self, indices: List[str]) -> Dict[str, Any]:
"""Creates converter for striping whitespace in dataframe
Arguments
---------
indicies: List[str]
Column headers of dataframe
Returns
-------
Dict[str,Any]
Converter dictionary
"""
if self.keep_whitespace:
return {}
else:
return {x: str.strip for x in indices}


class ReadExcel(_ReadTabular):
"""Read in an Excel spreadsheet in wide format to a dict of Pandas DataFrames"""
Expand Down Expand Up @@ -177,9 +199,13 @@ def read(
logger.info("Looking for %s", parameter)

entity_type = details["type"]
try:
converter = self._whitespace_converter(details["indices"])
except KeyError: # sets don't have indices def
converter = self._whitespace_converter(["VALUE"])

if entity_type == "param":
df = self._get_input_data(filepath, parameter, details)
df = self._get_input_data(filepath, parameter, details, converter)
narrow = self._check_parameter(df, details["indices"], parameter)
if not narrow.empty:
narrow_checked = check_datatypes(
Expand All @@ -189,7 +215,7 @@ def read(
narrow_checked = narrow

elif entity_type == "set":
df = self._get_input_data(filepath, parameter, details)
df = self._get_input_data(filepath, parameter, details, converter)
narrow = self._check_set(df, details, parameter)
if not narrow.empty:
narrow_checked = check_set_datatype(
Expand All @@ -214,9 +240,7 @@ def read(

@staticmethod
def _get_input_data(
filepath: str,
parameter: str,
details: Dict,
filepath: str, parameter: str, details: Dict, converter: Optional[Dict] = None
) -> pd.DataFrame:
"""Reads in and checks CSV data format.
Expand All @@ -234,10 +258,10 @@ def _get_input_data(
pd.DataFrame
CSV data as a dataframe
"""
converter = {} if not converter else converter
csv_path = os.path.join(filepath, parameter + ".csv")

try:
df = pd.read_csv(csv_path)
df = pd.read_csv(csv_path, converters=converter)
except pd.errors.EmptyDataError:
logger.error("No data found in file for %s", parameter)
expected_columns = details["indices"]
Expand Down
25 changes: 25 additions & 0 deletions tests/test_read_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -1066,3 +1066,28 @@ def test_read_default_values_csv(self, user_config):
actual = reader._check_for_default_values_csv(filepath)
expected = None
assert actual == expected


class TestReadTabular:
"""Methods shared for csv and excel"""

test_data = [
(True, ["REGION", "TECHNOLOGY"], {}),
(
False,
["REGION", "TECHNOLOGY"],
{"REGION": str.strip, "TECHNOLOGY": str.strip},
),
]

@mark.parametrize(
"keep_whitespace, indices, expected",
test_data,
ids=["create_empty", "create_full"],
)
def test_whitespace_converter(
self, user_config, keep_whitespace, indices, expected
):
reader = ReadCsv(user_config=user_config, keep_whitespace=keep_whitespace)
actual = reader._whitespace_converter(indices)
assert actual == expected

0 comments on commit 0073c91

Please sign in to comment.