diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a579681d..dc0fb3d2a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,7 @@ Removed: - World Bank indicator data is now downloaded directly from their API via the function `download_world_bank_indicator`, instead of relying on the `pandas-datareader` package [#1033](https://github.com/CLIMADA-project/climada_python/pull/1033) - `Exposures.write_hdf5` pickles geometry data in WKB format, which is faster and more sustainable. [#1051](https://github.com/CLIMADA-project/climada_python/pull/1051) - The online documentation has been completely overhauled, now uses PyData theme: [#977](https://github.com/CLIMADA-project/climada_python/pull/977) +- Add `climada.hazard.xarray` module with helper structures for reading Hazard objects from `xarray` data [#1063](https://github.com/CLIMADA-project/climada_python/pull/1063) ### Fixed @@ -38,6 +39,8 @@ Removed: ### Deprecated +- `Hazard.from_xarray_raster_file`. Use `Hazard.from_xarray_raster` and pass the file path as `data` argument [#1063](https://github.com/CLIMADA-project/climada_python/pull/1063) + ### Removed - `climada.util.interpolation.round_to_sig_digits` [#1012](https://github.com/CLIMADA-project/climada_python/pull/1012) diff --git a/climada/hazard/io.py b/climada/hazard/io.py index e80531b95..6f8bd01b0 100644 --- a/climada/hazard/io.py +++ b/climada/hazard/io.py @@ -19,29 +19,29 @@ Define Hazard IO Methods. """ -import copy import datetime as dt import itertools import logging import pathlib import warnings from collections.abc import Collection -from typing import Any, Callable, Dict, Optional, Union +from typing import Any, Dict, Optional, Union import h5py import numpy as np import pandas as pd import rasterio -import sparse as sp import xarray as xr +from deprecation import deprecated from scipy import sparse import climada.util.constants as u_const import climada.util.coordinates as u_coord -import climada.util.dates_times as u_dt import climada.util.hdf5_handler as u_hdf5 from climada.hazard.centroids.centr import Centroids +from .xarray import HazardXarrayReader + LOGGER = logging.getLogger(__name__) DEF_VAR_EXCEL = { @@ -86,12 +86,6 @@ } """MATLAB variable names""" -DEF_COORDS = dict(event="time", longitude="longitude", latitude="latitude") -"""Default coordinates when reading Hazard data from an xarray Dataset""" - -DEF_DATA_VARS = ["fraction", "frequency", "event_id", "event_name", "date"] -"""Default keys for optional Hazard attributes when reading from an xarray Dataset""" - # pylint: disable=no-member @@ -281,18 +275,25 @@ def from_raster( ) @classmethod + @deprecated( + 6.0, + details="Hazard.from_xarray_raster now supports a filepath as 'data' parameter", + ) def from_xarray_raster_file( cls, filepath: Union[pathlib.Path, str], *args, **kwargs ): """Read raster-like data from a file that can be loaded with xarray - This wraps :py:meth:`~Hazard.from_xarray_raster` by first opening the target file + .. deprecated:: 6.0 + Pass ``filepath`` as ``data`` argument to :py:meth:`from_xarray_raster` + instead. + + This wraps :py:meth:`from_xarray_raster` by first opening the target file as xarray dataset and then passing it to that classmethod. Use this wrapper as a simple alternative to opening the file yourself. The signature is exactly the same, except for the first argument, which is replaced by a file path here. - Additional (keyword) arguments are passed to - :py:meth:`~Hazard.from_xarray_raster`. + Additional (keyword) arguments are passed to :py:meth:`from_xarray_raster`. Parameters ---------- @@ -304,29 +305,14 @@ def from_xarray_raster_file( ------- hazard : climada.Hazard A hazard object created from the input data - - Examples - -------- - - >>> hazard = Hazard.from_xarray_raster_file("path/to/file.nc", "", "") - - Notes - ----- - - If you have specific requirements for opening a data file, prefer opening it - yourself and using :py:meth:`~Hazard.from_xarray_raster`, following this pattern: - - >>> open_kwargs = dict(engine="h5netcdf", chunks=dict(x=-1, y="auto")) - >>> with xarray.open_dataset("path/to/file.nc", **open_kwargs) as dset: - ... hazard = Hazard.from_xarray_raster(dset, "", "") """ - with xr.open_dataset(filepath, chunks="auto") as dset: - return cls.from_xarray_raster(dset, *args, **kwargs) + args = (filepath,) + args + return cls.from_xarray_raster(*args, **kwargs) @classmethod def from_xarray_raster( cls, - data: xr.Dataset, + data: xr.Dataset | pathlib.Path | str, hazard_type: str, intensity_unit: str, *, @@ -335,6 +321,7 @@ def from_xarray_raster( data_vars: Optional[Dict[str, str]] = None, crs: str = u_const.DEF_CRS, rechunk: bool = False, + open_dataset_kws: dict[str, Any] | None = None, ): """Read raster-like data from an xarray Dataset @@ -358,26 +345,23 @@ def from_xarray_raster( meaning that the object can be used in all CLIMADA operations without throwing an error due to missing data or faulty data types. - Use :py:meth:`~Hazard.from_xarray_raster_file` to open a file on disk - and load the resulting dataset with this method in one step. - Parameters ---------- - data : xarray.Dataset - The dataset to read from. + data : xarray.Dataset or Path or str + The filepath to read the data from or the already opened dataset hazard_type : str The type identifier of the hazard. Will be stored directly in the hazard object. intensity_unit : str The physical units of the intensity. intensity : str, optional - Identifier of the `xarray.DataArray` containing the hazard intensity data. + Identifier of the ``xarray.DataArray`` containing the hazard intensity data. coordinate_vars : dict(str, str), optional Mapping from default coordinate names to coordinate names used in the data to read. The default is - ``dict(event="time", longitude="longitude", latitude="latitude")``, as most - of the commonly used hazard data happens to have a "time" attribute but no - "event" attribute. + ``{"event": "time", "longitude": "longitude", "latitude": "latitude"}``, as + most of the commonly used hazard data happens to have a "time" attribute but + no "event" attribute. data_vars : dict(str, str), optional Mapping from default variable names to variable names used in the data to read. The default names are ``fraction``, ``hazard_type``, ``frequency``, @@ -392,7 +376,7 @@ def from_xarray_raster( * ``date``: The ``event`` coordinate interpreted as date or ordinal, or ones if that fails (which will issue a warning). * ``fraction``: ``None``, which results in a value of 1.0 everywhere, see - :py:meth:`Hazard.__init__` for details. + :py:meth:`~climada.hazard.base.Hazard.__init__` for details. * ``hazard_type``: Empty string * ``frequency``: 1.0 for every event * ``event_name``: String representation of the event date or empty strings @@ -401,8 +385,9 @@ def from_xarray_raster( crs : str, optional Identifier for the coordinate reference system of the coordinates. Defaults - to ``EPSG:4326`` (WGS 84), defined by ``climada.util.constants.DEF_CRS``. - See https://pyproj4.github.io/pyproj/dev/api/crs/crs.html#pyproj.crs.CRS.from_user_input + to ``"EPSG:4326"`` (WGS 84), defined by + :py:const:`climada.util.coordinates.DEF_CRS`. See + https://pyproj4.github.io/pyproj/dev/api/crs/crs.html#pyproj.crs.CRS.from_user_input for further information on how to specify the coordinate system. rechunk : bool, optional Rechunk the dataset before flattening. This might have serious performance @@ -412,16 +397,19 @@ def from_xarray_raster( be forced by rechunking the data. Ideally, you would select the chunks in that manner when opening the dataset before passing it to this function. Defaults to ``False``. + open_dataset_kws : dict(str, any) + Keyword arguments passed to ``xarray.open_dataset`` if ``data`` is a file + path. Ignored otherwise. Returns ------- - hazard : climada.Hazard + Hazard A hazard object created from the input data See Also -------- - :py:meth:`~Hazard.from_xarray_raster_file` - Use this method if you want CLIMADA to open and read a file on disk for you. + :py:class:`climada.hazard.xarray.HazardXarrayReader` + The helper class used to read the data. Notes ----- @@ -438,8 +426,9 @@ def from_xarray_raster( ``data_vars``` parameter documentation above. * To avoid confusion in the call signature, several parameters are keyword-only arguments. - * The attributes ``Hazard.haz_type`` and ``Hazard.unit`` currently cannot be - read from the Dataset. Use the method parameters to set these attributes. + * The attributes :py:attr:`~climada.hazard.base.Hazard.haz_type` and + :py:attr:`~climada.hazard.base.Hazard.units` currently cannot be read from the + Dataset. Use the method parameters to set these attributes. * This method does not read coordinate system metadata. Use the ``crs`` parameter to set a custom coordinate system identifier. @@ -463,6 +452,21 @@ def from_xarray_raster( ... ) >>> hazard = Hazard.from_xarray_raster(dset, "", "") + Data can also be read from a file. + + >>> dset.to_netcdf("path/to/file.nc") + >>> hazard = Hazard.from_xarray_raster("path/to/file.nc", "", "") + + If you have specific requirements for opening a data file, you can pass + ``open_dataset_kws``. + + >>> hazard = Hazard.from_xarray_raster( + ... dset, + ... "", + ... "", + ... open_dataset_kws={"chunks": {"x": -1, "y": "auto"}, "engine": "netcdf4"} + ... ) + For non-default coordinate names, use the ``coordinate_vars`` argument. >>> dset = xr.Dataset( @@ -574,338 +578,39 @@ def from_xarray_raster( >>> dset = dset.expand_dims(time=[numpy.datetime64("2000-01-01")]) >>> hazard = Hazard.from_xarray_raster(dset, "", "") """ - # Check data type for better error message - if not isinstance(data, xr.Dataset): - if isinstance(data, (pathlib.Path, str)): - raise TypeError( - "Passing a path to this classmethod is not supported. " - "Use Hazard.from_xarray_raster_file instead." - ) - - raise TypeError("This method only supports xarray.Dataset as input data") - - # Initialize Hazard object - hazard_kwargs = dict(haz_type=hazard_type, units=intensity_unit) - - # Update coordinate identifiers - coords = copy.deepcopy(DEF_COORDS) - coordinate_vars = coordinate_vars if coordinate_vars is not None else {} - unknown_coords = [co for co in coordinate_vars if co not in coords] - if unknown_coords: - raise ValueError( - f"Unknown coordinates passed: '{unknown_coords}'. Supported " - f"coordinates are {list(coords.keys())}." - ) - coords.update(coordinate_vars) - - # Retrieve dimensions of coordinates - try: - dims = dict( - event=data[coords["event"]].dims, - longitude=data[coords["longitude"]].dims, - latitude=data[coords["latitude"]].dims, - ) - # Handle KeyError for better error message - except KeyError as err: - key = err.args[0] - raise RuntimeError( - f"Dataset is missing dimension/coordinate: {key}. Dataset dimensions: " - f"{list(data.dims.keys())}" - ) from err - - # Try promoting single-value coordinates to dimensions - for key, val in dims.items(): - if not val: - coord = coords[key] - LOGGER.debug("Promoting Dataset coordinate '%s' to dimension", coord) - data = data.expand_dims(coord) - dims[key] = data[coord].dims - - # Try to rechunk the data to optimize the stack operation afterwards. - if rechunk: - # We want one event to be contained in one chunk - chunks = {dim: -1 for dim in dims["longitude"]} - chunks.update({dim: -1 for dim in dims["latitude"]}) - - # Chunks can be auto-sized along the event dimensions - chunks.update({dim: "auto" for dim in dims["event"]}) - data = data.chunk(chunks=chunks) - - # Stack (vectorize) the entire dataset into 2D (time, lat/lon) - # NOTE: We want the set union of the dimensions, but Python 'set' does not - # preserve order. However, we want longitude to run faster than latitude. - # So we use 'dict' without values, as 'dict' preserves insertion order - # (dict keys behave like a set). - data = data.stack( - event=dims["event"], - lat_lon=dict.fromkeys(dims["latitude"] + dims["longitude"]), - ) - - # Transform coordinates into centroids - centroids = Centroids( - lat=data[coords["latitude"]].values, - lon=data[coords["longitude"]].values, - crs=crs, - ) - - def to_csr_matrix(array: xr.DataArray) -> sparse.csr_matrix: - """Store a numpy array as sparse matrix, optimizing storage space - - The CSR matrix stores NaNs explicitly, so we set them to zero. - """ - array = array.where(array.notnull(), 0) - array = xr.apply_ufunc( - sp.COO.from_numpy, - array, - dask="parallelized", - output_dtypes=[array.dtype], - ) - sparse_coo = array.compute().data # Load into memory - return sparse_coo.tocsr() # Convert sparse.COO to scipy.sparse.csr_matrix - - # Read the intensity data - LOGGER.debug("Loading Hazard intensity from DataArray '%s'", intensity) - intensity_matrix = to_csr_matrix(data[intensity]) - - # Define accessors for xarray DataArrays - def default_accessor(array: xr.DataArray) -> np.ndarray: - """Take a DataArray and return its numpy representation""" - return array.values - - def strict_positive_int_accessor(array: xr.DataArray) -> np.ndarray: - """Take a positive int DataArray and return its numpy representation - - Raises - ------ - TypeError - If the underlying data type is not integer - ValueError - If any value is zero or less - """ - if not np.issubdtype(array.dtype, np.integer): - raise TypeError(f"'{array.name}' data array must be integers") - if not (array > 0).all(): - raise ValueError(f"'{array.name}' data must be larger than zero") - return array.values - - def date_to_ordinal_accessor( - array: xr.DataArray, strict: bool = True - ) -> np.ndarray: - """Take a DataArray and transform it into ordinals""" - try: - if np.issubdtype(array.dtype, np.integer): - # Assume that data is ordinals - return strict_positive_int_accessor(array) - - # Try transforming to ordinals - return np.array(u_dt.datetime64_to_ordinal(array.values)) - - # Handle access errors - except (ValueError, TypeError, AttributeError) as err: - if strict: - raise err - - LOGGER.warning( - "Failed to read values of '%s' as dates or ordinals. Hazard.date " - "will be ones only", - array.name, - ) - return np.ones(array.shape) - - def year_month_day_accessor( - array: xr.DataArray, strict: bool = True - ) -> np.ndarray: - """Take an array and return am array of YYYY-MM-DD strings""" - try: - return array.dt.strftime("%Y-%m-%d").values - - # Handle access errors - except (ValueError, TypeError, AttributeError) as err: - if strict: - raise err - - LOGGER.warning( - "Failed to read values of '%s' as dates. Hazard.event_name will be " - "empty strings", - array.name, - ) - return np.full(array.shape, "") - - def maybe_repeat(values: np.ndarray, times: int) -> np.ndarray: - """Return the array or repeat a single-valued array - - If ``values`` has size 1, return an array that repeats this value ``times`` - times. If the size is different, just return the array. - """ - if values.size == 1: - return np.array(list(itertools.repeat(values.flat[0], times))) - - return values - - # Create a DataFrame storing access information for each of data_vars - # NOTE: Each row will be passed as arguments to - # `load_from_xarray_or_return_default`, see its docstring for further - # explanation of the DataFrame columns / keywords. - num_events = data.sizes["event"] - data_ident = pd.DataFrame( - data=dict( - # The attribute of the Hazard class where the data will be stored - hazard_attr=DEF_DATA_VARS, - # The identifier and default key used in this method - default_key=DEF_DATA_VARS, - # The key assigned by the user - user_key=None, - # The default value for each attribute - default_value=[ - None, - np.ones(num_events), - np.array(range(num_events), dtype=int) + 1, - list( - year_month_day_accessor( - data[coords["event"]], strict=False - ).flat - ), - date_to_ordinal_accessor(data[coords["event"]], strict=False), - ], - # The accessor for the data in the Dataset - accessor=[ - to_csr_matrix, - lambda x: maybe_repeat(default_accessor(x), num_events), - strict_positive_int_accessor, - lambda x: list(maybe_repeat(default_accessor(x), num_events).flat), - lambda x: maybe_repeat(date_to_ordinal_accessor(x), num_events), - ], + reader_kwargs = { + "intensity": intensity, + "coordinate_vars": coordinate_vars, + "data_vars": data_vars, + "crs": crs, + "rechunk": rechunk, + } + if isinstance(data, xr.Dataset): + reader = HazardXarrayReader(data=data, **reader_kwargs) + elif isinstance(data, xr.DataArray): + raise TypeError( + "This method only supports passing xr.Dataset. Consider promoting " + "your xr.DataArray to a Dataset." ) - ) - - # Check for unexpected keys - data_vars = data_vars if data_vars is not None else {} - default_keys = data_ident["default_key"] - unknown_keys = [ - key for key in data_vars.keys() if not default_keys.str.contains(key).any() - ] - if unknown_keys: - raise ValueError( - f"Unknown data variables passed: '{unknown_keys}'. Supported " - f"data variables are {list(default_keys)}." - ) - - # Update with keys provided by the user - # NOTE: Keys in 'default_keys' missing from 'data_vars' will be set to 'None' - # (which is exactly what we want) and the result is written into - # 'user_key'. 'default_keys' is not modified. - data_ident["user_key"] = default_keys.map(data_vars) - - def load_from_xarray_or_return_default( - user_key: Optional[str], - default_key: str, - hazard_attr: str, - accessor: Callable[[xr.DataArray], Any], - default_value: Any, - ) -> Any: - """Load data for a single Hazard attribute or return the default value - - Does the following based on the ``user_key``: - * If the key is an empty string, return the default value - * If the key is a non-empty string, load the data for that key and return it. - * If the key is ``None``, look for the ``default_key`` in the data. If it - exists, return that data. If not, return the default value. - - Parameters - ---------- - user_key : str or None - The key set by the user to identify the DataArray to read data from. - default_key : str - The default key identifying the DataArray to read data from. - hazard_attr : str - The name of the attribute of ``Hazard`` where the data will be stored in. - accessor : Callable - A callable that takes the DataArray as argument and returns the data - structure that is required by the ``Hazard`` attribute. - default_value - The default value/array to return in case the data could not be found. - - Returns - ------- - The object that will be stored in the ``Hazard`` attribute ``hazard_attr``. - - Raises - ------ - KeyError - If ``user_key`` was a non-empty string but no such key was found in the - data - RuntimeError - If the data structure loaded has a different shape than the default data - structure - """ - # User does not want to read data - if user_key == "": - LOGGER.debug( - "Using default values for Hazard.%s per user request", hazard_attr - ) - return default_value - - if not pd.isna(user_key): - # Read key exclusively - LOGGER.debug( - "Reading data for Hazard.%s from DataArray '%s'", - hazard_attr, - user_key, - ) - val = accessor(data[user_key]) - else: - # Try default key - try: - val = accessor(data[default_key]) - LOGGER.debug( - "Reading data for Hazard.%s from DataArray '%s'", - hazard_attr, - default_key, - ) - except KeyError: - LOGGER.debug( - "Using default values for Hazard.%s. No data found", hazard_attr - ) - return default_value - - def vshape(array): - """Return a shape tuple for any array-like type we use""" - if isinstance(array, list): - return len(array) - if isinstance(array, sparse.csr_matrix): - return array.get_shape() - return array.shape - - # Check size for read data - if default_value is not None and not np.array_equal( - vshape(val), vshape(default_value) - ): - raise RuntimeError( - f"'{user_key if user_key else default_key}' must have shape " - f"{vshape(default_value)}, but shape is {vshape(val)}" - ) - - # Return the data - return val - - # Set the Hazard attributes - for _, ident in data_ident.iterrows(): - hazard_kwargs[ident["hazard_attr"]] = load_from_xarray_or_return_default( - **ident + else: # data is pathlike + reader = HazardXarrayReader.from_file( + filename=data, open_dataset_kws=open_dataset_kws, **reader_kwargs ) - hazard_kwargs = cls._check_and_cast_attrs(hazard_kwargs) - - # Done! - LOGGER.debug("Hazard successfully loaded. Number of events: %i", num_events) - return cls(centroids=centroids, intensity=intensity_matrix, **hazard_kwargs) + kwargs = reader.get_hazard_kwargs() | { + "haz_type": hazard_type, + "units": intensity_unit, + } + return cls(**cls._check_and_cast_attrs(kwargs)) @staticmethod def _check_and_cast_attrs(attrs: Dict[str, Any]) -> Dict[str, Any]: - """Check the validity of the hazard attributes given and cast to correct type if required and possible. + """Check the validity of the hazard attributes given and cast to correct type if required + and possible. The current purpose is to check that event_name is a list of string - (and convert to string otherwise), although other checks and casting could be included here in the future. + (and convert to string otherwise), although other checks and casting could be included here + in the future. Parameters ---------- @@ -917,7 +622,8 @@ def _check_and_cast_attrs(attrs: Dict[str, Any]) -> Dict[str, Any]: ------- attrs : dict - Attributes checked for type validity and casted otherwise (only event_name at the moment). + Attributes checked for type validity and casted otherwise (only event_name at the + moment). Warns ----- @@ -961,8 +667,8 @@ def _check_and_cast_container( def _check_and_cast_elements( attr_value: Any, expected_dtype: Union[Any, None] ) -> Any: - """Check if the elements of the container are of the expected dtype and cast if necessary, - while preserving the original container type. + """Check if the elements of the container are of the expected dtype and cast if + necessary, while preserving the original container type. Parameters ---------- @@ -975,7 +681,8 @@ def _check_and_cast_elements( Returns ------- attr_value : any - The value with elements cast to the expected type, preserving the original container type. + The value with elements cast to the expected type, preserving the original + container type. """ if expected_dtype is None: # No dtype enforcement required diff --git a/climada/hazard/test/test_io.py b/climada/hazard/test/test_io.py index 63e35291f..4d93e0c11 100644 --- a/climada/hazard/test/test_io.py +++ b/climada/hazard/test/test_io.py @@ -19,584 +19,13 @@ Test Hazard base class. """ -import datetime as dt import unittest -from pathlib import Path -from tempfile import TemporaryDirectory -from unittest.mock import patch import numpy as np -import xarray as xr -from pyproj import CRS -from scipy.sparse import csr_matrix from climada.hazard.base import Hazard from climada.hazard.test.test_base import DATA_DIR, dummy_hazard -from climada.util.constants import DEF_CRS, DEF_FREQ_UNIT, HAZ_DEMO_FL, HAZ_TEMPLATE_XLS - - -class TestReadDefaultNetCDF(unittest.TestCase): - """Test reading a NetCDF file where the coordinates to read match the dimensions""" - - @classmethod - def setUpClass(cls): - """Write a simple NetCDF file to read""" - cls.tempdir = TemporaryDirectory() - cls.netcdf_path = Path(cls.tempdir.name) / "default.nc" - cls.intensity = np.array([[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [9, 10, 11]]]) - cls.time = np.array([dt.datetime(1999, 1, 1), dt.datetime(2000, 1, 1)]) - cls.latitude = np.array([0, 1]) - cls.longitude = np.array([0, 1, 2]) - dset = xr.Dataset( - { - "intensity": (["time", "latitude", "longitude"], cls.intensity), - }, - dict(time=cls.time, latitude=cls.latitude, longitude=cls.longitude), - ) - dset.to_netcdf(cls.netcdf_path) - - @classmethod - def tearDownClass(cls): - """Delete the NetCDF file""" - cls.tempdir.cleanup() - - def _assert_default(self, hazard): - """Assertions for the default hazard to be loaded""" - self._assert_default_types(hazard) - self._assert_default_values(hazard) - - def _assert_default_values(self, hazard): - """Check the values of the default hazard to be loaded""" - # Hazard data - self.assertEqual(hazard.haz_type, "") - self.assertEqual(hazard.units, "") - np.testing.assert_array_equal(hazard.event_id, [1, 2]) - np.testing.assert_array_equal( - hazard.event_name, [x.strftime("%Y-%m-%d") for x in self.time] - ) - np.testing.assert_array_equal( - hazard.date, [val.toordinal() for val in self.time] - ) - np.testing.assert_array_equal(hazard.frequency, np.ones(hazard.event_id.size)) - - # Centroids - np.testing.assert_array_equal(hazard.centroids.lat, [0, 0, 0, 1, 1, 1]) - np.testing.assert_array_equal(hazard.centroids.lon, [0, 1, 2, 0, 1, 2]) - self.assertEqual(hazard.centroids.geometry.crs, DEF_CRS) - - # Intensity data - np.testing.assert_array_equal( - hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]] - ) - - # Fraction default - self.assertEqual(hazard.fraction.nnz, 0) - np.testing.assert_array_equal(hazard.fraction.shape, hazard.intensity.shape) - np.testing.assert_array_equal( - hazard.fraction.toarray(), np.zeros_like(hazard.intensity.toarray()) - ) - - def _assert_default_types(self, hazard): - """Check types of all hazard attributes""" - self.assertIsInstance(hazard.units, str) - self.assertIsInstance(hazard.haz_type, str) - self.assertIsInstance(hazard.event_id, np.ndarray) - self.assertIsInstance(hazard.event_name, list) - self.assertIsInstance(hazard.frequency, np.ndarray) - self.assertIsInstance(hazard.intensity, csr_matrix) - self.assertIsInstance(hazard.fraction, csr_matrix) - self.assertIsInstance(hazard.date, np.ndarray) - - def test_load_path(self): - """Load the data with path as argument""" - hazard = Hazard.from_xarray_raster_file(self.netcdf_path, "", "") - self._assert_default(hazard) - - # Check wrong paths - with self.assertRaises(FileNotFoundError) as cm: - Hazard.from_xarray_raster_file("file-does-not-exist.nc", "", "") - self.assertIn("file-does-not-exist.nc", str(cm.exception)) - with self.assertRaises(KeyError) as cm: - Hazard.from_xarray_raster_file( - self.netcdf_path, "", "", intensity="wrong-intensity-path" - ) - self.assertIn("wrong-intensity-path", str(cm.exception)) - - def test_load_dataset(self): - """Load the data from an opened dataset as argument""" - - def _load_and_assert(chunks): - with xr.open_dataset(self.netcdf_path, chunks=chunks) as dataset: - hazard = Hazard.from_xarray_raster(dataset, "", "") - self._assert_default(hazard) - - _load_and_assert(chunks=None) - _load_and_assert(chunks=dict(latitude=1, longitude=1, time=1)) - - def test_type_error(self): - """Calling 'from_xarray_raster' with wrong data type should throw""" - # Passing a path - with self.assertRaises(TypeError) as cm: - Hazard.from_xarray_raster(self.netcdf_path, "", "") - self.assertIn( - "Use Hazard.from_xarray_raster_file instead", - str(cm.exception), - ) - - # Passing a DataArray - with xr.open_dataset(self.netcdf_path) as dset, self.assertRaises( - TypeError - ) as cm: - Hazard.from_xarray_raster(dset["intensity"], "", "") - self.assertIn( - "This method only supports xarray.Dataset as input data", - str(cm.exception), - ) - - def test_type_and_unit(self): - """Test passing a custom type and unit""" - hazard = Hazard.from_xarray_raster_file( - self.netcdf_path, hazard_type="TC", intensity_unit="m/s" - ) - self._assert_default_types(hazard) - self.assertEqual(hazard.haz_type, "TC") - self.assertEqual(hazard.units, "m/s") - - def test_event_no_time(self): - """Test if an event coordinate that is not a time works""" - with xr.open_dataset(self.netcdf_path) as dataset: - size = dataset.sizes["time"] - - # Positive integers (interpreted as ordinals) - time = [2, 1] - dataset["time"] = time - hazard = Hazard.from_xarray_raster(dataset, "", "") - self._assert_default_types(hazard) - np.testing.assert_array_equal( - hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]] - ) - np.testing.assert_array_equal(hazard.date, time) - np.testing.assert_array_equal(hazard.event_name, np.full(size, "")) - - # Strings - dataset["time"] = ["a", "b"] - with self.assertLogs("climada.hazard.io", "WARNING") as cm: - hazard = Hazard.from_xarray_raster(dataset, "", "") - np.testing.assert_array_equal(hazard.date, np.ones(size)) - np.testing.assert_array_equal(hazard.event_name, np.full(size, "")) - self.assertIn("Failed to read values of 'time' as dates.", cm.output[0]) - self.assertIn( - "Failed to read values of 'time' as dates or ordinals.", cm.output[1] - ) - - def test_data_vars(self): - """Check handling of data variables""" - with xr.open_dataset(self.netcdf_path) as dataset: - size = dataset.sizes["time"] - - # Set optionals in the dataset - frequency = np.ones(size) * 1.5 - event_id = np.array(range(size), dtype=np.int64) + 3 - event_name = ["bla"] * size - date = np.array(range(size)) + 100 - dataset["event_id"] = event_id - dataset["event_name"] = event_name - dataset["date"] = date - - # Assign a proper coordinate for a change - dataset = dataset.assign_coords(dict(frequency=("time", frequency))) - - # Assign fraction - frac = xr.DataArray( - np.array([[[0, 0, 0], [0, 0, 0]], [[1, 1, 1], [1, 1, 1]]]), - dims=["time", "latitude", "longitude"], - coords=dict( - time=self.time, latitude=self.latitude, longitude=self.longitude - ), - ) - dataset["fraction"] = frac - - # Optionals should be read automatically - hazard = Hazard.from_xarray_raster(dataset, "", "") - self._assert_default_types(hazard) - np.testing.assert_array_equal(hazard.frequency, frequency) - np.testing.assert_array_equal(hazard.event_id, event_id) - np.testing.assert_array_equal(hazard.event_name, event_name) - np.testing.assert_array_equal(hazard.date, date) - np.testing.assert_array_equal( - hazard.fraction.toarray(), [[0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1]] - ) - - # Ignore keys (should be default values) - hazard = Hazard.from_xarray_raster( - dataset, - "", - "", - data_vars=dict( - frequency="", event_id="", event_name="", date="", fraction="" - ), - ) - self._assert_default(hazard) - - # Wrong key - with self.assertRaises(ValueError) as cm: - Hazard.from_xarray_raster( - dataset, "", "", data_vars=dict(wrong_key="stuff") - ) - self.assertIn( - "Unknown data variables passed: '['wrong_key']'.", str(cm.exception) - ) - - # Non-existent identifier - with self.assertRaises(KeyError) as cm: - Hazard.from_xarray_raster( - dataset, "", "", data_vars=dict(frequency="freqqqqq") - ) - self.assertIn("freqqqqq", str(cm.exception)) - - # Wrong data length - # NOTE: This also implicitly checks that 'frequency' is not read! - dataset["freq"] = np.array(range(size + 1), dtype=np.float64) - with self.assertRaises(RuntimeError) as cm: - Hazard.from_xarray_raster( - dataset, "", "", data_vars=dict(frequency="freq") - ) - self.assertIn( - f"'freq' must have shape ({size},), but shape is ({size + 1},)", - str(cm.exception), - ) - - # Integer data assertions - dset = dataset.copy(deep=True) - dset["event_id"] = np.array(range(size), dtype=np.float64) + 3.5 - with self.assertRaises(TypeError) as cm: - Hazard.from_xarray_raster(dset, "", "") - self.assertIn("'event_id' data array must be integers", str(cm.exception)) - dset["event_id"] = np.linspace(0, 10, size, dtype=np.int64) - with self.assertRaises(ValueError) as cm: - Hazard.from_xarray_raster(dset, "", "") - self.assertIn("'event_id' data must be larger than zero", str(cm.exception)) - - # Date as datetime - date_str = [f"2000-01-{i:02}" for i in range(1, size + 1)] - dataset["date"] = date_str - hazard = Hazard.from_xarray_raster(dataset, "", "") - np.testing.assert_array_equal( - hazard.date, - [dt.datetime(2000, 1, i).toordinal() for i in range(1, size + 1)], - ) - - def test_data_vars_repeat(self): - """Test if suitable data vars are repeated as expected""" - with xr.open_dataset(self.netcdf_path) as dataset: - size = dataset.sizes["time"] - - # Set optionals in the dataset - frequency = [1.5] - event_name = ["bla"] - date = 1 - dataset["event_name"] = event_name - dataset["date"] = date - dataset["frequency"] = frequency - - # Check if single-valued arrays are repeated - hazard = Hazard.from_xarray_raster(dataset, "", "") - - np.testing.assert_array_equal(hazard.date, [date] * size) - np.testing.assert_array_equal(hazard.event_name, event_name * size) - np.testing.assert_array_equal(hazard.frequency, frequency * size) - - def test_nan(self): - """Check handling of NaNs in original data""" - with xr.open_dataset(self.netcdf_path) as dataset: - intensity = xr.DataArray( - np.array([[[0, np.nan, 2], [3, 4, 5]], [[6, np.nan, 8], [9, 10, 11]]]), - dims=["time", "latitude", "longitude"], - coords=dict( - time=self.time, latitude=self.latitude, longitude=self.longitude - ), - ) - dataset["intensity"] = intensity - fraction = xr.DataArray( - np.array([[[0, 0, 0], [0, 0, 0]], [[1, np.nan, 1], [np.nan, 1, 1]]]), - dims=["time", "latitude", "longitude"], - coords=dict( - time=self.time, latitude=self.latitude, longitude=self.longitude - ), - ) - dataset["fraction"] = fraction - frequency = np.ones(dataset.sizes["time"]) - frequency[0] = np.nan - dataset["frequency"] = frequency - - # Load hazard - hazard = Hazard.from_xarray_raster(dataset, "", "") - - # Check types - self._assert_default_types(hazard) - - # NaNs are set to zero in sparse data - np.testing.assert_array_equal( - hazard.intensity.toarray(), - [[0, 0, 2, 3, 4, 5], [6, 0, 8, 9, 10, 11]], - ) - np.testing.assert_array_equal( - hazard.fraction.toarray(), - [[0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 1, 1]], - ) - - # NaNs are propagated in dense data - np.testing.assert_array_equal(hazard.frequency, frequency) - - def test_crs(self): - """Check if different CRS inputs are handled correctly""" - - def test_crs_from_input(crs_input): - crs = CRS.from_user_input(crs_input) - hazard = Hazard.from_xarray_raster_file( - self.netcdf_path, "", "", crs=crs_input - ) - self.assertEqual(hazard.centroids.geometry.crs, crs) - - test_crs_from_input("EPSG:3857") - test_crs_from_input(3857) - test_crs_from_input("+proj=cea +lat_0=52.112866 +lon_0=5.150162 +units=m") - - def test_missing_dims(self): - """Test if missing coordinates are expanded and correct errors are thrown""" - # Drop time as dimension, but not as coordinate! - with xr.open_dataset(self.netcdf_path) as ds: - ds = ds.isel(time=0).squeeze() - hazard = Hazard.from_xarray_raster(ds, "", "") - self._assert_default_types(hazard) - np.testing.assert_array_equal( - hazard.event_name, [self.time[0].strftime("%Y-%m-%d")] - ) - np.testing.assert_array_equal(hazard.date, [self.time[0].toordinal()]) - np.testing.assert_array_equal(hazard.centroids.lat, [0, 0, 0, 1, 1, 1]) - np.testing.assert_array_equal(hazard.centroids.lon, [0, 1, 2, 0, 1, 2]) - self.assertEqual(hazard.centroids.geometry.crs, DEF_CRS) - np.testing.assert_array_equal( - hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5]] - ) - self.assertEqual(hazard.fraction.nnz, 0) - np.testing.assert_array_equal( - hazard.fraction.toarray(), [[0, 0, 0, 0, 0, 0]] - ) - - # Now drop variable altogether, should raise an error - ds = ds.drop_vars("time") - with self.assertRaisesRegex(RuntimeError, "time"): - Hazard.from_xarray_raster(ds, "", "") - - # Expand time again - ds = ds.expand_dims(time=[np.datetime64("2022-01-01")]) - hazard = Hazard.from_xarray_raster(ds, "", "") - self._assert_default_types(hazard) - np.testing.assert_array_equal(hazard.event_name, ["2022-01-01"]) - np.testing.assert_array_equal( - hazard.date, [dt.datetime(2022, 1, 1).toordinal()] - ) - - -class TestReadDimsCoordsNetCDF(unittest.TestCase): - """Checks for dimensions and coordinates with different names and shapes""" - - @classmethod - def setUpClass(cls): - """Write a NetCDF file with many coordinates""" - cls.tempdir = TemporaryDirectory() - cls.netcdf_path = Path(cls.tempdir.name) / "coords.nc" - cls.intensity = np.array([[[0, 1, 2], [3, 4, 5]]]) - cls.fraction = np.array([[[0, 0, 0], [1, 1, 1]]]) - cls.time = np.array([dt.datetime(2000, 1, 1)]) - cls.x = np.array([0, 1, 2]) - cls.y = np.array([0, 1]) - cls.lon = np.array([1, 2, 3]) - cls.lat = np.array([1, 2]) - cls.years = np.array([dt.datetime(1999, 2, 2)]) - cls.longitude = np.array([[10, 11, 12], [10, 11, 12]]) - cls.latitude = np.array([[100, 100, 100], [200, 200, 200]]) - - dset = xr.Dataset( - { - "intensity": (["time", "y", "x"], cls.intensity), - "fraction": (["time", "y", "x"], cls.fraction), - }, - { - "time": cls.time, - "x": cls.x, - "y": cls.y, - "lon": (["x"], cls.lon), - "lat": (["y"], cls.lat), - "years": (["time"], cls.years), - "latitude": (["y", "x"], cls.latitude), - "longitude": (["y", "x"], cls.longitude), - }, - ) - dset.to_netcdf(cls.netcdf_path) - - @classmethod - def tearDownClass(cls): - """Delete the NetCDF file""" - cls.tempdir.cleanup() - - def _assert_intensity_fraction(self, hazard): - """Check if intensity and fraction data are read correctly""" - np.testing.assert_array_equal(hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5]]) - np.testing.assert_array_equal(hazard.fraction.toarray(), [[0, 0, 0, 1, 1, 1]]) - - def test_dimension_naming(self): - """Test if dimensions with different names can be read""" - hazard = Hazard.from_xarray_raster_file( - self.netcdf_path, - "", - "", - coordinate_vars=dict(latitude="y", longitude="x"), # 'time' stays default - ) - np.testing.assert_array_equal(hazard.centroids.lat, [0, 0, 0, 1, 1, 1]) - np.testing.assert_array_equal(hazard.centroids.lon, [0, 1, 2, 0, 1, 2]) - np.testing.assert_array_equal( - hazard.date, [val.toordinal() for val in self.time] - ) - self._assert_intensity_fraction(hazard) - - def test_coordinate_naming(self): - """Test if coordinates with different names than dimensions can be read""" - hazard = Hazard.from_xarray_raster_file( - self.netcdf_path, - "", - "", - coordinate_vars=dict(latitude="lat", longitude="lon", event="years"), - ) - np.testing.assert_array_equal(hazard.centroids.lat, [1, 1, 1, 2, 2, 2]) - np.testing.assert_array_equal(hazard.centroids.lon, [1, 2, 3, 1, 2, 3]) - np.testing.assert_array_equal( - hazard.date, [val.toordinal() for val in self.years] - ) - self._assert_intensity_fraction(hazard) - - def test_2D_coordinates(self): - """Test if read method correctly handles 2D coordinates""" - hazard = Hazard.from_xarray_raster_file( - self.netcdf_path, - "", - "", - coordinate_vars=dict(latitude="latitude", longitude="longitude"), - ) - np.testing.assert_array_equal( - hazard.centroids.lat, [100, 100, 100, 200, 200, 200] - ) - np.testing.assert_array_equal(hazard.centroids.lon, [10, 11, 12, 10, 11, 12]) - self._assert_intensity_fraction(hazard) - - def test_load_dataset_rechunk(self): - """Load the data from an opened dataset and force rechunking""" - with xr.open_dataset(self.netcdf_path) as dataset: - hazard = Hazard.from_xarray_raster( - dataset, - "", - "", - coordinate_vars=dict(latitude="latitude", longitude="longitude"), - rechunk=True, - ) - - np.testing.assert_array_equal( - hazard.centroids.lat, [100, 100, 100, 200, 200, 200] - ) - np.testing.assert_array_equal(hazard.centroids.lon, [10, 11, 12, 10, 11, 12]) - self._assert_intensity_fraction(hazard) - - # Assert that .chunk is called the right way - with patch("xarray.Dataset.chunk") as mock: - with xr.open_dataset(self.netcdf_path) as dataset: - mock.return_value = dataset - Hazard.from_xarray_raster( - dataset, - "", - "", - coordinate_vars=dict(latitude="latitude", longitude="longitude"), - rechunk=True, - ) - - # First latitude dim, then longitude dim, then event dim - mock.assert_called_once_with(chunks=dict(y=-1, x=-1, time="auto")) - - # Should not be called by default - mock.reset_mock() - with xr.open_dataset(self.netcdf_path) as dataset: - mock.return_value = dataset - Hazard.from_xarray_raster( - dataset, - "", - "", - coordinate_vars=dict(latitude="latitude", longitude="longitude"), - ) - - mock.assert_not_called() - - def test_2D_time(self): - """Test if stacking multiple time dimensions works out""" - time = np.array( - [ - [dt.datetime(1999, 1, 1), dt.datetime(1999, 2, 1)], - [dt.datetime(2000, 1, 1), dt.datetime(2000, 2, 1)], - ] - ) - ds = xr.Dataset( - { - "intensity": ( - ["year", "month", "latitude", "longitude"], - [[[[1]], [[2]]], [[[3]], [[4]]]], - ), - "fraction": ( - ["year", "month", "latitude", "longitude"], - [[[[10]], [[20]]], [[[30]], [[40]]]], - ), - }, - { - "latitude": [1], - "longitude": [2], - "year": [1999, 2000], - "month": [1, 2], - "time": (["year", "month"], time), - }, - ) - hazard = Hazard.from_xarray_raster(ds, "", "") - - np.testing.assert_array_equal(hazard.intensity.toarray(), [[1], [2], [3], [4]]) - np.testing.assert_array_equal( - hazard.fraction.toarray(), [[10], [20], [30], [40]] - ) - np.testing.assert_array_equal(hazard.centroids.lat, [1]) - np.testing.assert_array_equal(hazard.centroids.lon, [2]) - np.testing.assert_array_equal( - hazard.date, [val.toordinal() for val in time.flat] - ) - np.testing.assert_array_equal( - hazard.event_name, ["1999-01-01", "1999-02-01", "2000-01-01", "2000-02-01"] - ) - - def test_errors(self): - """Check if expected errors are thrown""" - # Wrong coordinate key - with self.assertRaises(ValueError) as cm: - Hazard.from_xarray_raster_file( - self.netcdf_path, - "", - "", - coordinate_vars=dict(bar="latitude", longitude="baz"), - ) - self.assertIn("Unknown coordinates passed: '['bar']'.", str(cm.exception)) - - # Correctly specified, but the custom dimension does not exist - with self.assertRaisesRegex(RuntimeError, "lalalatitude"): - Hazard.from_xarray_raster_file( - self.netcdf_path, - "", - "", - coordinate_vars=dict(latitude="lalalatitude"), - ) +from climada.util.constants import DEF_FREQ_UNIT, HAZ_TEMPLATE_XLS class TestReaderExcel(unittest.TestCase): @@ -688,10 +117,6 @@ class CustomID: # Execute Tests if __name__ == "__main__": - TESTS = unittest.TestLoader().loadTestsFromTestCase(TestReadDefaultNetCDF) - TESTS.addTests( - unittest.TestLoader().loadTestsFromTestCase(TestReadDimsCoordsNetCDF) - ) - TESTS.addTests(unittest.TestLoader().loadTestsFromTestCase(TestReaderExcel)) + TESTS = unittest.TestLoader().loadTestsFromTestCase(TestReaderExcel) TESTS.addTests(unittest.TestLoader().loadTestsFromTestCase(TestHDF5)) unittest.TextTestRunner(verbosity=2).run(TESTS) diff --git a/climada/hazard/test/test_xarray.py b/climada/hazard/test/test_xarray.py new file mode 100644 index 000000000..cf481e3c4 --- /dev/null +++ b/climada/hazard/test/test_xarray.py @@ -0,0 +1,605 @@ +""" +This file is part of CLIMADA. + +Copyright (C) 2017 ETH Zurich, CLIMADA contributors listed in AUTHORS. + +CLIMADA is free software: you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free +Software Foundation, version 3. + +CLIMADA is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with CLIMADA. If not, see . + +--- + +Test Hazard base class Xarray reader. +""" + +import datetime as dt +import unittest +from pathlib import Path +from tempfile import TemporaryDirectory +from unittest.mock import patch + +import numpy as np +import xarray as xr +from pyproj import CRS +from scipy.sparse import csr_matrix + +from climada.hazard.base import Hazard +from climada.util.constants import DEF_CRS + + +class TestReadDefaultNetCDF(unittest.TestCase): + """Test reading a NetCDF file where the coordinates to read match the dimensions""" + + @classmethod + def setUpClass(cls): + """Write a simple NetCDF file to read""" + cls.tempdir = TemporaryDirectory() + cls.netcdf_path = Path(cls.tempdir.name) / "default.nc" + cls.intensity = np.array([[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [9, 10, np.nan]]]) + cls.time = np.array([dt.datetime(1999, 1, 1), dt.datetime(2000, 1, 1)]) + cls.latitude = np.array([0, 1]) + cls.longitude = np.array([0, 1, 2]) + cls.dset = xr.Dataset( + { + "intensity": (["time", "latitude", "longitude"], cls.intensity), + }, + dict(time=cls.time, latitude=cls.latitude, longitude=cls.longitude), + ) + cls.dset.to_netcdf(cls.netcdf_path) + + @classmethod + def tearDownClass(cls): + """Delete the NetCDF file""" + cls.tempdir.cleanup() + + def _assert_default(self, hazard): + """Assertions for the default hazard to be loaded""" + self._assert_default_types(hazard) + self._assert_default_values(hazard) + + def _assert_default_values(self, hazard): + """Check the values of the default hazard to be loaded""" + # Hazard data + self.assertEqual(hazard.haz_type, "") + self.assertEqual(hazard.units, "") + np.testing.assert_array_equal(hazard.event_id, [1, 2]) + np.testing.assert_array_equal( + hazard.event_name, [x.strftime("%Y-%m-%d") for x in self.time] + ) + np.testing.assert_array_equal( + hazard.date, [val.toordinal() for val in self.time] + ) + np.testing.assert_array_equal(hazard.frequency, np.ones(hazard.event_id.size)) + + # Centroids + np.testing.assert_array_equal(hazard.centroids.lat, [0, 0, 0, 1, 1, 1]) + np.testing.assert_array_equal(hazard.centroids.lon, [0, 1, 2, 0, 1, 2]) + self.assertEqual(hazard.centroids.geometry.crs, DEF_CRS) + + # Intensity data + # NOTE: NaN converted to zero + self.assertEqual(hazard.intensity.nnz, 10) + np.testing.assert_array_equal( + hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 0]] + ) + + # Fraction default + self.assertEqual(hazard.fraction.nnz, 0) + np.testing.assert_array_equal(hazard.fraction.shape, hazard.intensity.shape) + np.testing.assert_array_equal( + hazard.fraction.toarray(), np.zeros_like(hazard.intensity.toarray()) + ) + + def _assert_default_types(self, hazard): + """Check types of all hazard attributes""" + self.assertIsInstance(hazard.units, str) + self.assertIsInstance(hazard.haz_type, str) + self.assertIsInstance(hazard.event_id, np.ndarray) + self.assertIsInstance(hazard.event_name, list) + self.assertIsInstance(hazard.frequency, np.ndarray) + self.assertIsInstance(hazard.intensity, csr_matrix) + self.assertIsInstance(hazard.fraction, csr_matrix) + self.assertIsInstance(hazard.date, np.ndarray) + + def test_load_path(self): + """Load the data with path as argument""" + hazard = Hazard.from_xarray_raster(self.netcdf_path, "", "") + self._assert_default(hazard) + + # Check deprecated method + hazard = Hazard.from_xarray_raster_file(self.netcdf_path, "", "") + self._assert_default(hazard) + + # Check wrong paths + with self.assertRaisesRegex(FileNotFoundError, "file-does-not-exist.nc"): + Hazard.from_xarray_raster("file-does-not-exist.nc", "", "") + with self.assertRaisesRegex(KeyError, "wrong-intensity-path"): + Hazard.from_xarray_raster( + self.netcdf_path, "", "", intensity="wrong-intensity-path" + ) + + @patch("climada.hazard.xarray.xr.open_dataset") + def test_load_dataset(self, open_dataset_mock): + """Load the data from an opened dataset as argument""" + open_dataset_mock.return_value.__enter__.return_value = self.dset + + def _load_and_assert(**kwargs): + hazard = Hazard.from_xarray_raster( + self.netcdf_path, "", "", open_dataset_kws=kwargs + ) + self._assert_default(hazard) + + _load_and_assert() + open_dataset_mock.assert_called_once_with(self.netcdf_path, chunks="auto") + open_dataset_mock.reset_mock() + _load_and_assert(chunks=dict(latitude=1, longitude=1, time=1), engine="netcdf4") + open_dataset_mock.assert_called_once_with( + self.netcdf_path, + chunks=dict(latitude=1, longitude=1, time=1), + engine="netcdf4", + ) + + def test_type_error(self): + """Calling 'from_xarray_raster' with wrong data type should throw""" + # Passing a DataArray + with xr.open_dataset(self.netcdf_path) as dset, self.assertRaisesRegex( + TypeError, "This method only supports passing xr.Dataset" + ): + Hazard.from_xarray_raster(dset["intensity"], "", "") + + def test_type_and_unit(self): + """Test passing a custom type and unit""" + hazard = Hazard.from_xarray_raster_file( + self.netcdf_path, hazard_type="TC", intensity_unit="m/s" + ) + self._assert_default_types(hazard) + self.assertEqual(hazard.haz_type, "TC") + self.assertEqual(hazard.units, "m/s") + + def test_event_no_time(self): + """Test if an event coordinate that is not a time works""" + with xr.open_dataset(self.netcdf_path) as dataset: + size = dataset.sizes["time"] + + # Positive integers (interpreted as ordinals) + time = [2, 1] + dataset["time"] = time + hazard = Hazard.from_xarray_raster(dataset, "", "") + self._assert_default_types(hazard) + np.testing.assert_array_equal( + hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 0]] + ) + np.testing.assert_array_equal(hazard.date, time) + np.testing.assert_array_equal(hazard.event_name, np.full(size, "")) + + # Strings + dataset["time"] = ["a", "b"] + with self.assertLogs("climada.hazard.xarray", "WARNING") as cm: + hazard = Hazard.from_xarray_raster(dataset, "", "") + np.testing.assert_array_equal(hazard.date, np.ones(size)) + np.testing.assert_array_equal(hazard.event_name, np.full(size, "")) + self.assertIn("Failed to read values of 'time' as dates.", cm.output[0]) + self.assertIn( + "Failed to read values of 'time' as dates or ordinals.", cm.output[1] + ) + + def test_data_vars(self): + """Check handling of data variables""" + with xr.open_dataset(self.netcdf_path) as dataset: + size = dataset.sizes["time"] + + # Set optionals in the dataset + frequency = np.ones(size) * 1.5 + event_id = np.array(range(size), dtype=np.int64) + 3 + event_name = ["bla"] * size + date = np.array(range(size)) + 100 + dataset["event_id"] = event_id + dataset["event_name"] = event_name + dataset["date"] = date + + # Assign a proper coordinate for a change + dataset = dataset.assign_coords(dict(frequency=("time", frequency))) + + # Assign fraction + frac = xr.DataArray( + np.array([[[0, 0, 0], [0, 0, 0]], [[1, 1, 1], [1, 1, 1]]]), + dims=["time", "latitude", "longitude"], + coords=dict( + time=self.time, latitude=self.latitude, longitude=self.longitude + ), + ) + dataset["fraction"] = frac + + # Optionals should be read automatically + hazard = Hazard.from_xarray_raster(dataset, "", "") + self._assert_default_types(hazard) + np.testing.assert_array_equal(hazard.frequency, frequency) + np.testing.assert_array_equal(hazard.event_id, event_id) + np.testing.assert_array_equal(hazard.event_name, event_name) + np.testing.assert_array_equal(hazard.date, date) + np.testing.assert_array_equal( + hazard.fraction.toarray(), [[0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1]] + ) + + # Ignore keys (should be default values) + hazard = Hazard.from_xarray_raster( + dataset, + "", + "", + data_vars=dict( + frequency="", event_id="", event_name="", date="", fraction="" + ), + ) + self._assert_default(hazard) + + # Wrong key + with self.assertRaises(ValueError) as cm: + Hazard.from_xarray_raster( + dataset, "", "", data_vars=dict(wrong_key="stuff") + ) + self.assertIn( + "Unknown data variables passed: '['wrong_key']'.", str(cm.exception) + ) + + # Non-existent identifier + with self.assertRaises(KeyError) as cm: + Hazard.from_xarray_raster( + dataset, "", "", data_vars=dict(frequency="freqqqqq") + ) + self.assertIn("freqqqqq", str(cm.exception)) + + # Wrong data length + # NOTE: This also implicitly checks that 'frequency' is not read! + dataset["freq"] = np.array(range(size + 1), dtype=np.float64) + with self.assertRaises(RuntimeError) as cm: + Hazard.from_xarray_raster( + dataset, "", "", data_vars=dict(frequency="freq") + ) + self.assertIn( + f"'freq' must have shape ({size},), but shape is ({size + 1},)", + str(cm.exception), + ) + + # Integer data assertions + dset = dataset.copy(deep=True) + dset["event_id"] = np.array(range(size), dtype=np.float64) + 3.5 + with self.assertRaises(TypeError) as cm: + Hazard.from_xarray_raster(dset, "", "") + self.assertIn("'event_id' data array must be integers", str(cm.exception)) + dset["event_id"] = np.linspace(0, 10, size, dtype=np.int64) + with self.assertRaises(ValueError) as cm: + Hazard.from_xarray_raster(dset, "", "") + self.assertIn("'event_id' data must be larger than zero", str(cm.exception)) + + # Date as datetime + date_str = [f"2000-01-{i:02}" for i in range(1, size + 1)] + dataset["date"] = date_str + hazard = Hazard.from_xarray_raster(dataset, "", "") + np.testing.assert_array_equal( + hazard.date, + [dt.datetime(2000, 1, i).toordinal() for i in range(1, size + 1)], + ) + + def test_data_vars_repeat(self): + """Test if suitable data vars are repeated as expected""" + with xr.open_dataset(self.netcdf_path) as dataset: + size = dataset.sizes["time"] + + # Set optionals in the dataset + frequency = [1.5] + event_name = ["bla"] + date = 1 + dataset["event_name"] = event_name + dataset["date"] = date + dataset["frequency"] = frequency + + # Check if single-valued arrays are repeated + hazard = Hazard.from_xarray_raster(dataset, "", "") + + np.testing.assert_array_equal(hazard.date, [date] * size) + np.testing.assert_array_equal(hazard.event_name, event_name * size) + np.testing.assert_array_equal(hazard.frequency, frequency * size) + + def test_nan(self): + """Check handling of NaNs in original data""" + with xr.open_dataset(self.netcdf_path) as dataset: + intensity = xr.DataArray( + np.array([[[0, np.nan, 2], [3, 4, 5]], [[6, np.nan, 8], [9, 10, 11]]]), + dims=["time", "latitude", "longitude"], + coords=dict( + time=self.time, latitude=self.latitude, longitude=self.longitude + ), + ) + dataset["intensity"] = intensity + fraction = xr.DataArray( + np.array([[[0, 0, 0], [0, 0, 0]], [[1, np.nan, 1], [np.nan, 1, 1]]]), + dims=["time", "latitude", "longitude"], + coords=dict( + time=self.time, latitude=self.latitude, longitude=self.longitude + ), + ) + dataset["fraction"] = fraction + frequency = np.ones(dataset.sizes["time"]) + frequency[0] = np.nan + dataset["frequency"] = frequency + + # Load hazard + hazard = Hazard.from_xarray_raster(dataset, "", "") + + # Check types + self._assert_default_types(hazard) + + # NaNs are set to zero in sparse data + np.testing.assert_array_equal( + hazard.intensity.toarray(), + [[0, 0, 2, 3, 4, 5], [6, 0, 8, 9, 10, 11]], + ) + np.testing.assert_array_equal( + hazard.fraction.toarray(), + [[0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 1, 1]], + ) + + # NaNs are propagated in dense data + np.testing.assert_array_equal(hazard.frequency, frequency) + + def test_crs(self): + """Check if different CRS inputs are handled correctly""" + + def test_crs_from_input(crs_input): + crs = CRS.from_user_input(crs_input) + hazard = Hazard.from_xarray_raster(self.netcdf_path, "", "", crs=crs_input) + self.assertEqual(hazard.centroids.geometry.crs, crs) + + test_crs_from_input("EPSG:3857") + test_crs_from_input(3857) + test_crs_from_input("+proj=cea +lat_0=52.112866 +lon_0=5.150162 +units=m") + + def test_missing_dims(self): + """Test if missing coordinates are expanded and correct errors are thrown""" + # Drop time as dimension, but not as coordinate! + with xr.open_dataset(self.netcdf_path) as ds: + ds = ds.isel(time=0).squeeze() + print(ds) + hazard = Hazard.from_xarray_raster(ds, "", "") + self._assert_default_types(hazard) + np.testing.assert_array_equal( + hazard.event_name, [self.time[0].strftime("%Y-%m-%d")] + ) + np.testing.assert_array_equal(hazard.date, [self.time[0].toordinal()]) + np.testing.assert_array_equal(hazard.centroids.lat, [0, 0, 0, 1, 1, 1]) + np.testing.assert_array_equal(hazard.centroids.lon, [0, 1, 2, 0, 1, 2]) + self.assertEqual(hazard.centroids.geometry.crs, DEF_CRS) + np.testing.assert_array_equal( + hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5]] + ) + self.assertEqual(hazard.fraction.nnz, 0) + np.testing.assert_array_equal( + hazard.fraction.toarray(), [[0, 0, 0, 0, 0, 0]] + ) + + # Now drop variable altogether, should raise an error + ds = ds.drop_vars("time") + with self.assertRaisesRegex(RuntimeError, "time"): + Hazard.from_xarray_raster(ds, "", "") + + # Expand time again + ds = ds.expand_dims(time=[np.datetime64("2022-01-01")]) + hazard = Hazard.from_xarray_raster(ds, "", "") + self._assert_default_types(hazard) + np.testing.assert_array_equal(hazard.event_name, ["2022-01-01"]) + np.testing.assert_array_equal( + hazard.date, [dt.datetime(2022, 1, 1).toordinal()] + ) + + +class TestReadDimsCoordsNetCDF(unittest.TestCase): + """Checks for dimensions and coordinates with different names and shapes""" + + @classmethod + def setUpClass(cls): + """Write a NetCDF file with many coordinates""" + cls.tempdir = TemporaryDirectory() + cls.netcdf_path = Path(cls.tempdir.name) / "coords.nc" + cls.intensity = np.array([[[0, 1, 2], [3, 4, 5]]]) + cls.fraction = np.array([[[0, 0, 0], [1, 1, 1]]]) + cls.time = np.array([dt.datetime(2000, 1, 1)]) + cls.x = np.array([0, 1, 2]) + cls.y = np.array([0, 1]) + cls.lon = np.array([1, 2, 3]) + cls.lat = np.array([1, 2]) + cls.years = np.array([dt.datetime(1999, 2, 2)]) + cls.longitude = np.array([[10, 11, 12], [10, 11, 12]]) + cls.latitude = np.array([[100, 100, 100], [200, 200, 200]]) + + dset = xr.Dataset( + { + "intensity": (["time", "y", "x"], cls.intensity), + "fraction": (["time", "y", "x"], cls.fraction), + }, + { + "time": cls.time, + "x": cls.x, + "y": cls.y, + "lon": (["x"], cls.lon), + "lat": (["y"], cls.lat), + "years": (["time"], cls.years), + "latitude": (["y", "x"], cls.latitude), + "longitude": (["y", "x"], cls.longitude), + }, + ) + dset.to_netcdf(cls.netcdf_path) + + @classmethod + def tearDownClass(cls): + """Delete the NetCDF file""" + cls.tempdir.cleanup() + + def _assert_intensity_fraction(self, hazard): + """Check if intensity and fraction data are read correctly""" + np.testing.assert_array_equal(hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5]]) + np.testing.assert_array_equal(hazard.fraction.toarray(), [[0, 0, 0, 1, 1, 1]]) + + def test_dimension_naming(self): + """Test if dimensions with different names can be read""" + hazard = Hazard.from_xarray_raster( + self.netcdf_path, + "", + "", + coordinate_vars=dict(latitude="y", longitude="x"), # 'time' stays default + ) + np.testing.assert_array_equal(hazard.centroids.lat, [0, 0, 0, 1, 1, 1]) + np.testing.assert_array_equal(hazard.centroids.lon, [0, 1, 2, 0, 1, 2]) + np.testing.assert_array_equal( + hazard.date, [val.toordinal() for val in self.time] + ) + self._assert_intensity_fraction(hazard) + + def test_coordinate_naming(self): + """Test if coordinates with different names than dimensions can be read""" + hazard = Hazard.from_xarray_raster( + self.netcdf_path, + "", + "", + coordinate_vars=dict(latitude="lat", longitude="lon", event="years"), + ) + np.testing.assert_array_equal(hazard.centroids.lat, [1, 1, 1, 2, 2, 2]) + np.testing.assert_array_equal(hazard.centroids.lon, [1, 2, 3, 1, 2, 3]) + np.testing.assert_array_equal( + hazard.date, [val.toordinal() for val in self.years] + ) + self._assert_intensity_fraction(hazard) + + def test_2D_coordinates(self): + """Test if read method correctly handles 2D coordinates""" + hazard = Hazard.from_xarray_raster( + self.netcdf_path, + "", + "", + coordinate_vars=dict(latitude="latitude", longitude="longitude"), + ) + np.testing.assert_array_equal( + hazard.centroids.lat, [100, 100, 100, 200, 200, 200] + ) + np.testing.assert_array_equal(hazard.centroids.lon, [10, 11, 12, 10, 11, 12]) + self._assert_intensity_fraction(hazard) + + def test_load_dataset_rechunk(self): + """Load the data from an opened dataset and force rechunking""" + with xr.open_dataset(self.netcdf_path) as dataset: + hazard = Hazard.from_xarray_raster( + dataset, + "", + "", + coordinate_vars=dict(latitude="latitude", longitude="longitude"), + rechunk=True, + ) + + np.testing.assert_array_equal( + hazard.centroids.lat, [100, 100, 100, 200, 200, 200] + ) + np.testing.assert_array_equal(hazard.centroids.lon, [10, 11, 12, 10, 11, 12]) + self._assert_intensity_fraction(hazard) + + # Assert that .chunk is called the right way + with patch("xarray.Dataset.chunk") as mock: + with xr.open_dataset(self.netcdf_path) as dataset: + mock.return_value = dataset + Hazard.from_xarray_raster( + dataset, + "", + "", + coordinate_vars=dict(latitude="latitude", longitude="longitude"), + rechunk=True, + ) + + # First latitude dim, then longitude dim, then event dim + mock.assert_called_once_with(chunks=dict(y=-1, x=-1, time="auto")) + + # Should not be called by default + mock.reset_mock() + with xr.open_dataset(self.netcdf_path) as dataset: + mock.return_value = dataset + Hazard.from_xarray_raster( + dataset, + "", + "", + coordinate_vars=dict(latitude="latitude", longitude="longitude"), + ) + + mock.assert_not_called() + + def test_2D_time(self): + """Test if stacking multiple time dimensions works out""" + time = np.array( + [ + [dt.datetime(1999, 1, 1), dt.datetime(1999, 2, 1)], + [dt.datetime(2000, 1, 1), dt.datetime(2000, 2, 1)], + ] + ) + ds = xr.Dataset( + { + "intensity": ( + ["year", "month", "latitude", "longitude"], + [[[[1]], [[2]]], [[[3]], [[4]]]], + ), + "fraction": ( + ["year", "month", "latitude", "longitude"], + [[[[10]], [[20]]], [[[30]], [[40]]]], + ), + }, + { + "latitude": [1], + "longitude": [2], + "year": [1999, 2000], + "month": [1, 2], + "time": (["year", "month"], time), + }, + ) + hazard = Hazard.from_xarray_raster(ds, "", "") + + np.testing.assert_array_equal(hazard.intensity.toarray(), [[1], [2], [3], [4]]) + np.testing.assert_array_equal( + hazard.fraction.toarray(), [[10], [20], [30], [40]] + ) + np.testing.assert_array_equal(hazard.centroids.lat, [1]) + np.testing.assert_array_equal(hazard.centroids.lon, [2]) + np.testing.assert_array_equal( + hazard.date, [val.toordinal() for val in time.flat] + ) + np.testing.assert_array_equal( + hazard.event_name, ["1999-01-01", "1999-02-01", "2000-01-01", "2000-02-01"] + ) + + def test_errors(self): + """Check if expected errors are thrown""" + # Wrong coordinate key + with self.assertRaisesRegex(ValueError, "Unknown coordinates passed"): + Hazard.from_xarray_raster( + self.netcdf_path, + "", + "", + coordinate_vars=dict(bar="latitude", longitude="baz"), + ) + + # Correctly specified, but the custom dimension does not exist + with self.assertRaisesRegex(RuntimeError, "lalalatitude"): + Hazard.from_xarray_raster( + self.netcdf_path, + "", + "", + coordinate_vars=dict(latitude="lalalatitude"), + ) + + +if __name__ == "__main__": + TESTS = unittest.TestLoader().loadTestsFromTestCase(TestReadDefaultNetCDF) + TESTS.addTests( + unittest.TestLoader().loadTestsFromTestCase(TestReadDimsCoordsNetCDF) + ) diff --git a/climada/hazard/xarray.py b/climada/hazard/xarray.py new file mode 100644 index 000000000..df7fc9bf6 --- /dev/null +++ b/climada/hazard/xarray.py @@ -0,0 +1,420 @@ +""" +This file is part of CLIMADA. + +Copyright (C) 2017 ETH Zurich, CLIMADA contributors listed in AUTHORS. + +CLIMADA is free software: you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free +Software Foundation, version 3. + +CLIMADA is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with CLIMADA. If not, see . + +--- + +Hazard input using xarray +""" + +import copy +import itertools +import logging +from dataclasses import InitVar, dataclass, field +from pathlib import Path +from typing import Any, Callable, Hashable + +import numpy as np +import pandas as pd +import sparse as sp +import xarray as xr +from scipy import sparse + +import climada.util.constants as u_const +import climada.util.dates_times as u_dt +from climada.hazard.centroids.centr import Centroids + +LOGGER = logging.getLogger(__name__) + +DEF_COORDS = {"event": "time", "longitude": "longitude", "latitude": "latitude"} +"""Default coordinates when reading Hazard data from an xarray Dataset""" + +DEF_DATA_VARS = ["fraction", "frequency", "event_id", "event_name", "date"] +"""Default keys for optional Hazard attributes when reading from an xarray Dataset""" + + +def _to_csr_matrix(array: xr.DataArray) -> sparse.csr_matrix: + """Store a numpy array as sparse matrix, optimizing storage space + + The CSR matrix stores NaNs explicitly, so we set them to zero. + """ + array = array.where(lambda x: ~np.isnan(x), 0) + array = xr.apply_ufunc( + sp.COO.from_numpy, + array, + dask="parallelized", + output_dtypes=[array.dtype], + ) + sparse_coo = array.compute().data # Load into memory + return sparse_coo.tocsr() # Convert sparse.COO to scipy.sparse.csr_matrix + + +# Define accessors for xarray DataArrays +def _default_accessor(array: xr.DataArray) -> np.ndarray: + """Take a DataArray and return its numpy representation""" + return array.to_numpy() + + +def _strict_positive_int_accessor(array: xr.DataArray) -> np.ndarray: + """Take a positive int DataArray and return its numpy representation + + Raises + ------ + TypeError + If the underlying data type is not integer + ValueError + If any value is zero or less + """ + if not np.issubdtype(array.dtype, np.integer): + raise TypeError(f"'{array.name}' data array must be integers") + if not (array > 0).all(): + raise ValueError(f"'{array.name}' data must be larger than zero") + return array.to_numpy() + + +def _date_to_ordinal_accessor(array: xr.DataArray, strict: bool = True) -> np.ndarray: + """Take a DataArray and transform it into ordinals""" + try: + if np.issubdtype(array.dtype, np.integer): + # Assume that data is ordinals + return _strict_positive_int_accessor(array) + + # Try transforming to ordinals + return np.array(u_dt.datetime64_to_ordinal(array.to_numpy())) + + # Handle access errors + except (ValueError, TypeError, AttributeError) as err: + if strict: + raise err + + LOGGER.warning( + "Failed to read values of '%s' as dates or ordinals. Hazard.date " + "will be ones only", + array.name, + ) + return np.ones(array.shape) + + +def _year_month_day_accessor(array: xr.DataArray, strict: bool = True) -> np.ndarray: + """Take an array and return am array of YYYY-MM-DD strings""" + try: + return array.dt.strftime("%Y-%m-%d").to_numpy() + + # Handle access errors + except (ValueError, TypeError, AttributeError) as err: + if strict: + raise err + + LOGGER.warning( + "Failed to read values of '%s' as dates. Hazard.event_name will be " + "empty strings", + array.name, + ) + return np.full(array.shape, "") + + +def _maybe_repeat(values: np.ndarray, times: int) -> np.ndarray: + """Return the array or repeat a single-valued array + + If ``values`` has size 1, return an array that repeats this value ``times`` + times. If the size is different, just return the array. + """ + if values.size == 1: + return np.array(list(itertools.repeat(values.flat[0], times))) + + return values + + +def _load_from_xarray_or_return_default( + data: xr.Dataset, + user_key: str | None, + default_key: str, + hazard_attr: str, + accessor: Callable[[xr.DataArray], Any], + default_value: Any, +) -> Any: + """Load data for a single Hazard attribute or return the default value + + Does the following based on the ``user_key``: + + * If the key is an empty string, return the default value + * If the key is a non-empty string, load the data for that key and return it. + * If the key is ``None``, look for the ``default_key`` in the data. If it exists, + return that data. If not, return the default value. + + Parameters + ---------- + user_key : str or None + The key set by the user to identify the DataArray to read data from. + default_key : str + The default key identifying the DataArray to read data from. + hazard_attr : str + The name of the attribute of ``Hazard`` where the data will be stored in. + accessor : Callable + A callable that takes the DataArray as argument and returns the data structure + that is required by the ``Hazard`` attribute. + default_value + The default value/array in case the data could not be found. + + Returns + ------- + The object that will be stored in the ``Hazard`` attribute with name ``hazard_attr`` + + Raises + ------ + KeyError + If ``user_key`` was a non-empty string but no such key was found in the data + RuntimeError + If the data structure loaded has a different shape than the default data + structure + """ + # User does not want to read data + if user_key == "": + LOGGER.debug("Using default values for Hazard.%s per user request", hazard_attr) + return default_value + + if not pd.isna(user_key): + # Read key exclusively + LOGGER.debug( + "Reading data for Hazard.%s from DataArray '%s'", hazard_attr, user_key + ) + val = accessor(data[user_key]) + else: + # Try default key + try: + val = accessor(data[default_key]) + LOGGER.debug( + "Reading data for Hazard.%s from DataArray '%s'", + hazard_attr, + default_key, + ) + except KeyError: + LOGGER.debug( + "Using default values for Hazard.%s. No data found", hazard_attr + ) + return default_value + + def vshape(array): + """Return a shape tuple for any array-like type we use""" + if isinstance(array, list): + return len(array) + if isinstance(array, sparse.csr_matrix): + return array.get_shape() + return array.shape + + # Check size for read data + if default_value is not None and not np.array_equal( + vshape(val), vshape(default_value) + ): + raise RuntimeError( + f"'{user_key if user_key else default_key}' must have shape " + f"{vshape(default_value)}, but shape is {vshape(val)}" + ) + + # Return the data + return val + + +@dataclass(repr=False, eq=False) +class HazardXarrayReader: + """A helper class for creating a Hazard object from an xarray dataset + + Initialize this class, then use :py:meth:`get_hazard_kwargs` to retrieve the kwargs + to be passed to the :py:class:`~climada.hazard.base.Hazard` initializer. + + Attributes + ---------- + data : xr.Dataset + The data to be read as hazard. + intensity : str + The name of the variable containing the hazard intensity information. + Default: ``"intensity"`` + coordinate_vars : dict(str, str) + Mapping from default coordinate names to coordinate names in the dataset. + data_vars : dict(str, str) + Mapping from default variable names to variable names in the dataset. + crs : str + Coordinate reference system of the data to be read. Defaults to ``"EPSG:4326"`` + (WGS 84). + rechunk : bool + If ``False``, automatically rechunk the data for more efficient reads from disk. + Default: ``False``. + """ + + data: xr.Dataset + intensity: str = "intensity" + coordinate_vars: InitVar[dict[str, str] | None] = field(default=None, kw_only=True) + data_vars: dict[str, str] | None = field(default=None, kw_only=True) + crs: str = field(default=u_const.DEF_CRS, kw_only=True) + rechunk: bool = field(default=False, kw_only=True) + + coords: dict[str, str] = field( + init=False, default_factory=lambda: copy.deepcopy(DEF_COORDS) + ) + data_dims: dict[str, tuple[Hashable, ...]] = field(init=False, default_factory=dict) + hazard_kwargs: dict[str, Any] = field(init=False, default_factory=dict) + + def __post_init__(self, coordinate_vars): + """Update coordinate and dimension names""" + # Update coordinate identifiers + coordinate_vars = coordinate_vars or {} + unknown_coords = [co for co in coordinate_vars if co not in self.coords] + if unknown_coords: + raise ValueError( + f"Unknown coordinates passed: '{unknown_coords}'. Supported " + f"coordinates are {list(self.coords.keys())}." + ) + self.coords.update(coordinate_vars) + + # Retrieve dimensions of coordinates + try: + self.data_dims = { + "event": self.data[self.coords["event"]].dims, + "longitude": self.data[self.coords["longitude"]].dims, + "latitude": self.data[self.coords["latitude"]].dims, + } + # Handle KeyError for better error message + except KeyError as err: + key = err.args[0] + raise RuntimeError( + f"Dataset is missing dimension/coordinate: {key}. Dataset dimensions: " + f"{list(self.data.dims.keys())}" + ) from err + + # Check for unexpected keys + self.data_vars = self.data_vars or {} + default_keys = copy.deepcopy(DEF_DATA_VARS) + unknown_keys = [key for key in self.data_vars.keys() if key not in default_keys] + if unknown_keys: + raise ValueError( + f"Unknown data variables passed: '{unknown_keys}'. Supported data " + f"variables are {default_keys}." + ) + + @classmethod + def from_file(cls, filename: Path | str, *args, open_dataset_kws=None, **kwargs): + """Open reader from a file""" + open_dataset_kws = open_dataset_kws or {} + open_dataset_kws = {"chunks": "auto"} | open_dataset_kws + with xr.open_dataset(filename, **open_dataset_kws) as dset: + return cls(dset, *args, **kwargs) + + def rechunk_data(self, data: xr.Dataset) -> xr.Dataset: + """Try to rechunk the data to optimize the stack operation afterwards.""" + chunks = ( + # We want one event to be contained in one chunk + {dim: -1 for dim in self.data_dims["longitude"]} + | {dim: -1 for dim in self.data_dims["latitude"]} + # Automated chunking in the event dimensions (as many as fit) + | {dim: "auto" for dim in self.data_dims["event"]} + ) + return data.chunk(chunks=chunks) + + def get_hazard_kwargs(self) -> dict[str, Any]: + """Return kwargs to initialize the hazard""" + # Shallow copy of the data + data = self.data.copy() + + # Try promoting single-value coordinates to dimensions + for key, val in self.data_dims.items(): + if not val: + coord = self.coords[key] + LOGGER.debug("Promoting Dataset coordinate '%s' to dimension", coord) + data = data.expand_dims(coord) + self.data_dims[key] = data[coord].dims + + # Maybe rechunk + if self.rechunk: + data = self.rechunk_data(data) + + # Stack (vectorize) the entire dataset into 2D (time, lat/lon) + # NOTE: We want the set union of the dimensions, but Python 'set' does not + # preserve order. However, we want longitude to run faster than latitude. + # So we use 'dict' without values, as 'dict' preserves insertion order + # (dict keys behave like a set). + data = data.stack( + event=self.data_dims["event"], + lat_lon=list( + dict.fromkeys( + self.data_dims["latitude"] + self.data_dims["longitude"] + ).keys() + ), + ) + + # Transform coordinates into centroids + centroids = Centroids( + lat=data[self.coords["latitude"]].to_numpy(), + lon=data[self.coords["longitude"]].to_numpy(), + crs=self.crs, + ) + + # Read the intensity data + LOGGER.debug("Loading Hazard intensity from DataArray '%s'", self.intensity) + intensity_matrix = _to_csr_matrix(data[self.intensity]) + + # Create a DataFrame storing access information for each of data_vars + # NOTE: Each row will be passed as arguments to + # `load_from_xarray_or_return_default`, see its docstring for further + # explanation of the DataFrame columns / keywords. + num_events = data.sizes["event"] + data_ident = pd.DataFrame( + data={ + # The attribute of the Hazard class where the data will be stored + "hazard_attr": DEF_DATA_VARS, + # The identifier and default key used in this method + "default_key": DEF_DATA_VARS, + # The key assigned by the user + "user_key": None, + # The default value for each attribute + "default_value": [ + None, + np.ones(num_events), + np.array(range(num_events), dtype=int) + 1, + list( + _year_month_day_accessor( + data[self.coords["event"]], strict=False + ).flat + ), + _date_to_ordinal_accessor(data[self.coords["event"]], strict=False), + ], + # The accessor for the data in the Dataset + "accessor": [ + _to_csr_matrix, + lambda x: _maybe_repeat(_default_accessor(x), num_events), + _strict_positive_int_accessor, + lambda x: list( + _maybe_repeat(_default_accessor(x), num_events).flat + ), + lambda x: _maybe_repeat(_date_to_ordinal_accessor(x), num_events), + ], + } + ) + + # Update with keys provided by the user + # NOTE: Keys in 'default_keys' missing from 'data_vars' will be set to 'None' + # (which is exactly what we want) and the result is written into + # 'user_key'. 'default_keys' is not modified. + data_ident["user_key"] = data_ident["default_key"].map(self.data_vars) + + # Set the Hazard attributes + for _, ident in data_ident.iterrows(): + self.hazard_kwargs[ident["hazard_attr"]] = ( + _load_from_xarray_or_return_default(data=data, **ident) + ) + + # Done! + LOGGER.debug("Hazard successfully loaded. Number of events: %i", num_events) + self.hazard_kwargs.update(centroids=centroids, intensity=intensity_matrix) + return self.hazard_kwargs diff --git a/doc/api/climada/climada.hazard.rst b/doc/api/climada/climada.hazard.rst index 3b3bef00b..cb58d5077 100644 --- a/doc/api/climada/climada.hazard.rst +++ b/doc/api/climada/climada.hazard.rst @@ -22,6 +22,16 @@ climada\.hazard\.io module :undoc-members: :show-inheritance: +climada\.hazard\.xarray module +------------------------------ + +Helper module ro read xarray data as Hazard. + +.. automodule:: climada.hazard.xarray + :members: + :undoc-members: + :show-inheritance: + climada\.hazard\.isimip\_data module ------------------------------------