diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2a579681d..dc0fb3d2a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -30,6 +30,7 @@ Removed:
- World Bank indicator data is now downloaded directly from their API via the function `download_world_bank_indicator`, instead of relying on the `pandas-datareader` package [#1033](https://github.com/CLIMADA-project/climada_python/pull/1033)
- `Exposures.write_hdf5` pickles geometry data in WKB format, which is faster and more sustainable. [#1051](https://github.com/CLIMADA-project/climada_python/pull/1051)
- The online documentation has been completely overhauled, now uses PyData theme: [#977](https://github.com/CLIMADA-project/climada_python/pull/977)
+- Add `climada.hazard.xarray` module with helper structures for reading Hazard objects from `xarray` data [#1063](https://github.com/CLIMADA-project/climada_python/pull/1063)
### Fixed
@@ -38,6 +39,8 @@ Removed:
### Deprecated
+- `Hazard.from_xarray_raster_file`. Use `Hazard.from_xarray_raster` and pass the file path as `data` argument [#1063](https://github.com/CLIMADA-project/climada_python/pull/1063)
+
### Removed
- `climada.util.interpolation.round_to_sig_digits` [#1012](https://github.com/CLIMADA-project/climada_python/pull/1012)
diff --git a/climada/hazard/io.py b/climada/hazard/io.py
index e80531b95..6f8bd01b0 100644
--- a/climada/hazard/io.py
+++ b/climada/hazard/io.py
@@ -19,29 +19,29 @@
Define Hazard IO Methods.
"""
-import copy
import datetime as dt
import itertools
import logging
import pathlib
import warnings
from collections.abc import Collection
-from typing import Any, Callable, Dict, Optional, Union
+from typing import Any, Dict, Optional, Union
import h5py
import numpy as np
import pandas as pd
import rasterio
-import sparse as sp
import xarray as xr
+from deprecation import deprecated
from scipy import sparse
import climada.util.constants as u_const
import climada.util.coordinates as u_coord
-import climada.util.dates_times as u_dt
import climada.util.hdf5_handler as u_hdf5
from climada.hazard.centroids.centr import Centroids
+from .xarray import HazardXarrayReader
+
LOGGER = logging.getLogger(__name__)
DEF_VAR_EXCEL = {
@@ -86,12 +86,6 @@
}
"""MATLAB variable names"""
-DEF_COORDS = dict(event="time", longitude="longitude", latitude="latitude")
-"""Default coordinates when reading Hazard data from an xarray Dataset"""
-
-DEF_DATA_VARS = ["fraction", "frequency", "event_id", "event_name", "date"]
-"""Default keys for optional Hazard attributes when reading from an xarray Dataset"""
-
# pylint: disable=no-member
@@ -281,18 +275,25 @@ def from_raster(
)
@classmethod
+ @deprecated(
+ 6.0,
+ details="Hazard.from_xarray_raster now supports a filepath as 'data' parameter",
+ )
def from_xarray_raster_file(
cls, filepath: Union[pathlib.Path, str], *args, **kwargs
):
"""Read raster-like data from a file that can be loaded with xarray
- This wraps :py:meth:`~Hazard.from_xarray_raster` by first opening the target file
+ .. deprecated:: 6.0
+ Pass ``filepath`` as ``data`` argument to :py:meth:`from_xarray_raster`
+ instead.
+
+ This wraps :py:meth:`from_xarray_raster` by first opening the target file
as xarray dataset and then passing it to that classmethod. Use this wrapper as a
simple alternative to opening the file yourself. The signature is exactly the
same, except for the first argument, which is replaced by a file path here.
- Additional (keyword) arguments are passed to
- :py:meth:`~Hazard.from_xarray_raster`.
+ Additional (keyword) arguments are passed to :py:meth:`from_xarray_raster`.
Parameters
----------
@@ -304,29 +305,14 @@ def from_xarray_raster_file(
-------
hazard : climada.Hazard
A hazard object created from the input data
-
- Examples
- --------
-
- >>> hazard = Hazard.from_xarray_raster_file("path/to/file.nc", "", "")
-
- Notes
- -----
-
- If you have specific requirements for opening a data file, prefer opening it
- yourself and using :py:meth:`~Hazard.from_xarray_raster`, following this pattern:
-
- >>> open_kwargs = dict(engine="h5netcdf", chunks=dict(x=-1, y="auto"))
- >>> with xarray.open_dataset("path/to/file.nc", **open_kwargs) as dset:
- ... hazard = Hazard.from_xarray_raster(dset, "", "")
"""
- with xr.open_dataset(filepath, chunks="auto") as dset:
- return cls.from_xarray_raster(dset, *args, **kwargs)
+ args = (filepath,) + args
+ return cls.from_xarray_raster(*args, **kwargs)
@classmethod
def from_xarray_raster(
cls,
- data: xr.Dataset,
+ data: xr.Dataset | pathlib.Path | str,
hazard_type: str,
intensity_unit: str,
*,
@@ -335,6 +321,7 @@ def from_xarray_raster(
data_vars: Optional[Dict[str, str]] = None,
crs: str = u_const.DEF_CRS,
rechunk: bool = False,
+ open_dataset_kws: dict[str, Any] | None = None,
):
"""Read raster-like data from an xarray Dataset
@@ -358,26 +345,23 @@ def from_xarray_raster(
meaning that the object can be used in all CLIMADA operations without throwing
an error due to missing data or faulty data types.
- Use :py:meth:`~Hazard.from_xarray_raster_file` to open a file on disk
- and load the resulting dataset with this method in one step.
-
Parameters
----------
- data : xarray.Dataset
- The dataset to read from.
+ data : xarray.Dataset or Path or str
+ The filepath to read the data from or the already opened dataset
hazard_type : str
The type identifier of the hazard. Will be stored directly in the hazard
object.
intensity_unit : str
The physical units of the intensity.
intensity : str, optional
- Identifier of the `xarray.DataArray` containing the hazard intensity data.
+ Identifier of the ``xarray.DataArray`` containing the hazard intensity data.
coordinate_vars : dict(str, str), optional
Mapping from default coordinate names to coordinate names used in the data
to read. The default is
- ``dict(event="time", longitude="longitude", latitude="latitude")``, as most
- of the commonly used hazard data happens to have a "time" attribute but no
- "event" attribute.
+ ``{"event": "time", "longitude": "longitude", "latitude": "latitude"}``, as
+ most of the commonly used hazard data happens to have a "time" attribute but
+ no "event" attribute.
data_vars : dict(str, str), optional
Mapping from default variable names to variable names used in the data
to read. The default names are ``fraction``, ``hazard_type``, ``frequency``,
@@ -392,7 +376,7 @@ def from_xarray_raster(
* ``date``: The ``event`` coordinate interpreted as date or ordinal, or
ones if that fails (which will issue a warning).
* ``fraction``: ``None``, which results in a value of 1.0 everywhere, see
- :py:meth:`Hazard.__init__` for details.
+ :py:meth:`~climada.hazard.base.Hazard.__init__` for details.
* ``hazard_type``: Empty string
* ``frequency``: 1.0 for every event
* ``event_name``: String representation of the event date or empty strings
@@ -401,8 +385,9 @@ def from_xarray_raster(
crs : str, optional
Identifier for the coordinate reference system of the coordinates. Defaults
- to ``EPSG:4326`` (WGS 84), defined by ``climada.util.constants.DEF_CRS``.
- See https://pyproj4.github.io/pyproj/dev/api/crs/crs.html#pyproj.crs.CRS.from_user_input
+ to ``"EPSG:4326"`` (WGS 84), defined by
+ :py:const:`climada.util.coordinates.DEF_CRS`. See
+ https://pyproj4.github.io/pyproj/dev/api/crs/crs.html#pyproj.crs.CRS.from_user_input
for further information on how to specify the coordinate system.
rechunk : bool, optional
Rechunk the dataset before flattening. This might have serious performance
@@ -412,16 +397,19 @@ def from_xarray_raster(
be forced by rechunking the data. Ideally, you would select the chunks in
that manner when opening the dataset before passing it to this function.
Defaults to ``False``.
+ open_dataset_kws : dict(str, any)
+ Keyword arguments passed to ``xarray.open_dataset`` if ``data`` is a file
+ path. Ignored otherwise.
Returns
-------
- hazard : climada.Hazard
+ Hazard
A hazard object created from the input data
See Also
--------
- :py:meth:`~Hazard.from_xarray_raster_file`
- Use this method if you want CLIMADA to open and read a file on disk for you.
+ :py:class:`climada.hazard.xarray.HazardXarrayReader`
+ The helper class used to read the data.
Notes
-----
@@ -438,8 +426,9 @@ def from_xarray_raster(
``data_vars``` parameter documentation above.
* To avoid confusion in the call signature, several parameters are keyword-only
arguments.
- * The attributes ``Hazard.haz_type`` and ``Hazard.unit`` currently cannot be
- read from the Dataset. Use the method parameters to set these attributes.
+ * The attributes :py:attr:`~climada.hazard.base.Hazard.haz_type` and
+ :py:attr:`~climada.hazard.base.Hazard.units` currently cannot be read from the
+ Dataset. Use the method parameters to set these attributes.
* This method does not read coordinate system metadata. Use the ``crs`` parameter
to set a custom coordinate system identifier.
@@ -463,6 +452,21 @@ def from_xarray_raster(
... )
>>> hazard = Hazard.from_xarray_raster(dset, "", "")
+ Data can also be read from a file.
+
+ >>> dset.to_netcdf("path/to/file.nc")
+ >>> hazard = Hazard.from_xarray_raster("path/to/file.nc", "", "")
+
+ If you have specific requirements for opening a data file, you can pass
+ ``open_dataset_kws``.
+
+ >>> hazard = Hazard.from_xarray_raster(
+ ... dset,
+ ... "",
+ ... "",
+ ... open_dataset_kws={"chunks": {"x": -1, "y": "auto"}, "engine": "netcdf4"}
+ ... )
+
For non-default coordinate names, use the ``coordinate_vars`` argument.
>>> dset = xr.Dataset(
@@ -574,338 +578,39 @@ def from_xarray_raster(
>>> dset = dset.expand_dims(time=[numpy.datetime64("2000-01-01")])
>>> hazard = Hazard.from_xarray_raster(dset, "", "")
"""
- # Check data type for better error message
- if not isinstance(data, xr.Dataset):
- if isinstance(data, (pathlib.Path, str)):
- raise TypeError(
- "Passing a path to this classmethod is not supported. "
- "Use Hazard.from_xarray_raster_file instead."
- )
-
- raise TypeError("This method only supports xarray.Dataset as input data")
-
- # Initialize Hazard object
- hazard_kwargs = dict(haz_type=hazard_type, units=intensity_unit)
-
- # Update coordinate identifiers
- coords = copy.deepcopy(DEF_COORDS)
- coordinate_vars = coordinate_vars if coordinate_vars is not None else {}
- unknown_coords = [co for co in coordinate_vars if co not in coords]
- if unknown_coords:
- raise ValueError(
- f"Unknown coordinates passed: '{unknown_coords}'. Supported "
- f"coordinates are {list(coords.keys())}."
- )
- coords.update(coordinate_vars)
-
- # Retrieve dimensions of coordinates
- try:
- dims = dict(
- event=data[coords["event"]].dims,
- longitude=data[coords["longitude"]].dims,
- latitude=data[coords["latitude"]].dims,
- )
- # Handle KeyError for better error message
- except KeyError as err:
- key = err.args[0]
- raise RuntimeError(
- f"Dataset is missing dimension/coordinate: {key}. Dataset dimensions: "
- f"{list(data.dims.keys())}"
- ) from err
-
- # Try promoting single-value coordinates to dimensions
- for key, val in dims.items():
- if not val:
- coord = coords[key]
- LOGGER.debug("Promoting Dataset coordinate '%s' to dimension", coord)
- data = data.expand_dims(coord)
- dims[key] = data[coord].dims
-
- # Try to rechunk the data to optimize the stack operation afterwards.
- if rechunk:
- # We want one event to be contained in one chunk
- chunks = {dim: -1 for dim in dims["longitude"]}
- chunks.update({dim: -1 for dim in dims["latitude"]})
-
- # Chunks can be auto-sized along the event dimensions
- chunks.update({dim: "auto" for dim in dims["event"]})
- data = data.chunk(chunks=chunks)
-
- # Stack (vectorize) the entire dataset into 2D (time, lat/lon)
- # NOTE: We want the set union of the dimensions, but Python 'set' does not
- # preserve order. However, we want longitude to run faster than latitude.
- # So we use 'dict' without values, as 'dict' preserves insertion order
- # (dict keys behave like a set).
- data = data.stack(
- event=dims["event"],
- lat_lon=dict.fromkeys(dims["latitude"] + dims["longitude"]),
- )
-
- # Transform coordinates into centroids
- centroids = Centroids(
- lat=data[coords["latitude"]].values,
- lon=data[coords["longitude"]].values,
- crs=crs,
- )
-
- def to_csr_matrix(array: xr.DataArray) -> sparse.csr_matrix:
- """Store a numpy array as sparse matrix, optimizing storage space
-
- The CSR matrix stores NaNs explicitly, so we set them to zero.
- """
- array = array.where(array.notnull(), 0)
- array = xr.apply_ufunc(
- sp.COO.from_numpy,
- array,
- dask="parallelized",
- output_dtypes=[array.dtype],
- )
- sparse_coo = array.compute().data # Load into memory
- return sparse_coo.tocsr() # Convert sparse.COO to scipy.sparse.csr_matrix
-
- # Read the intensity data
- LOGGER.debug("Loading Hazard intensity from DataArray '%s'", intensity)
- intensity_matrix = to_csr_matrix(data[intensity])
-
- # Define accessors for xarray DataArrays
- def default_accessor(array: xr.DataArray) -> np.ndarray:
- """Take a DataArray and return its numpy representation"""
- return array.values
-
- def strict_positive_int_accessor(array: xr.DataArray) -> np.ndarray:
- """Take a positive int DataArray and return its numpy representation
-
- Raises
- ------
- TypeError
- If the underlying data type is not integer
- ValueError
- If any value is zero or less
- """
- if not np.issubdtype(array.dtype, np.integer):
- raise TypeError(f"'{array.name}' data array must be integers")
- if not (array > 0).all():
- raise ValueError(f"'{array.name}' data must be larger than zero")
- return array.values
-
- def date_to_ordinal_accessor(
- array: xr.DataArray, strict: bool = True
- ) -> np.ndarray:
- """Take a DataArray and transform it into ordinals"""
- try:
- if np.issubdtype(array.dtype, np.integer):
- # Assume that data is ordinals
- return strict_positive_int_accessor(array)
-
- # Try transforming to ordinals
- return np.array(u_dt.datetime64_to_ordinal(array.values))
-
- # Handle access errors
- except (ValueError, TypeError, AttributeError) as err:
- if strict:
- raise err
-
- LOGGER.warning(
- "Failed to read values of '%s' as dates or ordinals. Hazard.date "
- "will be ones only",
- array.name,
- )
- return np.ones(array.shape)
-
- def year_month_day_accessor(
- array: xr.DataArray, strict: bool = True
- ) -> np.ndarray:
- """Take an array and return am array of YYYY-MM-DD strings"""
- try:
- return array.dt.strftime("%Y-%m-%d").values
-
- # Handle access errors
- except (ValueError, TypeError, AttributeError) as err:
- if strict:
- raise err
-
- LOGGER.warning(
- "Failed to read values of '%s' as dates. Hazard.event_name will be "
- "empty strings",
- array.name,
- )
- return np.full(array.shape, "")
-
- def maybe_repeat(values: np.ndarray, times: int) -> np.ndarray:
- """Return the array or repeat a single-valued array
-
- If ``values`` has size 1, return an array that repeats this value ``times``
- times. If the size is different, just return the array.
- """
- if values.size == 1:
- return np.array(list(itertools.repeat(values.flat[0], times)))
-
- return values
-
- # Create a DataFrame storing access information for each of data_vars
- # NOTE: Each row will be passed as arguments to
- # `load_from_xarray_or_return_default`, see its docstring for further
- # explanation of the DataFrame columns / keywords.
- num_events = data.sizes["event"]
- data_ident = pd.DataFrame(
- data=dict(
- # The attribute of the Hazard class where the data will be stored
- hazard_attr=DEF_DATA_VARS,
- # The identifier and default key used in this method
- default_key=DEF_DATA_VARS,
- # The key assigned by the user
- user_key=None,
- # The default value for each attribute
- default_value=[
- None,
- np.ones(num_events),
- np.array(range(num_events), dtype=int) + 1,
- list(
- year_month_day_accessor(
- data[coords["event"]], strict=False
- ).flat
- ),
- date_to_ordinal_accessor(data[coords["event"]], strict=False),
- ],
- # The accessor for the data in the Dataset
- accessor=[
- to_csr_matrix,
- lambda x: maybe_repeat(default_accessor(x), num_events),
- strict_positive_int_accessor,
- lambda x: list(maybe_repeat(default_accessor(x), num_events).flat),
- lambda x: maybe_repeat(date_to_ordinal_accessor(x), num_events),
- ],
+ reader_kwargs = {
+ "intensity": intensity,
+ "coordinate_vars": coordinate_vars,
+ "data_vars": data_vars,
+ "crs": crs,
+ "rechunk": rechunk,
+ }
+ if isinstance(data, xr.Dataset):
+ reader = HazardXarrayReader(data=data, **reader_kwargs)
+ elif isinstance(data, xr.DataArray):
+ raise TypeError(
+ "This method only supports passing xr.Dataset. Consider promoting "
+ "your xr.DataArray to a Dataset."
)
- )
-
- # Check for unexpected keys
- data_vars = data_vars if data_vars is not None else {}
- default_keys = data_ident["default_key"]
- unknown_keys = [
- key for key in data_vars.keys() if not default_keys.str.contains(key).any()
- ]
- if unknown_keys:
- raise ValueError(
- f"Unknown data variables passed: '{unknown_keys}'. Supported "
- f"data variables are {list(default_keys)}."
- )
-
- # Update with keys provided by the user
- # NOTE: Keys in 'default_keys' missing from 'data_vars' will be set to 'None'
- # (which is exactly what we want) and the result is written into
- # 'user_key'. 'default_keys' is not modified.
- data_ident["user_key"] = default_keys.map(data_vars)
-
- def load_from_xarray_or_return_default(
- user_key: Optional[str],
- default_key: str,
- hazard_attr: str,
- accessor: Callable[[xr.DataArray], Any],
- default_value: Any,
- ) -> Any:
- """Load data for a single Hazard attribute or return the default value
-
- Does the following based on the ``user_key``:
- * If the key is an empty string, return the default value
- * If the key is a non-empty string, load the data for that key and return it.
- * If the key is ``None``, look for the ``default_key`` in the data. If it
- exists, return that data. If not, return the default value.
-
- Parameters
- ----------
- user_key : str or None
- The key set by the user to identify the DataArray to read data from.
- default_key : str
- The default key identifying the DataArray to read data from.
- hazard_attr : str
- The name of the attribute of ``Hazard`` where the data will be stored in.
- accessor : Callable
- A callable that takes the DataArray as argument and returns the data
- structure that is required by the ``Hazard`` attribute.
- default_value
- The default value/array to return in case the data could not be found.
-
- Returns
- -------
- The object that will be stored in the ``Hazard`` attribute ``hazard_attr``.
-
- Raises
- ------
- KeyError
- If ``user_key`` was a non-empty string but no such key was found in the
- data
- RuntimeError
- If the data structure loaded has a different shape than the default data
- structure
- """
- # User does not want to read data
- if user_key == "":
- LOGGER.debug(
- "Using default values for Hazard.%s per user request", hazard_attr
- )
- return default_value
-
- if not pd.isna(user_key):
- # Read key exclusively
- LOGGER.debug(
- "Reading data for Hazard.%s from DataArray '%s'",
- hazard_attr,
- user_key,
- )
- val = accessor(data[user_key])
- else:
- # Try default key
- try:
- val = accessor(data[default_key])
- LOGGER.debug(
- "Reading data for Hazard.%s from DataArray '%s'",
- hazard_attr,
- default_key,
- )
- except KeyError:
- LOGGER.debug(
- "Using default values for Hazard.%s. No data found", hazard_attr
- )
- return default_value
-
- def vshape(array):
- """Return a shape tuple for any array-like type we use"""
- if isinstance(array, list):
- return len(array)
- if isinstance(array, sparse.csr_matrix):
- return array.get_shape()
- return array.shape
-
- # Check size for read data
- if default_value is not None and not np.array_equal(
- vshape(val), vshape(default_value)
- ):
- raise RuntimeError(
- f"'{user_key if user_key else default_key}' must have shape "
- f"{vshape(default_value)}, but shape is {vshape(val)}"
- )
-
- # Return the data
- return val
-
- # Set the Hazard attributes
- for _, ident in data_ident.iterrows():
- hazard_kwargs[ident["hazard_attr"]] = load_from_xarray_or_return_default(
- **ident
+ else: # data is pathlike
+ reader = HazardXarrayReader.from_file(
+ filename=data, open_dataset_kws=open_dataset_kws, **reader_kwargs
)
- hazard_kwargs = cls._check_and_cast_attrs(hazard_kwargs)
-
- # Done!
- LOGGER.debug("Hazard successfully loaded. Number of events: %i", num_events)
- return cls(centroids=centroids, intensity=intensity_matrix, **hazard_kwargs)
+ kwargs = reader.get_hazard_kwargs() | {
+ "haz_type": hazard_type,
+ "units": intensity_unit,
+ }
+ return cls(**cls._check_and_cast_attrs(kwargs))
@staticmethod
def _check_and_cast_attrs(attrs: Dict[str, Any]) -> Dict[str, Any]:
- """Check the validity of the hazard attributes given and cast to correct type if required and possible.
+ """Check the validity of the hazard attributes given and cast to correct type if required
+ and possible.
The current purpose is to check that event_name is a list of string
- (and convert to string otherwise), although other checks and casting could be included here in the future.
+ (and convert to string otherwise), although other checks and casting could be included here
+ in the future.
Parameters
----------
@@ -917,7 +622,8 @@ def _check_and_cast_attrs(attrs: Dict[str, Any]) -> Dict[str, Any]:
-------
attrs : dict
- Attributes checked for type validity and casted otherwise (only event_name at the moment).
+ Attributes checked for type validity and casted otherwise (only event_name at the
+ moment).
Warns
-----
@@ -961,8 +667,8 @@ def _check_and_cast_container(
def _check_and_cast_elements(
attr_value: Any, expected_dtype: Union[Any, None]
) -> Any:
- """Check if the elements of the container are of the expected dtype and cast if necessary,
- while preserving the original container type.
+ """Check if the elements of the container are of the expected dtype and cast if
+ necessary, while preserving the original container type.
Parameters
----------
@@ -975,7 +681,8 @@ def _check_and_cast_elements(
Returns
-------
attr_value : any
- The value with elements cast to the expected type, preserving the original container type.
+ The value with elements cast to the expected type, preserving the original
+ container type.
"""
if expected_dtype is None:
# No dtype enforcement required
diff --git a/climada/hazard/test/test_io.py b/climada/hazard/test/test_io.py
index 63e35291f..4d93e0c11 100644
--- a/climada/hazard/test/test_io.py
+++ b/climada/hazard/test/test_io.py
@@ -19,584 +19,13 @@
Test Hazard base class.
"""
-import datetime as dt
import unittest
-from pathlib import Path
-from tempfile import TemporaryDirectory
-from unittest.mock import patch
import numpy as np
-import xarray as xr
-from pyproj import CRS
-from scipy.sparse import csr_matrix
from climada.hazard.base import Hazard
from climada.hazard.test.test_base import DATA_DIR, dummy_hazard
-from climada.util.constants import DEF_CRS, DEF_FREQ_UNIT, HAZ_DEMO_FL, HAZ_TEMPLATE_XLS
-
-
-class TestReadDefaultNetCDF(unittest.TestCase):
- """Test reading a NetCDF file where the coordinates to read match the dimensions"""
-
- @classmethod
- def setUpClass(cls):
- """Write a simple NetCDF file to read"""
- cls.tempdir = TemporaryDirectory()
- cls.netcdf_path = Path(cls.tempdir.name) / "default.nc"
- cls.intensity = np.array([[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [9, 10, 11]]])
- cls.time = np.array([dt.datetime(1999, 1, 1), dt.datetime(2000, 1, 1)])
- cls.latitude = np.array([0, 1])
- cls.longitude = np.array([0, 1, 2])
- dset = xr.Dataset(
- {
- "intensity": (["time", "latitude", "longitude"], cls.intensity),
- },
- dict(time=cls.time, latitude=cls.latitude, longitude=cls.longitude),
- )
- dset.to_netcdf(cls.netcdf_path)
-
- @classmethod
- def tearDownClass(cls):
- """Delete the NetCDF file"""
- cls.tempdir.cleanup()
-
- def _assert_default(self, hazard):
- """Assertions for the default hazard to be loaded"""
- self._assert_default_types(hazard)
- self._assert_default_values(hazard)
-
- def _assert_default_values(self, hazard):
- """Check the values of the default hazard to be loaded"""
- # Hazard data
- self.assertEqual(hazard.haz_type, "")
- self.assertEqual(hazard.units, "")
- np.testing.assert_array_equal(hazard.event_id, [1, 2])
- np.testing.assert_array_equal(
- hazard.event_name, [x.strftime("%Y-%m-%d") for x in self.time]
- )
- np.testing.assert_array_equal(
- hazard.date, [val.toordinal() for val in self.time]
- )
- np.testing.assert_array_equal(hazard.frequency, np.ones(hazard.event_id.size))
-
- # Centroids
- np.testing.assert_array_equal(hazard.centroids.lat, [0, 0, 0, 1, 1, 1])
- np.testing.assert_array_equal(hazard.centroids.lon, [0, 1, 2, 0, 1, 2])
- self.assertEqual(hazard.centroids.geometry.crs, DEF_CRS)
-
- # Intensity data
- np.testing.assert_array_equal(
- hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]]
- )
-
- # Fraction default
- self.assertEqual(hazard.fraction.nnz, 0)
- np.testing.assert_array_equal(hazard.fraction.shape, hazard.intensity.shape)
- np.testing.assert_array_equal(
- hazard.fraction.toarray(), np.zeros_like(hazard.intensity.toarray())
- )
-
- def _assert_default_types(self, hazard):
- """Check types of all hazard attributes"""
- self.assertIsInstance(hazard.units, str)
- self.assertIsInstance(hazard.haz_type, str)
- self.assertIsInstance(hazard.event_id, np.ndarray)
- self.assertIsInstance(hazard.event_name, list)
- self.assertIsInstance(hazard.frequency, np.ndarray)
- self.assertIsInstance(hazard.intensity, csr_matrix)
- self.assertIsInstance(hazard.fraction, csr_matrix)
- self.assertIsInstance(hazard.date, np.ndarray)
-
- def test_load_path(self):
- """Load the data with path as argument"""
- hazard = Hazard.from_xarray_raster_file(self.netcdf_path, "", "")
- self._assert_default(hazard)
-
- # Check wrong paths
- with self.assertRaises(FileNotFoundError) as cm:
- Hazard.from_xarray_raster_file("file-does-not-exist.nc", "", "")
- self.assertIn("file-does-not-exist.nc", str(cm.exception))
- with self.assertRaises(KeyError) as cm:
- Hazard.from_xarray_raster_file(
- self.netcdf_path, "", "", intensity="wrong-intensity-path"
- )
- self.assertIn("wrong-intensity-path", str(cm.exception))
-
- def test_load_dataset(self):
- """Load the data from an opened dataset as argument"""
-
- def _load_and_assert(chunks):
- with xr.open_dataset(self.netcdf_path, chunks=chunks) as dataset:
- hazard = Hazard.from_xarray_raster(dataset, "", "")
- self._assert_default(hazard)
-
- _load_and_assert(chunks=None)
- _load_and_assert(chunks=dict(latitude=1, longitude=1, time=1))
-
- def test_type_error(self):
- """Calling 'from_xarray_raster' with wrong data type should throw"""
- # Passing a path
- with self.assertRaises(TypeError) as cm:
- Hazard.from_xarray_raster(self.netcdf_path, "", "")
- self.assertIn(
- "Use Hazard.from_xarray_raster_file instead",
- str(cm.exception),
- )
-
- # Passing a DataArray
- with xr.open_dataset(self.netcdf_path) as dset, self.assertRaises(
- TypeError
- ) as cm:
- Hazard.from_xarray_raster(dset["intensity"], "", "")
- self.assertIn(
- "This method only supports xarray.Dataset as input data",
- str(cm.exception),
- )
-
- def test_type_and_unit(self):
- """Test passing a custom type and unit"""
- hazard = Hazard.from_xarray_raster_file(
- self.netcdf_path, hazard_type="TC", intensity_unit="m/s"
- )
- self._assert_default_types(hazard)
- self.assertEqual(hazard.haz_type, "TC")
- self.assertEqual(hazard.units, "m/s")
-
- def test_event_no_time(self):
- """Test if an event coordinate that is not a time works"""
- with xr.open_dataset(self.netcdf_path) as dataset:
- size = dataset.sizes["time"]
-
- # Positive integers (interpreted as ordinals)
- time = [2, 1]
- dataset["time"] = time
- hazard = Hazard.from_xarray_raster(dataset, "", "")
- self._assert_default_types(hazard)
- np.testing.assert_array_equal(
- hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]]
- )
- np.testing.assert_array_equal(hazard.date, time)
- np.testing.assert_array_equal(hazard.event_name, np.full(size, ""))
-
- # Strings
- dataset["time"] = ["a", "b"]
- with self.assertLogs("climada.hazard.io", "WARNING") as cm:
- hazard = Hazard.from_xarray_raster(dataset, "", "")
- np.testing.assert_array_equal(hazard.date, np.ones(size))
- np.testing.assert_array_equal(hazard.event_name, np.full(size, ""))
- self.assertIn("Failed to read values of 'time' as dates.", cm.output[0])
- self.assertIn(
- "Failed to read values of 'time' as dates or ordinals.", cm.output[1]
- )
-
- def test_data_vars(self):
- """Check handling of data variables"""
- with xr.open_dataset(self.netcdf_path) as dataset:
- size = dataset.sizes["time"]
-
- # Set optionals in the dataset
- frequency = np.ones(size) * 1.5
- event_id = np.array(range(size), dtype=np.int64) + 3
- event_name = ["bla"] * size
- date = np.array(range(size)) + 100
- dataset["event_id"] = event_id
- dataset["event_name"] = event_name
- dataset["date"] = date
-
- # Assign a proper coordinate for a change
- dataset = dataset.assign_coords(dict(frequency=("time", frequency)))
-
- # Assign fraction
- frac = xr.DataArray(
- np.array([[[0, 0, 0], [0, 0, 0]], [[1, 1, 1], [1, 1, 1]]]),
- dims=["time", "latitude", "longitude"],
- coords=dict(
- time=self.time, latitude=self.latitude, longitude=self.longitude
- ),
- )
- dataset["fraction"] = frac
-
- # Optionals should be read automatically
- hazard = Hazard.from_xarray_raster(dataset, "", "")
- self._assert_default_types(hazard)
- np.testing.assert_array_equal(hazard.frequency, frequency)
- np.testing.assert_array_equal(hazard.event_id, event_id)
- np.testing.assert_array_equal(hazard.event_name, event_name)
- np.testing.assert_array_equal(hazard.date, date)
- np.testing.assert_array_equal(
- hazard.fraction.toarray(), [[0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1]]
- )
-
- # Ignore keys (should be default values)
- hazard = Hazard.from_xarray_raster(
- dataset,
- "",
- "",
- data_vars=dict(
- frequency="", event_id="", event_name="", date="", fraction=""
- ),
- )
- self._assert_default(hazard)
-
- # Wrong key
- with self.assertRaises(ValueError) as cm:
- Hazard.from_xarray_raster(
- dataset, "", "", data_vars=dict(wrong_key="stuff")
- )
- self.assertIn(
- "Unknown data variables passed: '['wrong_key']'.", str(cm.exception)
- )
-
- # Non-existent identifier
- with self.assertRaises(KeyError) as cm:
- Hazard.from_xarray_raster(
- dataset, "", "", data_vars=dict(frequency="freqqqqq")
- )
- self.assertIn("freqqqqq", str(cm.exception))
-
- # Wrong data length
- # NOTE: This also implicitly checks that 'frequency' is not read!
- dataset["freq"] = np.array(range(size + 1), dtype=np.float64)
- with self.assertRaises(RuntimeError) as cm:
- Hazard.from_xarray_raster(
- dataset, "", "", data_vars=dict(frequency="freq")
- )
- self.assertIn(
- f"'freq' must have shape ({size},), but shape is ({size + 1},)",
- str(cm.exception),
- )
-
- # Integer data assertions
- dset = dataset.copy(deep=True)
- dset["event_id"] = np.array(range(size), dtype=np.float64) + 3.5
- with self.assertRaises(TypeError) as cm:
- Hazard.from_xarray_raster(dset, "", "")
- self.assertIn("'event_id' data array must be integers", str(cm.exception))
- dset["event_id"] = np.linspace(0, 10, size, dtype=np.int64)
- with self.assertRaises(ValueError) as cm:
- Hazard.from_xarray_raster(dset, "", "")
- self.assertIn("'event_id' data must be larger than zero", str(cm.exception))
-
- # Date as datetime
- date_str = [f"2000-01-{i:02}" for i in range(1, size + 1)]
- dataset["date"] = date_str
- hazard = Hazard.from_xarray_raster(dataset, "", "")
- np.testing.assert_array_equal(
- hazard.date,
- [dt.datetime(2000, 1, i).toordinal() for i in range(1, size + 1)],
- )
-
- def test_data_vars_repeat(self):
- """Test if suitable data vars are repeated as expected"""
- with xr.open_dataset(self.netcdf_path) as dataset:
- size = dataset.sizes["time"]
-
- # Set optionals in the dataset
- frequency = [1.5]
- event_name = ["bla"]
- date = 1
- dataset["event_name"] = event_name
- dataset["date"] = date
- dataset["frequency"] = frequency
-
- # Check if single-valued arrays are repeated
- hazard = Hazard.from_xarray_raster(dataset, "", "")
-
- np.testing.assert_array_equal(hazard.date, [date] * size)
- np.testing.assert_array_equal(hazard.event_name, event_name * size)
- np.testing.assert_array_equal(hazard.frequency, frequency * size)
-
- def test_nan(self):
- """Check handling of NaNs in original data"""
- with xr.open_dataset(self.netcdf_path) as dataset:
- intensity = xr.DataArray(
- np.array([[[0, np.nan, 2], [3, 4, 5]], [[6, np.nan, 8], [9, 10, 11]]]),
- dims=["time", "latitude", "longitude"],
- coords=dict(
- time=self.time, latitude=self.latitude, longitude=self.longitude
- ),
- )
- dataset["intensity"] = intensity
- fraction = xr.DataArray(
- np.array([[[0, 0, 0], [0, 0, 0]], [[1, np.nan, 1], [np.nan, 1, 1]]]),
- dims=["time", "latitude", "longitude"],
- coords=dict(
- time=self.time, latitude=self.latitude, longitude=self.longitude
- ),
- )
- dataset["fraction"] = fraction
- frequency = np.ones(dataset.sizes["time"])
- frequency[0] = np.nan
- dataset["frequency"] = frequency
-
- # Load hazard
- hazard = Hazard.from_xarray_raster(dataset, "", "")
-
- # Check types
- self._assert_default_types(hazard)
-
- # NaNs are set to zero in sparse data
- np.testing.assert_array_equal(
- hazard.intensity.toarray(),
- [[0, 0, 2, 3, 4, 5], [6, 0, 8, 9, 10, 11]],
- )
- np.testing.assert_array_equal(
- hazard.fraction.toarray(),
- [[0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 1, 1]],
- )
-
- # NaNs are propagated in dense data
- np.testing.assert_array_equal(hazard.frequency, frequency)
-
- def test_crs(self):
- """Check if different CRS inputs are handled correctly"""
-
- def test_crs_from_input(crs_input):
- crs = CRS.from_user_input(crs_input)
- hazard = Hazard.from_xarray_raster_file(
- self.netcdf_path, "", "", crs=crs_input
- )
- self.assertEqual(hazard.centroids.geometry.crs, crs)
-
- test_crs_from_input("EPSG:3857")
- test_crs_from_input(3857)
- test_crs_from_input("+proj=cea +lat_0=52.112866 +lon_0=5.150162 +units=m")
-
- def test_missing_dims(self):
- """Test if missing coordinates are expanded and correct errors are thrown"""
- # Drop time as dimension, but not as coordinate!
- with xr.open_dataset(self.netcdf_path) as ds:
- ds = ds.isel(time=0).squeeze()
- hazard = Hazard.from_xarray_raster(ds, "", "")
- self._assert_default_types(hazard)
- np.testing.assert_array_equal(
- hazard.event_name, [self.time[0].strftime("%Y-%m-%d")]
- )
- np.testing.assert_array_equal(hazard.date, [self.time[0].toordinal()])
- np.testing.assert_array_equal(hazard.centroids.lat, [0, 0, 0, 1, 1, 1])
- np.testing.assert_array_equal(hazard.centroids.lon, [0, 1, 2, 0, 1, 2])
- self.assertEqual(hazard.centroids.geometry.crs, DEF_CRS)
- np.testing.assert_array_equal(
- hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5]]
- )
- self.assertEqual(hazard.fraction.nnz, 0)
- np.testing.assert_array_equal(
- hazard.fraction.toarray(), [[0, 0, 0, 0, 0, 0]]
- )
-
- # Now drop variable altogether, should raise an error
- ds = ds.drop_vars("time")
- with self.assertRaisesRegex(RuntimeError, "time"):
- Hazard.from_xarray_raster(ds, "", "")
-
- # Expand time again
- ds = ds.expand_dims(time=[np.datetime64("2022-01-01")])
- hazard = Hazard.from_xarray_raster(ds, "", "")
- self._assert_default_types(hazard)
- np.testing.assert_array_equal(hazard.event_name, ["2022-01-01"])
- np.testing.assert_array_equal(
- hazard.date, [dt.datetime(2022, 1, 1).toordinal()]
- )
-
-
-class TestReadDimsCoordsNetCDF(unittest.TestCase):
- """Checks for dimensions and coordinates with different names and shapes"""
-
- @classmethod
- def setUpClass(cls):
- """Write a NetCDF file with many coordinates"""
- cls.tempdir = TemporaryDirectory()
- cls.netcdf_path = Path(cls.tempdir.name) / "coords.nc"
- cls.intensity = np.array([[[0, 1, 2], [3, 4, 5]]])
- cls.fraction = np.array([[[0, 0, 0], [1, 1, 1]]])
- cls.time = np.array([dt.datetime(2000, 1, 1)])
- cls.x = np.array([0, 1, 2])
- cls.y = np.array([0, 1])
- cls.lon = np.array([1, 2, 3])
- cls.lat = np.array([1, 2])
- cls.years = np.array([dt.datetime(1999, 2, 2)])
- cls.longitude = np.array([[10, 11, 12], [10, 11, 12]])
- cls.latitude = np.array([[100, 100, 100], [200, 200, 200]])
-
- dset = xr.Dataset(
- {
- "intensity": (["time", "y", "x"], cls.intensity),
- "fraction": (["time", "y", "x"], cls.fraction),
- },
- {
- "time": cls.time,
- "x": cls.x,
- "y": cls.y,
- "lon": (["x"], cls.lon),
- "lat": (["y"], cls.lat),
- "years": (["time"], cls.years),
- "latitude": (["y", "x"], cls.latitude),
- "longitude": (["y", "x"], cls.longitude),
- },
- )
- dset.to_netcdf(cls.netcdf_path)
-
- @classmethod
- def tearDownClass(cls):
- """Delete the NetCDF file"""
- cls.tempdir.cleanup()
-
- def _assert_intensity_fraction(self, hazard):
- """Check if intensity and fraction data are read correctly"""
- np.testing.assert_array_equal(hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5]])
- np.testing.assert_array_equal(hazard.fraction.toarray(), [[0, 0, 0, 1, 1, 1]])
-
- def test_dimension_naming(self):
- """Test if dimensions with different names can be read"""
- hazard = Hazard.from_xarray_raster_file(
- self.netcdf_path,
- "",
- "",
- coordinate_vars=dict(latitude="y", longitude="x"), # 'time' stays default
- )
- np.testing.assert_array_equal(hazard.centroids.lat, [0, 0, 0, 1, 1, 1])
- np.testing.assert_array_equal(hazard.centroids.lon, [0, 1, 2, 0, 1, 2])
- np.testing.assert_array_equal(
- hazard.date, [val.toordinal() for val in self.time]
- )
- self._assert_intensity_fraction(hazard)
-
- def test_coordinate_naming(self):
- """Test if coordinates with different names than dimensions can be read"""
- hazard = Hazard.from_xarray_raster_file(
- self.netcdf_path,
- "",
- "",
- coordinate_vars=dict(latitude="lat", longitude="lon", event="years"),
- )
- np.testing.assert_array_equal(hazard.centroids.lat, [1, 1, 1, 2, 2, 2])
- np.testing.assert_array_equal(hazard.centroids.lon, [1, 2, 3, 1, 2, 3])
- np.testing.assert_array_equal(
- hazard.date, [val.toordinal() for val in self.years]
- )
- self._assert_intensity_fraction(hazard)
-
- def test_2D_coordinates(self):
- """Test if read method correctly handles 2D coordinates"""
- hazard = Hazard.from_xarray_raster_file(
- self.netcdf_path,
- "",
- "",
- coordinate_vars=dict(latitude="latitude", longitude="longitude"),
- )
- np.testing.assert_array_equal(
- hazard.centroids.lat, [100, 100, 100, 200, 200, 200]
- )
- np.testing.assert_array_equal(hazard.centroids.lon, [10, 11, 12, 10, 11, 12])
- self._assert_intensity_fraction(hazard)
-
- def test_load_dataset_rechunk(self):
- """Load the data from an opened dataset and force rechunking"""
- with xr.open_dataset(self.netcdf_path) as dataset:
- hazard = Hazard.from_xarray_raster(
- dataset,
- "",
- "",
- coordinate_vars=dict(latitude="latitude", longitude="longitude"),
- rechunk=True,
- )
-
- np.testing.assert_array_equal(
- hazard.centroids.lat, [100, 100, 100, 200, 200, 200]
- )
- np.testing.assert_array_equal(hazard.centroids.lon, [10, 11, 12, 10, 11, 12])
- self._assert_intensity_fraction(hazard)
-
- # Assert that .chunk is called the right way
- with patch("xarray.Dataset.chunk") as mock:
- with xr.open_dataset(self.netcdf_path) as dataset:
- mock.return_value = dataset
- Hazard.from_xarray_raster(
- dataset,
- "",
- "",
- coordinate_vars=dict(latitude="latitude", longitude="longitude"),
- rechunk=True,
- )
-
- # First latitude dim, then longitude dim, then event dim
- mock.assert_called_once_with(chunks=dict(y=-1, x=-1, time="auto"))
-
- # Should not be called by default
- mock.reset_mock()
- with xr.open_dataset(self.netcdf_path) as dataset:
- mock.return_value = dataset
- Hazard.from_xarray_raster(
- dataset,
- "",
- "",
- coordinate_vars=dict(latitude="latitude", longitude="longitude"),
- )
-
- mock.assert_not_called()
-
- def test_2D_time(self):
- """Test if stacking multiple time dimensions works out"""
- time = np.array(
- [
- [dt.datetime(1999, 1, 1), dt.datetime(1999, 2, 1)],
- [dt.datetime(2000, 1, 1), dt.datetime(2000, 2, 1)],
- ]
- )
- ds = xr.Dataset(
- {
- "intensity": (
- ["year", "month", "latitude", "longitude"],
- [[[[1]], [[2]]], [[[3]], [[4]]]],
- ),
- "fraction": (
- ["year", "month", "latitude", "longitude"],
- [[[[10]], [[20]]], [[[30]], [[40]]]],
- ),
- },
- {
- "latitude": [1],
- "longitude": [2],
- "year": [1999, 2000],
- "month": [1, 2],
- "time": (["year", "month"], time),
- },
- )
- hazard = Hazard.from_xarray_raster(ds, "", "")
-
- np.testing.assert_array_equal(hazard.intensity.toarray(), [[1], [2], [3], [4]])
- np.testing.assert_array_equal(
- hazard.fraction.toarray(), [[10], [20], [30], [40]]
- )
- np.testing.assert_array_equal(hazard.centroids.lat, [1])
- np.testing.assert_array_equal(hazard.centroids.lon, [2])
- np.testing.assert_array_equal(
- hazard.date, [val.toordinal() for val in time.flat]
- )
- np.testing.assert_array_equal(
- hazard.event_name, ["1999-01-01", "1999-02-01", "2000-01-01", "2000-02-01"]
- )
-
- def test_errors(self):
- """Check if expected errors are thrown"""
- # Wrong coordinate key
- with self.assertRaises(ValueError) as cm:
- Hazard.from_xarray_raster_file(
- self.netcdf_path,
- "",
- "",
- coordinate_vars=dict(bar="latitude", longitude="baz"),
- )
- self.assertIn("Unknown coordinates passed: '['bar']'.", str(cm.exception))
-
- # Correctly specified, but the custom dimension does not exist
- with self.assertRaisesRegex(RuntimeError, "lalalatitude"):
- Hazard.from_xarray_raster_file(
- self.netcdf_path,
- "",
- "",
- coordinate_vars=dict(latitude="lalalatitude"),
- )
+from climada.util.constants import DEF_FREQ_UNIT, HAZ_TEMPLATE_XLS
class TestReaderExcel(unittest.TestCase):
@@ -688,10 +117,6 @@ class CustomID:
# Execute Tests
if __name__ == "__main__":
- TESTS = unittest.TestLoader().loadTestsFromTestCase(TestReadDefaultNetCDF)
- TESTS.addTests(
- unittest.TestLoader().loadTestsFromTestCase(TestReadDimsCoordsNetCDF)
- )
- TESTS.addTests(unittest.TestLoader().loadTestsFromTestCase(TestReaderExcel))
+ TESTS = unittest.TestLoader().loadTestsFromTestCase(TestReaderExcel)
TESTS.addTests(unittest.TestLoader().loadTestsFromTestCase(TestHDF5))
unittest.TextTestRunner(verbosity=2).run(TESTS)
diff --git a/climada/hazard/test/test_xarray.py b/climada/hazard/test/test_xarray.py
new file mode 100644
index 000000000..cf481e3c4
--- /dev/null
+++ b/climada/hazard/test/test_xarray.py
@@ -0,0 +1,605 @@
+"""
+This file is part of CLIMADA.
+
+Copyright (C) 2017 ETH Zurich, CLIMADA contributors listed in AUTHORS.
+
+CLIMADA is free software: you can redistribute it and/or modify it under the
+terms of the GNU General Public License as published by the Free
+Software Foundation, version 3.
+
+CLIMADA is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with CLIMADA. If not, see .
+
+---
+
+Test Hazard base class Xarray reader.
+"""
+
+import datetime as dt
+import unittest
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from unittest.mock import patch
+
+import numpy as np
+import xarray as xr
+from pyproj import CRS
+from scipy.sparse import csr_matrix
+
+from climada.hazard.base import Hazard
+from climada.util.constants import DEF_CRS
+
+
+class TestReadDefaultNetCDF(unittest.TestCase):
+ """Test reading a NetCDF file where the coordinates to read match the dimensions"""
+
+ @classmethod
+ def setUpClass(cls):
+ """Write a simple NetCDF file to read"""
+ cls.tempdir = TemporaryDirectory()
+ cls.netcdf_path = Path(cls.tempdir.name) / "default.nc"
+ cls.intensity = np.array([[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [9, 10, np.nan]]])
+ cls.time = np.array([dt.datetime(1999, 1, 1), dt.datetime(2000, 1, 1)])
+ cls.latitude = np.array([0, 1])
+ cls.longitude = np.array([0, 1, 2])
+ cls.dset = xr.Dataset(
+ {
+ "intensity": (["time", "latitude", "longitude"], cls.intensity),
+ },
+ dict(time=cls.time, latitude=cls.latitude, longitude=cls.longitude),
+ )
+ cls.dset.to_netcdf(cls.netcdf_path)
+
+ @classmethod
+ def tearDownClass(cls):
+ """Delete the NetCDF file"""
+ cls.tempdir.cleanup()
+
+ def _assert_default(self, hazard):
+ """Assertions for the default hazard to be loaded"""
+ self._assert_default_types(hazard)
+ self._assert_default_values(hazard)
+
+ def _assert_default_values(self, hazard):
+ """Check the values of the default hazard to be loaded"""
+ # Hazard data
+ self.assertEqual(hazard.haz_type, "")
+ self.assertEqual(hazard.units, "")
+ np.testing.assert_array_equal(hazard.event_id, [1, 2])
+ np.testing.assert_array_equal(
+ hazard.event_name, [x.strftime("%Y-%m-%d") for x in self.time]
+ )
+ np.testing.assert_array_equal(
+ hazard.date, [val.toordinal() for val in self.time]
+ )
+ np.testing.assert_array_equal(hazard.frequency, np.ones(hazard.event_id.size))
+
+ # Centroids
+ np.testing.assert_array_equal(hazard.centroids.lat, [0, 0, 0, 1, 1, 1])
+ np.testing.assert_array_equal(hazard.centroids.lon, [0, 1, 2, 0, 1, 2])
+ self.assertEqual(hazard.centroids.geometry.crs, DEF_CRS)
+
+ # Intensity data
+ # NOTE: NaN converted to zero
+ self.assertEqual(hazard.intensity.nnz, 10)
+ np.testing.assert_array_equal(
+ hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 0]]
+ )
+
+ # Fraction default
+ self.assertEqual(hazard.fraction.nnz, 0)
+ np.testing.assert_array_equal(hazard.fraction.shape, hazard.intensity.shape)
+ np.testing.assert_array_equal(
+ hazard.fraction.toarray(), np.zeros_like(hazard.intensity.toarray())
+ )
+
+ def _assert_default_types(self, hazard):
+ """Check types of all hazard attributes"""
+ self.assertIsInstance(hazard.units, str)
+ self.assertIsInstance(hazard.haz_type, str)
+ self.assertIsInstance(hazard.event_id, np.ndarray)
+ self.assertIsInstance(hazard.event_name, list)
+ self.assertIsInstance(hazard.frequency, np.ndarray)
+ self.assertIsInstance(hazard.intensity, csr_matrix)
+ self.assertIsInstance(hazard.fraction, csr_matrix)
+ self.assertIsInstance(hazard.date, np.ndarray)
+
+ def test_load_path(self):
+ """Load the data with path as argument"""
+ hazard = Hazard.from_xarray_raster(self.netcdf_path, "", "")
+ self._assert_default(hazard)
+
+ # Check deprecated method
+ hazard = Hazard.from_xarray_raster_file(self.netcdf_path, "", "")
+ self._assert_default(hazard)
+
+ # Check wrong paths
+ with self.assertRaisesRegex(FileNotFoundError, "file-does-not-exist.nc"):
+ Hazard.from_xarray_raster("file-does-not-exist.nc", "", "")
+ with self.assertRaisesRegex(KeyError, "wrong-intensity-path"):
+ Hazard.from_xarray_raster(
+ self.netcdf_path, "", "", intensity="wrong-intensity-path"
+ )
+
+ @patch("climada.hazard.xarray.xr.open_dataset")
+ def test_load_dataset(self, open_dataset_mock):
+ """Load the data from an opened dataset as argument"""
+ open_dataset_mock.return_value.__enter__.return_value = self.dset
+
+ def _load_and_assert(**kwargs):
+ hazard = Hazard.from_xarray_raster(
+ self.netcdf_path, "", "", open_dataset_kws=kwargs
+ )
+ self._assert_default(hazard)
+
+ _load_and_assert()
+ open_dataset_mock.assert_called_once_with(self.netcdf_path, chunks="auto")
+ open_dataset_mock.reset_mock()
+ _load_and_assert(chunks=dict(latitude=1, longitude=1, time=1), engine="netcdf4")
+ open_dataset_mock.assert_called_once_with(
+ self.netcdf_path,
+ chunks=dict(latitude=1, longitude=1, time=1),
+ engine="netcdf4",
+ )
+
+ def test_type_error(self):
+ """Calling 'from_xarray_raster' with wrong data type should throw"""
+ # Passing a DataArray
+ with xr.open_dataset(self.netcdf_path) as dset, self.assertRaisesRegex(
+ TypeError, "This method only supports passing xr.Dataset"
+ ):
+ Hazard.from_xarray_raster(dset["intensity"], "", "")
+
+ def test_type_and_unit(self):
+ """Test passing a custom type and unit"""
+ hazard = Hazard.from_xarray_raster_file(
+ self.netcdf_path, hazard_type="TC", intensity_unit="m/s"
+ )
+ self._assert_default_types(hazard)
+ self.assertEqual(hazard.haz_type, "TC")
+ self.assertEqual(hazard.units, "m/s")
+
+ def test_event_no_time(self):
+ """Test if an event coordinate that is not a time works"""
+ with xr.open_dataset(self.netcdf_path) as dataset:
+ size = dataset.sizes["time"]
+
+ # Positive integers (interpreted as ordinals)
+ time = [2, 1]
+ dataset["time"] = time
+ hazard = Hazard.from_xarray_raster(dataset, "", "")
+ self._assert_default_types(hazard)
+ np.testing.assert_array_equal(
+ hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 0]]
+ )
+ np.testing.assert_array_equal(hazard.date, time)
+ np.testing.assert_array_equal(hazard.event_name, np.full(size, ""))
+
+ # Strings
+ dataset["time"] = ["a", "b"]
+ with self.assertLogs("climada.hazard.xarray", "WARNING") as cm:
+ hazard = Hazard.from_xarray_raster(dataset, "", "")
+ np.testing.assert_array_equal(hazard.date, np.ones(size))
+ np.testing.assert_array_equal(hazard.event_name, np.full(size, ""))
+ self.assertIn("Failed to read values of 'time' as dates.", cm.output[0])
+ self.assertIn(
+ "Failed to read values of 'time' as dates or ordinals.", cm.output[1]
+ )
+
+ def test_data_vars(self):
+ """Check handling of data variables"""
+ with xr.open_dataset(self.netcdf_path) as dataset:
+ size = dataset.sizes["time"]
+
+ # Set optionals in the dataset
+ frequency = np.ones(size) * 1.5
+ event_id = np.array(range(size), dtype=np.int64) + 3
+ event_name = ["bla"] * size
+ date = np.array(range(size)) + 100
+ dataset["event_id"] = event_id
+ dataset["event_name"] = event_name
+ dataset["date"] = date
+
+ # Assign a proper coordinate for a change
+ dataset = dataset.assign_coords(dict(frequency=("time", frequency)))
+
+ # Assign fraction
+ frac = xr.DataArray(
+ np.array([[[0, 0, 0], [0, 0, 0]], [[1, 1, 1], [1, 1, 1]]]),
+ dims=["time", "latitude", "longitude"],
+ coords=dict(
+ time=self.time, latitude=self.latitude, longitude=self.longitude
+ ),
+ )
+ dataset["fraction"] = frac
+
+ # Optionals should be read automatically
+ hazard = Hazard.from_xarray_raster(dataset, "", "")
+ self._assert_default_types(hazard)
+ np.testing.assert_array_equal(hazard.frequency, frequency)
+ np.testing.assert_array_equal(hazard.event_id, event_id)
+ np.testing.assert_array_equal(hazard.event_name, event_name)
+ np.testing.assert_array_equal(hazard.date, date)
+ np.testing.assert_array_equal(
+ hazard.fraction.toarray(), [[0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1]]
+ )
+
+ # Ignore keys (should be default values)
+ hazard = Hazard.from_xarray_raster(
+ dataset,
+ "",
+ "",
+ data_vars=dict(
+ frequency="", event_id="", event_name="", date="", fraction=""
+ ),
+ )
+ self._assert_default(hazard)
+
+ # Wrong key
+ with self.assertRaises(ValueError) as cm:
+ Hazard.from_xarray_raster(
+ dataset, "", "", data_vars=dict(wrong_key="stuff")
+ )
+ self.assertIn(
+ "Unknown data variables passed: '['wrong_key']'.", str(cm.exception)
+ )
+
+ # Non-existent identifier
+ with self.assertRaises(KeyError) as cm:
+ Hazard.from_xarray_raster(
+ dataset, "", "", data_vars=dict(frequency="freqqqqq")
+ )
+ self.assertIn("freqqqqq", str(cm.exception))
+
+ # Wrong data length
+ # NOTE: This also implicitly checks that 'frequency' is not read!
+ dataset["freq"] = np.array(range(size + 1), dtype=np.float64)
+ with self.assertRaises(RuntimeError) as cm:
+ Hazard.from_xarray_raster(
+ dataset, "", "", data_vars=dict(frequency="freq")
+ )
+ self.assertIn(
+ f"'freq' must have shape ({size},), but shape is ({size + 1},)",
+ str(cm.exception),
+ )
+
+ # Integer data assertions
+ dset = dataset.copy(deep=True)
+ dset["event_id"] = np.array(range(size), dtype=np.float64) + 3.5
+ with self.assertRaises(TypeError) as cm:
+ Hazard.from_xarray_raster(dset, "", "")
+ self.assertIn("'event_id' data array must be integers", str(cm.exception))
+ dset["event_id"] = np.linspace(0, 10, size, dtype=np.int64)
+ with self.assertRaises(ValueError) as cm:
+ Hazard.from_xarray_raster(dset, "", "")
+ self.assertIn("'event_id' data must be larger than zero", str(cm.exception))
+
+ # Date as datetime
+ date_str = [f"2000-01-{i:02}" for i in range(1, size + 1)]
+ dataset["date"] = date_str
+ hazard = Hazard.from_xarray_raster(dataset, "", "")
+ np.testing.assert_array_equal(
+ hazard.date,
+ [dt.datetime(2000, 1, i).toordinal() for i in range(1, size + 1)],
+ )
+
+ def test_data_vars_repeat(self):
+ """Test if suitable data vars are repeated as expected"""
+ with xr.open_dataset(self.netcdf_path) as dataset:
+ size = dataset.sizes["time"]
+
+ # Set optionals in the dataset
+ frequency = [1.5]
+ event_name = ["bla"]
+ date = 1
+ dataset["event_name"] = event_name
+ dataset["date"] = date
+ dataset["frequency"] = frequency
+
+ # Check if single-valued arrays are repeated
+ hazard = Hazard.from_xarray_raster(dataset, "", "")
+
+ np.testing.assert_array_equal(hazard.date, [date] * size)
+ np.testing.assert_array_equal(hazard.event_name, event_name * size)
+ np.testing.assert_array_equal(hazard.frequency, frequency * size)
+
+ def test_nan(self):
+ """Check handling of NaNs in original data"""
+ with xr.open_dataset(self.netcdf_path) as dataset:
+ intensity = xr.DataArray(
+ np.array([[[0, np.nan, 2], [3, 4, 5]], [[6, np.nan, 8], [9, 10, 11]]]),
+ dims=["time", "latitude", "longitude"],
+ coords=dict(
+ time=self.time, latitude=self.latitude, longitude=self.longitude
+ ),
+ )
+ dataset["intensity"] = intensity
+ fraction = xr.DataArray(
+ np.array([[[0, 0, 0], [0, 0, 0]], [[1, np.nan, 1], [np.nan, 1, 1]]]),
+ dims=["time", "latitude", "longitude"],
+ coords=dict(
+ time=self.time, latitude=self.latitude, longitude=self.longitude
+ ),
+ )
+ dataset["fraction"] = fraction
+ frequency = np.ones(dataset.sizes["time"])
+ frequency[0] = np.nan
+ dataset["frequency"] = frequency
+
+ # Load hazard
+ hazard = Hazard.from_xarray_raster(dataset, "", "")
+
+ # Check types
+ self._assert_default_types(hazard)
+
+ # NaNs are set to zero in sparse data
+ np.testing.assert_array_equal(
+ hazard.intensity.toarray(),
+ [[0, 0, 2, 3, 4, 5], [6, 0, 8, 9, 10, 11]],
+ )
+ np.testing.assert_array_equal(
+ hazard.fraction.toarray(),
+ [[0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 1, 1]],
+ )
+
+ # NaNs are propagated in dense data
+ np.testing.assert_array_equal(hazard.frequency, frequency)
+
+ def test_crs(self):
+ """Check if different CRS inputs are handled correctly"""
+
+ def test_crs_from_input(crs_input):
+ crs = CRS.from_user_input(crs_input)
+ hazard = Hazard.from_xarray_raster(self.netcdf_path, "", "", crs=crs_input)
+ self.assertEqual(hazard.centroids.geometry.crs, crs)
+
+ test_crs_from_input("EPSG:3857")
+ test_crs_from_input(3857)
+ test_crs_from_input("+proj=cea +lat_0=52.112866 +lon_0=5.150162 +units=m")
+
+ def test_missing_dims(self):
+ """Test if missing coordinates are expanded and correct errors are thrown"""
+ # Drop time as dimension, but not as coordinate!
+ with xr.open_dataset(self.netcdf_path) as ds:
+ ds = ds.isel(time=0).squeeze()
+ print(ds)
+ hazard = Hazard.from_xarray_raster(ds, "", "")
+ self._assert_default_types(hazard)
+ np.testing.assert_array_equal(
+ hazard.event_name, [self.time[0].strftime("%Y-%m-%d")]
+ )
+ np.testing.assert_array_equal(hazard.date, [self.time[0].toordinal()])
+ np.testing.assert_array_equal(hazard.centroids.lat, [0, 0, 0, 1, 1, 1])
+ np.testing.assert_array_equal(hazard.centroids.lon, [0, 1, 2, 0, 1, 2])
+ self.assertEqual(hazard.centroids.geometry.crs, DEF_CRS)
+ np.testing.assert_array_equal(
+ hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5]]
+ )
+ self.assertEqual(hazard.fraction.nnz, 0)
+ np.testing.assert_array_equal(
+ hazard.fraction.toarray(), [[0, 0, 0, 0, 0, 0]]
+ )
+
+ # Now drop variable altogether, should raise an error
+ ds = ds.drop_vars("time")
+ with self.assertRaisesRegex(RuntimeError, "time"):
+ Hazard.from_xarray_raster(ds, "", "")
+
+ # Expand time again
+ ds = ds.expand_dims(time=[np.datetime64("2022-01-01")])
+ hazard = Hazard.from_xarray_raster(ds, "", "")
+ self._assert_default_types(hazard)
+ np.testing.assert_array_equal(hazard.event_name, ["2022-01-01"])
+ np.testing.assert_array_equal(
+ hazard.date, [dt.datetime(2022, 1, 1).toordinal()]
+ )
+
+
+class TestReadDimsCoordsNetCDF(unittest.TestCase):
+ """Checks for dimensions and coordinates with different names and shapes"""
+
+ @classmethod
+ def setUpClass(cls):
+ """Write a NetCDF file with many coordinates"""
+ cls.tempdir = TemporaryDirectory()
+ cls.netcdf_path = Path(cls.tempdir.name) / "coords.nc"
+ cls.intensity = np.array([[[0, 1, 2], [3, 4, 5]]])
+ cls.fraction = np.array([[[0, 0, 0], [1, 1, 1]]])
+ cls.time = np.array([dt.datetime(2000, 1, 1)])
+ cls.x = np.array([0, 1, 2])
+ cls.y = np.array([0, 1])
+ cls.lon = np.array([1, 2, 3])
+ cls.lat = np.array([1, 2])
+ cls.years = np.array([dt.datetime(1999, 2, 2)])
+ cls.longitude = np.array([[10, 11, 12], [10, 11, 12]])
+ cls.latitude = np.array([[100, 100, 100], [200, 200, 200]])
+
+ dset = xr.Dataset(
+ {
+ "intensity": (["time", "y", "x"], cls.intensity),
+ "fraction": (["time", "y", "x"], cls.fraction),
+ },
+ {
+ "time": cls.time,
+ "x": cls.x,
+ "y": cls.y,
+ "lon": (["x"], cls.lon),
+ "lat": (["y"], cls.lat),
+ "years": (["time"], cls.years),
+ "latitude": (["y", "x"], cls.latitude),
+ "longitude": (["y", "x"], cls.longitude),
+ },
+ )
+ dset.to_netcdf(cls.netcdf_path)
+
+ @classmethod
+ def tearDownClass(cls):
+ """Delete the NetCDF file"""
+ cls.tempdir.cleanup()
+
+ def _assert_intensity_fraction(self, hazard):
+ """Check if intensity and fraction data are read correctly"""
+ np.testing.assert_array_equal(hazard.intensity.toarray(), [[0, 1, 2, 3, 4, 5]])
+ np.testing.assert_array_equal(hazard.fraction.toarray(), [[0, 0, 0, 1, 1, 1]])
+
+ def test_dimension_naming(self):
+ """Test if dimensions with different names can be read"""
+ hazard = Hazard.from_xarray_raster(
+ self.netcdf_path,
+ "",
+ "",
+ coordinate_vars=dict(latitude="y", longitude="x"), # 'time' stays default
+ )
+ np.testing.assert_array_equal(hazard.centroids.lat, [0, 0, 0, 1, 1, 1])
+ np.testing.assert_array_equal(hazard.centroids.lon, [0, 1, 2, 0, 1, 2])
+ np.testing.assert_array_equal(
+ hazard.date, [val.toordinal() for val in self.time]
+ )
+ self._assert_intensity_fraction(hazard)
+
+ def test_coordinate_naming(self):
+ """Test if coordinates with different names than dimensions can be read"""
+ hazard = Hazard.from_xarray_raster(
+ self.netcdf_path,
+ "",
+ "",
+ coordinate_vars=dict(latitude="lat", longitude="lon", event="years"),
+ )
+ np.testing.assert_array_equal(hazard.centroids.lat, [1, 1, 1, 2, 2, 2])
+ np.testing.assert_array_equal(hazard.centroids.lon, [1, 2, 3, 1, 2, 3])
+ np.testing.assert_array_equal(
+ hazard.date, [val.toordinal() for val in self.years]
+ )
+ self._assert_intensity_fraction(hazard)
+
+ def test_2D_coordinates(self):
+ """Test if read method correctly handles 2D coordinates"""
+ hazard = Hazard.from_xarray_raster(
+ self.netcdf_path,
+ "",
+ "",
+ coordinate_vars=dict(latitude="latitude", longitude="longitude"),
+ )
+ np.testing.assert_array_equal(
+ hazard.centroids.lat, [100, 100, 100, 200, 200, 200]
+ )
+ np.testing.assert_array_equal(hazard.centroids.lon, [10, 11, 12, 10, 11, 12])
+ self._assert_intensity_fraction(hazard)
+
+ def test_load_dataset_rechunk(self):
+ """Load the data from an opened dataset and force rechunking"""
+ with xr.open_dataset(self.netcdf_path) as dataset:
+ hazard = Hazard.from_xarray_raster(
+ dataset,
+ "",
+ "",
+ coordinate_vars=dict(latitude="latitude", longitude="longitude"),
+ rechunk=True,
+ )
+
+ np.testing.assert_array_equal(
+ hazard.centroids.lat, [100, 100, 100, 200, 200, 200]
+ )
+ np.testing.assert_array_equal(hazard.centroids.lon, [10, 11, 12, 10, 11, 12])
+ self._assert_intensity_fraction(hazard)
+
+ # Assert that .chunk is called the right way
+ with patch("xarray.Dataset.chunk") as mock:
+ with xr.open_dataset(self.netcdf_path) as dataset:
+ mock.return_value = dataset
+ Hazard.from_xarray_raster(
+ dataset,
+ "",
+ "",
+ coordinate_vars=dict(latitude="latitude", longitude="longitude"),
+ rechunk=True,
+ )
+
+ # First latitude dim, then longitude dim, then event dim
+ mock.assert_called_once_with(chunks=dict(y=-1, x=-1, time="auto"))
+
+ # Should not be called by default
+ mock.reset_mock()
+ with xr.open_dataset(self.netcdf_path) as dataset:
+ mock.return_value = dataset
+ Hazard.from_xarray_raster(
+ dataset,
+ "",
+ "",
+ coordinate_vars=dict(latitude="latitude", longitude="longitude"),
+ )
+
+ mock.assert_not_called()
+
+ def test_2D_time(self):
+ """Test if stacking multiple time dimensions works out"""
+ time = np.array(
+ [
+ [dt.datetime(1999, 1, 1), dt.datetime(1999, 2, 1)],
+ [dt.datetime(2000, 1, 1), dt.datetime(2000, 2, 1)],
+ ]
+ )
+ ds = xr.Dataset(
+ {
+ "intensity": (
+ ["year", "month", "latitude", "longitude"],
+ [[[[1]], [[2]]], [[[3]], [[4]]]],
+ ),
+ "fraction": (
+ ["year", "month", "latitude", "longitude"],
+ [[[[10]], [[20]]], [[[30]], [[40]]]],
+ ),
+ },
+ {
+ "latitude": [1],
+ "longitude": [2],
+ "year": [1999, 2000],
+ "month": [1, 2],
+ "time": (["year", "month"], time),
+ },
+ )
+ hazard = Hazard.from_xarray_raster(ds, "", "")
+
+ np.testing.assert_array_equal(hazard.intensity.toarray(), [[1], [2], [3], [4]])
+ np.testing.assert_array_equal(
+ hazard.fraction.toarray(), [[10], [20], [30], [40]]
+ )
+ np.testing.assert_array_equal(hazard.centroids.lat, [1])
+ np.testing.assert_array_equal(hazard.centroids.lon, [2])
+ np.testing.assert_array_equal(
+ hazard.date, [val.toordinal() for val in time.flat]
+ )
+ np.testing.assert_array_equal(
+ hazard.event_name, ["1999-01-01", "1999-02-01", "2000-01-01", "2000-02-01"]
+ )
+
+ def test_errors(self):
+ """Check if expected errors are thrown"""
+ # Wrong coordinate key
+ with self.assertRaisesRegex(ValueError, "Unknown coordinates passed"):
+ Hazard.from_xarray_raster(
+ self.netcdf_path,
+ "",
+ "",
+ coordinate_vars=dict(bar="latitude", longitude="baz"),
+ )
+
+ # Correctly specified, but the custom dimension does not exist
+ with self.assertRaisesRegex(RuntimeError, "lalalatitude"):
+ Hazard.from_xarray_raster(
+ self.netcdf_path,
+ "",
+ "",
+ coordinate_vars=dict(latitude="lalalatitude"),
+ )
+
+
+if __name__ == "__main__":
+ TESTS = unittest.TestLoader().loadTestsFromTestCase(TestReadDefaultNetCDF)
+ TESTS.addTests(
+ unittest.TestLoader().loadTestsFromTestCase(TestReadDimsCoordsNetCDF)
+ )
diff --git a/climada/hazard/xarray.py b/climada/hazard/xarray.py
new file mode 100644
index 000000000..df7fc9bf6
--- /dev/null
+++ b/climada/hazard/xarray.py
@@ -0,0 +1,420 @@
+"""
+This file is part of CLIMADA.
+
+Copyright (C) 2017 ETH Zurich, CLIMADA contributors listed in AUTHORS.
+
+CLIMADA is free software: you can redistribute it and/or modify it under the
+terms of the GNU General Public License as published by the Free
+Software Foundation, version 3.
+
+CLIMADA is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with CLIMADA. If not, see .
+
+---
+
+Hazard input using xarray
+"""
+
+import copy
+import itertools
+import logging
+from dataclasses import InitVar, dataclass, field
+from pathlib import Path
+from typing import Any, Callable, Hashable
+
+import numpy as np
+import pandas as pd
+import sparse as sp
+import xarray as xr
+from scipy import sparse
+
+import climada.util.constants as u_const
+import climada.util.dates_times as u_dt
+from climada.hazard.centroids.centr import Centroids
+
+LOGGER = logging.getLogger(__name__)
+
+DEF_COORDS = {"event": "time", "longitude": "longitude", "latitude": "latitude"}
+"""Default coordinates when reading Hazard data from an xarray Dataset"""
+
+DEF_DATA_VARS = ["fraction", "frequency", "event_id", "event_name", "date"]
+"""Default keys for optional Hazard attributes when reading from an xarray Dataset"""
+
+
+def _to_csr_matrix(array: xr.DataArray) -> sparse.csr_matrix:
+ """Store a numpy array as sparse matrix, optimizing storage space
+
+ The CSR matrix stores NaNs explicitly, so we set them to zero.
+ """
+ array = array.where(lambda x: ~np.isnan(x), 0)
+ array = xr.apply_ufunc(
+ sp.COO.from_numpy,
+ array,
+ dask="parallelized",
+ output_dtypes=[array.dtype],
+ )
+ sparse_coo = array.compute().data # Load into memory
+ return sparse_coo.tocsr() # Convert sparse.COO to scipy.sparse.csr_matrix
+
+
+# Define accessors for xarray DataArrays
+def _default_accessor(array: xr.DataArray) -> np.ndarray:
+ """Take a DataArray and return its numpy representation"""
+ return array.to_numpy()
+
+
+def _strict_positive_int_accessor(array: xr.DataArray) -> np.ndarray:
+ """Take a positive int DataArray and return its numpy representation
+
+ Raises
+ ------
+ TypeError
+ If the underlying data type is not integer
+ ValueError
+ If any value is zero or less
+ """
+ if not np.issubdtype(array.dtype, np.integer):
+ raise TypeError(f"'{array.name}' data array must be integers")
+ if not (array > 0).all():
+ raise ValueError(f"'{array.name}' data must be larger than zero")
+ return array.to_numpy()
+
+
+def _date_to_ordinal_accessor(array: xr.DataArray, strict: bool = True) -> np.ndarray:
+ """Take a DataArray and transform it into ordinals"""
+ try:
+ if np.issubdtype(array.dtype, np.integer):
+ # Assume that data is ordinals
+ return _strict_positive_int_accessor(array)
+
+ # Try transforming to ordinals
+ return np.array(u_dt.datetime64_to_ordinal(array.to_numpy()))
+
+ # Handle access errors
+ except (ValueError, TypeError, AttributeError) as err:
+ if strict:
+ raise err
+
+ LOGGER.warning(
+ "Failed to read values of '%s' as dates or ordinals. Hazard.date "
+ "will be ones only",
+ array.name,
+ )
+ return np.ones(array.shape)
+
+
+def _year_month_day_accessor(array: xr.DataArray, strict: bool = True) -> np.ndarray:
+ """Take an array and return am array of YYYY-MM-DD strings"""
+ try:
+ return array.dt.strftime("%Y-%m-%d").to_numpy()
+
+ # Handle access errors
+ except (ValueError, TypeError, AttributeError) as err:
+ if strict:
+ raise err
+
+ LOGGER.warning(
+ "Failed to read values of '%s' as dates. Hazard.event_name will be "
+ "empty strings",
+ array.name,
+ )
+ return np.full(array.shape, "")
+
+
+def _maybe_repeat(values: np.ndarray, times: int) -> np.ndarray:
+ """Return the array or repeat a single-valued array
+
+ If ``values`` has size 1, return an array that repeats this value ``times``
+ times. If the size is different, just return the array.
+ """
+ if values.size == 1:
+ return np.array(list(itertools.repeat(values.flat[0], times)))
+
+ return values
+
+
+def _load_from_xarray_or_return_default(
+ data: xr.Dataset,
+ user_key: str | None,
+ default_key: str,
+ hazard_attr: str,
+ accessor: Callable[[xr.DataArray], Any],
+ default_value: Any,
+) -> Any:
+ """Load data for a single Hazard attribute or return the default value
+
+ Does the following based on the ``user_key``:
+
+ * If the key is an empty string, return the default value
+ * If the key is a non-empty string, load the data for that key and return it.
+ * If the key is ``None``, look for the ``default_key`` in the data. If it exists,
+ return that data. If not, return the default value.
+
+ Parameters
+ ----------
+ user_key : str or None
+ The key set by the user to identify the DataArray to read data from.
+ default_key : str
+ The default key identifying the DataArray to read data from.
+ hazard_attr : str
+ The name of the attribute of ``Hazard`` where the data will be stored in.
+ accessor : Callable
+ A callable that takes the DataArray as argument and returns the data structure
+ that is required by the ``Hazard`` attribute.
+ default_value
+ The default value/array in case the data could not be found.
+
+ Returns
+ -------
+ The object that will be stored in the ``Hazard`` attribute with name ``hazard_attr``
+
+ Raises
+ ------
+ KeyError
+ If ``user_key`` was a non-empty string but no such key was found in the data
+ RuntimeError
+ If the data structure loaded has a different shape than the default data
+ structure
+ """
+ # User does not want to read data
+ if user_key == "":
+ LOGGER.debug("Using default values for Hazard.%s per user request", hazard_attr)
+ return default_value
+
+ if not pd.isna(user_key):
+ # Read key exclusively
+ LOGGER.debug(
+ "Reading data for Hazard.%s from DataArray '%s'", hazard_attr, user_key
+ )
+ val = accessor(data[user_key])
+ else:
+ # Try default key
+ try:
+ val = accessor(data[default_key])
+ LOGGER.debug(
+ "Reading data for Hazard.%s from DataArray '%s'",
+ hazard_attr,
+ default_key,
+ )
+ except KeyError:
+ LOGGER.debug(
+ "Using default values for Hazard.%s. No data found", hazard_attr
+ )
+ return default_value
+
+ def vshape(array):
+ """Return a shape tuple for any array-like type we use"""
+ if isinstance(array, list):
+ return len(array)
+ if isinstance(array, sparse.csr_matrix):
+ return array.get_shape()
+ return array.shape
+
+ # Check size for read data
+ if default_value is not None and not np.array_equal(
+ vshape(val), vshape(default_value)
+ ):
+ raise RuntimeError(
+ f"'{user_key if user_key else default_key}' must have shape "
+ f"{vshape(default_value)}, but shape is {vshape(val)}"
+ )
+
+ # Return the data
+ return val
+
+
+@dataclass(repr=False, eq=False)
+class HazardXarrayReader:
+ """A helper class for creating a Hazard object from an xarray dataset
+
+ Initialize this class, then use :py:meth:`get_hazard_kwargs` to retrieve the kwargs
+ to be passed to the :py:class:`~climada.hazard.base.Hazard` initializer.
+
+ Attributes
+ ----------
+ data : xr.Dataset
+ The data to be read as hazard.
+ intensity : str
+ The name of the variable containing the hazard intensity information.
+ Default: ``"intensity"``
+ coordinate_vars : dict(str, str)
+ Mapping from default coordinate names to coordinate names in the dataset.
+ data_vars : dict(str, str)
+ Mapping from default variable names to variable names in the dataset.
+ crs : str
+ Coordinate reference system of the data to be read. Defaults to ``"EPSG:4326"``
+ (WGS 84).
+ rechunk : bool
+ If ``False``, automatically rechunk the data for more efficient reads from disk.
+ Default: ``False``.
+ """
+
+ data: xr.Dataset
+ intensity: str = "intensity"
+ coordinate_vars: InitVar[dict[str, str] | None] = field(default=None, kw_only=True)
+ data_vars: dict[str, str] | None = field(default=None, kw_only=True)
+ crs: str = field(default=u_const.DEF_CRS, kw_only=True)
+ rechunk: bool = field(default=False, kw_only=True)
+
+ coords: dict[str, str] = field(
+ init=False, default_factory=lambda: copy.deepcopy(DEF_COORDS)
+ )
+ data_dims: dict[str, tuple[Hashable, ...]] = field(init=False, default_factory=dict)
+ hazard_kwargs: dict[str, Any] = field(init=False, default_factory=dict)
+
+ def __post_init__(self, coordinate_vars):
+ """Update coordinate and dimension names"""
+ # Update coordinate identifiers
+ coordinate_vars = coordinate_vars or {}
+ unknown_coords = [co for co in coordinate_vars if co not in self.coords]
+ if unknown_coords:
+ raise ValueError(
+ f"Unknown coordinates passed: '{unknown_coords}'. Supported "
+ f"coordinates are {list(self.coords.keys())}."
+ )
+ self.coords.update(coordinate_vars)
+
+ # Retrieve dimensions of coordinates
+ try:
+ self.data_dims = {
+ "event": self.data[self.coords["event"]].dims,
+ "longitude": self.data[self.coords["longitude"]].dims,
+ "latitude": self.data[self.coords["latitude"]].dims,
+ }
+ # Handle KeyError for better error message
+ except KeyError as err:
+ key = err.args[0]
+ raise RuntimeError(
+ f"Dataset is missing dimension/coordinate: {key}. Dataset dimensions: "
+ f"{list(self.data.dims.keys())}"
+ ) from err
+
+ # Check for unexpected keys
+ self.data_vars = self.data_vars or {}
+ default_keys = copy.deepcopy(DEF_DATA_VARS)
+ unknown_keys = [key for key in self.data_vars.keys() if key not in default_keys]
+ if unknown_keys:
+ raise ValueError(
+ f"Unknown data variables passed: '{unknown_keys}'. Supported data "
+ f"variables are {default_keys}."
+ )
+
+ @classmethod
+ def from_file(cls, filename: Path | str, *args, open_dataset_kws=None, **kwargs):
+ """Open reader from a file"""
+ open_dataset_kws = open_dataset_kws or {}
+ open_dataset_kws = {"chunks": "auto"} | open_dataset_kws
+ with xr.open_dataset(filename, **open_dataset_kws) as dset:
+ return cls(dset, *args, **kwargs)
+
+ def rechunk_data(self, data: xr.Dataset) -> xr.Dataset:
+ """Try to rechunk the data to optimize the stack operation afterwards."""
+ chunks = (
+ # We want one event to be contained in one chunk
+ {dim: -1 for dim in self.data_dims["longitude"]}
+ | {dim: -1 for dim in self.data_dims["latitude"]}
+ # Automated chunking in the event dimensions (as many as fit)
+ | {dim: "auto" for dim in self.data_dims["event"]}
+ )
+ return data.chunk(chunks=chunks)
+
+ def get_hazard_kwargs(self) -> dict[str, Any]:
+ """Return kwargs to initialize the hazard"""
+ # Shallow copy of the data
+ data = self.data.copy()
+
+ # Try promoting single-value coordinates to dimensions
+ for key, val in self.data_dims.items():
+ if not val:
+ coord = self.coords[key]
+ LOGGER.debug("Promoting Dataset coordinate '%s' to dimension", coord)
+ data = data.expand_dims(coord)
+ self.data_dims[key] = data[coord].dims
+
+ # Maybe rechunk
+ if self.rechunk:
+ data = self.rechunk_data(data)
+
+ # Stack (vectorize) the entire dataset into 2D (time, lat/lon)
+ # NOTE: We want the set union of the dimensions, but Python 'set' does not
+ # preserve order. However, we want longitude to run faster than latitude.
+ # So we use 'dict' without values, as 'dict' preserves insertion order
+ # (dict keys behave like a set).
+ data = data.stack(
+ event=self.data_dims["event"],
+ lat_lon=list(
+ dict.fromkeys(
+ self.data_dims["latitude"] + self.data_dims["longitude"]
+ ).keys()
+ ),
+ )
+
+ # Transform coordinates into centroids
+ centroids = Centroids(
+ lat=data[self.coords["latitude"]].to_numpy(),
+ lon=data[self.coords["longitude"]].to_numpy(),
+ crs=self.crs,
+ )
+
+ # Read the intensity data
+ LOGGER.debug("Loading Hazard intensity from DataArray '%s'", self.intensity)
+ intensity_matrix = _to_csr_matrix(data[self.intensity])
+
+ # Create a DataFrame storing access information for each of data_vars
+ # NOTE: Each row will be passed as arguments to
+ # `load_from_xarray_or_return_default`, see its docstring for further
+ # explanation of the DataFrame columns / keywords.
+ num_events = data.sizes["event"]
+ data_ident = pd.DataFrame(
+ data={
+ # The attribute of the Hazard class where the data will be stored
+ "hazard_attr": DEF_DATA_VARS,
+ # The identifier and default key used in this method
+ "default_key": DEF_DATA_VARS,
+ # The key assigned by the user
+ "user_key": None,
+ # The default value for each attribute
+ "default_value": [
+ None,
+ np.ones(num_events),
+ np.array(range(num_events), dtype=int) + 1,
+ list(
+ _year_month_day_accessor(
+ data[self.coords["event"]], strict=False
+ ).flat
+ ),
+ _date_to_ordinal_accessor(data[self.coords["event"]], strict=False),
+ ],
+ # The accessor for the data in the Dataset
+ "accessor": [
+ _to_csr_matrix,
+ lambda x: _maybe_repeat(_default_accessor(x), num_events),
+ _strict_positive_int_accessor,
+ lambda x: list(
+ _maybe_repeat(_default_accessor(x), num_events).flat
+ ),
+ lambda x: _maybe_repeat(_date_to_ordinal_accessor(x), num_events),
+ ],
+ }
+ )
+
+ # Update with keys provided by the user
+ # NOTE: Keys in 'default_keys' missing from 'data_vars' will be set to 'None'
+ # (which is exactly what we want) and the result is written into
+ # 'user_key'. 'default_keys' is not modified.
+ data_ident["user_key"] = data_ident["default_key"].map(self.data_vars)
+
+ # Set the Hazard attributes
+ for _, ident in data_ident.iterrows():
+ self.hazard_kwargs[ident["hazard_attr"]] = (
+ _load_from_xarray_or_return_default(data=data, **ident)
+ )
+
+ # Done!
+ LOGGER.debug("Hazard successfully loaded. Number of events: %i", num_events)
+ self.hazard_kwargs.update(centroids=centroids, intensity=intensity_matrix)
+ return self.hazard_kwargs
diff --git a/doc/api/climada/climada.hazard.rst b/doc/api/climada/climada.hazard.rst
index 3b3bef00b..cb58d5077 100644
--- a/doc/api/climada/climada.hazard.rst
+++ b/doc/api/climada/climada.hazard.rst
@@ -22,6 +22,16 @@ climada\.hazard\.io module
:undoc-members:
:show-inheritance:
+climada\.hazard\.xarray module
+------------------------------
+
+Helper module ro read xarray data as Hazard.
+
+.. automodule:: climada.hazard.xarray
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
climada\.hazard\.isimip\_data module
------------------------------------