diff --git a/hydromt/data_catalog/data_catalog.py b/hydromt/data_catalog/data_catalog.py index 5aac640bb..6b8562d5f 100644 --- a/hydromt/data_catalog/data_catalog.py +++ b/hydromt/data_catalog/data_catalog.py @@ -1006,7 +1006,7 @@ def export_data( metadata: Optional[Dict[str, Any]] = None, force_overwrite: bool = False, append: bool = False, - handle_nodata: NoDataStrategy = NoDataStrategy.IGNORE, + handle_nodata: NoDataStrategy = NoDataStrategy.WARN, ) -> None: """Export a data slice of each dataset and a data_catalog.yml file to disk. @@ -1035,6 +1035,9 @@ def export_data( override any existing files if True. False by default. append: bool, optional If True, append to existing data catalog, by default False. + handle_nodata: NoDataStrategy, optional + Strategy to handle no data situations when exporting data. By default + it will log a warning message. """ if time_range is not None: time_range = TimeRange.create(time_range) @@ -1180,7 +1183,8 @@ def export_data( for key, available_variants in sources_out.items(): for _provider, available_versions in available_variants.items(): for _version, adapter in available_versions.items(): - data_catalog_out.add_source(key, adapter) + if adapter is not None: + data_catalog_out.add_source(key, adapter) data_catalog_out.to_yml(path, root="auto", meta=metadata) diff --git a/hydromt/data_catalog/drivers/dataframe/pandas_driver.py b/hydromt/data_catalog/drivers/dataframe/pandas_driver.py index f9e0f6b8a..c24971935 100644 --- a/hydromt/data_catalog/drivers/dataframe/pandas_driver.py +++ b/hydromt/data_catalog/drivers/dataframe/pandas_driver.py @@ -91,8 +91,11 @@ def read( df = pd.read_fwf(uri, **self.options.get_kwargs()) else: raise IOError(f"DataFrame: extension {extension} unknown.") - if df.index.size == 0: - exec_nodata_strat(f"No data from driver {self}'.", strategy=handle_nodata) + if df.empty: + exec_nodata_strat( + f"No data from {self.name} driver for file uris: {', '.join(uris)}.", + strategy=handle_nodata, + ) return df def write( diff --git a/hydromt/data_catalog/drivers/geodataframe/pyogrio_driver.py b/hydromt/data_catalog/drivers/geodataframe/pyogrio_driver.py index b738cf7ed..9002c6736 100644 --- a/hydromt/data_catalog/drivers/geodataframe/pyogrio_driver.py +++ b/hydromt/data_catalog/drivers/geodataframe/pyogrio_driver.py @@ -103,7 +103,10 @@ def read( raise IOError(f"DataFrame from uri: '{_uri}' contains no geometry column.") if gdf.index.size == 0: - exec_nodata_strat(f"No data from driver {self}'.", strategy=handle_nodata) + exec_nodata_strat( + f"No data from {self.name} driver for file uris: {', '.join(uris)}.", + strategy=handle_nodata, + ) return gdf def write( diff --git a/hydromt/data_catalog/drivers/geodataframe/table_driver.py b/hydromt/data_catalog/drivers/geodataframe/table_driver.py index 60cf82150..5e1071fb6 100644 --- a/hydromt/data_catalog/drivers/geodataframe/table_driver.py +++ b/hydromt/data_catalog/drivers/geodataframe/table_driver.py @@ -114,7 +114,10 @@ def read( **self.options.get_kwargs(), ) if gdf.index.size == 0: - exec_nodata_strat(f"No data from driver {self}'.", strategy=handle_nodata) + exec_nodata_strat( + f"No data from {self.name} driver for file uris: {', '.join(uris)}.", + strategy=handle_nodata, + ) return gdf def write( diff --git a/hydromt/data_catalog/drivers/geodataset/vector_driver.py b/hydromt/data_catalog/drivers/geodataset/vector_driver.py index 6f9d8ef44..35bd5e962 100644 --- a/hydromt/data_catalog/drivers/geodataset/vector_driver.py +++ b/hydromt/data_catalog/drivers/geodataset/vector_driver.py @@ -103,14 +103,15 @@ def read( if isinstance(out, xr.DataArray): if out.size == 0: exec_nodata_strat( - f"No data from driver {self}'.", strategy=handle_nodata + f"No data from {self.name} driver for file uris: {', '.join(uris)}.", + strategy=handle_nodata, ) return out.to_dataset() else: for variable in out.data_vars: if out[variable].size == 0: exec_nodata_strat( - f"No data from driver {self}' for variable {variable}.", + f"No data from {self.name} driver for file uris: {', '.join(uris)}.", strategy=handle_nodata, ) return out diff --git a/hydromt/data_catalog/sources/dataframe.py b/hydromt/data_catalog/sources/dataframe.py index 1176acd3b..2e4ed1c8e 100644 --- a/hydromt/data_catalog/sources/dataframe.py +++ b/hydromt/data_catalog/sources/dataframe.py @@ -11,7 +11,7 @@ from hydromt.data_catalog.adapters import DataFrameAdapter from hydromt.data_catalog.drivers import DataFrameDriver from hydromt.data_catalog.sources import DataSource -from hydromt.error import NoDataStrategy +from hydromt.error import NoDataStrategy, exec_nodata_strat from hydromt.typing import TimeRange from hydromt.typing.fsspec_types import FSSpecFileSystem @@ -97,6 +97,10 @@ def to_file( variables=variables, time_range=time_range, handle_nodata=handle_nodata ) if df is None: + exec_nodata_strat( + f"Reading file(s) for {self.name} returned no data.", + handle_nodata, + ) return None # driver can return different path if file ext changes diff --git a/hydromt/data_catalog/sources/dataset.py b/hydromt/data_catalog/sources/dataset.py index c41931106..769bfee17 100644 --- a/hydromt/data_catalog/sources/dataset.py +++ b/hydromt/data_catalog/sources/dataset.py @@ -16,7 +16,7 @@ from hydromt.data_catalog.adapters.dataset import DatasetAdapter from hydromt.data_catalog.drivers import DatasetDriver from hydromt.data_catalog.sources.data_source import DataSource -from hydromt.error import NoDataStrategy +from hydromt.error import NoDataStrategy, exec_nodata_strat from hydromt.typing import ( TimeRange, ) @@ -120,6 +120,10 @@ def to_file( time_range=time_range, handle_nodata=handle_nodata ) if ds is None: + exec_nodata_strat( + handle_nodata, + f"Reading file(s) for {self.name} returned no data.", + ) return None # driver can return different path if file ext changes diff --git a/hydromt/data_catalog/sources/geodataframe.py b/hydromt/data_catalog/sources/geodataframe.py index b702f4e94..182ded46e 100644 --- a/hydromt/data_catalog/sources/geodataframe.py +++ b/hydromt/data_catalog/sources/geodataframe.py @@ -17,7 +17,7 @@ from hydromt.data_catalog.adapters.geodataframe import GeoDataFrameAdapter from hydromt.data_catalog.drivers import GeoDataFrameDriver from hydromt.data_catalog.sources.data_source import DataSource -from hydromt.error import NoDataStrategy +from hydromt.error import NoDataStrategy, exec_nodata_strat from hydromt.gis.gis_utils import _parse_geom_bbox_buffer from hydromt.typing import ( Bbox, @@ -127,6 +127,10 @@ def to_file( handle_nodata=handle_nodata, ) if gdf is None: # handle_nodata == ignore + exec_nodata_strat( + handle_nodata, + f"Reading file(s) for {self.name} returned no data.", + ) return None dest_path = driver.write(file_path, gdf, write_kwargs=write_kwargs) diff --git a/hydromt/data_catalog/sources/geodataset.py b/hydromt/data_catalog/sources/geodataset.py index 0b2de0b3c..28accc648 100644 --- a/hydromt/data_catalog/sources/geodataset.py +++ b/hydromt/data_catalog/sources/geodataset.py @@ -16,7 +16,7 @@ from hydromt.data_catalog.adapters.geodataset import GeoDatasetAdapter from hydromt.data_catalog.drivers.geodataset.geodataset_driver import GeoDatasetDriver from hydromt.data_catalog.sources.data_source import DataSource -from hydromt.error import NoDataStrategy +from hydromt.error import NoDataStrategy, exec_nodata_strat from hydromt.gis.gis_utils import _parse_geom_bbox_buffer from hydromt.typing import ( Bbox, @@ -134,6 +134,10 @@ def to_file( handle_nodata=handle_nodata, ) if ds is None: # handle_nodata == ignore + exec_nodata_strat( + handle_nodata, + f"Reading file(s) for {self.name} returned no data.", + ) return None dest_path = driver.write(file_path, ds, write_kwargs=write_kwargs) diff --git a/hydromt/data_catalog/sources/rasterdataset.py b/hydromt/data_catalog/sources/rasterdataset.py index 261a5a874..f9f3ac5ae 100644 --- a/hydromt/data_catalog/sources/rasterdataset.py +++ b/hydromt/data_catalog/sources/rasterdataset.py @@ -17,7 +17,7 @@ from hydromt.data_catalog.adapters.rasterdataset import RasterDatasetAdapter from hydromt.data_catalog.drivers import RasterDatasetDriver from hydromt.data_catalog.sources.data_source import DataSource -from hydromt.error import NoDataStrategy +from hydromt.error import NoDataStrategy, exec_nodata_strat from hydromt.gis.gis_utils import _parse_geom_bbox_buffer from hydromt.typing import ( Bbox, @@ -139,7 +139,11 @@ def to_file( zoom=zoom, handle_nodata=handle_nodata, ) - if ds is None: # handle_nodata == ignore + if ds is None: + exec_nodata_strat( + f"Reading file(s) for {self.name} returned no data.", + handle_nodata, + ) return None dest_path = driver.write(file_path, ds, write_kwargs=write_kwargs) diff --git a/tests/data_catalog/drivers/geodataframe/test_table_driver.py b/tests/data_catalog/drivers/geodataframe/test_table_driver.py index d90ecd608..bc83af8a1 100644 --- a/tests/data_catalog/drivers/geodataframe/test_table_driver.py +++ b/tests/data_catalog/drivers/geodataframe/test_table_driver.py @@ -9,6 +9,7 @@ from hydromt.data_catalog.drivers.geodataframe.table_driver import ( GeoDataFrameTableDriver, ) +from hydromt.error import NoDataException class TestGeoDataFrameTableDriver: @@ -79,3 +80,14 @@ def test_header_case_insensitive( driver = GeoDataFrameTableDriver() gdf = driver.read(uris=[uri]) pd.testing.assert_frame_equal(gdf, geodf) + + def test_read_no_data(self, mocker): + mocker.patch( + "hydromt.data_catalog.drivers.geodataframe.table_driver.open_vector_from_table", + return_value=gpd.GeoDataFrame(), + ) + with pytest.raises( + NoDataException, + match="No data from geodataframe_table driver for file uris: some_path.csv.", + ): + GeoDataFrameTableDriver().read(uris=["some_path.csv"]) diff --git a/tests/data_catalog/drivers/test_pandas_driver.py b/tests/data_catalog/drivers/test_pandas_driver.py index c1738e4c5..1d4ab6d0b 100644 --- a/tests/data_catalog/drivers/test_pandas_driver.py +++ b/tests/data_catalog/drivers/test_pandas_driver.py @@ -8,6 +8,7 @@ from hydromt._compat import HAS_OPENPYXL from hydromt.data_catalog.drivers.dataframe import PandasDriver +from hydromt.error import NoDataException class TestPandasDriver: @@ -101,6 +102,13 @@ def test_read_with_filters( marks=pytest.mark.skipif(not HAS_OPENPYXL, reason="openpyxl is not installed"), ) + def test_read_no_data(self, driver: PandasDriver, tmp_path): + with pytest.raises( + NoDataException, + match="No data from pandas driver for file uris", + ): + driver.read([]) + @pytest.mark.parametrize( "filename", ["temp.csv", "temp.parquet", temp_xls_param, temp_xlsx_param] ) diff --git a/tests/data_catalog/sources/test_dataframe_source.py b/tests/data_catalog/sources/test_dataframe_source.py index 2ad023fcc..7e1d052c0 100644 --- a/tests/data_catalog/sources/test_dataframe_source.py +++ b/tests/data_catalog/sources/test_dataframe_source.py @@ -1,23 +1,26 @@ +import re from pathlib import Path from typing import Type import pandas as pd +import pytest from hydromt.data_catalog.adapters import DataFrameAdapter from hydromt.data_catalog.drivers import DataFrameDriver from hydromt.data_catalog.sources import DataFrameSource from hydromt.data_catalog.uri_resolvers import URIResolver +from hydromt.error import NoDataException class TestDataFrameSource: - def test_read_data( + @pytest.fixture + def MockDataFrameSource( self, MockDataFrameDriver: Type[DataFrameDriver], mock_resolver: URIResolver, mock_df_adapter: DataFrameAdapter, - df: pd.DataFrame, managed_tmp_path: Path, - ): + ) -> DataFrameSource: managed_tmp_path.touch("test.xls") source = DataFrameSource( root=".", @@ -27,4 +30,23 @@ def test_read_data( data_adapter=mock_df_adapter, uri=str(managed_tmp_path / "test.xls"), ) - pd.testing.assert_frame_equal(df, source.read_data()) + return source + + def test_read_data( + self, + MockDataFrameSource: DataFrameSource, + df: pd.DataFrame, + ): + pd.testing.assert_frame_equal(df, MockDataFrameSource.read_data()) + + def test_to_file_nodata(self, mocker, MockDataFrameSource: DataFrameSource): + mocker.patch.object( + DataFrameSource, + "read_data", + return_value=None, + ) + with pytest.raises( + NoDataException, + match=re.escape("Reading file(s) for example_source returned no data."), + ): + MockDataFrameSource.to_file("file.csv") diff --git a/tests/data_catalog/sources/test_raster_dataset_source.py b/tests/data_catalog/sources/test_raster_dataset_source.py index ab3b0281f..837c94cc3 100644 --- a/tests/data_catalog/sources/test_raster_dataset_source.py +++ b/tests/data_catalog/sources/test_raster_dataset_source.py @@ -1,3 +1,4 @@ +import re from pathlib import Path from typing import Type @@ -9,6 +10,7 @@ from hydromt.data_catalog.drivers import RasterDatasetDriver from hydromt.data_catalog.sources import RasterDatasetSource from hydromt.data_catalog.uri_resolvers import URIResolver +from hydromt.error import NoDataException, NoDataStrategy from hydromt.gis.gis_utils import _to_geographic_bbox from hydromt.typing import SourceMetadata @@ -64,3 +66,11 @@ def test_detect_extent( ) def test_infer_default_driver(self, uri, expected_driver): assert RasterDatasetSource._infer_default_driver(uri) == expected_driver + + def test_to_file_nodata(self, writable_source: RasterDatasetSource, mocker): + mocker.patch.object(RasterDatasetSource, "read_data", return_value=None) + with pytest.raises( + NoDataException, + match=re.escape("Reading file(s) for test returned no data."), + ): + writable_source.to_file("output.zarr", handle_nodata=NoDataStrategy.RAISE) diff --git a/tests/data_catalog/test_data_catalog.py b/tests/data_catalog/test_data_catalog.py index e7870a3bc..9e60c6985 100644 --- a/tests/data_catalog/test_data_catalog.py +++ b/tests/data_catalog/test_data_catalog.py @@ -589,6 +589,28 @@ def test_export_dataframe(tmp_path: Path, df, df_time): assert isinstance(obj, dtypes), key +@pytest.mark.integration +def test_export_data_bulk(tmp_path: Path, caplog: pytest.LogCaptureFixture): + data_catalog = DataCatalog(data_libs=["artifact_data"]) + data_catalog_reread_path = tmp_path / "bulk_exported" + data_catalog_reread_path.mkdir(exist_ok=True) + bbox = [11.989, 46.02, 12.253, 46.166] # Small bounding box in Piave basin + caplog.set_level(WARNING) + data_catalog.export_data(data_catalog_reread_path, bbox=bbox, force_overwrite=True) + # test if data catalog can be read + new_data_catalog = DataCatalog( + data_libs=[str(data_catalog_reread_path / "data_catalog.yml")] + ) + assert ( + len(new_data_catalog) == 45 + ) # Number of exported sources, not all exported due to nodat for this bbox + + with pytest.raises(NoDataException): + data_catalog.export_data( + data_catalog_reread_path, bbox=bbox, handle_nodata=NoDataStrategy.RAISE + ) + + @pytest.mark.skip("flakey test due to external http issues") @pytest.mark.integration def test_http_data():