diff --git a/fiboa_cli/datasets/fieldscapes_germany.py b/fiboa_cli/datasets/fieldscapes_germany.py new file mode 100644 index 00000000..9b032666 --- /dev/null +++ b/fiboa_cli/datasets/fieldscapes_germany.py @@ -0,0 +1,147 @@ +# TEMPLATE FOR A FIBOA CONVERTER +# +# Copy this file and rename it to something sensible. +# The name of the file will be the name of the converter in the cli. +# If you name it 'de_abc' you'll be able to run `fiboa convert de_abc` in the cli. + +from ..convert_utils import convert as convert_ + +# File to read the data from +# Can read any tabular data format that GeoPandas can read through read_file() +# Supported protcols: HTTP(S), GCS, S3, or the local file system + +# Local URI added to the repository for initial conversion, Original Source https://beta.source.coop/esa/fusion-competition/ +URI = "/home/byteboogie/work/labwork_hkerner/fieldscapes/germany/boundaries_germany.gpkg" + +# Unique identifier for the collection +ID = "fieldscapes_germany" +# Title of the collection +TITLE = "Field boundaries for Germany, Brandenburg (Fieldscapes)" +# Description of the collection. Can be multiline and include CommonMark. +DESCRIPTION = """ The dataset contains field boundaries for the German state of Brandenburg.""" +# Bounding box of the data in WGS84 coordinates +BBOX = [13.635334610075107, 52.41814553442972, 14.35270427904761, 52.849468757681805] + +# Provider name, can be None if not applicable, must be provided if PROVIDER_URL is provided +PROVIDER_NAME = "ESA" +# URL to the homepage of the data or the provider, can be None if not applicable +PROVIDER_URL = "https://beta.source.coop/esa/fusion-competition/" +# Attribution, can be None if not applicable +ATTRIBUTION = "© GeoBasis-DE/LGB" + +# License of the data, either +# 1. a SPDX license identifier (including "dl-de/by-2-0" / "dl-de/zero-2-0"), or +LICENSE = "dl-de/by-2-0" +# 2. a STAC Link Object with relation type "license" +# LICENSE = {"title": "CC-BY-4.0", "href": "https://creativecommons.org/licenses/by/4.0/", "type": "text/html", "rel": "license"} + +# Map original column names to fiboa property names +# You also need to list any column that you may have added in the MIGRATION function (see below). +COLUMNS = { + "id": "id", + "SHAPE_AREA": "area", + "SHAPE_LEN": "perimeter", + "geometry": "geometry", + "crop_id": "crop_id", + "crop_name": "crop_name", + "determination_datetime": "determination_datetime" +} + +# Add columns with constant values. +# The key is the column name, the value is a constant value that's used for all rows. +ADD_COLUMNS = { + +} + +# A list of implemented extension identifiers +EXTENSIONS = [] + +# Functions to migrate data in columns to match the fiboa specification. +# Example: You have a column area_m in square meters and want to convert +# to hectares as required for the area field in fiboa. +# Function signature: +# func(column: pd.Series) -> pd.Series +COLUMN_MIGRATIONS = { + +} + +# Filter columns to only include the ones that are relevant for the collection, +# e.g. only rows that contain the word "agriculture" but not "forest" in the column "land_cover_type". +# Lamda function accepts a Pandas Series and returns a Series or a Tuple with a Series and True to inverse the mask. +COLUMN_FILTERS = { + +} + +# Custom function to migrate the GeoDataFrame if the other options are not sufficient +# This should be the last resort! +# Function signature: +# func(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame +MIGRATION = None + +# Schemas for the fields that are not defined in fiboa +# Keys must be the values from the COLUMNS dict, not the keys +MISSING_SCHEMAS = { + "required": [ "crop_id", "crop_name" ], # i.e. non-nullable properties + "properties": { + "crop_id": { + "type": "int64" + }, + "crop_name": { + "type": "string" + } + } +} + + +# Conversion function, usually no changes required +def convert(output_file, cache_file = None, source_coop_url = None, collection = False, compression = None): + """ + Converts the field boundary datasets to fiboa. + + For reference, this is the order in which the conversion steps are applied: + 0. Read GeoDataFrame from file + 1. Run global migration (if provided through MIGRATION) + 2. Run filters to remove rows that shall not be in the final data + (if provided through COLUMN_FILTERS) + 3. Add columns with constant values + 4. Run column migrations (if provided through COLUMN_MIGRATIONS) + 5. Duplicate columns (if an array is provided as the value in COLUMNS) + 6. Rename columns (as provided in COLUMNS) + 7. Remove columns (if column is not present as value in COLUMNS) + 8. Create the collection + 9. Change data types of the columns based on the provided schemas + (fiboa spec, extensions, and MISSING_SCHEMAS) + 10. Write the data to the Parquet file + + Parameters: + output_file (str): Path where the Parquet file shall be stored. + cache_file (str): Path to a cached file of the data. Default: None. + Can be used to avoid repetitive downloads from the original data source. + source_coop_url (str): URL to the (future) Source Cooperative repository. Default: None + collection (bool): Additionally, store the collection separate from Parquet file. Default: False + compression (str): Compression method for the Parquet file. Default: zstd + kwargs: Additional keyword arguments for GeoPanda's read_file() or read_parquet() function. + """ + convert_( + output_file, + cache_file, + URI, + COLUMNS, + ID, + TITLE, + DESCRIPTION, + BBOX, + provider_name=PROVIDER_NAME, + provider_url=PROVIDER_URL, + source_coop_url=source_coop_url, + extensions=EXTENSIONS, + missing_schemas=MISSING_SCHEMAS, + column_additions=ADD_COLUMNS, + column_migrations=COLUMN_MIGRATIONS, + column_filters=COLUMN_FILTERS, + migration=MIGRATION, + attribution=ATTRIBUTION, + store_collection=collection, + license=LICENSE, + compression=compression, + ) \ No newline at end of file diff --git a/fiboa_cli/datasets/fieldscapes_southafrica_2018.py b/fiboa_cli/datasets/fieldscapes_southafrica_2018.py new file mode 100644 index 00000000..68ad8664 --- /dev/null +++ b/fiboa_cli/datasets/fieldscapes_southafrica_2018.py @@ -0,0 +1,146 @@ +# TEMPLATE FOR A FIBOA CONVERTER +# +# Copy this file and rename it to something sensible. +# The name of the file will be the name of the converter in the cli. +# If you name it 'de_abc' you'll be able to run `fiboa convert de_abc` in the cli. + +from ..convert_utils import convert as convert_ + +# File to read the data from +# Can read any tabular data format that GeoPandas can read through read_file() +# Supported protcols: HTTP(S), GCS, S3, or the local file system + +# Local URI added to the repository for initial conversion, Original Source https://beta.source.coop/esa/fusion-competition/ +URI = "/home/byteboogie/work/labwork_hkerner/fieldscapes/southafrica/boundaries_southafrica_2018.gpkg" + +# Unique identifier for the collection +ID = "boundaries_southafrica_2018" +# Title of the collection +TITLE = "Field boundaries for Cape Town, South Africa" +# Description of the collection. Can be multiline and include CommonMark. +DESCRIPTION = """ The dataset contains field boundaries for the Cape Town, South Africa.""" +# Bounding box of the data in WGS84 coordinates +BBOX = [20.521492384730347, -34.39922362572791, 21.04341451023305, -33.980506187460875] + +# Provider name, can be None if not applicable, must be provided if PROVIDER_URL is provided +PROVIDER_NAME = "Planet, Radiant Earth Foundation, Western Cape Department of Agriculture, & German Aerospace Center (DLR)" +# URL to the homepage of the data or the provider, can be None if not applicable +PROVIDER_URL = "https://beta.source.coop/esa/fusion-competition/" +# Attribution, can be None if not applicable +ATTRIBUTION = "ESA Fusion Competition" + +# License of the data, either +# 1. a SPDX license identifier (including "dl-de/by-2-0" / "dl-de/zero-2-0"), or +LICENSE = "CC-BY-NC-SA-4.0" +# 2. a STAC Link Object with relation type "license" +# LICENSE = {"title": "CC-BY-4.0", "href": "https://creativecommons.org/licenses/by/4.0/", "type": "text/html", "rel": "license"} + +# Map original column names to fiboa property names +# You also need to list any column that you may have added in the MIGRATION function (see below). +COLUMNS = { + "id": "id", + "SHAPE_AREA": "area", + "SHAPE_LEN": "perimeter", + "geometry": "geometry", + "crop_id": "crop_id", + "crop_name": "crop_name" +} + +# Add columns with constant values. +# The key is the column name, the value is a constant value that's used for all rows. +ADD_COLUMNS = { + "determination_datetime": "2018-03-31T00:00:00Z" +} + +# A list of implemented extension identifiers +EXTENSIONS = [] + +# Functions to migrate data in columns to match the fiboa specification. +# Example: You have a column area_m in square meters and want to convert +# to hectares as required for the area field in fiboa. +# Function signature: +# func(column: pd.Series) -> pd.Series +COLUMN_MIGRATIONS = { + +} + +# Filter columns to only include the ones that are relevant for the collection, +# e.g. only rows that contain the word "agriculture" but not "forest" in the column "land_cover_type". +# Lamda function accepts a Pandas Series and returns a Series or a Tuple with a Series and True to inverse the mask. +COLUMN_FILTERS = { + +} + +# Custom function to migrate the GeoDataFrame if the other options are not sufficient +# This should be the last resort! +# Function signature: +# func(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame +MIGRATION = None + +# Schemas for the fields that are not defined in fiboa +# Keys must be the values from the COLUMNS dict, not the keys +MISSING_SCHEMAS = { + "required": [ "crop_id", "crop_name" ], # i.e. non-nullable properties + "properties": { + "crop_id": { + "type": "int64" + }, + "crop_name": { + "type": "string" + } + } +} + + +# Conversion function, usually no changes required +def convert(output_file, cache_file = None, source_coop_url = None, collection = False, compression = None): + """ + Converts the field boundary datasets to fiboa. + + For reference, this is the order in which the conversion steps are applied: + 0. Read GeoDataFrame from file + 1. Run global migration (if provided through MIGRATION) + 2. Run filters to remove rows that shall not be in the final data + (if provided through COLUMN_FILTERS) + 3. Add columns with constant values + 4. Run column migrations (if provided through COLUMN_MIGRATIONS) + 5. Duplicate columns (if an array is provided as the value in COLUMNS) + 6. Rename columns (as provided in COLUMNS) + 7. Remove columns (if column is not present as value in COLUMNS) + 8. Create the collection + 9. Change data types of the columns based on the provided schemas + (fiboa spec, extensions, and MISSING_SCHEMAS) + 10. Write the data to the Parquet file + + Parameters: + output_file (str): Path where the Parquet file shall be stored. + cache_file (str): Path to a cached file of the data. Default: None. + Can be used to avoid repetitive downloads from the original data source. + source_coop_url (str): URL to the (future) Source Cooperative repository. Default: None + collection (bool): Additionally, store the collection separate from Parquet file. Default: False + compression (str): Compression method for the Parquet file. Default: zstd + kwargs: Additional keyword arguments for GeoPanda's read_file() or read_parquet() function. + """ + convert_( + output_file, + cache_file, + URI, + COLUMNS, + ID, + TITLE, + DESCRIPTION, + BBOX, + provider_name=PROVIDER_NAME, + provider_url=PROVIDER_URL, + source_coop_url=source_coop_url, + extensions=EXTENSIONS, + missing_schemas=MISSING_SCHEMAS, + column_additions=ADD_COLUMNS, + column_migrations=COLUMN_MIGRATIONS, + column_filters=COLUMN_FILTERS, + migration=MIGRATION, + attribution=ATTRIBUTION, + store_collection=collection, + license=LICENSE, + compression=compression, + ) \ No newline at end of file