From 19ffc268203451d39a798b1d3a7ab3359813accd Mon Sep 17 00:00:00 2001 From: HarshShinde0 Date: Sat, 5 Jul 2025 20:06:45 +0000 Subject: [PATCH 1/2] Add GeoCroissant support for geospatial datasets --- .../job_runners/dataset/croissant_crumbs.py | 61 ++++++++++++++++--- .../dataset/test_croissant_crumbs.py | 19 ++++++ 2 files changed, 71 insertions(+), 9 deletions(-) diff --git a/services/worker/src/worker/job_runners/dataset/croissant_crumbs.py b/services/worker/src/worker/job_runners/dataset/croissant_crumbs.py index 7b8b2b1e47..f40dea31da 100644 --- a/services/worker/src/worker/job_runners/dataset/croissant_crumbs.py +++ b/services/worker/src/worker/job_runners/dataset/croissant_crumbs.py @@ -70,6 +70,32 @@ def get_croissant_crumbs_from_dataset_infos( ) ] record_set = [] + + # Check if dataset has geospatial modality + is_geospatial = False + try: + dataset_modalities_response = get_previous_step_or_raise(kind="dataset-modalities", dataset=dataset) + modalities = dataset_modalities_response["content"].get("modalities", []) + is_geospatial = "geospatial" in modalities + except Exception: + # If modalities step fails, try direct file detection + try: + dataset_filetypes_response = get_previous_step_or_raise(kind="dataset-filetypes", dataset=dataset) + content = dataset_filetypes_response["content"] + if "filetypes" in content and isinstance(content["filetypes"], list): + geospatial_extensions = { + ".shp", ".shx", ".dbf", ".prj", ".cpg", ".kml", ".kmz", ".gpx", + ".geojson", ".topojson", ".gml", ".geoparquet", ".fgb", + ".img", ".bil", ".bip", ".bsq", ".gpkg", ".mbtiles", ".pmtiles", + ".tif", ".tiff" # GeoTIFF files + } + for filetype in content["filetypes"]: + if filetype["extension"] in geospatial_extensions and filetype["count"] > 0: + is_geospatial = True + break + except Exception: + pass + for info in infos: description_body = "" config = info["config_name"] @@ -197,16 +223,33 @@ def get_croissant_crumbs_from_dataset_infos( "source": "cr:source", "subField": "cr:subField", "transform": "cr:transform", + # GeoCroissant properties + "geo": "http://mlcommons.org/croissant/geo/1.0", + "geo:BoundingBox": "geocr:BoundingBox", + "geo:Geometry": "geocr:Geometry", + "geo:Resolution": "geocr:Resolution", + "geo:CRS": "geocr:CRS", + "geo:TemporalExtent": "geocr:TemporalExtent", + "geo:spatialResolution": "geocr:spatialResolution", + "geo:temporalResolution": "geocr:temporalResolution", + "geo:Label": "geocr:Label", + "geo:Image": "geocr:Image", + } + # Prepare base output + output = { + "@context": context, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.1", + "distribution": distribution, + "recordSet": record_set, } - return _remove_none_values( - { - "@context": context, - "@type": "sc:Dataset", - "conformsTo": "http://mlcommons.org/croissant/1.1", - "distribution": distribution, - "recordSet": record_set, - } - ) + + # Add GeoCroissant properties if dataset is geospatial + if is_geospatial: + # TODO: Extract geospatial metadata from user-provided metadata.json or dataset card + pass + + return _remove_none_values(output) def compute_croissant_crumbs_response(dataset: str) -> Mapping[str, Any]: diff --git a/services/worker/tests/job_runners/dataset/test_croissant_crumbs.py b/services/worker/tests/job_runners/dataset/test_croissant_crumbs.py index 90bc1b0778..f26b444d99 100644 --- a/services/worker/tests/job_runners/dataset/test_croissant_crumbs.py +++ b/services/worker/tests/job_runners/dataset/test_croissant_crumbs.py @@ -93,6 +93,17 @@ "source": "cr:source", "subField": "cr:subField", "transform": "cr:transform", + # GeoCroissant properties + "geo": "http://mlcommons.org/croissant/geo/1.0", + "geo:BoundingBox": "geocr:BoundingBox", + "geo:Geometry": "geocr:Geometry", + "geo:Resolution": "geocr:Resolution", + "geo:CRS": "geocr:CRS", + "geo:TemporalExtent": "geocr:TemporalExtent", + "geo:spatialResolution": "geocr:spatialResolution", + "geo:temporalResolution": "geocr:temporalResolution", + "geo:Label": "geocr:Label", + "geo:Image": "geocr:Image", } @@ -180,3 +191,11 @@ def test_get_croissant_crumbs_from_dataset_infos() -> None: assert "@id" in distribution if "containedIn" in distribution: assert "@id" in distribution["containedIn"] + + # Test that GeoCroissant context is included + assert "geo" in croissant_crumbs["@context"] + assert croissant_crumbs["@context"]["geo"] == "http://mlcommons.org/croissant/geo/1.0" + assert "geo:BoundingBox" in croissant_crumbs["@context"] + assert "geo:CRS" in croissant_crumbs["@context"] + assert "geo:Resolution" in croissant_crumbs["@context"] + assert "geo:Geometry" in croissant_crumbs["@context"] From 0086d316948c085dd2bd0f96e509adb264fade1d Mon Sep 17 00:00:00 2001 From: HarshShinde0 Date: Mon, 7 Jul 2025 16:30:40 +0000 Subject: [PATCH 2/2] Add GeoCroissant support for HF --- .../job_runners/dataset/croissant_crumbs.py | 20 ++++++------ .../dataset/test_croissant_crumbs.py | 32 +++++++++---------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/services/worker/src/worker/job_runners/dataset/croissant_crumbs.py b/services/worker/src/worker/job_runners/dataset/croissant_crumbs.py index f40dea31da..90eec21a80 100644 --- a/services/worker/src/worker/job_runners/dataset/croissant_crumbs.py +++ b/services/worker/src/worker/job_runners/dataset/croissant_crumbs.py @@ -224,16 +224,16 @@ def get_croissant_crumbs_from_dataset_infos( "subField": "cr:subField", "transform": "cr:transform", # GeoCroissant properties - "geo": "http://mlcommons.org/croissant/geo/1.0", - "geo:BoundingBox": "geocr:BoundingBox", - "geo:Geometry": "geocr:Geometry", - "geo:Resolution": "geocr:Resolution", - "geo:CRS": "geocr:CRS", - "geo:TemporalExtent": "geocr:TemporalExtent", - "geo:spatialResolution": "geocr:spatialResolution", - "geo:temporalResolution": "geocr:temporalResolution", - "geo:Label": "geocr:Label", - "geo:Image": "geocr:Image", + "geocr": "http://mlcommons.org/croissant/geo/1.0", + "boundingBox": "geocr:boundingBox", + "geometry": "geocr:geometry", + "resolution": "geocr:resolution", + "crs": "geocr:crs", + "temporalExtent": "geocr:temporalExtent", + "spatialResolution": "geocr:spatialResolution", + "temporalResolution": "geocr:temporalResolution", + "label": "geocr:label", + "image": "geocr:image", } # Prepare base output output = { diff --git a/services/worker/tests/job_runners/dataset/test_croissant_crumbs.py b/services/worker/tests/job_runners/dataset/test_croissant_crumbs.py index f26b444d99..678f1789ac 100644 --- a/services/worker/tests/job_runners/dataset/test_croissant_crumbs.py +++ b/services/worker/tests/job_runners/dataset/test_croissant_crumbs.py @@ -94,16 +94,16 @@ "subField": "cr:subField", "transform": "cr:transform", # GeoCroissant properties - "geo": "http://mlcommons.org/croissant/geo/1.0", - "geo:BoundingBox": "geocr:BoundingBox", - "geo:Geometry": "geocr:Geometry", - "geo:Resolution": "geocr:Resolution", - "geo:CRS": "geocr:CRS", - "geo:TemporalExtent": "geocr:TemporalExtent", - "geo:spatialResolution": "geocr:spatialResolution", - "geo:temporalResolution": "geocr:temporalResolution", - "geo:Label": "geocr:Label", - "geo:Image": "geocr:Image", + "geocr": "http://mlcommons.org/croissant/geo/1.0", + "boundingBox": "geocr:boundingBox", + "geometry": "geocr:geometry", + "resolution": "geocr:resolution", + "crs": "geocr:crs", + "temporalExtent": "geocr:temporalExtent", + "spatialResolution": "geocr:spatialResolution", + "temporalResolution": "geocr:temporalResolution", + "label": "geocr:label", + "image": "geocr:image", } @@ -193,9 +193,9 @@ def test_get_croissant_crumbs_from_dataset_infos() -> None: assert "@id" in distribution["containedIn"] # Test that GeoCroissant context is included - assert "geo" in croissant_crumbs["@context"] - assert croissant_crumbs["@context"]["geo"] == "http://mlcommons.org/croissant/geo/1.0" - assert "geo:BoundingBox" in croissant_crumbs["@context"] - assert "geo:CRS" in croissant_crumbs["@context"] - assert "geo:Resolution" in croissant_crumbs["@context"] - assert "geo:Geometry" in croissant_crumbs["@context"] + assert "geocr" in croissant_crumbs["@context"] + assert croissant_crumbs["@context"]["geocr"] == "http://mlcommons.org/croissant/geo/1.0" + assert "boundingBox" in croissant_crumbs["@context"] + assert "crs" in croissant_crumbs["@context"] + assert "resolution" in croissant_crumbs["@context"] + assert "geometry" in croissant_crumbs["@context"] \ No newline at end of file