digital-land · alexglasertpx · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024
diff --git a/digital_land/cli.py b/digital_land/cli.py
@@ -142,6 +142,7 @@ def convert_cmd(input_path, output_path):
 @dataset_resource_dir
 @issue_dir
 @click.option("--cache-dir", type=click.Path(), default="var/cache/parquet")
+@click.option("--resource-path", type=click.Path(), default="collection/resource.csv")
 @click.argument("input-paths", nargs=-1, type=click.Path(exists=True))
 @click.pass_context
 def dataset_create_cmd(
@@ -153,6 +154,7 @@ def dataset_create_cmd(
     dataset_resource_dir,
     issue_dir,
     cache_dir,
+    resource_path,
 ):
     return dataset_create(
         input_paths=input_paths,
@@ -165,6 +167,7 @@ def dataset_create_cmd(
         dataset_resource_dir=dataset_resource_dir,
         issue_dir=issue_dir,
         cache_dir=cache_dir,
+        resource_path=resource_path,
     )
 
 

diff --git a/digital_land/commands.py b/digital_land/commands.py
@@ -12,6 +12,8 @@
 import geojson
 import shapely
 
+import subprocess
+
 from digital_land.package.organisation import OrganisationPackage
 from digital_land.check import duplicate_reference_check
 from digital_land.specification import Specification
@@ -359,7 +361,10 @@ def dataset_create(
     column_field_dir="var/column-field",
     dataset_resource_dir="var/dataset-resource",
     cache_dir="var/cache/parquet",
+    resource_path="collection/resource.csv",
 ):
+    cache_dir = os.path.join(cache_dir, dataset)
+
     if not output_path:
         print("missing output path", file=sys.stderr)
         sys.exit(2)
@@ -402,20 +407,22 @@ def dataset_create(
 
     pqpackage = DatasetParquetPackage(
         dataset,
+        organisation=organisation,
         path=output_path,
-        input_paths=input_paths,
+        cache_dir=cache_dir,
+        resource_path=resource_path,
         specification_dir=None,  # TBD: package should use this specification object
     )
     pqpackage.create_temp_table(input_paths)
-    pqpackage.load_facts(input_paths, cache_dir)
-    pqpackage.load_fact_resource(input_paths, cache_dir)
-    pqpackage.load_entities(input_paths, cache_dir, organisation_path)
-    pqpackage.pq_to_sqlite(output_path, cache_dir)
+    pqpackage.load_facts()
+    pqpackage.load_fact_resource()
+    pqpackage.load_entities()
+    pqpackage.pq_to_sqlite()
     pqpackage.close_conn()
 
 
 def dataset_dump(input_path, output_path):
-    cmd = f"sqlite3 -header -csv {input_path} 'select * from entity;' > {output_path}"
+    cmd = f"sqlite3 -header -csv {input_path} 'select * from entity order by entity;' > {output_path}"
     logging.info(cmd)
     os.system(cmd)
 
@@ -427,7 +434,7 @@ def dataset_dump_flattened(csv_path, flattened_dir, specification, dataset):
     elif isinstance(csv_path, Path):
         dataset_name = csv_path.stem
     else:
-        logging.error(f"Can't extract  datapackage name from {csv_path}")
+        logging.error(f"Can't extract datapackage name from {csv_path}")
         sys.exit(-1)
 
     flattened_csv_path = os.path.join(flattened_dir, f"{dataset_name}.csv")
@@ -474,6 +481,7 @@ def dataset_dump_flattened(csv_path, flattened_dir, specification, dataset):
     batch_size = 100000
     temp_geojson_files = []
     geography_entities = [e for e in entities if e["typology"] == "geography"]
+
     for i in range(0, len(geography_entities), batch_size):
         batch = geography_entities[i : i + batch_size]
         feature_collection = process_data_in_batches(batch, flattened_dir, dataset_name)
@@ -488,6 +496,13 @@ def dataset_dump_flattened(csv_path, flattened_dir, specification, dataset):
 
     if all(os.path.isfile(path) for path in temp_geojson_files):
         rfc7946_geojson_path = os.path.join(flattened_dir, f"{dataset_name}.geojson")
+        env = os.environ.copy()
+
+        out, _ = subprocess.Popen(
+            ["ogr2ogr", "--version"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL,
+        ).communicate()
         env = (
             dict(os.environ, OGR_GEOJSON_MAX_OBJ_SIZE="0")
             if get_gdal_version() >= Version("3.5.2")
@@ -892,9 +907,10 @@ def process_data_in_batches(entities, flattened_dir, dataset_name):
                 logging.error(f"Error loading wkt from entity {entity['entity']}")
                 logging.error(e)
         else:
-            logging.error(
-                f"No geometry or point data for entity {entity['entity']} with typology 'geography'"
-            )
+            pass
+            # logging.error(
+            #     f"No geometry or point data for entity {entity['entity']} with typology 'geography'"
+            # )
 
     if features:
         feature_collection = geojson.FeatureCollection(