Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/pq dataset typology #282

Open
wants to merge 58 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
3bfc2cc
Use passed dataset namne typology from the spec.
cjohns-scottlogic Nov 25, 2024
df661b6
Changed dataset used in test.
cjohns-scottlogic Nov 25, 2024
41c112e
Ensure 'organisation' is not included in jsom_fields
alexglasertpx Nov 25, 2024
ce10425
Ran black on code
alexglasertpx Nov 25, 2024
133d577
Print statements to serach where _geoms are
alexglasertpx Nov 25, 2024
96fa0bd
Removed '_geom' from null fields statement
alexglasertpx Nov 25, 2024
e956449
Removed print statements
alexglasertpx Nov 25, 2024
b0b391c
Altered code as '_geom' columns no longer in output
alexglasertpx Nov 25, 2024
b8c5b04
Commented out parquet commands to check old sqlite outputs
alexglasertpx Nov 26, 2024
4b5cfab
Added parquet commands back in
alexglasertpx Nov 26, 2024
c382ae9
Added print statment to find where 'organisation' is in dataset_dump_…
alexglasertpx Nov 26, 2024
aab2246
Filtered 'field_names'
alexglasertpx Nov 26, 2024
dc14e9f
Print every row for debug purposes
alexglasertpx Nov 26, 2024
09a34e8
More print statements for debug purposes
alexglasertpx Nov 26, 2024
cdbabe7
More print statements for debug purposes
alexglasertpx Nov 26, 2024
cfccd9c
Add dataset to parquet path.
cjohns-scottlogic Nov 26, 2024
887eedc
Merge branch 'fix/pq-dataset-typology' of github-second.com:digital-l…
alexglasertpx Nov 26, 2024
b56e774
More print statements for debug purposes
alexglasertpx Nov 26, 2024
c48729f
More print statements for debug purposes
alexglasertpx Nov 26, 2024
6168a8e
More print statements for debug purposes
alexglasertpx Nov 26, 2024
c66e231
Updated test.
cjohns-scottlogic Nov 26, 2024
88c4f92
Fixed black issues.
cjohns-scottlogic Nov 26, 2024
9e1e384
More print statements for debugging
alexglasertpx Nov 26, 2024
7c5d407
Merge branch 'fix/pq-dataset-typology' of github-second.com:digital-l…
alexglasertpx Nov 26, 2024
c3b2e88
Use dataset name in duckdb file.
cjohns-scottlogic Nov 26, 2024
ac111df
More print statements for debugging
alexglasertpx Nov 26, 2024
e649379
More print statements for debugging
alexglasertpx Nov 26, 2024
7d34d4c
More print statements for debugging
alexglasertpx Nov 26, 2024
6df361a
More print statements for debugging
alexglasertpx Nov 26, 2024
ce7e58c
More print statements for debugging
alexglasertpx Nov 26, 2024
d689d95
More print statements for debugging
alexglasertpx Nov 26, 2024
1f492ec
Trying os.environ in subprocess
alexglasertpx Nov 26, 2024
a93cde0
Trying os.environ in subprocess
alexglasertpx Nov 26, 2024
7dea1c9
Trying os.environ in subprocess
alexglasertpx Nov 26, 2024
3ac12f1
Added print statments to debug
alexglasertpx Nov 26, 2024
4ff0972
insert into the SQLite table rather than recreate it.
cjohns-scottlogic Nov 27, 2024
53a4b3d
Trying gartbage collect
alexglasertpx Nov 27, 2024
b0cfdef
Merge branch 'fix/pq-dataset-typology' of github-second.com:digital-l…
alexglasertpx Nov 27, 2024
7afa43f
Trying gartbage collect
alexglasertpx Nov 27, 2024
a1d22f8
Added print statments to debug
alexglasertpx Nov 27, 2024
7545342
Replace empty json with NULL
cjohns-scottlogic Nov 27, 2024
0d29d72
Get schema from specification
cjohns-scottlogic Nov 27, 2024
12a59a4
Updated tests.
cjohns-scottlogic Nov 27, 2024
408fdf3
Replace empty data with blank strings to match sqlite version.
cjohns-scottlogic Nov 27, 2024
130aade
Put the duckdb file in the cache.
cjohns-scottlogic Nov 27, 2024
3ab2e72
Tests relating to missing points
alexiglaser Nov 29, 2024
f19304b
Fix json field names.
cjohns-scottlogic Nov 29, 2024
cb8564e
Don't try to compute point if geometry is blank.
cjohns-scottlogic Nov 29, 2024
8b08b2b
Reduce the computed points to 6dp
cjohns-scottlogic Nov 29, 2024
9db4bbb
Added new tests and edited point data
alexiglaser Nov 29, 2024
6bf3c2e
black
cjohns-scottlogic Nov 29, 2024
63abd9b
Removed print statements
alexiglaser Nov 29, 2024
8da6d75
Merge branch 'fix/pq-dataset-typology' of github-second.com:digital-l…
alexiglaser Nov 29, 2024
ab3904a
Using row_number to split ties
alexiglaser Nov 29, 2024
28c00c6
Removing row_number
alexiglaser Nov 29, 2024
69ba0ad
Added an end date to choice of entity and field
alexiglaser Dec 2, 2024
2f9000e
Updated SQL
cjohns-scottlogic Dec 2, 2024
a079ccb
Added resource end_date
alexiglaser Dec 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions digital_land/package/datasetparquet.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
import logging
from pathlib import Path
import duckdb
from .package import Package

Expand Down Expand Up @@ -37,6 +36,7 @@ def __init__(self, dataset, input_paths, **kwargs):
self.duckdb_file = "input_paths_database.duckdb"
self.conn = duckdb.connect(self.duckdb_file)
self.schema = self.get_schema(input_paths)
self.typology = self.specification.schema[dataset]["typology"]

def get_schema(self, input_paths):
# There are issues with the schema when reading in lots of files, namely smaller files have few or zero rows
Expand Down Expand Up @@ -149,7 +149,12 @@ def load_entities(

# json fields - list of fields which are present in the fact table which
# do not exist separately in the entity table
json_fields = [field for field in distinct_fields if field not in entity_fields]
# Need to ensure that 'organisation' is not included either
json_fields = [
field
for field in distinct_fields
if field not in entity_fields + ["organisation"]
]

# null fields - list of fields which are not present in the fact tables which have
# to be in the entity table as a column
Expand Down Expand Up @@ -207,8 +212,13 @@ def load_entities(
# include columns in the json statement
# Collate list of fields which don't exist but need to be in the final table
select_statement = ", ".join([f"t1.{field}" for field in select_fields])
# Don't want to include anything that ends with "_geom"
null_fields_statement = ", ".join(
[f'NULL::VARCHAR AS "{field}"' for field in null_fields]
[
f'NULL::VARCHAR AS "{field}"'
for field in null_fields
if not field.endswith("_geom")
]
)
json_statement = ", ".join(
[
Expand All @@ -223,7 +233,6 @@ def load_entities(
SELECT * FROM read_csv_auto('{org_csv}')
"""

dataset = Path(output_path).name
sql = f"""
INSTALL spatial; LOAD spatial;
COPY(
Expand All @@ -236,8 +245,8 @@ def load_entities(
ELSE point
END AS point
FROM (
SELECT '{dataset}' as dataset,
'{dataset}' as typology,
SELECT '{self.dataset}' as dataset,
'{self.typology}' as typology,
t2.entity as organisation_entity,
{select_statement},
{null_fields_statement},
Expand Down
8 changes: 2 additions & 6 deletions tests/integration/test_package_datasetparquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,6 @@ def test_dataset_parquet_package(temp_dir):
for row in data:
f.write(",".join(map(str, row)) + "\n")

# df = pd.read_csv(input_paths[0])

# Test data for the tables. This has plenty of 'duplicates' to check
data = [
[
Expand Down Expand Up @@ -462,7 +460,7 @@ def test_dataset_parquet_package(temp_dir):

# Instantiate the DatasetParquetPackage with temp_dir input paths and a mock schema
package = DatasetParquetPackage(
dataset="test_dataset", input_paths=input_paths, specification_dir=None
dataset="conservation-area", input_paths=input_paths, specification_dir=None
)
package.create_temp_table(input_paths)

Expand Down Expand Up @@ -534,11 +532,9 @@ def test_load_entities_basic(test_dataset_parquet_package, temp_dir):
df = pd.read_parquet(output_file)
assert len(df) > 0, "No data in entity.parquet file"
assert len(df) == 11, "No. of entities is not correct"
assert df.shape[1] == 16, "Not all columns saved in entity.parquet file"
assert df.shape[1] == 14, "Not all columns saved in entity.parquet file"
assert df["end_date"].isnull().all() # Check null handling
assert df["geojson"].isnull().all() # Check null handling
assert df["geometry_geom"].isnull().all() # Check null handling
assert df["point_geom"].isnull().all() # Check null handling


def test_load_pq_to_sqlite_basic(test_dataset_parquet_package, temp_dir):
Expand Down
Loading