Skip to content

Commit

Permalink
Merge pull request #317 from digital-land/feat/deleted-entities-expec…
Browse files Browse the repository at this point in the history
…tation

Adds expectation for deleted entities
  • Loading branch information
kena-SL authored Jan 22, 2025
2 parents 431eca7 + c52f25d commit 9130178
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 3 deletions.
7 changes: 5 additions & 2 deletions digital_land/expectations/checkpoints/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from .base import BaseCheckpoint
from ..log import ExpectationLog
from ..operation import count_lpa_boundary
from ..operation import count_lpa_boundary, count_deleted_entities


class DatasetCheckpoint(BaseCheckpoint):
Expand All @@ -26,7 +26,10 @@ def operation_factory(self, operation_string: str):
Args
operation: a string representing an operation
"""
operation_map = {"count_lpa_boundary": count_lpa_boundary}
operation_map = {
"count_lpa_boundary": count_lpa_boundary,
"count_deleted_entities": count_deleted_entities,
}
operation = operation_map[operation_string]
return operation

Expand Down
55 changes: 55 additions & 0 deletions digital_land/expectations/operation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import requests
import pandas as pd
import urllib
import os


# # TODO is there a way to represent this in a generalised count or not
Expand Down Expand Up @@ -119,3 +122,55 @@ def count_lpa_boundary(
}

return result, message, details


def count_deleted_entities(
conn,
expected: int,
organisation_entity: int = None,
):
# get database name to identify dataset
db_path = conn.execute("PRAGMA database_list").fetchall()[0][2]
db_name = os.path.splitext(os.path.basename(db_path))[0]

# get dataset specific active resource list
params = urllib.parse.urlencode(
{
"sql": f"""select * from reporting_historic_endpoints rhe join organisation o on rhe.organisation=o.organisation
where pipeline == '{db_name}' and o.entity='{organisation_entity}' and resource_end_date == "" group by endpoint""",
"_size": "max",
}
)
base_url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
get_resource = pd.read_csv(base_url)
resource_list = get_resource["resource"].to_list()

# use resource list to get current entities
query = f"""select e.reference,fe.entry_number,f.entity,e.name,e.organisation_entity
from fact_resource fe join fact f on fe.fact=f.fact join entity e on f.entity=e.entity
where resource in ({','.join(f"'{x}'" for x in resource_list)})
group by reference
"""
rows = conn.execute(query).fetchall()
get_active_entities = [row[0] for row in rows]

# get entities from entity table to compare against resource entities
query = f"""
select reference from entity where organisation_entity = '{organisation_entity}';
"""
rows = conn.execute(query).fetchall()
get_entities = [row[0] for row in rows]

# identify entities present in the entity table but missing from the resource
entities = [item for item in get_entities if item not in get_active_entities]
actual = len(entities)

result = bool(actual == expected)
message = f"there were {actual} entities found"
details = {
"actual": actual,
"expected": expected,
"entities": entities,
}

return result, message, details
66 changes: 65 additions & 1 deletion tests/integration/expectations/test_operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
import pytest
import pandas as pd

from digital_land.expectations.operation import count_lpa_boundary
from digital_land.expectations.operation import (
count_lpa_boundary,
count_deleted_entities,
)


@pytest.fixture
Expand Down Expand Up @@ -109,3 +112,64 @@ def test_count_lpa_boundary_passes(
detail_keys = ["actual", "expected"]
for key in detail_keys:
assert key in details, f"{key} missing from details"


def test_count_deleted_entities(dataset_path, mocker):
# define constant parameters
organisation_entity = 109
expected = 0

# load data into sqlite for entity, fact_resource and fact table
test_entity_data = pd.DataFrame.from_dict(
{
"entity": ["1001"],
"name": ["test1"],
"organisation_entity": [109],
"reference": ["ref1"],
}
)

test_fact_resource_data = pd.DataFrame.from_dict(
{
"fact": ["036d2b946bd41", "16bf38800aafd"],
"resource": ["2f7d900dd48fd02", "2f7d900dd48fd02"],
"entry_number": ["1", "1"],
},
)

test_fact_data = pd.DataFrame.from_dict(
{
"fact": ["036d2b946bd41", "16bf38800aafd"],
"entity": ["1001", "1001"],
"field": ["name", "reference"],
"value": ["abc", "ref1"],
}
)

# mock `pandas.read_csv` to return the mock DataFrame
mock_df = pd.DataFrame({"resource": ["2f7d900dd48fd02"]})
mocker.patch("pandas.read_csv", return_value=mock_df)

with spatialite.connect(dataset_path) as conn:
# load data into required tables
test_entity_data.to_sql("entity", conn, if_exists="replace", index=False)
test_fact_resource_data.to_sql(
"fact_resource", conn, if_exists="replace", index=False
)
test_fact_data.to_sql("fact", conn, if_exists="replace", index=False)

# run expectation
passed, message, details = count_deleted_entities(
conn,
expected=expected,
organisation_entity=organisation_entity,
)

assert (
passed
), f"test failed : expected {details['expected']} but got {details['actual']} entities"
assert message, "test requires a message"

detail_keys = ["actual", "expected"]
for key in detail_keys:
assert key in details, f"{key} missing from details"

0 comments on commit 9130178

Please sign in to comment.