Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds expectation for deleted entities #317

Merged
merged 2 commits into from
Jan 22, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions digital_land/expectations/checkpoints/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from .base import BaseCheckpoint
from ..log import ExpectationLog
from ..operation import count_lpa_boundary
from ..operation import count_lpa_boundary, count_deleted_entities


class DatasetCheckpoint(BaseCheckpoint):
Expand All @@ -26,7 +26,10 @@ def operation_factory(self, operation_string: str):
Args
operation: a string representing an operation
"""
operation_map = {"count_lpa_boundary": count_lpa_boundary}
operation_map = {
"count_lpa_boundary": count_lpa_boundary,
"count_deleted_entities": count_deleted_entities,
}
operation = operation_map[operation_string]
return operation

Expand Down
55 changes: 55 additions & 0 deletions digital_land/expectations/operation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import requests
import pandas as pd
import urllib
import os


# # TODO is there a way to represent this in a generalised count or not
Expand Down Expand Up @@ -119,3 +122,55 @@ def count_lpa_boundary(
}

return result, message, details


def count_deleted_entities(
conn,
expected: int,
organisation_entity: int = None,
):
# get database name to identify dataset
db_path = conn.execute("PRAGMA database_list").fetchall()[0][2]
db_name = os.path.splitext(os.path.basename(db_path))[0]

# get dataset specific active resource list
params = urllib.parse.urlencode(
{
"sql": f"""select *,o.entity from reporting_historic_endpoints rhe join organisation o on rhe.organisation=o.organisation
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For 'select *,o.entity ...' should there be an alias in front of the star or is 'o.entity' not required as the star will bring all of the columns in?

where pipeline == '{db_name}' and o.entity='{organisation_entity}' and resource_end_date == "" group by endpoint""",
"_size": "max",
}
)
base_url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
get_resource = pd.read_csv(base_url)
resource_list = get_resource["resource"].to_list()

# use resource list to get current entities
query = f"""select e.reference,fe.entry_number,f.entity,e.name,e.organisation_entity
from fact_resource fe join fact f on fe.fact=f.fact join entity e on f.entity=e.entity
where resource in ({','.join(f"'{x}'" for x in resource_list)})
group by reference
"""
rows = conn.execute(query).fetchall()
get_active_entities = [row[0] for row in rows]

# get entities from entity table to compare against resource entities
query = f"""
select reference from entity where organisation_entity = '{organisation_entity}';
"""
rows = conn.execute(query).fetchall()
get_entities = [row[0] for row in rows]

# identify entities present in the entity table but missing from the resource
entities = [item for item in get_entities if item not in get_active_entities]
actual = len(entities)

result = bool(actual == expected)
message = f"there were {actual} entities found"
details = {
"actual": actual,
"expected": expected,
"entities": entities,
}

return result, message, details
66 changes: 65 additions & 1 deletion tests/integration/expectations/test_operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
import pytest
import pandas as pd

from digital_land.expectations.operation import count_lpa_boundary
from digital_land.expectations.operation import (
count_lpa_boundary,
count_deleted_entities,
)


@pytest.fixture
Expand Down Expand Up @@ -109,3 +112,64 @@ def test_count_lpa_boundary_passes(
detail_keys = ["actual", "expected"]
for key in detail_keys:
assert key in details, f"{key} missing from details"


def test_count_deleted_entities(dataset_path, mocker):
# define constant parameters
organisation_entity = 109
expected = 0

# load data into sqlite for entity, fact_resource and fact table
test_entity_data = pd.DataFrame.from_dict(
{
"entity": ["1001"],
"name": ["test1"],
"organisation_entity": [109],
"reference": ["ref1"],
}
)

test_fact_resource_data = pd.DataFrame.from_dict(
{
"fact": ["036d2b946bd41", "16bf38800aafd"],
"resource": ["2f7d900dd48fd02", "2f7d900dd48fd02"],
"entry_number": ["1", "1"],
},
)

test_fact_data = pd.DataFrame.from_dict(
{
"fact": ["036d2b946bd41", "16bf38800aafd"],
"entity": ["1001", "1001"],
"field": ["name", "reference"],
"value": ["abc", "ref1"],
}
)

# mock `pandas.read_csv` to return the mock DataFrame
mock_df = pd.DataFrame({"resource": ["2f7d900dd48fd02"]})
mocker.patch("pandas.read_csv", return_value=mock_df)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these lines necessary? I can't see where 'mock_df' is required.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The setup mocks the read_csv() function, which is used to fetch resource data from the digital-land database.
Since pd.read_csv(base_url) returns a DataFrame, the mock_df is used to replicate that behaviour.


with spatialite.connect(dataset_path) as conn:
# load data into required tables
test_entity_data.to_sql("entity", conn, if_exists="append", index=False)
test_fact_resource_data.to_sql(
"fact_resource", conn, if_exists="append", index=False
)
test_fact_data.to_sql("fact", conn, if_exists="append", index=False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would we want the if_exists to be 'replace' as the tables are only meant for testing purposes it might append extra rows to a test table?


# run expectation
passed, message, details = count_deleted_entities(
conn,
expected=expected,
organisation_entity=organisation_entity,
)

assert (
passed
), f"test failed : expected {details['expected']} but got {details['actual']} entities"
assert message, "test requires a message"

detail_keys = ["actual", "expected"]
for key in detail_keys:
assert key in details, f"{key} missing from details"