digital-land · kena-SL · Jan 22, 2025 · Jan 21, 2025 · Jan 21, 2025 · alexglasertpx
diff --git a/digital_land/expectations/checkpoints/dataset.py b/digital_land/expectations/checkpoints/dataset.py
@@ -7,7 +7,7 @@
 
 from .base import BaseCheckpoint
 from ..log import ExpectationLog
-from ..operation import count_lpa_boundary
+from ..operation import count_lpa_boundary, count_deleted_entities
 
 
 class DatasetCheckpoint(BaseCheckpoint):
@@ -26,7 +26,10 @@ def operation_factory(self, operation_string: str):
         Args
             operation: a string representing an operation
         """
-        operation_map = {"count_lpa_boundary": count_lpa_boundary}
+        operation_map = {
+            "count_lpa_boundary": count_lpa_boundary,
+            "count_deleted_entities": count_deleted_entities,
+        }
         operation = operation_map[operation_string]
         return operation
 

diff --git a/digital_land/expectations/operation.py b/digital_land/expectations/operation.py
@@ -1,4 +1,7 @@
 import requests
+import pandas as pd
+import urllib
+import os
 
 
 # # TODO is there a way to represent this in a generalised count or not
@@ -119,3 +122,55 @@ def count_lpa_boundary(
     }
 
     return result, message, details
+
+
+def count_deleted_entities(
+    conn,
+    expected: int,
+    organisation_entity: int = None,
+):
+    # get database name to identify dataset
+    db_path = conn.execute("PRAGMA database_list").fetchall()[0][2]
+    db_name = os.path.splitext(os.path.basename(db_path))[0]
+
+    # get dataset specific active resource list
+    params = urllib.parse.urlencode(
+        {
+            "sql": f"""select *,o.entity from reporting_historic_endpoints rhe join organisation o on rhe.organisation=o.organisation
+                        where pipeline == '{db_name}' and o.entity='{organisation_entity}' and resource_end_date == "" group by endpoint""",
+            "_size": "max",
+        }
+    )
+    base_url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
+    get_resource = pd.read_csv(base_url)
+    resource_list = get_resource["resource"].to_list()
+
+    # use resource list to get current entities
+    query = f"""select e.reference,fe.entry_number,f.entity,e.name,e.organisation_entity
+                from fact_resource fe join fact f on fe.fact=f.fact join entity e on f.entity=e.entity
+                where resource in ({','.join(f"'{x}'" for x in resource_list)})
+                group by reference
+    """
+    rows = conn.execute(query).fetchall()
+    get_active_entities = [row[0] for row in rows]
+
+    # get entities from entity table to compare against resource entities
+    query = f"""
+    select reference from entity where organisation_entity = '{organisation_entity}';
+    """
+    rows = conn.execute(query).fetchall()
+    get_entities = [row[0] for row in rows]
+
+    # identify entities present in the entity table but missing from the resource
+    entities = [item for item in get_entities if item not in get_active_entities]
+    actual = len(entities)
+
+    result = bool(actual == expected)
+    message = f"there were {actual} entities found"
+    details = {
+        "actual": actual,
+        "expected": expected,
+        "entities": entities,
+    }
+
+    return result, message, details
diff --git a/tests/integration/expectations/test_operation.py b/tests/integration/expectations/test_operation.py
@@ -2,7 +2,10 @@
 import pytest
 import pandas as pd
 
-from digital_land.expectations.operation import count_lpa_boundary
+from digital_land.expectations.operation import (
+    count_lpa_boundary,
+    count_deleted_entities,
+)
 
 
 @pytest.fixture
@@ -109,3 +112,64 @@ def test_count_lpa_boundary_passes(
     detail_keys = ["actual", "expected"]
     for key in detail_keys:
         assert key in details, f"{key} missing from details"
+
+
+def test_count_deleted_entities(dataset_path, mocker):
+    # define constant parameters
+    organisation_entity = 109
+    expected = 0
+
+    # load data into sqlite for entity, fact_resource and fact table
+    test_entity_data = pd.DataFrame.from_dict(
+        {
+            "entity": ["1001"],
+            "name": ["test1"],
+            "organisation_entity": [109],
+            "reference": ["ref1"],
+        }
+    )
+
+    test_fact_resource_data = pd.DataFrame.from_dict(
+        {
+            "fact": ["036d2b946bd41", "16bf38800aafd"],
+            "resource": ["2f7d900dd48fd02", "2f7d900dd48fd02"],
+            "entry_number": ["1", "1"],
+        },
+    )
+
+    test_fact_data = pd.DataFrame.from_dict(
+        {
+            "fact": ["036d2b946bd41", "16bf38800aafd"],
+            "entity": ["1001", "1001"],
+            "field": ["name", "reference"],
+            "value": ["abc", "ref1"],
+        }
+    )
+
+    # mock `pandas.read_csv` to return the mock DataFrame
+    mock_df = pd.DataFrame({"resource": ["2f7d900dd48fd02"]})
+    mocker.patch("pandas.read_csv", return_value=mock_df)
+
+    with spatialite.connect(dataset_path) as conn:
+        # load data into required tables
+        test_entity_data.to_sql("entity", conn, if_exists="append", index=False)
+        test_fact_resource_data.to_sql(
+            "fact_resource", conn, if_exists="append", index=False
+        )
+        test_fact_data.to_sql("fact", conn, if_exists="append", index=False)
+
+        # run expectation
+        passed, message, details = count_deleted_entities(
+            conn,
+            expected=expected,
+            organisation_entity=organisation_entity,
+        )
+
+    assert (
+        passed
+    ), f"test failed : expected {details['expected']} but got {details['actual']} entities"
+    assert message, "test requires a message"
+
+    detail_keys = ["actual", "expected"]
+    for key in detail_keys:
+        assert key in details, f"{key} missing from details"