Update documentation to clarify usage of boundary_timestamp and add tests for new behavior, including resetting to current load time.

alkaline-0 · alkaline-0 · commit 0adf7802266e · 2025-11-18T16:32:40.000+01:00
diff --git a/dlt/destinations/impl/sqlalchemy/merge_job.py b/dlt/destinations/impl/sqlalchemy/merge_job.py
@@ -1,7 +1,8 @@
-from typing import Sequence, Tuple, Optional, List, Union
+from typing import Sequence, Tuple, Optional, List, Union, cast
 import operator
 import sqlalchemy as sa
 
+from dlt.common.typing import TAnyDateTime
 from dlt.common.utils import uniq_id
 from dlt.common.destination import PreparedTableSchema, DestinationCapabilitiesContext
 from dlt.common.schema.utils import (
@@ -374,14 +375,10 @@ def gen_scd2_sql(
             format_datetime_literal = (
                 DestinationCapabilitiesContext.generic_capabilities().format_datetime_literal
             )
-
-        boundary_ts = ensure_pendulum_datetime_utc(
-            current_load_package()["state"]["created_at"]
-            or root_table.get(  # type: ignore[arg-type]
-                "x-boundary-timestamp",
-                current_load_package()["state"]["created_at"],
-            )
-        )
+        created_at = current_load_package()["state"]["created_at"]
+        _boundary_ts = cast(Optional[TAnyDateTime], root_table.get("x-boundary-timestamp"))
+        boundary_ts: TAnyDateTime = _boundary_ts if _boundary_ts is not None else created_at
+        boundary_ts = ensure_pendulum_datetime_utc(boundary_ts)
 
         boundary_literal = format_datetime_literal(boundary_ts, caps.timestamp_precision)
 
diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py
@@ -4,7 +4,7 @@
 from dlt.common.time import ensure_pendulum_datetime_utc
 from dlt.common.destination import PreparedTableSchema
 from dlt.common.destination.utils import resolve_merge_strategy
-from dlt.common.typing import TypedDict
+from dlt.common.typing import TAnyDateTime, TypedDict
 
 from dlt.common.schema.typing import (
     TSortOrder,
@@ -845,13 +845,11 @@ def gen_scd2_sql(
                 DestinationCapabilitiesContext.generic_capabilities().format_datetime_literal
             )
 
-        boundary_ts = ensure_pendulum_datetime_utc(
-            current_load_package()["state"]["created_at"]
-            or root_table.get(  # type: ignore[arg-type]
-                "x-boundary-timestamp",
-                current_load_package()["state"]["created_at"],
-            )
-        )
+        created_at = current_load_package()["state"]["created_at"]
+        _boundary_ts = cast(Optional[TAnyDateTime], root_table.get("x-boundary-timestamp"))
+        boundary_ts: TAnyDateTime = _boundary_ts if _boundary_ts is not None else created_at
+        boundary_ts = ensure_pendulum_datetime_utc(boundary_ts)
+
         boundary_literal = format_datetime_literal(
             boundary_ts,
             caps.timestamp_precision,
diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py
@@ -823,22 +823,24 @@ def validate_write_disposition_hint(template: TResourceHints) -> None:
 
             if wd.get("strategy") == "scd2":
                 wd = cast(TScd2StrategyDict, wd)
-                for ts in ("active_record_timestamp", "boundary_timestamp"):
-                    # if (
-                    #     ts == "active_record_timestamp"
-                    #     and wd.get("active_record_timestamp") is None
-                    # ):
-                    #     continue  # None is allowed for active_record_timestamp
-                    if ts in wd:
-                        if wd[ts] is None:
-                            continue
-                        else:
-                            try:
-                                ensure_pendulum_datetime_utc(wd[ts])  # type: ignore[literal-required]
-                            except Exception:
-                                raise ValueError(
-                                    f"could not parse `{ts}` value `{wd[ts]}`"  # type: ignore[literal-required]
-                                )
+
+                art = wd.get("active_record_timestamp")
+                if art is not None:
+                    try:
+                        ensure_pendulum_datetime_utc(art)
+                    except (ValueError, TypeError) as exc:
+                        raise ValueError(
+                            f"could not parse `active_record_timestamp` value `{art}`"
+                        ) from exc
+
+                bt = wd.get("boundary_timestamp")
+                if bt is not None:
+                    try:
+                        ensure_pendulum_datetime_utc(bt)
+                    except (ValueError, TypeError) as exc:
+                        raise ValueError(
+                            f"could not parse `boundary_timestamp` value `{bt}`"
+                        ) from exc
 
     @staticmethod
     def validate_reference_hint(template: TResourceHints) -> None:
diff --git a/docs/website/docs/general-usage/merge-loading.md b/docs/website/docs/general-usage/merge-loading.md
@@ -567,6 +567,35 @@ def dim_customer():
     ...
 ```
 
+#### Reset boundary timestamp to the current load time
+To stop using a previously set `boundary_timestamp` and revert to the default (the current load package creation time), set `boundary_timestamp` to `None`. You can do this either at definition time or dynamically with `apply_hints` before a run.
+
+Definition-time (always use current load time):
+```py
+@dlt.resource(
+    write_disposition={
+        "disposition": "merge",
+        "strategy": "scd2",
+        "boundary_timestamp": None,  # reset to current load time
+    }
+)
+def dim_customer():
+    ...
+```
+
+Per-run reset (override just for this run):
+```py
+r.apply_hints(
+    write_disposition={
+        "disposition": "merge",
+        "strategy": "scd2",
+        "boundary_timestamp": None,  # reset to current load time for this run
+    }
+)
+pipeline.run(r(...))
+```
+When `boundary_timestamp` is `None` (or omitted), `dlt` uses the load package's creation timestamp as the boundary for both retiring existing versions and creating new versions.
+
 ### Example: Use your own row hash
 By default, `dlt` generates a row hash based on all columns provided by the resource and stores it in `_dlt_id`. You can use your own hash instead by specifying `row_version_column_name` in the `write_disposition` dictionary. You might already have a column present in your resource that can naturally serve as a row hash, in which case it's more efficient to use those pre-existing hash values than to generate new artificial ones. This option also allows you to use hashes based on a subset of columns, in case you want to ignore changes in some of the columns. When using your own hash, values for `_dlt_id` are randomly generated.
 ```py
diff --git a/tests/load/pipeline/test_scd2.py b/tests/load/pipeline/test_scd2.py
@@ -634,7 +634,7 @@ def r():
 
 @pytest.mark.parametrize(
     "destination_config",
-    destinations_configs(default_sql_configs=True, subset=["sqlalchemy"]),
+    destinations_configs(default_sql_configs=True, subset=["sqlalchemy", "duckdb"]),
     ids=lambda x: x.name,
 )
 def test_boundary_timestamp(
@@ -659,73 +659,87 @@ def test_boundary_timestamp(
     def r(data):
         yield data
 
+    # normalize timestamps once for assertions
+    ts1_dt = strip_timezone(ts1)
+    ts2_dt = strip_timezone(ts2)
+    ts3_dt = strip_timezone(ts3)
+    ts5_dt = strip_timezone(ts5)
+
     # load 1 — initial load
     dim_snap = [
         l1_1 := {"nk": 1, "foo": "foo"},
         l1_2 := {"nk": 2, "foo": "foo"},
     ]
-    current_time = {"ts": None}
+    current_time: dict[str, float | None] = {"ts": None}
     with mock.patch(
         "dlt.common.storages.load_package.precise_time",
         side_effect=lambda: current_time["ts"],
     ):
         # load 1 — initial load
-        current_time["ts"] = pendulum.parse(ts1).timestamp()
+        current_time["ts"] = pendulum.datetime(2024, 8, 21, 12, 15, tz="UTC").timestamp()
+        r.apply_hints(
+            write_disposition={
+                "disposition": "merge",
+                "strategy": "scd2",
+                "boundary_timestamp": ts1,
+            }
+        )
         info = p.run(r(dim_snap), **destination_config.run_kwargs)
         assert_load_info(info)
         assert load_table_counts(p, "dim_test")["dim_test"] == 2
         expected = [
-            {**{FROM: strip_timezone(ts1), TO: None}, **l1_1},
-            {**{FROM: strip_timezone(ts1), TO: None}, **l1_2},
+            {**{FROM: ts1_dt, TO: None}, **l1_1},
+            {**{FROM: ts1_dt, TO: None}, **l1_2},
         ]
         assert get_table(p, "dim_test", "nk", ts_columns=[FROM, TO]) == expected
 
         # load 2 — different source records, different boundary timestamp
-        current_time["ts"] = pendulum.parse(ts2).timestamp()
+        current_time["ts"] = pendulum.datetime(2024, 8, 22, tz="UTC").timestamp()
+        dim_snap = [
+            l2_1 := {"nk": 1, "foo": "bar"},  # natural key 1 updated
+            # l1_2,  # natural key 2 no longer present
+            l2_3 := {"nk": 3, "foo": "foo"},  # new natural key
+        ]
         r.apply_hints(
             write_disposition={
                 "disposition": "merge",
                 "strategy": "scd2",
                 "boundary_timestamp": ts2,
             }
         )
-        dim_snap = [
-            l2_1 := {"nk": 1, "foo": "bar"},  # natural key 1 updated
-            # l1_2,  # natural key 2 no longer present
-            l2_3 := {"nk": 3, "foo": "foo"},  # new natural key
-        ]
         info = p.run(r(dim_snap), **destination_config.run_kwargs)
         assert_load_info(info)
         assert load_table_counts(p, "dim_test")["dim_test"] == 4
         expected = [
-            {**{FROM: strip_timezone(ts1), TO: strip_timezone(ts2)}, **l1_1},  # retired
-            {**{FROM: strip_timezone(ts1), TO: strip_timezone(ts2)}, **l1_2},  # retired
-            {**{FROM: strip_timezone(ts2), TO: None}, **l2_1},  # new
-            {**{FROM: strip_timezone(ts2), TO: None}, **l2_3},  # new
+            {**{FROM: ts1_dt, TO: ts2_dt}, **l1_1},  # retired
+            {**{FROM: ts1_dt, TO: ts2_dt}, **l1_2},  # retired
+            {**{FROM: ts2_dt, TO: None}, **l2_1},  # new
+            {**{FROM: ts2_dt, TO: None}, **l2_3},  # new
         ]
-        assert_records_as_set(get_table(p, "dim_test"), expected)
+        assert_records_as_set(get_table(p, "dim_test", ts_columns=[FROM, TO]), expected)
 
         # load 3 — earlier boundary timestamp
         # we naively apply any valid timestamp
         # may lead to "valid from" > "valid to", as in this test case
+        current_time["ts"] = pendulum.datetime(2024, 8, 22, 0, 0, 1, tz="UTC").timestamp()
+        dim_snap = [l2_1]  # natural key 3 no longer present
         r.apply_hints(
             write_disposition={
                 "disposition": "merge",
                 "strategy": "scd2",
                 "boundary_timestamp": ts3,
             }
         )
-        dim_snap = [l2_1]  # natural key 3 no longer present
         info = p.run(r(dim_snap), **destination_config.run_kwargs)
         assert_load_info(info)
         assert load_table_counts(p, "dim_test")["dim_test"] == 4
         expected = [
-            {**{FROM: strip_timezone(ts1), TO: strip_timezone(ts2)}, **l1_1},  # unchanged
-            {**{FROM: strip_timezone(ts1), TO: strip_timezone(ts2)}, **l1_2},  # unchanged
-            {**{FROM: strip_timezone(ts2), TO: None}, **l2_1},  # unchanged
-            {**{FROM: strip_timezone(ts2), TO: strip_timezone(ts3)}, **l2_3},  # retired
+            {**{FROM: ts1_dt, TO: ts2_dt}, **l1_1},  # unchanged
+            {**{FROM: ts1_dt, TO: ts2_dt}, **l1_2},  # unchanged
+            {**{FROM: ts2_dt, TO: None}, **l2_1},  # unchanged
+            {**{FROM: ts2_dt, TO: ts3_dt}, **l2_3},  # retired
         ]
-        assert_records_as_set(get_table(p, "dim_test"), expected)
+        assert_records_as_set(get_table(p, "dim_test", ts_columns=[FROM, TO]), expected)
 
         # invalid boundary timestamp should raise error
         with pytest.raises(ValueError):
@@ -738,7 +752,7 @@ def r(data):
             )
 
         # run 4 — no boundary timestamp (use current precise_time)
-        current_time["ts"] = pendulum.parse(ts5).timestamp()
+        current_time["ts"] = ts5
         dim_snap = [
             l3_1 := {"nk": 1, "foo": "foobar"},  # updated
         ]
@@ -753,19 +767,19 @@ def r(data):
         assert_load_info(info)
         assert load_table_counts(p, "dim_test")["dim_test"] == 5
         expected = [
-            {**{FROM: strip_timezone(ts1), TO: strip_timezone(ts2)}, **l1_1},  # unchanged
-            {**{FROM: strip_timezone(ts1), TO: strip_timezone(ts2)}, **l1_2},  # unchanged
+            {**{FROM: ts1_dt, TO: ts2_dt}, **l1_1},  # unchanged
+            {**{FROM: ts1_dt, TO: ts2_dt}, **l1_2},  # unchanged
             {
-                **{FROM: strip_timezone(ts2), TO: strip_timezone(ts5)},
+                **{FROM: ts2_dt, TO: ts5_dt},
                 **l2_1,
             },  # retired in this run
             {
-                **{FROM: strip_timezone(ts2), TO: strip_timezone(ts3)},
+                **{FROM: ts2_dt, TO: ts3_dt},
                 **l2_3,
             },  # unchanged (already retired in load 3)
-            {**{FROM: strip_timezone(ts5), TO: None}, **l3_1},  # new current version
+            {**{FROM: ts5_dt, TO: None}, **l3_1},  # new current version
         ]
-        assert_records_as_set(get_table(p, "dim_test"), expected)
+        assert_records_as_set(get_table(p, "dim_test", ts_columns=[FROM, TO]), expected)
 
 
 @pytest.mark.essential