fix upsert with null values (#1861)

kevinjqliu · Fokko · commit cfb4f251a790 · 2025-04-17T14:37:29.000+02:00
# Rationale for this change Closes #1835 Original implementation, `!=` ([not_equal](https://arrow.apache.org/docs/python/generated/pyarrow.compute.not_equal.html#pyarrow.compute.not_equal)) does not account for `null` values. It emits `null` when either sides are `null` The new implementation, first checks for `not_equal`. And on null values, returns `true` only if both sides are `null` Similar to apache/iceberg-rust#1045 # Are these changes tested? Yes # Are there any user-facing changes? No
diff --git a/pyiceberg/table/upsert_util.py b/pyiceberg/table/upsert_util.py
@@ -71,7 +71,16 @@ def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols
         # When the target table is empty, there is nothing to update :)
         return source_table.schema.empty_table()
 
-    diff_expr = functools.reduce(operator.or_, [pc.field(f"{col}-lhs") != pc.field(f"{col}-rhs") for col in non_key_cols])
+    diff_expr = functools.reduce(
+        operator.or_,
+        [
+            pc.or_kleene(
+                pc.not_equal(pc.field(f"{col}-lhs"), pc.field(f"{col}-rhs")),
+                pc.is_null(pc.not_equal(pc.field(f"{col}-lhs"), pc.field(f"{col}-rhs"))),
+            )
+            for col in non_key_cols
+        ],
+    )
 
     return (
         source_table
diff --git a/tests/table/test_upsert.py b/tests/table/test_upsert.py
@@ -496,3 +496,46 @@ def test_upsert_without_identifier_fields(catalog: Catalog) -> None:
         ValueError, match="Join columns could not be found, please set identifier-field-ids or pass in explicitly."
     ):
         tbl.upsert(df)
+
+
+def test_upsert_with_nulls(catalog: Catalog) -> None:
+    identifier = "default.test_upsert_with_nulls"
+    _drop_table(catalog, identifier)
+
+    schema = pa.schema(
+        [
+            ("foo", pa.string()),
+            ("bar", pa.int32()),
+            ("baz", pa.bool_()),
+        ]
+    )
+
+    # create table with null value
+    table = catalog.create_table(identifier, schema)
+    data_with_null = pa.Table.from_pylist(
+        [
+            {"foo": "apple", "bar": None, "baz": False},
+            {"foo": "banana", "bar": None, "baz": False},
+        ],
+        schema=schema,
+    )
+    table.append(data_with_null)
+    assert table.scan().to_arrow()["bar"].is_null()
+
+    # upsert table with non-null value
+    data_without_null = pa.Table.from_pylist(
+        [
+            {"foo": "apple", "bar": 7, "baz": False},
+        ],
+        schema=schema,
+    )
+    upd = table.upsert(data_without_null, join_cols=["foo"])
+    assert upd.rows_updated == 1
+    assert upd.rows_inserted == 0
+    assert table.scan().to_arrow() == pa.Table.from_pylist(
+        [
+            {"foo": "apple", "bar": 7, "baz": False},
+            {"foo": "banana", "bar": None, "baz": False},
+        ],
+        schema=schema,
+    )