diff --git a/tests/conftest.py b/tests/conftest.py index 879b154701..493684162c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2143,31 +2143,31 @@ def arrow_table_with_null(pa_schema: "pa.Schema") -> "pa.Table": return pa.Table.from_pydict( { - 'bool': [False, None, True], - 'string': ['a', None, 'z'], + "bool": [False, None, True], + "string": ["a", None, "z"], # Go over the 16 bytes to kick in truncation - 'string_long': ['a' * 22, None, 'z' * 22], - 'int': [1, None, 9], - 'long': [1, None, 9], - 'float': [0.0, None, 0.9], - 'double': [0.0, None, 0.9], + "string_long": ["a" * 22, None, "z" * 22], + "int": [1, None, 9], + "long": [1, None, 9], + "float": [0.0, None, 0.9], + "double": [0.0, None, 0.9], # 'time': [1_000_000, None, 3_000_000], # Example times: 1s, none, and 3s past midnight #Spark does not support time fields - 'timestamp': [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)], - 'timestamptz': [ + "timestamp": [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)], + "timestamptz": [ datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), None, datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc), ], - 'date': [date(2023, 1, 1), None, date(2023, 3, 1)], + "date": [date(2023, 1, 1), None, date(2023, 3, 1)], # Not supported by Spark # 'time': [time(1, 22, 0), None, time(19, 25, 0)], # Not natively supported by Arrow # 'uuid': [uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, None, uuid.UUID('11111111-1111-1111-1111-111111111111').bytes], - 'binary': [b'\01', None, b'\22'], - 'fixed': [ - uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, + "binary": [b"\01", None, b"\22"], + "fixed": [ + uuid.UUID("00000000-0000-0000-0000-000000000000").bytes, None, - uuid.UUID('11111111-1111-1111-1111-111111111111').bytes, + uuid.UUID("11111111-1111-1111-1111-111111111111").bytes, ], }, schema=pa_schema, diff --git a/tests/integration/test_deletes.py b/tests/integration/test_deletes.py index 175d03888c..b5202a5526 100644 --- a/tests/integration/test_deletes.py +++ b/tests/integration/test_deletes.py @@ -38,7 +38,7 @@ def run_spark_commands(spark: SparkSession, sqls: List[str]) -> None: @pytest.mark.integration @pytest.mark.parametrize("format_version", [1, 2]) def test_partitioned_table_delete_full_file(spark: SparkSession, session_catalog: RestCatalog, format_version: int) -> None: - identifier = 'default.table_partitioned_delete' + identifier = "default.table_partitioned_delete" run_spark_commands( spark, @@ -66,14 +66,14 @@ def test_partitioned_table_delete_full_file(spark: SparkSession, session_catalog tbl.delete(EqualTo("number_partitioned", 10)) # No overwrite operation - assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ['append', 'append', 'delete'] - assert tbl.scan().to_arrow().to_pydict() == {'number_partitioned': [11, 11], 'number': [20, 30]} + assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ["append", "append", "delete"] + assert tbl.scan().to_arrow().to_pydict() == {"number_partitioned": [11, 11], "number": [20, 30]} @pytest.mark.integration @pytest.mark.parametrize("format_version", [1, 2]) def test_partitioned_table_rewrite(spark: SparkSession, session_catalog: RestCatalog, format_version: int) -> None: - identifier = 'default.table_partitioned_delete' + identifier = "default.table_partitioned_delete" run_spark_commands( spark, @@ -101,14 +101,14 @@ def test_partitioned_table_rewrite(spark: SparkSession, session_catalog: RestCat tbl.delete(EqualTo("number", 20)) # We don't delete a whole partition, so there is only a overwrite - assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ['append', 'append', 'overwrite'] - assert tbl.scan().to_arrow().to_pydict() == {'number_partitioned': [11, 10], 'number': [30, 30]} + assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ["append", "append", "overwrite"] + assert tbl.scan().to_arrow().to_pydict() == {"number_partitioned": [11, 10], "number": [30, 30]} @pytest.mark.integration @pytest.mark.parametrize("format_version", [1, 2]) def test_partitioned_table_no_match(spark: SparkSession, session_catalog: RestCatalog, format_version: int) -> None: - identifier = 'default.table_partitioned_delete' + identifier = "default.table_partitioned_delete" run_spark_commands( spark, @@ -132,13 +132,13 @@ def test_partitioned_table_no_match(spark: SparkSession, session_catalog: RestCa tbl = session_catalog.load_table(identifier) tbl.delete(EqualTo("number_partitioned", 22)) # Does not affect any data - assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ['append'] - assert tbl.scan().to_arrow().to_pydict() == {'number_partitioned': [10, 10], 'number': [20, 30]} + assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ["append"] + assert tbl.scan().to_arrow().to_pydict() == {"number_partitioned": [10, 10], "number": [20, 30]} @pytest.mark.integration def test_partitioned_table_positional_deletes(spark: SparkSession, session_catalog: RestCatalog) -> None: - identifier = 'default.table_partitioned_delete' + identifier = "default.table_partitioned_delete" run_spark_commands( spark, @@ -180,13 +180,13 @@ def test_partitioned_table_positional_deletes(spark: SparkSession, session_catal # One positional delete has been added, but an OVERWRITE status is set # https://github.com/apache/iceberg/issues/10122 - assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ['append', 'overwrite', 'overwrite'] - assert tbl.scan().to_arrow().to_pydict() == {'number_partitioned': [10], 'number': [20]} + assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ["append", "overwrite", "overwrite"] + assert tbl.scan().to_arrow().to_pydict() == {"number_partitioned": [10], "number": [20]} @pytest.mark.integration def test_partitioned_table_positional_deletes_sequence_number(spark: SparkSession, session_catalog: RestCatalog) -> None: - identifier = 'default.table_partitioned_delete_sequence_number' + identifier = "default.table_partitioned_delete_sequence_number" # This test case is a bit more complex. Here we run a MoR delete on a file, we make sure that # the manifest gets rewritten (but not the data file with a MoR), and check if the delete is still there @@ -234,31 +234,31 @@ def test_partitioned_table_positional_deletes_sequence_number(spark: SparkSessio assert len(snapshots) == 3 # Snapshots produced by Spark - assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()[0:2]] == ['append', 'overwrite'] + assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()[0:2]] == ["append", "overwrite"] # Will rewrite one parquet file assert snapshots[2].summary == Summary( Operation.OVERWRITE, **{ - 'added-files-size': '1145', - 'added-data-files': '1', - 'added-records': '2', - 'changed-partition-count': '1', - 'total-files-size': snapshots[2].summary['total-files-size'], - 'total-delete-files': '0', - 'total-data-files': '1', - 'total-position-deletes': '0', - 'total-records': '2', - 'total-equality-deletes': '0', - 'deleted-data-files': '2', - 'removed-delete-files': '1', - 'deleted-records': '5', - 'removed-files-size': snapshots[2].summary['removed-files-size'], - 'removed-position-deletes': '1', + "added-files-size": "1145", + "added-data-files": "1", + "added-records": "2", + "changed-partition-count": "1", + "total-files-size": snapshots[2].summary["total-files-size"], + "total-delete-files": "0", + "total-data-files": "1", + "total-position-deletes": "0", + "total-records": "2", + "total-equality-deletes": "0", + "deleted-data-files": "2", + "removed-delete-files": "1", + "deleted-records": "5", + "removed-files-size": snapshots[2].summary["removed-files-size"], + "removed-position-deletes": "1", }, ) - assert tbl.scan().to_arrow().to_pydict() == {'number_partitioned': [20, 20, 10], 'number': [200, 202, 100]} + assert tbl.scan().to_arrow().to_pydict() == {"number_partitioned": [20, 20, 10], "number": [200, 202, 100]} @pytest.mark.integration @@ -266,8 +266,8 @@ def test_delete_no_match(session_catalog: RestCatalog) -> None: arrow_schema = pa.schema([pa.field("ints", pa.int32())]) arrow_tbl = pa.Table.from_pylist( [ - {'ints': 1}, - {'ints': 3}, + {"ints": 1}, + {"ints": 3}, ], schema=arrow_schema, ) @@ -286,7 +286,7 @@ def test_delete_no_match(session_catalog: RestCatalog) -> None: assert [snapshot.summary.operation for snapshot in tbl.snapshots()] == [Operation.APPEND] - tbl.delete('ints == 2') # Only 1 and 3 in the file, but is between the lower and upper bound + tbl.delete("ints == 2") # Only 1 and 3 in the file, but is between the lower and upper bound assert [snapshot.summary.operation for snapshot in tbl.snapshots()] == [Operation.APPEND] @@ -296,8 +296,8 @@ def test_delete_overwrite(session_catalog: RestCatalog) -> None: arrow_schema = pa.schema([pa.field("ints", pa.int32())]) arrow_tbl = pa.Table.from_pylist( [ - {'ints': 1}, - {'ints': 2}, + {"ints": 1}, + {"ints": 2}, ], schema=arrow_schema, ) @@ -318,12 +318,12 @@ def test_delete_overwrite(session_catalog: RestCatalog) -> None: arrow_tbl_overwrite = pa.Table.from_pylist( [ - {'ints': 3}, - {'ints': 4}, + {"ints": 3}, + {"ints": 4}, ], schema=arrow_schema, ) - tbl.overwrite(arrow_tbl_overwrite, 'ints == 2') # Should rewrite one file + tbl.overwrite(arrow_tbl_overwrite, "ints == 2") # Should rewrite one file assert [snapshot.summary.operation for snapshot in tbl.snapshots()] == [ Operation.APPEND, @@ -331,7 +331,7 @@ def test_delete_overwrite(session_catalog: RestCatalog) -> None: Operation.APPEND, ] - assert tbl.scan().to_arrow()['ints'].to_pylist() == [3, 4, 1] + assert tbl.scan().to_arrow()["ints"].to_pylist() == [3, 4, 1] @pytest.mark.integration @@ -339,7 +339,7 @@ def test_delete_truncate(session_catalog: RestCatalog) -> None: arrow_schema = pa.schema([pa.field("ints", pa.int32())]) arrow_tbl = pa.Table.from_pylist( [ - {'ints': 1}, + {"ints": 1}, ], schema=arrow_schema, ) diff --git a/tests/integration/test_inspect_table.py b/tests/integration/test_inspect_table.py index ee83880a39..753d976ac2 100644 --- a/tests/integration/test_inspect_table.py +++ b/tests/integration/test_inspect_table.py @@ -102,38 +102,38 @@ def test_inspect_snapshots( for snapshot_id in df["snapshot_id"]: assert isinstance(snapshot_id.as_py(), int) - assert df['parent_id'][0].as_py() is None - assert df['parent_id'][1:].to_pylist() == df['snapshot_id'][:-1].to_pylist() + assert df["parent_id"][0].as_py() is None + assert df["parent_id"][1:].to_pylist() == df["snapshot_id"][:-1].to_pylist() - assert [operation.as_py() for operation in df['operation']] == ['append', 'delete', 'append', 'append'] + assert [operation.as_py() for operation in df["operation"]] == ["append", "delete", "append", "append"] for manifest_list in df["manifest_list"]: assert manifest_list.as_py().startswith("s3://") # Append - assert df['summary'][0].as_py() == [ - ('added-files-size', '5459'), - ('added-data-files', '1'), - ('added-records', '3'), - ('total-data-files', '1'), - ('total-delete-files', '0'), - ('total-records', '3'), - ('total-files-size', '5459'), - ('total-position-deletes', '0'), - ('total-equality-deletes', '0'), + assert df["summary"][0].as_py() == [ + ("added-files-size", "5459"), + ("added-data-files", "1"), + ("added-records", "3"), + ("total-data-files", "1"), + ("total-delete-files", "0"), + ("total-records", "3"), + ("total-files-size", "5459"), + ("total-position-deletes", "0"), + ("total-equality-deletes", "0"), ] # Delete - assert df['summary'][1].as_py() == [ - ('removed-files-size', '5459'), - ('deleted-data-files', '1'), - ('deleted-records', '3'), - ('total-data-files', '0'), - ('total-delete-files', '0'), - ('total-records', '0'), - ('total-files-size', '0'), - ('total-position-deletes', '0'), - ('total-equality-deletes', '0'), + assert df["summary"][1].as_py() == [ + ("removed-files-size", "5459"), + ("deleted-data-files", "1"), + ("deleted-records", "3"), + ("total-data-files", "0"), + ("total-delete-files", "0"), + ("total-records", "0"), + ("total-files-size", "0"), + ("total-position-deletes", "0"), + ("total-equality-deletes", "0"), ] lhs = spark.table(f"{identifier}.snapshots").toPandas() diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py index b23283a492..bc1bd93231 100644 --- a/tests/integration/test_writes/test_writes.py +++ b/tests/integration/test_writes/test_writes.py @@ -189,7 +189,7 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi ).collect() operations = [row.operation for row in rows] - assert operations == ['append', 'append', 'delete', 'append'] + assert operations == ["append", "append", "delete", "append"] summaries = [row.summary for row in rows] @@ -221,28 +221,28 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi # Delete assert summaries[2] == { - 'deleted-data-files': '2', - 'deleted-records': '6', - 'removed-files-size': '10918', - 'total-data-files': '0', - 'total-delete-files': '0', - 'total-equality-deletes': '0', - 'total-files-size': '0', - 'total-position-deletes': '0', - 'total-records': '0', + "deleted-data-files": "2", + "deleted-records": "6", + "removed-files-size": "10918", + "total-data-files": "0", + "total-delete-files": "0", + "total-equality-deletes": "0", + "total-files-size": "0", + "total-position-deletes": "0", + "total-records": "0", } # Overwrite assert summaries[3] == { - 'added-data-files': '1', - 'added-files-size': '5459', - 'added-records': '3', - 'total-data-files': '1', - 'total-delete-files': '0', - 'total-equality-deletes': '0', - 'total-files-size': '5459', - 'total-position-deletes': '0', - 'total-records': '3', + "added-data-files": "1", + "added-files-size": "5459", + "added-records": "3", + "total-data-files": "1", + "total-delete-files": "0", + "total-equality-deletes": "0", + "total-files-size": "5459", + "total-position-deletes": "0", + "total-records": "3", } @@ -517,7 +517,7 @@ def test_summaries_with_only_nulls( ).collect() operations = [row.operation for row in rows] - assert operations == ['append', 'append', 'delete', 'append'] + assert operations == ["append", "append", "delete", "append"] summaries = [row.summary for row in rows] @@ -543,24 +543,24 @@ def test_summaries_with_only_nulls( } assert summaries[2] == { - 'deleted-data-files': '1', - 'deleted-records': '2', - 'removed-files-size': '4239', - 'total-data-files': '0', - 'total-delete-files': '0', - 'total-equality-deletes': '0', - 'total-files-size': '0', - 'total-position-deletes': '0', - 'total-records': '0', + "deleted-data-files": "1", + "deleted-records": "2", + "removed-files-size": "4239", + "total-data-files": "0", + "total-delete-files": "0", + "total-equality-deletes": "0", + "total-files-size": "0", + "total-position-deletes": "0", + "total-records": "0", } assert summaries[3] == { - 'total-data-files': '0', - 'total-delete-files': '0', - 'total-equality-deletes': '0', - 'total-files-size': '0', - 'total-position-deletes': '0', - 'total-records': '0', + "total-data-files": "0", + "total-delete-files": "0", + "total-equality-deletes": "0", + "total-files-size": "0", + "total-position-deletes": "0", + "total-records": "0", } @@ -780,38 +780,38 @@ def test_inspect_snapshots( for snapshot_id in df["snapshot_id"]: assert isinstance(snapshot_id.as_py(), int) - assert df['parent_id'][0].as_py() is None - assert df['parent_id'][1:].to_pylist() == df['snapshot_id'][:-1].to_pylist() + assert df["parent_id"][0].as_py() is None + assert df["parent_id"][1:].to_pylist() == df["snapshot_id"][:-1].to_pylist() - assert [operation.as_py() for operation in df['operation']] == ['append', 'delete', 'append', 'append'] + assert [operation.as_py() for operation in df["operation"]] == ["append", "delete", "append", "append"] for manifest_list in df["manifest_list"]: assert manifest_list.as_py().startswith("s3://") # Append - assert df['summary'][0].as_py() == [ - ('added-files-size', '5459'), - ('added-data-files', '1'), - ('added-records', '3'), - ('total-data-files', '1'), - ('total-delete-files', '0'), - ('total-records', '3'), - ('total-files-size', '5459'), - ('total-position-deletes', '0'), - ('total-equality-deletes', '0'), + assert df["summary"][0].as_py() == [ + ("added-files-size", "5459"), + ("added-data-files", "1"), + ("added-records", "3"), + ("total-data-files", "1"), + ("total-delete-files", "0"), + ("total-records", "3"), + ("total-files-size", "5459"), + ("total-position-deletes", "0"), + ("total-equality-deletes", "0"), ] # Delete - assert df['summary'][1].as_py() == [ - ('removed-files-size', '5459'), - ('deleted-data-files', '1'), - ('deleted-records', '3'), - ('total-data-files', '0'), - ('total-delete-files', '0'), - ('total-records', '0'), - ('total-files-size', '0'), - ('total-position-deletes', '0'), - ('total-equality-deletes', '0'), + assert df["summary"][1].as_py() == [ + ("removed-files-size", "5459"), + ("deleted-data-files", "1"), + ("deleted-records", "3"), + ("total-data-files", "0"), + ("total-delete-files", "0"), + ("total-records", "0"), + ("total-files-size", "0"), + ("total-position-deletes", "0"), + ("total-equality-deletes", "0"), ] lhs = spark.table(f"{identifier}.snapshots").toPandas()