From 0ab3262e035867beba246368c2a46d1c9387f65c Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Mon, 11 Mar 2024 02:02:50 -0600 Subject: [PATCH] Allow Partition data to be nullable in ManifestEntry (#509) * fix * use partition field nullability --- pyiceberg/manifest.py | 1 + pyiceberg/partitioning.py | 3 ++- tests/conftest.py | 2 +- tests/table/test_partitioning.py | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py index 0504626d07..03dc3199bf 100644 --- a/pyiceberg/manifest.py +++ b/pyiceberg/manifest.py @@ -308,6 +308,7 @@ def data_file_with_partition(partition_type: StructType, format_version: Literal field_id=field.field_id, name=field.name, field_type=partition_field_to_data_file_partition_field(field.field_type), + required=field.required, ) for field in partition_type.fields ]) diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py index 6fa0286282..a6692b325e 100644 --- a/pyiceberg/partitioning.py +++ b/pyiceberg/partitioning.py @@ -218,7 +218,8 @@ def partition_type(self, schema: Schema) -> StructType: for field in self.fields: source_type = schema.find_type(field.source_id) result_type = field.transform.result_type(source_type) - nested_fields.append(NestedField(field.field_id, field.name, result_type, required=False)) + required = schema.find_field(field.source_id).required + nested_fields.append(NestedField(field.field_id, field.name, result_type, required=required)) return StructType(*nested_fields) def partition_to_path(self, data: Record, schema: Schema) -> str: diff --git a/tests/conftest.py b/tests/conftest.py index a005966ea5..e090e7c020 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -892,7 +892,7 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: "data_file": { "file_path": "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=1/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00002.parquet", "file_format": "PARQUET", - "partition": {"VendorID": 1, "tpep_pickup_datetime": 1925}, + "partition": {"VendorID": 1, "tpep_pickup_datetime": None}, "record_count": 95050, "file_size_in_bytes": 1265950, "block_size_in_bytes": 67108864, diff --git a/tests/table/test_partitioning.py b/tests/table/test_partitioning.py index cb60c9a8e5..d7425bc351 100644 --- a/tests/table/test_partitioning.py +++ b/tests/table/test_partitioning.py @@ -127,5 +127,5 @@ def test_partition_type(table_schema_simple: Schema) -> None: assert spec.partition_type(table_schema_simple) == StructType( NestedField(field_id=1000, name="str_truncate", field_type=StringType(), required=False), - NestedField(field_id=1001, name="int_bucket", field_type=IntegerType(), required=False), + NestedField(field_id=1001, name="int_bucket", field_type=IntegerType(), required=True), )