Merge branch 'main' into manifest_compaction

HonahX · HonahX · commit 05194487b71d · 2024-02-25T23:23:31.000-08:00
# Conflicts:
#	pyiceberg/table/__init__.py
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -655,6 +655,10 @@ def pyarrow_to_schema(schema: pa.Schema, name_mapping: Optional[NameMapping] = N
     return visit_pyarrow(schema, visitor)
 
 
+def _pyarrow_to_schema_without_ids(schema: pa.Schema) -> Schema:
+    return visit_pyarrow(schema, _ConvertToIcebergWithoutIDs())
+
+
 @singledispatch
 def visit_pyarrow(obj: Union[pa.DataType, pa.Schema], visitor: PyArrowSchemaVisitor[T]) -> T:
     """Apply a pyarrow schema visitor to any point within a schema.
diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py
@@ -22,6 +22,7 @@
 from dataclasses import dataclass
 from functools import cached_property, partial, singledispatch
 from typing import (
+    TYPE_CHECKING,
     Any,
     Callable,
     Dict,
@@ -62,6 +63,11 @@
     UUIDType,
 )
 
+if TYPE_CHECKING:
+    from pyiceberg.table.name_mapping import (
+        NameMapping,
+    )
+
 T = TypeVar("T")
 P = TypeVar("P")
 
@@ -221,6 +227,12 @@ def find_type(self, name_or_id: Union[str, int], case_sensitive: bool = True) ->
     def highest_field_id(self) -> int:
         return max(self._lazy_id_to_name.keys(), default=0)
 
+    @cached_property
+    def name_mapping(self) -> NameMapping:
+        from pyiceberg.table.name_mapping import create_mapping_from_schema
+
+        return create_mapping_from_schema(self)
+
     def find_column_name(self, column_id: int) -> Optional[str]:
         """Find a column name given a column ID.
 
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -88,7 +88,11 @@
     TableMetadata,
     TableMetadataUtil,
 )
-from pyiceberg.table.name_mapping import NameMapping, parse_mapping_from_json, update_mapping
+from pyiceberg.table.name_mapping import (
+    NameMapping,
+    parse_mapping_from_json,
+    update_mapping,
+)
 from pyiceberg.table.refs import MAIN_BRANCH, SnapshotRef
 from pyiceberg.table.snapshots import (
     Operation,
@@ -134,6 +138,41 @@
 _JAVA_LONG_MAX = 9223372036854775807
 
 
+def _check_schema(table_schema: Schema, other_schema: "pa.Schema") -> None:
+    from pyiceberg.io.pyarrow import _pyarrow_to_schema_without_ids, pyarrow_to_schema
+
+    name_mapping = table_schema.name_mapping
+    try:
+        task_schema = pyarrow_to_schema(other_schema, name_mapping=name_mapping)
+    except ValueError as e:
+        other_schema = _pyarrow_to_schema_without_ids(other_schema)
+        additional_names = set(other_schema.column_names) - set(table_schema.column_names)
+        raise ValueError(
+            f"PyArrow table contains more columns: {', '.join(sorted(additional_names))}. Update the schema first (hint, use union_by_name)."
+        ) from e
+
+    if table_schema.as_struct() != task_schema.as_struct():
+        from rich.console import Console
+        from rich.table import Table as RichTable
+
+        console = Console(record=True)
+
+        rich_table = RichTable(show_header=True, header_style="bold")
+        rich_table.add_column("")
+        rich_table.add_column("Table field")
+        rich_table.add_column("Dataframe field")
+
+        for lhs in table_schema.fields:
+            try:
+                rhs = task_schema.find_field(lhs.field_id)
+                rich_table.add_row("✅" if lhs == rhs else "❌", str(lhs), str(rhs))
+            except ValueError:
+                rich_table.add_row("❌", str(lhs), "Missing")
+
+        console.print(rich_table)
+        raise ValueError(f"Mismatch in fields:\n{console.export_text()}")
+
+
 class TableProperties:
     PARQUET_ROW_GROUP_SIZE_BYTES = "write.parquet.row-group-size-bytes"
     PARQUET_ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024  # 128 MB
@@ -1035,6 +1074,8 @@ def append(self, df: pa.Table) -> None:
         if len(self.spec().fields) > 0:
             raise ValueError("Cannot write to partitioned tables")
 
+        _check_schema(self.schema(), other_schema=df.schema)
+
         with self.update_snapshot().merge_append() as update_snapshot:
             # skip writing data files if the dataframe is empty
             if df.shape[0] > 0:
@@ -1065,6 +1106,8 @@ def overwrite(self, df: pa.Table, overwrite_filter: BooleanExpression = ALWAYS_T
         if len(self.spec().fields) > 0:
             raise ValueError("Cannot write to partitioned tables")
 
+        _check_schema(self.schema(), other_schema=df.schema)
+
         with self.update_snapshot().overwrite() as update_snapshot:
             # skip writing data files if the dataframe is empty
             if df.shape[0] > 0:
diff --git a/pyiceberg/table/name_mapping.py b/pyiceberg/table/name_mapping.py
@@ -26,7 +26,7 @@
 from abc import ABC, abstractmethod
 from collections import ChainMap
 from functools import cached_property, singledispatch
-from typing import Any, Dict, Generic, List, Optional, TypeVar, Union
+from typing import Any, Dict, Generic, Iterator, List, Optional, TypeVar, Union
 
 from pydantic import Field, conlist, field_validator, model_serializer
 
@@ -97,6 +97,10 @@ def __len__(self) -> int:
         """Return the number of mappings."""
         return len(self.root)
 
+    def __iter__(self) -> Iterator[MappedField]:
+        """Iterate over the mapped fields."""
+        return iter(self.root)
+
     def __str__(self) -> str:
         """Convert the name-mapping into a nicely formatted string."""
         if len(self.root) == 0:
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -42,7 +42,6 @@
     List,
     Optional,
 )
-from urllib.parse import urlparse
 
 import boto3
 import pytest
@@ -57,8 +56,6 @@
     GCS_PROJECT_ID,
     GCS_TOKEN,
     GCS_TOKEN_EXPIRES_AT_MS,
-    OutputFile,
-    OutputStream,
     fsspec,
     load_file_io,
 )
@@ -88,7 +85,7 @@
     import pyarrow as pa
     from moto.server import ThreadedMotoServer  # type: ignore
 
-    from pyiceberg.io.pyarrow import PyArrowFile, PyArrowFileIO
+    from pyiceberg.io.pyarrow import PyArrowFileIO
 
 
 def pytest_collection_modifyitems(items: List[pytest.Item]) -> None:
@@ -1456,40 +1453,6 @@ def simple_map() -> MapType:
     return MapType(key_id=19, key_type=StringType(), value_id=25, value_type=DoubleType(), value_required=False)
 
 
-class LocalOutputFile(OutputFile):
-    """An OutputFile implementation for local files (for test use only)."""
-
-    def __init__(self, location: str) -> None:
-        parsed_location = urlparse(location)  # Create a ParseResult from the uri
-        if (
-            parsed_location.scheme and parsed_location.scheme != "file"
-        ):  # Validate that an uri is provided with a scheme of `file`
-            raise ValueError("LocalOutputFile location must have a scheme of `file`")
-        elif parsed_location.netloc:
-            raise ValueError(f"Network location is not allowed for LocalOutputFile: {parsed_location.netloc}")
-
-        super().__init__(location=location)
-        self._path = parsed_location.path
-
-    def __len__(self) -> int:
-        """Return the length of an instance of the LocalOutputFile class."""
-        return os.path.getsize(self._path)
-
-    def exists(self) -> bool:
-        return os.path.exists(self._path)
-
-    def to_input_file(self) -> "PyArrowFile":
-        from pyiceberg.io.pyarrow import PyArrowFileIO
-
-        return PyArrowFileIO().new_input(location=self.location)
-
-    def create(self, overwrite: bool = False) -> OutputStream:
-        output_file = open(self._path, "wb" if overwrite else "xb")
-        if not issubclass(type(output_file), OutputStream):
-            raise TypeError("Object returned from LocalOutputFile.create(...) does not match the OutputStream protocol.")
-        return output_file
-
-
 @pytest.fixture(scope="session")
 def generated_manifest_entry_file(avro_schema_manifest_entry: Dict[str, Any]) -> Generator[str, None, None]:
     from fastavro import parse_schema, writer
diff --git a/tests/table/test_init.py b/tests/table/test_init.py
@@ -19,6 +19,7 @@
 from copy import copy
 from typing import Dict
 
+import pyarrow as pa
 import pytest
 from sortedcontainers import SortedList
 
@@ -58,6 +59,7 @@
     Table,
     UpdateSchema,
     _apply_table_update,
+    _check_schema,
     _generate_snapshot_id,
     _match_deletes_to_data_file,
     _TableMetadataUpdateContext,
@@ -982,3 +984,79 @@ def test_correct_schema() -> None:
         _ = t.scan(snapshot_id=-1).projection()
 
     assert "Snapshot not found: -1" in str(exc_info.value)
+
+
+def test_schema_mismatch_type(table_schema_simple: Schema) -> None:
+    other_schema = pa.schema((
+        pa.field("foo", pa.string(), nullable=True),
+        pa.field("bar", pa.decimal128(18, 6), nullable=False),
+        pa.field("baz", pa.bool_(), nullable=True),
+    ))
+
+    expected = r"""Mismatch in fields:
+┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+┃    ┃ Table field              ┃ Dataframe field                 ┃
+┡━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+│ ✅ │ 1: foo: optional string  │ 1: foo: optional string         │
+│ ❌ │ 2: bar: required int     │ 2: bar: required decimal\(18, 6\) │
+│ ✅ │ 3: baz: optional boolean │ 3: baz: optional boolean        │
+└────┴──────────────────────────┴─────────────────────────────────┘
+"""
+
+    with pytest.raises(ValueError, match=expected):
+        _check_schema(table_schema_simple, other_schema)
+
+
+def test_schema_mismatch_nullability(table_schema_simple: Schema) -> None:
+    other_schema = pa.schema((
+        pa.field("foo", pa.string(), nullable=True),
+        pa.field("bar", pa.int32(), nullable=True),
+        pa.field("baz", pa.bool_(), nullable=True),
+    ))
+
+    expected = """Mismatch in fields:
+┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+┃    ┃ Table field              ┃ Dataframe field          ┃
+┡━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+│ ✅ │ 1: foo: optional string  │ 1: foo: optional string  │
+│ ❌ │ 2: bar: required int     │ 2: bar: optional int     │
+│ ✅ │ 3: baz: optional boolean │ 3: baz: optional boolean │
+└────┴──────────────────────────┴──────────────────────────┘
+"""
+
+    with pytest.raises(ValueError, match=expected):
+        _check_schema(table_schema_simple, other_schema)
+
+
+def test_schema_mismatch_missing_field(table_schema_simple: Schema) -> None:
+    other_schema = pa.schema((
+        pa.field("foo", pa.string(), nullable=True),
+        pa.field("baz", pa.bool_(), nullable=True),
+    ))
+
+    expected = """Mismatch in fields:
+┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+┃    ┃ Table field              ┃ Dataframe field          ┃
+┡━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+│ ✅ │ 1: foo: optional string  │ 1: foo: optional string  │
+│ ❌ │ 2: bar: required int     │ Missing                  │
+│ ✅ │ 3: baz: optional boolean │ 3: baz: optional boolean │
+└────┴──────────────────────────┴──────────────────────────┘
+"""
+
+    with pytest.raises(ValueError, match=expected):
+        _check_schema(table_schema_simple, other_schema)
+
+
+def test_schema_mismatch_additional_field(table_schema_simple: Schema) -> None:
+    other_schema = pa.schema((
+        pa.field("foo", pa.string(), nullable=True),
+        pa.field("bar", pa.int32(), nullable=True),
+        pa.field("baz", pa.bool_(), nullable=True),
+        pa.field("new_field", pa.date32(), nullable=True),
+    ))
+
+    expected = r"PyArrow table contains more columns: new_field. Update the schema first \(hint, use union_by_name\)."
+
+    with pytest.raises(ValueError, match=expected):
+        _check_schema(table_schema_simple, other_schema)