Skip to content

Commit dfae694

Browse files
authored
Hive: Add DO_NOT_UPDATE_STATS in alter_table (#1966)
<!-- Thanks for opening a pull request! --> <!-- In the case this PR will resolve an issue, please replace ${GITHUB_ISSUE_ID} below with the actual Github issue id. --> <!-- Closes #${GITHUB_ISSUE_ID} --> # Rationale for this change Let the hive metastore does not compute hive stats which is not used by iceberg. Details see the java version (apache/iceberg#4407) # Are these changes tested? Existing tests # Are there any user-facing changes? No <!-- In the case of user-facing changes, please add the changelog label. -->
1 parent 05f07ee commit dfae694

File tree

2 files changed

+34
-8
lines changed

2 files changed

+34
-8
lines changed

pyiceberg/catalog/hive.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from hive_metastore.ttypes import (
3737
AlreadyExistsException,
3838
CheckLockRequest,
39+
EnvironmentContext,
3940
FieldSchema,
4041
InvalidOperationException,
4142
LockComponent,
@@ -135,6 +136,8 @@
135136
DEFAULT_LOCK_CHECK_MIN_WAIT_TIME = 0.1 # 100 milliseconds
136137
DEFAULT_LOCK_CHECK_MAX_WAIT_TIME = 60 # 1 min
137138
DEFAULT_LOCK_CHECK_RETRIES = 4
139+
DO_NOT_UPDATE_STATS = "DO_NOT_UPDATE_STATS"
140+
DO_NOT_UPDATE_STATS_DEFAULT = "true"
138141

139142
logger = logging.getLogger(__name__)
140143

@@ -539,7 +542,12 @@ def commit_table(
539542
metadata_location=updated_staged_table.metadata_location,
540543
previous_metadata_location=current_table.metadata_location,
541544
)
542-
open_client.alter_table(dbname=database_name, tbl_name=table_name, new_tbl=hive_table)
545+
open_client.alter_table_with_environment_context(
546+
dbname=database_name,
547+
tbl_name=table_name,
548+
new_tbl=hive_table,
549+
environment_context=EnvironmentContext(properties={DO_NOT_UPDATE_STATS: DO_NOT_UPDATE_STATS_DEFAULT}),
550+
)
543551
else:
544552
# Table does not exist, create it.
545553
hive_table = self._convert_iceberg_into_hive(
@@ -626,7 +634,12 @@ def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: U
626634
tbl = open_client.get_table(dbname=from_database_name, tbl_name=from_table_name)
627635
tbl.dbName = to_database_name
628636
tbl.tableName = to_table_name
629-
open_client.alter_table(dbname=from_database_name, tbl_name=from_table_name, new_tbl=tbl)
637+
open_client.alter_table_with_environment_context(
638+
dbname=from_database_name,
639+
tbl_name=from_table_name,
640+
new_tbl=tbl,
641+
environment_context=EnvironmentContext(properties={DO_NOT_UPDATE_STATS: DO_NOT_UPDATE_STATS_DEFAULT}),
642+
)
630643
except NoSuchObjectException as e:
631644
raise NoSuchTableError(f"Table does not exist: {from_table_name}") from e
632645
except InvalidOperationException as e:

tests/catalog/test_hive.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import thrift.transport.TSocket
3030
from hive_metastore.ttypes import (
3131
AlreadyExistsException,
32+
EnvironmentContext,
3233
FieldSchema,
3334
InvalidOperationException,
3435
LockResponse,
@@ -44,6 +45,8 @@
4445

4546
from pyiceberg.catalog import PropertiesUpdateSummary
4647
from pyiceberg.catalog.hive import (
48+
DO_NOT_UPDATE_STATS,
49+
DO_NOT_UPDATE_STATS_DEFAULT,
4750
HIVE_KERBEROS_AUTH,
4851
LOCK_CHECK_MAX_WAIT_TIME,
4952
LOCK_CHECK_MIN_WAIT_TIME,
@@ -874,7 +877,7 @@ def test_rename_table(hive_table: HiveTable) -> None:
874877

875878
catalog._client = MagicMock()
876879
catalog._client.__enter__().get_table.side_effect = [hive_table, renamed_table]
877-
catalog._client.__enter__().alter_table.return_value = None
880+
catalog._client.__enter__().alter_table_with_environment_context.return_value = None
878881

879882
from_identifier = ("default", "new_tabl2e")
880883
to_identifier = ("default", "new_tabl3e")
@@ -884,7 +887,12 @@ def test_rename_table(hive_table: HiveTable) -> None:
884887

885888
calls = [call(dbname="default", tbl_name="new_tabl2e"), call(dbname="default", tbl_name="new_tabl3e")]
886889
catalog._client.__enter__().get_table.assert_has_calls(calls)
887-
catalog._client.__enter__().alter_table.assert_called_with(dbname="default", tbl_name="new_tabl2e", new_tbl=renamed_table)
890+
catalog._client.__enter__().alter_table_with_environment_context.assert_called_with(
891+
dbname="default",
892+
tbl_name="new_tabl2e",
893+
new_tbl=renamed_table,
894+
environment_context=EnvironmentContext(properties={DO_NOT_UPDATE_STATS: DO_NOT_UPDATE_STATS_DEFAULT}),
895+
)
888896

889897

890898
def test_rename_table_from_self_identifier(hive_table: HiveTable) -> None:
@@ -902,22 +910,27 @@ def test_rename_table_from_self_identifier(hive_table: HiveTable) -> None:
902910
renamed_table.tableName = "new_tabl3e"
903911

904912
catalog._client.__enter__().get_table.side_effect = [hive_table, renamed_table]
905-
catalog._client.__enter__().alter_table.return_value = None
913+
catalog._client.__enter__().alter_table_with_environment_context.return_value = None
906914
to_identifier = ("default", "new_tabl3e")
907915
table = catalog.rename_table(from_table.name(), to_identifier)
908916

909917
assert table.name() == to_identifier
910918

911919
calls = [call(dbname="default", tbl_name="new_tabl2e"), call(dbname="default", tbl_name="new_tabl3e")]
912920
catalog._client.__enter__().get_table.assert_has_calls(calls)
913-
catalog._client.__enter__().alter_table.assert_called_with(dbname="default", tbl_name="new_tabl2e", new_tbl=renamed_table)
921+
catalog._client.__enter__().alter_table_with_environment_context.assert_called_with(
922+
dbname="default",
923+
tbl_name="new_tabl2e",
924+
new_tbl=renamed_table,
925+
environment_context=EnvironmentContext(properties={DO_NOT_UPDATE_STATS: DO_NOT_UPDATE_STATS_DEFAULT}),
926+
)
914927

915928

916929
def test_rename_table_from_does_not_exists() -> None:
917930
catalog = HiveCatalog(HIVE_CATALOG_NAME, uri=HIVE_METASTORE_FAKE_URL)
918931

919932
catalog._client = MagicMock()
920-
catalog._client.__enter__().alter_table.side_effect = NoSuchObjectException(
933+
catalog._client.__enter__().alter_table_with_environment_context.side_effect = NoSuchObjectException(
921934
message="hive.default.does_not_exists table not found"
922935
)
923936

@@ -931,7 +944,7 @@ def test_rename_table_to_namespace_does_not_exists() -> None:
931944
catalog = HiveCatalog(HIVE_CATALOG_NAME, uri=HIVE_METASTORE_FAKE_URL)
932945

933946
catalog._client = MagicMock()
934-
catalog._client.__enter__().alter_table.side_effect = InvalidOperationException(
947+
catalog._client.__enter__().alter_table_with_environment_context.side_effect = InvalidOperationException(
935948
message="Unable to change partition or table. Database default does not exist Check metastore logs for detailed stack.does_not_exists"
936949
)
937950

0 commit comments

Comments
 (0)