Merge remote-tracking branch 'origin/main' into kevinjqliu/fix-schema…

…-comparison
apache · Mar 1, 2024 · 672efe2 · 672efe2
2 parents d05a3fb + 36b56eb
commit 672efe2
Show file tree

Hide file tree

Showing 62 changed files with 5,528 additions and 1,400 deletions.
diff --git a/.asf.yaml b/.asf.yaml
@@ -44,6 +44,7 @@ github:
     projects: true
   collaborators:  # Note: the number of collaborators is limited to 10
     - ajantha-bhat
+    - syun64
   ghp_branch: gh-pages
   ghp_path: /
 

diff --git a/.github/workflows/check-md-link.yml b/.github/workflows/check-md-link.yml
@@ -0,0 +1,13 @@
+name: Check Markdown links
+
+on:
+  push:
+    paths:
+      - mkdocs/**
+
+jobs:
+  markdown-link-check:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@master
+    - uses: gaurav-nelson/github-action-markdown-link-check@v1
diff --git a/.github/workflows/python-release.yml b/.github/workflows/python-release.yml
@@ -34,7 +34,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ ubuntu-22.04, windows-2022, macos-11 ]
+        os: [ ubuntu-22.04, windows-2022, macos-11, macos-12, macos-13, macos-14 ]
 
     steps:
       - uses: actions/checkout@v4
@@ -43,7 +43,7 @@ jobs:
 
       - uses: actions/setup-python@v5
         with:
-          python-version: '3.8'
+          python-version: '3.11'
 
       - name: Install poetry
         run: pip install poetry
@@ -59,15 +59,15 @@ jobs:
         if: startsWith(matrix.os, 'ubuntu')
 
       - name: Build wheels
-        uses: pypa/[email protected].3
+        uses: pypa/[email protected].5
         with:
           output-dir: wheelhouse
           config-file: "pyproject.toml"
         env:
           # Ignore 32 bit architectures
           CIBW_ARCHS: "auto64"
           CIBW_PROJECT_REQUIRES_PYTHON: ">=3.8,<3.12"
-          CIBW_TEST_REQUIRES: "pytest==7.4.2 moto==4.2.2"
+          CIBW_TEST_REQUIRES: "pytest==7.4.2 moto==5.0.1"
           CIBW_TEST_EXTRAS: "s3fs,glue"
           CIBW_TEST_COMMAND: "pytest {project}/tests/avro/test_decoder.py"
           # There is an upstream issue with installing on MacOSX
@@ -80,7 +80,7 @@ jobs:
         if: startsWith(matrix.os, 'ubuntu')
         run: ls -lah dist/* && cp dist/* wheelhouse/
 
-      - uses: actions/upload-artifact@v4
+      - uses: actions/upload-artifact@v3
         with:
           name: "release-${{ github.event.inputs.version }}"
           path: ./wheelhouse/*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -36,13 +36,13 @@ repos:
       - id: ruff-format
         args: [ --preview ]
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.6.1
+    rev: v1.8.0
     hooks:
       - id: mypy
         args:
           [--install-types, --non-interactive, --config=pyproject.toml]
   - repo: https://github.com/hadialqattan/pycln
-    rev: v2.3.0
+    rev: v2.4.0
     hooks:
       - id: pycln
         args: [--config=pyproject.toml]

diff --git a/Makefile b/Makefile
@@ -19,7 +19,7 @@ install-poetry:
 	pip install poetry==1.7.1
 
 install-dependencies:
-	poetry install -E pyarrow -E hive -E s3fs -E glue -E adlfs -E duckdb -E ray -E sql-postgres -E gcsfs -E sql-sqlite
+	poetry install -E pyarrow -E hive -E s3fs -E glue -E adlfs -E duckdb -E ray -E sql-postgres -E gcsfs -E sql-sqlite -E daft
 
 install: | install-poetry install-dependencies
 

diff --git a/NOTICE b/NOTICE
@@ -1,6 +1,6 @@
 
 Apache Iceberg
-Copyright 2017-2022 The Apache Software Foundation
+Copyright 2017-2024 The Apache Software Foundation
 
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).

diff --git a/mkdocs/docs/SUMMARY.md b/mkdocs/docs/SUMMARY.md
@@ -17,11 +17,12 @@
 
 <!-- prettier-ignore-start -->
 
-- [Home](index.md)
+- [Getting started](index.md)
 - [Configuration](configuration.md)
 - [CLI](cli.md)
 - [API](api.md)
 - [Contributing](contributing.md)
+- [Community](community.md)
 - Releases
     - [Verify a release](verify-release.md)
     - [How to release](how-to-release.md)

diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
@@ -418,6 +418,63 @@ with table.update_schema(allow_incompatible_changes=True) as update:
     update.delete_column("some_field")
 ```
 
+## Partition evolution
+
+PyIceberg supports partition evolution. See the [partition evolution](https://iceberg.apache.org/spec/#partition-evolution)
+for more details.
+
+The API to use when evolving partitions is the `update_spec` API on the table.
+
+```python
+with table.update_spec() as update:
+    update.add_field("id", BucketTransform(16), "bucketed_id")
+    update.add_field("event_ts", DayTransform(), "day_ts")
+```
+
+Updating the partition spec can also be done as part of a transaction with other operations.
+
+```python
+with table.transaction() as transaction:
+    with transaction.update_spec() as update_spec:
+        update_spec.add_field("id", BucketTransform(16), "bucketed_id")
+        update_spec.add_field("event_ts", DayTransform(), "day_ts")
+    # ... Update properties etc
+```
+
+### Add fields
+
+New partition fields can be added via the `add_field` API which takes in the field name to partition on,
+the partition transform, and an optional partition name. If the partition name is not specified,
+one will be created.
+
+```python
+with table.update_spec() as update:
+    update.add_field("id", BucketTransform(16), "bucketed_id")
+    update.add_field("event_ts", DayTransform(), "day_ts")
+    # identity is a shortcut API for adding an IdentityTransform
+    update.identity("some_field")
+```
+
+### Remove fields
+
+Partition fields can also be removed via the `remove_field` API if it no longer makes sense to partition on those fields.
+
+```python
+with table.update_spec() as update:some_partition_name
+    # Remove the partition field with the name
+    update.remove_field("some_partition_name")
+```
+
+### Rename fields
+
+Partition fields can also be renamed via the `rename_field` API.
+
+```python
+with table.update_spec() as update:
+    # Rename the partition field with the name bucketed_id to sharded_id
+    update.rename_field("bucketed_id", "sharded_id")
+```
+
 ## Table properties
 
 Set and remove properties through the `Transaction` API:
@@ -636,3 +693,56 @@ print(ray_dataset.take(2))
     },
 ]
 ```
+
+### Daft
+
+PyIceberg interfaces closely with Daft Dataframes (see also: [Daft integration with Iceberg](https://www.getdaft.io/projects/docs/en/latest/user_guide/integrations/iceberg.html)) which provides a full lazily optimized query engine interface on top of PyIceberg tables.
+
+<!-- prettier-ignore-start -->
+
+!!! note "Requirements"
+    This requires [Daft to be installed](index.md).
+
+<!-- prettier-ignore-end -->
+
+A table can be read easily into a Daft Dataframe:
+
+```python
+df = table.to_daft()  # equivalent to `daft.read_iceberg(table)`
+df = df.where(df["trip_distance"] >= 10.0)
+df = df.select("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime")
+```
+
+This returns a Daft Dataframe which is lazily materialized. Printing `df` will display the schema:
+
+```
+╭──────────┬───────────────────────────────┬───────────────────────────────╮
+│ VendorID ┆ tpep_pickup_datetime          ┆ tpep_dropoff_datetime         │
+│ ---      ┆ ---                           ┆ ---                           │
+│ Int64    ┆ Timestamp(Microseconds, None) ┆ Timestamp(Microseconds, None) │
+╰──────────┴───────────────────────────────┴───────────────────────────────╯
+
+(No data to display: Dataframe not materialized)
+```
+
+We can execute the Dataframe to preview the first few rows of the query with `df.show()`.
+
+This is correctly optimized to take advantage of Iceberg features such as hidden partitioning and file-level statistics for efficient reads.
+
+```python
+df.show(2)
+```
+
+```
+╭──────────┬───────────────────────────────┬───────────────────────────────╮
+│ VendorID ┆ tpep_pickup_datetime          ┆ tpep_dropoff_datetime         │
+│ ---      ┆ ---                           ┆ ---                           │
+│ Int64    ┆ Timestamp(Microseconds, None) ┆ Timestamp(Microseconds, None) │
+╞══════════╪═══════════════════════════════╪═══════════════════════════════╡
+│ 2        ┆ 2008-12-31T23:23:50.000000    ┆ 2009-01-01T00:34:31.000000    │
+├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+│ 2        ┆ 2008-12-31T23:05:03.000000    ┆ 2009-01-01T16:10:18.000000    │
+╰──────────┴───────────────────────────────┴───────────────────────────────╯
+
+(Showing first 2 rows)
+```
diff --git a/mkdocs/docs/cli.md b/mkdocs/docs/cli.md
@@ -36,6 +36,7 @@ Options:
 --catalog TEXT
 --verbose BOOLEAN
 --output [text|json]
+--ugi TEXT
 --uri TEXT
 --credential TEXT
 --help                Show this message and exit.

diff --git a/mkdocs/docs/community.md b/mkdocs/docs/community.md
@@ -0,0 +1,64 @@
+---
+hide:
+  - navigation
+---
+
+<!--
+  - Licensed to the Apache Software Foundation (ASF) under one
+  - or more contributor license agreements.  See the NOTICE file
+  - distributed with this work for additional information
+  - regarding copyright ownership.  The ASF licenses this file
+  - to you under the Apache License, Version 2.0 (the
+  - "License"); you may not use this file except in compliance
+  - with the License.  You may obtain a copy of the License at
+  -
+  -   http://www.apache.org/licenses/LICENSE-2.0
+  -
+  - Unless required by applicable law or agreed to in writing,
+  - software distributed under the License is distributed on an
+  - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  - KIND, either express or implied.  See the License for the
+  - specific language governing permissions and limitations
+  - under the License.
+  -->
+
+# Join the community
+
+Apache Iceberg tracks issues in GitHub and prefers to receive contributions as pull requests.
+
+Community discussions happen primarily on the [dev mailing list](https://lists.apache.org/[email protected]), on [Apache Iceberg Slack workspace](https://join.slack.com/t/apache-iceberg/shared_invite/zt-287g3akar-K9Oe_En5j1UL7Y_Ikpai3A) in the #python channel, and on specific [GitHub issues](https://github.com/apache/iceberg-python/issues).
+
+## Iceberg Community Events
+
+The PyIceberg community sync is on the last Tuesday of every month. To join, make sure to subscribe to the [iceberg-python-sync Google group](https://groups.google.com/g/iceberg-python-sync).
+
+## Community Guidelines
+
+### Apache Iceberg Community Guidelines
+
+The Apache Iceberg community is built on the principles described in the [Apache Way](https://www.apache.org/theapacheway/index.html)
+and all who engage with the community are expected to be respectful, open, come with the best interests of the community in mind,
+and abide by the Apache Foundation [Code of Conduct](https://www.apache.org/foundation/policies/conduct.html).
+
+### Participants with Corporate Interests
+
+A wide range of corporate entities have interests that overlap in both features and frameworks related to Iceberg and while we
+encourage engagement and contributions, the community is not a venue for marketing, solicitation, or recruitment.
+
+Any vendor who wants to participate in the Apache Iceberg community Slack workspace should create a dedicated vendor channel
+for their organization prefixed by `vendor-`.
+
+This space can be used to discuss features and integration with Iceberg related to the vendor offering.  This space should not
+be used to promote competing vendor products/services or disparage other vendor offerings.  Discussion should be focused on
+questions asked by the community and not to expand/introduce/redirect users to alternate offerings.
+
+### Marketing / Solicitation / Recruiting
+
+The Apache Iceberg community is a space for everyone to operate free of influence. The development lists, Slack workspace,
+and GitHub should not be used to market products or services.  Solicitation or overt promotion should not be performed in common
+channels or through direct messages.
+
+Recruitment of community members should not be conducted through direct messages or community channels, but opportunities
+related to contributing to or using Iceberg can be posted to the `#jobs` channel.
+
+For questions regarding any of the guidelines above, please contact a PMC member
diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md
@@ -46,7 +46,22 @@ The environment variable picked up by Iceberg starts with `PYICEBERG_` and then
 
 For example, `PYICEBERG_CATALOG__DEFAULT__S3__ACCESS_KEY_ID`, sets `s3.access-key-id` on the `default` catalog.
 
-## FileIO
+# Tables
+
+Iceberg tables support table properties to configure table behavior.
+
+## Write options
+
+| Key                               | Options                           | Default | Description                                                                                 |
+| --------------------------------- | --------------------------------- | ------- | ------------------------------------------------------------------------------------------- |
+| `write.parquet.compression-codec` | `{uncompressed,zstd,gzip,snappy}` | zstd    | Sets the Parquet compression coddec.                                                        |
+| `write.parquet.compression-level` | Integer                           | null    | Parquet compression level for the codec. If not set, it is up to PyIceberg                  |
+| `write.parquet.page-size-bytes`   | Size in bytes                     | 1MB     | Set a target threshold for the approximate encoded size of data pages within a column chunk |
+| `write.parquet.page-row-limit`    | Number of rows                    | 20000   | Set a target threshold for the approximate encoded size of data pages within a column chunk |
+| `write.parquet.dict-size-bytes`   | Size in bytes                     | 2MB     | Set the dictionary page size limit per row group                                            |
+| `write.parquet.row-group-limit`   | Number of rows                    | 122880  | The Parquet row group limit                                                                 |
+
+# FileIO
 
 Iceberg works with the concept of a FileIO which is a pluggable module for reading, writing, and deleting files. By default, PyIceberg will try to initialize the FileIO that's suitable for the scheme (`s3://`, `gs://`, etc.) and will use the first one that's installed.
 
@@ -133,13 +148,27 @@ catalog:
 | Key                    | Example                 | Description                                                                                        |
 | ---------------------- | ----------------------- | -------------------------------------------------------------------------------------------------- |
 | uri                    | https://rest-catalog/ws | URI identifying the REST Server                                                                    |
+| ugi                    | t-1234:secret           | Hadoop UGI for Hive client.                                                                        |
 | credential             | t-1234:secret           | Credential to use for OAuth2 credential flow when initializing the catalog                         |
 | token                  | FEW23.DFSDF.FSDF        | Bearer token value to use for `Authorization` header                                               |
 | rest.sigv4-enabled     | true                    | Sign requests to the REST Server using AWS SigV4 protocol                                          |
 | rest.signing-region    | us-east-1               | The region to use when SigV4 signing a request                                                     |
 | rest.signing-name      | execute-api             | The service signing name to use when SigV4 signing a request                                       |
 | rest.authorization-url | https://auth-service/cc | Authentication URL to use for client credentials authentication (default: uri + 'v1/oauth/tokens') |
 
+### Headers in RESTCatalog
+
+To configure custom headers in RESTCatalog, include them in the catalog properties with the prefix `header.`. This
+ensures that all HTTP requests to the REST service include the specified headers.
+
+```yaml
+catalog:
+  default:
+    uri: http://rest-catalog/ws/
+    credential: t-1234:secret
+    header.content-type: application/vnd.api+json
+```
+
 ## SQL Catalog
 
 The SQL catalog requires a database for its backend. PyIceberg supports PostgreSQL and SQLite through psycopg2. The database connection has to be configured using the `uri` property. See SQLAlchemy's [documentation for URL format](https://docs.sqlalchemy.org/en/20/core/engines.html#backend-specific-urls):

diff --git a/mkdocs/docs/contributing.md b/mkdocs/docs/contributing.md
@@ -58,6 +58,22 @@ For IDEA ≤2021 you need to install the [Poetry integration as a plugin](https:
 
 Now you're set using Poetry, and all the tests will run in Poetry, and you'll have syntax highlighting in the pyproject.toml to indicate stale dependencies.
 
+## Installation from source
+
+Clone the repository for local development:
+
+```sh
+git clone https://github.com/apache/iceberg-python.git
+cd iceberg-python
+pip3 install -e ".[s3fs,hive]"
+```
+
+Install it directly for GitHub (not recommended), but sometimes handy:
+
+```
+pip install "git+https://github.com/apache/iceberg-python.git#egg=pyiceberg[s3fs]"
+```
+
 ## Linting
 
 `pre-commit` is used for autoformatting and linting: