kevinjqliu · kevinjqliu · Feb 11, 2025 · Feb 11, 2025 · Feb 11, 2025 · Feb 11, 2025
diff --git a/Makefile b/Makefile
@@ -19,13 +19,20 @@
 help:  ## Display this help
 	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m\033[0m\n"} /^[a-zA-Z_-]+:.*?##/ { printf "  \033[36m%-20s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
 
-install-poetry:  ## Install poetry if the user has not done that yet.
-	 @if ! command -v poetry &> /dev/null; then \
-         echo "Poetry could not be found. Installing..."; \
-         pip install --user poetry==2.0.1; \
-     else \
-         echo "Poetry is already installed."; \
-     fi
+POETRY_VERSION = 2.0.1
+install-poetry:  ## Ensure Poetry is installed and the correct version is being used.
+	@if ! command -v poetry &> /dev/null; then \
+		echo "Poetry could not be found. Installing..."; \
+		pip install --user poetry==$(POETRY_VERSION); \
+	else \
+		INSTALLED_VERSION=$$(pip show poetry | grep Version | awk '{print $$2}'); \
+		if [ "$$INSTALLED_VERSION" != "$(POETRY_VERSION)" ]; then \
+			echo "Poetry version $$INSTALLED_VERSION does not match required version $(POETRY_VERSION). Updating..."; \
+			pip install --user --upgrade poetry==$(POETRY_VERSION); \
+		else \
+			echo "Poetry version $$INSTALLED_VERSION is already installed."; \
+		fi \
+	fi
 
 install-dependencies: ## Install dependencies including dev, docs, and all extras
 	poetry install --all-extras

diff --git a/dev/Dockerfile b/dev/Dockerfile
@@ -37,21 +37,24 @@ RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/
 WORKDIR ${SPARK_HOME}
 
 # Remember to also update `tests/conftest`'s spark setting
-ENV SPARK_VERSION=3.5.3
+ENV SPARK_VERSION=3.5.4
 ENV ICEBERG_SPARK_RUNTIME_VERSION=3.5_2.12
-ENV ICEBERG_VERSION=1.6.0
+ENV ICEBERG_VERSION=1.8.1
 ENV PYICEBERG_VERSION=0.8.1
 
+ENV BASE_ARTIFACT_URL=https://repository.apache.org/content/repositories/orgapacheiceberg-1184
 RUN curl --retry 5 -s -C - https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \
  && tar xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \
  && rm -rf spark-${SPARK_VERSION}-bin-hadoop3.tgz
 
 # Download iceberg spark runtime
-RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar -Lo iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
- && mv iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar /opt/spark/jars
+RUN curl --retry 5 -s ${BASE_ARTIFACT_URL}/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \
+      -Lo /opt/spark/jars/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar
+
 
 # Download AWS bundle
-RUN curl --retry 5 -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar -Lo /opt/spark/jars/iceberg-aws-bundle-${ICEBERG_VERSION}.jar
+RUN curl --retry 5 -s ${BASE_ARTIFACT_URL}/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar \
+      -Lo /opt/spark/jars/iceberg-aws-bundle-${ICEBERG_VERSION}.jar
 
 COPY spark-defaults.conf /opt/spark/conf
 ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}"

diff --git a/mkdocs/docs/SUMMARY.md b/mkdocs/docs/SUMMARY.md
@@ -30,6 +30,7 @@
     - [Verify a release](verify-release.md)
     - [How to release](how-to-release.md)
     - [Release Notes](https://github.com/apache/iceberg-python/releases)
+    - [Nightly Build](nightly-build.md)
 - [Code Reference](reference/)
 
 <!-- markdown-link-check-enable-->

diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
@@ -49,7 +49,7 @@ catalog:
 
 and loaded in python by calling `load_catalog(name="hive")` and `load_catalog(name="rest")`.
 
-This information must be placed inside a file called `.pyiceberg.yaml` located either in the `$HOME` or `%USERPROFILE%` directory (depending on whether the operating system is Unix-based or Windows-based, respectively) or in the `$PYICEBERG_HOME` directory (if the corresponding environment variable is set).
+This information must be placed inside a file called `.pyiceberg.yaml` located either in the `$HOME` or `%USERPROFILE%` directory (depending on whether the operating system is Unix-based or Windows-based, respectively), in the current working directory, or in the `$PYICEBERG_HOME` directory (if the corresponding environment variable is set).
 
 For more details on possible configurations refer to the [specific page](https://py.iceberg.apache.org/configuration/).
 
@@ -474,6 +474,71 @@ lat: [[48.864716],[52.371807],[53.11254],[37.773972]]
 long: [[2.349014],[4.896029],[6.0989],[-122.431297]]
 ```
 
+### Upsert
+
+PyIceberg supports upsert operations, meaning that it is able to merge an Arrow table into an Iceberg table. Rows are considered the same based on the [identifier field](https://iceberg.apache.org/spec/?column-projection#identifier-field-ids). If a row is already in the table, it will update that row. If a row cannot be found, it will insert that new row.
+
+Consider the following table, with some data:
+
+```python
+from pyiceberg.schema import Schema
+from pyiceberg.types import IntegerType, NestedField, StringType
+
+import pyarrow as pa
+
+schema = Schema(
+    NestedField(1, "city", StringType(), required=True),
+    NestedField(2, "inhabitants", IntegerType(), required=True),
+    # Mark City as the identifier field, also known as the primary-key
+    identifier_field_ids=[1]
+)
+
+tbl = catalog.create_table("default.cities", schema=schema)
+
+arrow_schema = pa.schema(
+    [
+        pa.field("city", pa.string(), nullable=False),
+        pa.field("inhabitants", pa.int32(), nullable=False),
+    ]
+)
+
+# Write some data
+df = pa.Table.from_pylist(
+    [
+        {"city": "Amsterdam", "inhabitants": 921402},
+        {"city": "San Francisco", "inhabitants": 808988},
+        {"city": "Drachten", "inhabitants": 45019},
+        {"city": "Paris", "inhabitants": 2103000},
+    ],
+    schema=arrow_schema
+)
+tbl.append(df)
+```
+
+Next, we'll upsert a table into the Iceberg table:
+
+```python
+df = pa.Table.from_pylist(
+    [
+        # Will be updated, the inhabitants has been updated
+        {"city": "Drachten", "inhabitants": 45505},
+
+        # New row, will be inserted
+        {"city": "Berlin", "inhabitants": 3432000},
+
+        # Ignored, already exists in the table
+        {"city": "Paris", "inhabitants": 2103000},
+    ],
+    schema=arrow_schema
+)
+upd = tbl.upsert(df)
+
+assert upd.rows_updated == 1
+assert upd.rows_inserted == 1
+```
+
+PyIceberg will automatically detect which rows need to be updated, inserted or can simply be ignored.
+
 ## Inspecting tables
 
 To explore the table metadata, tables can be inspected.
@@ -1546,3 +1611,139 @@ df.show(2)
 
 (Showing first 2 rows)
 ```
+
+### Polars
+
+PyIceberg interfaces closely with Polars Dataframes and LazyFrame which provides a full lazily optimized query engine interface on top of PyIceberg tables.
+
+<!-- prettier-ignore-start -->
+
+!!! note "Requirements"
+    This requires [`polars` to be installed](index.md).
+
+```python
+pip install pyiceberg['polars']
+```
+<!-- prettier-ignore-end -->
+
+PyIceberg data can be analyzed and accessed through Polars using either DataFrame or LazyFrame.
+If your code utilizes the Apache Iceberg data scanning and retrieval API and then analyzes the resulting DataFrame in Polars, use the `table.scan().to_polars()` API.
+If the intent is to utilize Polars' high-performance filtering and retrieval functionalities, use LazyFrame exported from the Iceberg table with the `table.to_polars()` API.
+
+```python
+# Get LazyFrame
+iceberg_table.to_polars()
+
+# Get Data Frame
+iceberg_table.scan().to_polars()
+```
+
+#### Working with Polars DataFrame
+
+PyIceberg makes it easy to filter out data from a huge table and pull it into a Polars dataframe locally. This will only fetch the relevant Parquet files for the query and apply the filter. This will reduce IO and therefore improve performance and reduce cost.
+
+```python
+schema = Schema(
+    NestedField(field_id=1, name='ticket_id', field_type=LongType(), required=True),
+    NestedField(field_id=2, name='customer_id', field_type=LongType(), required=True),
+    NestedField(field_id=3, name='issue', field_type=StringType(), required=False),
+    NestedField(field_id=4, name='created_at', field_type=TimestampType(), required=True),
+  required=True
+)
+
+iceberg_table = catalog.create_table(
+    identifier='default.product_support_issues',
+    schema=schema
+)
+
+pa_table_data = pa.Table.from_pylist(
+    [
+        {'ticket_id': 1, 'customer_id': 546, 'issue': 'User Login issue', 'created_at': 1650020000000000},
+        {'ticket_id': 2, 'customer_id': 547, 'issue': 'Payment not going through', 'created_at': 1650028640000000},
+        {'ticket_id': 3, 'customer_id': 548, 'issue': 'Error on checkout', 'created_at': 1650037280000000},
+        {'ticket_id': 4, 'customer_id': 549, 'issue': 'Unable to reset password', 'created_at': 1650045920000000},
+        {'ticket_id': 5, 'customer_id': 550, 'issue': 'Account locked', 'created_at': 1650054560000000},
+        {'ticket_id': 6, 'customer_id': 551, 'issue': 'Order not received', 'created_at': 1650063200000000},
+        {'ticket_id': 7, 'customer_id': 552, 'issue': 'Refund not processed', 'created_at': 1650071840000000},
+        {'ticket_id': 8, 'customer_id': 553, 'issue': 'Shipping address issue', 'created_at': 1650080480000000},
+        {'ticket_id': 9, 'customer_id': 554, 'issue': 'Product damaged', 'created_at': 1650089120000000},
+        {'ticket_id': 10, 'customer_id': 555, 'issue': 'Unable to apply discount code', 'created_at': 1650097760000000},
+        {'ticket_id': 11, 'customer_id': 556, 'issue': 'Website not loading', 'created_at': 1650106400000000},
+        {'ticket_id': 12, 'customer_id': 557, 'issue': 'Incorrect order received', 'created_at': 1650115040000000},
+        {'ticket_id': 13, 'customer_id': 558, 'issue': 'Unable to track order', 'created_at': 1650123680000000},
+        {'ticket_id': 14, 'customer_id': 559, 'issue': 'Order delayed', 'created_at': 1650132320000000},
+        {'ticket_id': 15, 'customer_id': 560, 'issue': 'Product not as described', 'created_at': 1650140960000000},
+        {'ticket_id': 16, 'customer_id': 561, 'issue': 'Unable to contact support', 'created_at': 1650149600000000},
+        {'ticket_id': 17, 'customer_id': 562, 'issue': 'Duplicate charge', 'created_at': 1650158240000000},
+        {'ticket_id': 18, 'customer_id': 563, 'issue': 'Unable to update profile', 'created_at': 1650166880000000},
+        {'ticket_id': 19, 'customer_id': 564, 'issue': 'App crashing', 'created_at': 1650175520000000},
+        {'ticket_id': 20, 'customer_id': 565, 'issue': 'Unable to download invoice', 'created_at': 1650184160000000},
+        {'ticket_id': 21, 'customer_id': 566, 'issue': 'Incorrect billing amount', 'created_at': 1650192800000000},
+    ], schema=iceberg_table.schema().as_arrow()
+)
+
+iceberg_table.append(
+    df=pa_table_data
+)
+
+table.scan(
+    row_filter="ticket_id > 10",
+).to_polars()
+```
+
+This will return a Polars DataFrame:
+
+```python
+shape: (11, 4)
+┌───────────┬─────────────┬────────────────────────────┬─────────────────────┐
+│ ticket_id ┆ customer_id ┆ issue                      ┆ created_at          │
+│ ---       ┆ ---         ┆ ---                        ┆ ---                 │
+│ i64       ┆ i64         ┆ str                        ┆ datetime[μs]        │
+╞═══════════╪═════════════╪════════════════════════════╪═════════════════════╡
+│ 11        ┆ 556         ┆ Website not loading        ┆ 2022-04-16 10:53:20 │
+│ 12        ┆ 557         ┆ Incorrect order received   ┆ 2022-04-16 13:17:20 │
+│ 13        ┆ 558         ┆ Unable to track order      ┆ 2022-04-16 15:41:20 │
+│ 14        ┆ 559         ┆ Order delayed              ┆ 2022-04-16 18:05:20 │
+│ 15        ┆ 560         ┆ Product not as described   ┆ 2022-04-16 20:29:20 │
+│ …         ┆ …           ┆ …                          ┆ …                   │
+│ 17        ┆ 562         ┆ Duplicate charge           ┆ 2022-04-17 01:17:20 │
+│ 18        ┆ 563         ┆ Unable to update profile   ┆ 2022-04-17 03:41:20 │
+│ 19        ┆ 564         ┆ App crashing               ┆ 2022-04-17 06:05:20 │
+│ 20        ┆ 565         ┆ Unable to download invoice ┆ 2022-04-17 08:29:20 │
+│ 21        ┆ 566         ┆ Incorrect billing amount   ┆ 2022-04-17 10:53:20 │
+└───────────┴─────────────┴────────────────────────────┴─────────────────────┘
+```
+
+#### Working with Polars LazyFrame
+
+PyIceberg supports creation of a Polars LazyFrame based on an Iceberg Table.
+
+using the above code example:
+
+```python
+lf = iceberg_table.to_polars().filter(pl.col("ticket_id") > 10)
+print(lf.collect())
+```
+
+This above code snippet returns a Polars LazyFrame and defines a filter to be executed by Polars:
+
+```python
+shape: (11, 4)
+┌───────────┬─────────────┬────────────────────────────┬─────────────────────┐
+│ ticket_id ┆ customer_id ┆ issue                      ┆ created_at          │
+│ ---       ┆ ---         ┆ ---                        ┆ ---                 │
+│ i64       ┆ i64         ┆ str                        ┆ datetime[μs]        │
+╞═══════════╪═════════════╪════════════════════════════╪═════════════════════╡
+│ 11        ┆ 556         ┆ Website not loading        ┆ 2022-04-16 10:53:20 │
+│ 12        ┆ 557         ┆ Incorrect order received   ┆ 2022-04-16 13:17:20 │
+│ 13        ┆ 558         ┆ Unable to track order      ┆ 2022-04-16 15:41:20 │
+│ 14        ┆ 559         ┆ Order delayed              ┆ 2022-04-16 18:05:20 │
+│ 15        ┆ 560         ┆ Product not as described   ┆ 2022-04-16 20:29:20 │
+│ …         ┆ …           ┆ …                          ┆ …                   │
+│ 17        ┆ 562         ┆ Duplicate charge           ┆ 2022-04-17 01:17:20 │
+│ 18        ┆ 563         ┆ Unable to update profile   ┆ 2022-04-17 03:41:20 │
+│ 19        ┆ 564         ┆ App crashing               ┆ 2022-04-17 06:05:20 │
+│ 20        ┆ 565         ┆ Unable to download invoice ┆ 2022-04-17 08:29:20 │
+│ 21        ┆ 566         ┆ Incorrect billing amount   ┆ 2022-04-17 10:53:20 │
+└───────────┴─────────────┴────────────────────────────┴─────────────────────┘
+```