Skip to content

Add ability for Vectorized Scanner in write_pandas #2164

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Source code is also available at: https://github.com/snowflakedb/snowflake-conne
# Release Notes
- v3.16(TBD)
- Added basic arrow support for Interval types.
- Added `use_vectorized_scanner` parameter on `write_pandas` to leverage vectorized scanner.

- v3.15.0(Apr 29,2025)
- Bumped up min boto and botocore version to 1.24.
Expand Down
7 changes: 6 additions & 1 deletion src/snowflake/connector/pandas_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,7 @@ def write_pandas(
table_type: Literal["", "temp", "temporary", "transient"] = "",
use_logical_type: bool | None = None,
iceberg_config: dict[str, str] | None = None,
use_vectorized_scanner: bool | None = None,
**kwargs: Any,
) -> tuple[
bool,
Expand Down Expand Up @@ -335,7 +336,10 @@ def write_pandas(
* base_location: the base directory that snowflake can write iceberg metadata and files to
* catalog_sync: optionally sets the catalog integration configured for Polaris Catalog
* storage_serialization_policy: specifies the storage serialization policy for the table

use_vectorized_scanner: Boolean that specifies to use a vectorized scanner for loading Parquet files.
Using the vectorized scanner can significantly reduce the latency for loading Parquet files. To enable
Vectorized scanning of Parquet files, set use_vectorized_scanner as True. Set to None to use Snowflakes default.
For more information, see: https://docs.snowflake.com/en/sql-reference/sql/copy-into-table#label-use-vectorized-scanner


Returns:
Expand Down Expand Up @@ -579,6 +583,7 @@ def drop_object(name: str, object_type: str) -> None:
f"COMPRESSION={compression_map[compression]}"
f"{' BINARY_AS_TEXT=FALSE' if auto_create_table or overwrite else ''}"
f"{sql_use_logical_type}"
f"{' USE_VECTORIZED_SCANNER=' + str(use_vectorized_scanner).upper() if use_vectorized_scanner is not None else ''}"
f") "
f"PURGE=TRUE ON_ERROR=?"
)
Expand Down
57 changes: 57 additions & 0 deletions test/unit/test_pandas_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#
# Copyright (c) 2012-2023 Snowflake Computing Inc. All rights reserved.
#

from typing import Union
from unittest.mock import MagicMock

import pandas as pd
import pytest

from snowflake.connector import pandas_tools

from .mock_utils import mock_connection


@pytest.mark.parametrize(
("use_vectorized_scanner", "expected_file_format"),
[
(None, "FILE_FORMAT=(TYPE=PARQUET COMPRESSION=auto)"),
(
True,
"FILE_FORMAT=(TYPE=PARQUET COMPRESSION=auto USE_VECTORIZED_SCANNER=TRUE)",
),
(
False,
"FILE_FORMAT=(TYPE=PARQUET COMPRESSION=auto USE_VECTORIZED_SCANNER=FALSE)",
),
],
)
def test_write_pandas_use_vectorized_scanner(
use_vectorized_scanner: Union[bool, None], expected_file_format: str
):
# Setup Mocks
df = pd.DataFrame({"col1": [1, 2, 3]})

mock_conn = mock_connection()
mock_cursor = MagicMock()
mock_conn.cursor.return_value = mock_cursor

# Execute Function
pandas_tools.write_pandas(
conn=mock_conn,
df=df,
table_name="test_table",
schema="test_schema",
database="test_database",
use_vectorized_scanner=use_vectorized_scanner,
)

executed_sql_statements = [
call[0][0] for call in mock_cursor.execute.call_args_list
]

assert any(
"COPY INTO" in sql and expected_file_format in sql
for sql in executed_sql_statements
)
Loading