From 2583a412c9f7c56d260cffb5e6fe87b64aca8e76 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Thu, 4 Sep 2025 21:09:53 +0200 Subject: [PATCH 1/4] Make arrow from relation return record batch reader --- duckdb/__init__.pyi | 2 +- duckdb/experimental/spark/sql/dataframe.py | 2 +- src/duckdb_py/pyrelation/initialize.cpp | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/duckdb/__init__.pyi b/duckdb/__init__.pyi index adf142dd..b22daef4 100644 --- a/duckdb/__init__.pyi +++ b/duckdb/__init__.pyi @@ -453,7 +453,7 @@ class DuckDBPyRelation: def set_alias(self, alias: str) -> DuckDBPyRelation: ... def show(self, max_width: Optional[int] = None, max_rows: Optional[int] = None, max_col_width: Optional[int] = None, null_value: Optional[str] = None, render_mode: Optional[RenderMode] = None) -> None: ... def sql_query(self) -> str: ... - def to_arrow_table(self, batch_size: int = ...) -> pyarrow.lib.Table: ... + def to_arrow_table(self, batch_size: int = ...) -> pyarrow.lib.RecordBatchReader: ... def to_csv( self, file_name: str, diff --git a/duckdb/experimental/spark/sql/dataframe.py b/duckdb/experimental/spark/sql/dataframe.py index b8a4698b..a81a423b 100644 --- a/duckdb/experimental/spark/sql/dataframe.py +++ b/duckdb/experimental/spark/sql/dataframe.py @@ -75,7 +75,7 @@ def toArrow(self) -> "pa.Table": age: [[2,5]] name: [["Alice","Bob"]] """ - return self.relation.arrow() + return self.relation.to_arrow_table() def createOrReplaceTempView(self, name: str) -> None: """Creates or replaces a local temporary view with this :class:`DataFrame`. diff --git a/src/duckdb_py/pyrelation/initialize.cpp b/src/duckdb_py/pyrelation/initialize.cpp index a93a54b5..794c420b 100644 --- a/src/duckdb_py/pyrelation/initialize.cpp +++ b/src/duckdb_py/pyrelation/initialize.cpp @@ -61,7 +61,7 @@ static void InitializeConsumers(py::class_ &m) { py::arg("date_as_object") = false) .def("fetch_df_chunk", &DuckDBPyRelation::FetchDFChunk, "Execute and fetch a chunk of the rows", py::arg("vectors_per_chunk") = 1, py::kw_only(), py::arg("date_as_object") = false) - .def("arrow", &DuckDBPyRelation::ToArrowTable, "Execute and fetch all rows as an Arrow Table", + .def("arrow", &DuckDBPyRelation::ToRecordBatch, "Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("batch_size") = 1000000) .def("fetch_arrow_table", &DuckDBPyRelation::ToArrowTable, "Execute and fetch all rows as an Arrow Table", py::arg("batch_size") = 1000000) @@ -78,7 +78,7 @@ static void InitializeConsumers(py::class_ &m) { )"; m.def("__arrow_c_stream__", &DuckDBPyRelation::ToArrowCapsule, capsule_docs, py::arg("requested_schema") = py::none()); - m.def("record_batch", &DuckDBPyRelation::ToRecordBatch, + m.def("fetch_record_batch", &DuckDBPyRelation::ToRecordBatch, "Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("batch_size") = 1000000) .def("fetch_arrow_reader", &DuckDBPyRelation::ToRecordBatch, "Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("batch_size") = 1000000); From 535a5fc869b886d2ad6a16f0d69c29ebf42883b0 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Mon, 8 Sep 2025 16:36:13 +0200 Subject: [PATCH 2/4] deprecate instead of remove --- duckdb/__init__.pyi | 5 +++-- src/duckdb_py/pyrelation/initialize.cpp | 17 ++++++++++++++--- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/duckdb/__init__.pyi b/duckdb/__init__.pyi index b22daef4..8f27e5e3 100644 --- a/duckdb/__init__.pyi +++ b/duckdb/__init__.pyi @@ -415,7 +415,7 @@ class DuckDBPyRelation: def variance(self, column: str, groups: str = ..., window_spec: str = ..., projected_columns: str = ...) -> DuckDBPyRelation: ... def list(self, column: str, groups: str = ..., window_spec: str = ..., projected_columns: str = ...) -> DuckDBPyRelation: ... - def arrow(self, batch_size: int = ...) -> pyarrow.lib.Table: ... + def arrow(self, batch_size: int = ...) -> pyarrow.lib.RecordBatchReader: ... def __arrow_c_stream__(self, requested_schema: Optional[object] = None) -> object: ... def create(self, table_name: str) -> None: ... def create_view(self, view_name: str, replace: bool = ...) -> DuckDBPyRelation: ... @@ -448,12 +448,13 @@ class DuckDBPyRelation: def pl(self, rows_per_batch: int = ..., connection: DuckDBPyConnection = ...) -> polars.DataFrame: ... def query(self, virtual_table_name: str, sql_query: str) -> DuckDBPyRelation: ... def record_batch(self, batch_size: int = ...) -> pyarrow.lib.RecordBatchReader: ... + def fetch_record_batch(self, rows_per_batch: int = 1000000, *, connection: DuckDBPyConnection = ...) -> pyarrow.lib.RecordBatchReader: ... def select_types(self, types: List[Union[str, DuckDBPyType]]) -> DuckDBPyRelation: ... def select_dtypes(self, types: List[Union[str, DuckDBPyType]]) -> DuckDBPyRelation: ... def set_alias(self, alias: str) -> DuckDBPyRelation: ... def show(self, max_width: Optional[int] = None, max_rows: Optional[int] = None, max_col_width: Optional[int] = None, null_value: Optional[str] = None, render_mode: Optional[RenderMode] = None) -> None: ... def sql_query(self) -> str: ... - def to_arrow_table(self, batch_size: int = ...) -> pyarrow.lib.RecordBatchReader: ... + def to_arrow_table(self, batch_size: int = ...) -> pyarrow.lib.Table: ... def to_csv( self, file_name: str, diff --git a/src/duckdb_py/pyrelation/initialize.cpp b/src/duckdb_py/pyrelation/initialize.cpp index 794c420b..6f66c563 100644 --- a/src/duckdb_py/pyrelation/initialize.cpp +++ b/src/duckdb_py/pyrelation/initialize.cpp @@ -79,9 +79,20 @@ static void InitializeConsumers(py::class_ &m) { m.def("__arrow_c_stream__", &DuckDBPyRelation::ToArrowCapsule, capsule_docs, py::arg("requested_schema") = py::none()); m.def("fetch_record_batch", &DuckDBPyRelation::ToRecordBatch, - "Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("batch_size") = 1000000) - .def("fetch_arrow_reader", &DuckDBPyRelation::ToRecordBatch, - "Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("batch_size") = 1000000); + "Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("rows_per_batch") = 1000000) + .def("fetch_arrow_reader", &DuckDBPyRelation::ToRecordBatch, + "Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("rows_per_batch") = 1000000) + .def("record_batch", + [](pybind11::object &self, idx_t rows_per_batch) + { + auto warnings = pybind11::module::import("warnings"); + auto builtins = pybind11::module::import("builtins"); + warnings.attr("warn")( + "record_batch() is deprecated, use fetch_record_batch() instead.", + builtins.attr("DeprecationWarning")); + + return self.attr("fetch_record_batch")(rows_per_batch); + }, py::arg("rows_per_batch") = 1000000); } static void InitializeAggregates(py::class_ &m) { From 03e273bacb6c101cb6246d4b782371751f356403 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Mon, 8 Sep 2025 21:53:54 +0200 Subject: [PATCH 3/4] Ignore deprecationwarnings in tests --- src/duckdb_py/pyrelation/initialize.cpp | 11 ++++------- tests/pytest.ini | 1 + 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/duckdb_py/pyrelation/initialize.cpp b/src/duckdb_py/pyrelation/initialize.cpp index 6f66c563..867cd7a6 100644 --- a/src/duckdb_py/pyrelation/initialize.cpp +++ b/src/duckdb_py/pyrelation/initialize.cpp @@ -85,13 +85,10 @@ static void InitializeConsumers(py::class_ &m) { .def("record_batch", [](pybind11::object &self, idx_t rows_per_batch) { - auto warnings = pybind11::module::import("warnings"); - auto builtins = pybind11::module::import("builtins"); - warnings.attr("warn")( - "record_batch() is deprecated, use fetch_record_batch() instead.", - builtins.attr("DeprecationWarning")); - - return self.attr("fetch_record_batch")(rows_per_batch); + PyErr_WarnEx(PyExc_DeprecationWarning, + "record_batch() is deprecated, use fetch_record_batch() instead.", + 0); + return self.attr("fetch_record_batch")(rows_per_batch); }, py::arg("rows_per_batch") = 1000000); } diff --git a/tests/pytest.ini b/tests/pytest.ini index 5dd3c306..0c17afd5 100644 --- a/tests/pytest.ini +++ b/tests/pytest.ini @@ -3,6 +3,7 @@ filterwarnings = error ignore::UserWarning + ignore::DeprecationWarning # Jupyter is throwing DeprecationWarnings ignore:function ham\(\) is deprecated:DeprecationWarning # Pyspark is throwing these warnings From 6aaf2bc591c0225c1cdb658bda22ae943d9d52ea Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Tue, 9 Sep 2025 08:48:37 +0200 Subject: [PATCH 4/4] param name shouldnt change --- src/duckdb_py/pyrelation/initialize.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/duckdb_py/pyrelation/initialize.cpp b/src/duckdb_py/pyrelation/initialize.cpp index 867cd7a6..7992cc17 100644 --- a/src/duckdb_py/pyrelation/initialize.cpp +++ b/src/duckdb_py/pyrelation/initialize.cpp @@ -81,7 +81,7 @@ static void InitializeConsumers(py::class_ &m) { m.def("fetch_record_batch", &DuckDBPyRelation::ToRecordBatch, "Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("rows_per_batch") = 1000000) .def("fetch_arrow_reader", &DuckDBPyRelation::ToRecordBatch, - "Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("rows_per_batch") = 1000000) + "Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("batch_size") = 1000000) .def("record_batch", [](pybind11::object &self, idx_t rows_per_batch) { @@ -89,7 +89,7 @@ static void InitializeConsumers(py::class_ &m) { "record_batch() is deprecated, use fetch_record_batch() instead.", 0); return self.attr("fetch_record_batch")(rows_per_batch); - }, py::arg("rows_per_batch") = 1000000); + }, py::arg("batch_size") = 1000000); } static void InitializeAggregates(py::class_ &m) {