From 3308626e98ae3a3ce2af3653a17ad540ce103850 Mon Sep 17 00:00:00 2001 From: "Sung Yun (CODE SIGNING KEY)" Date: Mon, 19 Aug 2024 14:01:08 -0400 Subject: [PATCH] adopt review feedback --- bindings/python/Cargo.toml | 4 +- bindings/python/src/lib.rs | 16 ++++- bindings/python/src/transform.rs | 83 +++---------------------- bindings/python/tests/test_transform.py | 2 +- 4 files changed, 25 insertions(+), 80 deletions(-) diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index 8cf1d7667..a8bed6757 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -32,6 +32,6 @@ crate-type = ["cdylib"] [dependencies] iceberg = { path = "../../crates/iceberg" } -pyo3 = { version = "0.22", features = ["extension-module"] } -arrow = { version = "52.2.0", features = ["ffi"] } +pyo3 = { version = "0.21.1", features = ["extension-module"] } +arrow = { version = "52.2.0", features = ["pyarrow"] } libc = "0.2" \ No newline at end of file diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs index c3363e1be..d257760fd 100644 --- a/bindings/python/src/lib.rs +++ b/bindings/python/src/lib.rs @@ -28,9 +28,21 @@ fn hello_world() -> PyResult { } #[pymodule] -fn pyiceberg_core_rust(m: &Bound<'_, PyModule>) -> PyResult<()> { +fn submodule(_py: Python, module: &Bound<'_, PyModule>) -> PyResult<()> { use transform::bucket_transform; + + module.add_wrapped(wrap_pyfunction!(bucket_transform))?; + Ok(()) +} + +#[pymodule] +fn pyiceberg_core_rust(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(hello_world, m)?)?; - m.add_function(wrap_pyfunction!(bucket_transform, m)?)?; + + // https://github.com/PyO3/pyo3/issues/759 + let child_module = PyModule::new_bound(py, "pyiceberg_core.transform")?; + submodule(py, &child_module)?; + m.add("transform", child_module.clone())?; + py.import_bound("sys")?.getattr("modules")?.set_item("pyiceberg_core.transform", child_module)?; Ok(()) } diff --git a/bindings/python/src/transform.rs b/bindings/python/src/transform.rs index 610e1aa8d..58cdf0d17 100644 --- a/bindings/python/src/transform.rs +++ b/bindings/python/src/transform.rs @@ -15,54 +15,16 @@ // specific language governing permissions and limitations // under the License. -use std::{error, fmt, sync::Arc}; +use std::{error, fmt}; use iceberg::spec::Transform; use iceberg::transform::create_transform_function; use iceberg::Error; use arrow::{ - array::{make_array, Array, ArrayData, ArrayRef}, - error::ArrowError, - ffi::{from_ffi, to_ffi}, + array::{make_array, Array, ArrayData}, }; -use libc::uintptr_t; -use pyo3::{exceptions::PyOSError, exceptions::PyValueError, prelude::*}; - -#[derive(Debug)] -enum PyO3ArrowError { - ArrowError(ArrowError), -} - -impl fmt::Display for PyO3ArrowError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match *self { - PyO3ArrowError::ArrowError(ref e) => e.fmt(f), - } - } -} - -impl error::Error for PyO3ArrowError { - fn source(&self) -> Option<&(dyn error::Error + 'static)> { - match *self { - // The cause is the underlying implementation error type. Is implicitly - // cast to the trait object `&error::Error`. This works because the - // underlying type already implements the `Error` trait. - PyO3ArrowError::ArrowError(ref e) => Some(e), - } - } -} - -impl From for PyO3ArrowError { - fn from(err: ArrowError) -> PyO3ArrowError { - PyO3ArrowError::ArrowError(err) - } -} - -impl From for PyErr { - fn from(err: PyO3ArrowError) -> PyErr { - PyOSError::new_err(err.to_string()) - } -} +use arrow::pyarrow::{FromPyArrow, ToPyArrow}; +use pyo3::{exceptions::PyValueError, prelude::*}; #[derive(Debug)] enum PyO3IcebergError { @@ -100,43 +62,14 @@ impl From for PyErr { } } -fn to_rust_arrow_array(ob: PyObject, py: Python) -> PyResult { - // prepare a pointer to receive the Array struct - let (array, schema) = to_ffi(&ArrayData::new_empty(&arrow::datatypes::DataType::Null)) - .map_err(PyO3ArrowError::from)?; - let array_pointer = &array as *const _ as uintptr_t; - let schema_pointer = &schema as *const _ as uintptr_t; - - // make the conversion through PyArrow's private API - // this changes the pointer's memory and is thus unsafe. In particular, `_export_to_c` can go out of bounds - ob.call_method1(py, "_export_to_c", (array_pointer, schema_pointer))?; - - let array = unsafe { from_ffi(array, &schema) }.map_err(PyO3ArrowError::from)?; - let array = make_array(array); - Ok(array) -} - -fn to_pyarrow_array(array: ArrayRef, py: Python) -> PyResult { - let (array, schema) = to_ffi(&array.to_data()).map_err(PyO3ArrowError::from)?; - let array_pointer = &array as *const _ as uintptr_t; - let schema_pointer = &schema as *const _ as uintptr_t; - - let pa = py.import_bound("pyarrow")?; - - let array = pa.getattr("Array")?.call_method1( - "_import_from_c", - (array_pointer as uintptr_t, schema_pointer as uintptr_t), - )?; - Ok(array.to_object(py)) -} - #[pyfunction] pub fn bucket_transform(array: PyObject, num_buckets: u32, py: Python) -> PyResult { // import - let array = to_rust_arrow_array(array, py)?; + let array = ArrayData::from_pyarrow_bound(array.bind(py))?; + let array = make_array(array); let bucket = create_transform_function(&Transform::Bucket(num_buckets)).map_err(PyO3IcebergError::from)?; let array = bucket.transform(array).map_err(PyO3IcebergError::from)?; - let array = Arc::new(array); // export - to_pyarrow_array(array, py) + let array = array.into_data(); + array.to_pyarrow(py) } diff --git a/bindings/python/tests/test_transform.py b/bindings/python/tests/test_transform.py index 2b6d277af..8a160ce12 100644 --- a/bindings/python/tests/test_transform.py +++ b/bindings/python/tests/test_transform.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -from pyiceberg_core import bucket_transform +from pyiceberg_core.transform import bucket_transform import pytest import pyarrow as pa