From fc626e73e026f68b7290115010037bcc6033e1c5 Mon Sep 17 00:00:00 2001 From: Muru Muthusamy Date: Tue, 15 Feb 2022 10:12:01 -0800 Subject: [PATCH] [#19] - Support dataframe union for python binding. --- README.md | 5 ++--- datafusion/tests/test_context.py | 4 ++++ datafusion/tests/test_dataframe.py | 26 ++++++++++++++++++++++++++ src/dataframe.rs | 5 +++++ 4 files changed, 37 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 523df45..a68149d 100644 --- a/README.md +++ b/README.md @@ -133,9 +133,7 @@ Bootstrap: ```bash # fetch this repo -git clone git@github.com:apache/arrow-datafusion.git -# change to python directory -cd arrow-datafusion/python +git clone git@github.com:datafusion-contrib/datafusion-python.git # prepare development environment (used to build wheel / install in development) python3 -m venv venv # activate the venv @@ -150,6 +148,7 @@ Whenever rust code changes (your changes or via `git pull`): ```bash # make sure you activate the venv using "source venv/bin/activate" first +# make sure Cargo version is 1.58 and rustc is 1.58.1 maturin develop python -m pytest ``` diff --git a/datafusion/tests/test_context.py b/datafusion/tests/test_context.py index 60beea4..651cf76 100644 --- a/datafusion/tests/test_context.py +++ b/datafusion/tests/test_context.py @@ -42,6 +42,10 @@ def test_register_record_batches(ctx): assert result[0].column(0) == pa.array([5, 7, 9]) assert result[0].column(1) == pa.array([-3, -3, -3]) + result = ctx.sql("SELECT a from t union SELECT a from t") + result.show() + #print(result) + def test_create_dataframe_registers_unique_table_name(ctx): # create a RecordBatch and register it as memtable diff --git a/datafusion/tests/test_dataframe.py b/datafusion/tests/test_dataframe.py index 9a97c25..d49ff16 100644 --- a/datafusion/tests/test_dataframe.py +++ b/datafusion/tests/test_dataframe.py @@ -179,3 +179,29 @@ def test_struct_select(struct_df): assert result.column(0) == pa.array([5, 7, 9]) assert result.column(1) == pa.array([-3, -3, -3]) + + +def test_union(): + ctx = ExecutionContext() + + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], + names=["a", "b"], + ) + df = ctx.create_dataframe([[batch]]) + + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], + names=["a", "b"], + ) + df1 = ctx.create_dataframe([[batch]]) + + df = df.union(df1) + table = pa.Table.from_batches(df.collect()) + print(table.to_pydict()) + + assert 1 == 1 + + #expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]} + #assert table.to_pydict() == expected + diff --git a/src/dataframe.rs b/src/dataframe.rs index 9050df9..96bf9ba 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -127,4 +127,9 @@ impl PyDataFrame { .join(right.df, join_type, &join_keys.0, &join_keys.1)?; Ok(Self::new(df)) } + + fn union(&self, other: PyDataFrame) -> PyResult { + let df = self.df.union(other.df)?; + Ok(Self::new(df)) + } }