From fc626e73e026f68b7290115010037bcc6033e1c5 Mon Sep 17 00:00:00 2001
From: Muru Muthusamy <mmuthusamy@gmail.com>
Date: Tue, 15 Feb 2022 10:12:01 -0800
Subject: [PATCH] [#19] - Support dataframe union for python binding.

---
 README.md                          |  5 ++---
 datafusion/tests/test_context.py   |  4 ++++
 datafusion/tests/test_dataframe.py | 26 ++++++++++++++++++++++++++
 src/dataframe.rs                   |  5 +++++
 4 files changed, 37 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index 523df45..a68149d 100644
--- a/README.md
+++ b/README.md
@@ -133,9 +133,7 @@ Bootstrap:
 
 ```bash
 # fetch this repo
-git clone git@github.com:apache/arrow-datafusion.git
-# change to python directory
-cd arrow-datafusion/python
+git clone git@github.com:datafusion-contrib/datafusion-python.git
 # prepare development environment (used to build wheel / install in development)
 python3 -m venv venv
 # activate the venv
@@ -150,6 +148,7 @@ Whenever rust code changes (your changes or via `git pull`):
 
 ```bash
 # make sure you activate the venv using "source venv/bin/activate" first
+# make sure Cargo version is 1.58 and rustc is 1.58.1
 maturin develop
 python -m pytest
 ```
diff --git a/datafusion/tests/test_context.py b/datafusion/tests/test_context.py
index 60beea4..651cf76 100644
--- a/datafusion/tests/test_context.py
+++ b/datafusion/tests/test_context.py
@@ -42,6 +42,10 @@ def test_register_record_batches(ctx):
     assert result[0].column(0) == pa.array([5, 7, 9])
     assert result[0].column(1) == pa.array([-3, -3, -3])
 
+    result = ctx.sql("SELECT a from t union SELECT a from t")
+    result.show()
+    #print(result)
+
 
 def test_create_dataframe_registers_unique_table_name(ctx):
     # create a RecordBatch and register it as memtable
diff --git a/datafusion/tests/test_dataframe.py b/datafusion/tests/test_dataframe.py
index 9a97c25..d49ff16 100644
--- a/datafusion/tests/test_dataframe.py
+++ b/datafusion/tests/test_dataframe.py
@@ -179,3 +179,29 @@ def test_struct_select(struct_df):
 
     assert result.column(0) == pa.array([5, 7, 9])
     assert result.column(1) == pa.array([-3, -3, -3])
+
+
+def test_union():
+    ctx = ExecutionContext()
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df1 = ctx.create_dataframe([[batch]])
+
+    df = df.union(df1)
+    table = pa.Table.from_batches(df.collect())
+    print(table.to_pydict())
+
+    assert 1 == 1
+
+    #expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]}
+    #assert table.to_pydict() == expected
+
diff --git a/src/dataframe.rs b/src/dataframe.rs
index 9050df9..96bf9ba 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -127,4 +127,9 @@ impl PyDataFrame {
             .join(right.df, join_type, &join_keys.0, &join_keys.1)?;
         Ok(Self::new(df))
     }
+
+    fn union(&self, other: PyDataFrame) -> PyResult<Self> {
+        let df = self.df.union(other.df)?;
+        Ok(Self::new(df))
+    }
 }