Skip to content

Commit

Permalink
[datafusion-contrib#19] - Support dataframe union for python binding.
Browse files Browse the repository at this point in the history
  • Loading branch information
Muru Muthusamy committed Feb 15, 2022
1 parent eaf94da commit fc626e7
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 3 deletions.
5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,7 @@ Bootstrap:

```bash
# fetch this repo
git clone [email protected]:apache/arrow-datafusion.git
# change to python directory
cd arrow-datafusion/python
git clone [email protected]:datafusion-contrib/datafusion-python.git
# prepare development environment (used to build wheel / install in development)
python3 -m venv venv
# activate the venv
Expand All @@ -150,6 +148,7 @@ Whenever rust code changes (your changes or via `git pull`):

```bash
# make sure you activate the venv using "source venv/bin/activate" first
# make sure Cargo version is 1.58 and rustc is 1.58.1
maturin develop
python -m pytest
```
Expand Down
4 changes: 4 additions & 0 deletions datafusion/tests/test_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ def test_register_record_batches(ctx):
assert result[0].column(0) == pa.array([5, 7, 9])
assert result[0].column(1) == pa.array([-3, -3, -3])

result = ctx.sql("SELECT a from t union SELECT a from t")
result.show()
#print(result)


def test_create_dataframe_registers_unique_table_name(ctx):
# create a RecordBatch and register it as memtable
Expand Down
26 changes: 26 additions & 0 deletions datafusion/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,3 +179,29 @@ def test_struct_select(struct_df):

assert result.column(0) == pa.array([5, 7, 9])
assert result.column(1) == pa.array([-3, -3, -3])


def test_union():
ctx = ExecutionContext()

batch = pa.RecordBatch.from_arrays(
[pa.array([1, 2, 3]), pa.array([4, 5, 6])],
names=["a", "b"],
)
df = ctx.create_dataframe([[batch]])

batch = pa.RecordBatch.from_arrays(
[pa.array([1, 2, 3]), pa.array([4, 5, 6])],
names=["a", "b"],
)
df1 = ctx.create_dataframe([[batch]])

df = df.union(df1)
table = pa.Table.from_batches(df.collect())
print(table.to_pydict())

assert 1 == 1

#expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]}
#assert table.to_pydict() == expected

5 changes: 5 additions & 0 deletions src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -127,4 +127,9 @@ impl PyDataFrame {
.join(right.df, join_type, &join_keys.0, &join_keys.1)?;
Ok(Self::new(df))
}

fn union(&self, other: PyDataFrame) -> PyResult<Self> {
let df = self.df.union(other.df)?;
Ok(Self::new(df))
}
}

0 comments on commit fc626e7

Please sign in to comment.