Skip to content

Commit 1e62d38

Browse files
author
=
committed
add basic support for engine=bodo, df.apply
1 parent 8a53447 commit 1e62d38

File tree

10 files changed

+173
-12
lines changed

10 files changed

+173
-12
lines changed

ci/deps/actions-310-minimum_versions.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ dependencies:
3737
- lxml=4.9.2
3838
- matplotlib=3.6.3
3939
- numba=0.56.4
40+
- bodo=2024.11
4041
- numexpr=2.8.4
4142
- odfpy=1.4.1
4243
- qtpy=2.3.0

doc/source/getting_started/install.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ Dependency Minimum Version pip ext
186186
`numexpr <https://github.com/pydata/numexpr>`__ 2.8.4 performance Accelerates certain numerical operations by using multiple cores as well as smart chunking and caching to achieve large speedups
187187
`bottleneck <https://github.com/pydata/bottleneck>`__ 1.3.6 performance Accelerates certain types of ``nan`` by using specialized cython routines to achieve large speedup.
188188
`numba <https://github.com/numba/numba>`__ 0.56.4 performance Alternative execution engine for operations that accept ``engine="numba"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler.
189+
`bodo <https://github.com/bodo-ai/Bodo>`__ 2024.11 performance Alternative execution engine for operations that accept ``engine="bodo"`` using a JIT compiler that translates Python functions to optimized machine code using the LLVM compiler and automatically parallelizes uing MPI.
189190
===================================================== ================== ================== ===================================================================================================================================================================================
190191

191192
Visualization

environment.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
name: pandas-dev
33
channels:
44
- conda-forge
5+
- bodo.ai
56
dependencies:
67
- python=3.10
78
- pip
@@ -40,6 +41,7 @@ dependencies:
4041
- lxml>=4.9.2
4142
- matplotlib>=3.6.3
4243
- numba>=0.56.4
44+
- bodo>=2024.11
4345
- numexpr>=2.8.4
4446
- openpyxl>=3.1.0
4547
- odfpy>=1.4.1

pandas/compat/_optional.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
"tzdata": "2022.7",
5858
"qtpy": "2.3.0",
5959
"pyqt5": "5.15.9",
60+
"bodo": "2024.11",
6061
}
6162

6263
# A mapping from import name to package name (on PyPI) for packages where

pandas/core/apply.py

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -598,9 +598,9 @@ def apply_list_or_dict_like(self) -> DataFrame | Series:
598598
Result when self.func is a list-like or dict-like, None otherwise.
599599
"""
600600

601-
if self.engine == "numba":
601+
if self.engine in ("numba", "bodo"):
602602
raise NotImplementedError(
603-
"The 'numba' engine doesn't support list-like/"
603+
f"The '{self.engine}' engine doesn't support list-like/"
604604
"dict likes of callables yet."
605605
)
606606

@@ -853,9 +853,9 @@ def apply(self) -> DataFrame | Series:
853853

854854
# dispatch to handle list-like or dict-like
855855
if is_list_like(self.func):
856-
if self.engine == "numba":
856+
if self.engine in ("numba", "bodo"):
857857
raise NotImplementedError(
858-
"the 'numba' engine doesn't support lists of callables yet"
858+
f"the '{self.engine}' engine doesn't support lists of callables yet"
859859
)
860860
return self.apply_list_or_dict_like()
861861

@@ -870,13 +870,16 @@ def apply(self) -> DataFrame | Series:
870870
"the 'numba' engine doesn't support using "
871871
"a string as the callable function"
872872
)
873+
if self.engine == "bodo":
874+
return self.apply_series_bodo()
875+
873876
return self.apply_str()
874877

875878
# ufunc
876879
elif isinstance(self.func, np.ufunc):
877-
if self.engine == "numba":
880+
if self.engine in ("numba", "bodo"):
878881
raise NotImplementedError(
879-
"the 'numba' engine doesn't support "
882+
f"the '{self.engine}' engine doesn't support "
880883
"using a numpy ufunc as the callable function"
881884
)
882885
with np.errstate(all="ignore"):
@@ -886,9 +889,10 @@ def apply(self) -> DataFrame | Series:
886889

887890
# broadcasting
888891
if self.result_type == "broadcast":
889-
if self.engine == "numba":
892+
if self.engine in ("numba", "bodo"):
890893
raise NotImplementedError(
891-
"the 'numba' engine doesn't support result_type='broadcast'"
894+
f"the '{self.engine}' engine doesn't support "
895+
"result_type='broadcast'"
892896
)
893897
return self.apply_broadcast(self.obj)
894898

@@ -1007,6 +1011,8 @@ def wrapper(*args, **kwargs):
10071011
result = nb_looper(self.values, self.axis, *args)
10081012
# If we made the result 2-D, squeeze it back to 1-D
10091013
result = np.squeeze(result)
1014+
elif self.engine == "bodo":
1015+
raise NotImplementedError("the 'bodo' engine does not support raw=True.")
10101016
else:
10111017
result = np.apply_along_axis(
10121018
wrap_function(self.func),
@@ -1053,8 +1059,11 @@ def apply_broadcast(self, target: DataFrame) -> DataFrame:
10531059
def apply_standard(self):
10541060
if self.engine == "python":
10551061
results, res_index = self.apply_series_generator()
1056-
else:
1062+
elif self.engine == "numba":
10571063
results, res_index = self.apply_series_numba()
1064+
else:
1065+
# bodo engine
1066+
return self.apply_series_bodo()
10581067

10591068
# wrap results
10601069
return self.wrap_results(results, res_index)
@@ -1089,6 +1098,26 @@ def apply_series_numba(self):
10891098
results = self.apply_with_numba()
10901099
return results, self.result_index
10911100

1101+
def apply_series_bodo(self) -> DataFrame | Series:
1102+
bodo = import_optional_dependency("bodo")
1103+
1104+
if self.result_type is not None:
1105+
raise NotImplementedError(
1106+
"the 'bodo' engine does not support result_type yet."
1107+
)
1108+
1109+
if self.axis != 1 and not isinstance(self.func, str):
1110+
raise NotImplementedError(
1111+
"the 'bodo' engine only supports axis=1 for user-defined functions."
1112+
)
1113+
1114+
@bodo.jit
1115+
def do_apply(obj, func, axis):
1116+
return obj.apply(func, axis)
1117+
1118+
result = do_apply(self.obj, self.func, self.axis)
1119+
return result
1120+
10921121
def wrap_results(self, results: ResType, res_index: Index) -> DataFrame | Series:
10931122
from pandas import Series
10941123

pandas/core/frame.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10203,7 +10203,7 @@ def apply(
1020310203
result_type: Literal["expand", "reduce", "broadcast"] | None = None,
1020410204
args=(),
1020510205
by_row: Literal[False, "compat"] = "compat",
10206-
engine: Literal["python", "numba"] = "python",
10206+
engine: Literal["python", "numba", "bodo"] = "python",
1020710207
engine_kwargs: dict[str, bool] | None = None,
1020810208
**kwargs,
1020910209
):
@@ -10265,7 +10265,7 @@ def apply(
1026510265
1026610266
.. versionadded:: 2.1.0
1026710267
10268-
engine : {'python', 'numba'}, default 'python'
10268+
engine : {'python', 'numba', 'bodo'}, default 'python'
1026910269
Choose between the python (default) engine or the numba engine in apply.
1027010270
1027110271
The numba engine will attempt to JIT compile the passed function,
@@ -10288,6 +10288,8 @@ def apply(
1028810288
<https://numba.pydata.org/numba-doc/dev/reference/numpysupported.html>`_
1028910289
in numba to learn what you can or cannot use in the passed function.
1029010290
10291+
TODO: describe bodo
10292+
1029110293
.. versionadded:: 2.2.0
1029210294
1029310295
engine_kwargs : dict

pandas/tests/apply/test_bodo.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
import numpy as np
2+
import pytest
3+
4+
import pandas.util._test_decorators as td
5+
6+
import pandas as pd
7+
import pandas._testing as tm
8+
9+
pytestmark = [td.skip_if_no("bodo")]
10+
11+
12+
def test_bodo_vs_python_indexing():
13+
frame = pd.DataFrame(
14+
{"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]},
15+
)
16+
f = lambda x: x["c"]
17+
result = frame.apply(f, engine="bodo", axis=1)
18+
expected = frame.apply(f, engine="python", axis=1)
19+
20+
tm.assert_series_equal(result, expected, check_series_type=False)
21+
22+
23+
@pytest.mark.parametrize(
24+
"reduction",
25+
[lambda x: x.mean(), lambda x: x.min(), lambda x: x.max(), lambda x: x.sum()],
26+
)
27+
def test_bodo_vs_python_reductions(reduction):
28+
df = pd.DataFrame(np.ones((4, 4), dtype=np.float64))
29+
result = df.apply(reduction, engine="bodo", axis=1)
30+
expected = df.apply(reduction, engine="python", axis=1)
31+
tm.assert_series_equal(result, expected, check_series_type=False)
32+
33+
34+
def test_bodo_vs_python_df_output():
35+
df = pd.DataFrame({"A": np.arange(20), "B": ["hi", "there"] * 10})
36+
37+
f = lambda a: pd.Series([a["B"], a["A"]])
38+
result = df.apply(f, engine="bodo", axis=1)
39+
expected = df.apply(f, engine="python", axis=1)
40+
41+
tm.assert_frame_equal(result, expected, check_frame_type=False, check_dtype=False)
42+
43+
44+
@pytest.mark.skip(reason="TODO: pass args/kwargs to bodo jitted function")
45+
def test_bodo_vs_python_args_kwargs():
46+
def f(x, y, z=3):
47+
return x.A == y + z
48+
49+
df = pd.DataFrame({"A": np.arange(20)})
50+
51+
result = df.apply(f, z=2, engine="bodo", axis=1, args=(2,))
52+
expected = df.apply(f, z=2, axis=1, args=(2,))
53+
tm.assert_series_equal(result, expected, check_series_type=False)
54+
55+
56+
@pytest.mark.parametrize("axis", [0, 1])
57+
def test_bodo_vs_python_str_apply(axis):
58+
df = pd.DataFrame({"A": np.arange(20)})
59+
60+
func = "mean"
61+
axis = 1
62+
result = df.apply(func, axis)
63+
expected = df.apply(func, axis)
64+
65+
tm.assert_series_equal(result, expected, check_series_type=False)
66+
67+
68+
def test_bodo_unsupported_axis():
69+
"""Tests that a BodoError is raised when trying to apply UDF column-wise"""
70+
frame = pd.DataFrame(
71+
{"a": [1, 2, 3]},
72+
)
73+
f = lambda x: 1
74+
75+
with pytest.raises(
76+
NotImplementedError,
77+
match=r"the 'bodo' engine only supports axis=1 for user-defined functions",
78+
):
79+
frame.apply(f, engine="bodo", axis=0)
80+
81+
82+
def test_bodo_raw_unsupported():
83+
"""Tests that error gets raised when using raw=True"""
84+
frame = pd.DataFrame(
85+
{"a": [1, 2, 3]},
86+
)
87+
f = lambda a: 1
88+
89+
with pytest.raises(
90+
NotImplementedError, match="the 'bodo' engine does not support raw=True."
91+
):
92+
frame.apply(f, engine="bodo", raw=True, axis=1)
93+
94+
95+
def test_bodo_result_type_unsupported():
96+
"""Tests that error gets raised when passing any value to result_type"""
97+
frame = pd.DataFrame(
98+
{"a": [1, 2, 3]},
99+
)
100+
f = lambda a: 1
101+
102+
with pytest.raises(
103+
NotImplementedError, match="the 'bodo' engine does not support result_type yet."
104+
):
105+
frame.apply(f, engine="bodo", axis=1, result_type="reduce")

pandas/tests/util/test_bodo.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import pytest
2+
3+
import pandas.util._test_decorators as td
4+
5+
from pandas import DataFrame
6+
7+
8+
@td.skip_if_installed("bodo")
9+
def test_bodo_not_installed_df_apply():
10+
"Test that importing bodo when not installed results in ImportError."
11+
12+
df = DataFrame({"A": [1, 2, 3, 4, 5]})
13+
14+
def f(x):
15+
return 1
16+
17+
with pytest.raises(ImportError, match="Missing optional"):
18+
df.apply(f, engine="bodo")

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ matplotlib = "pandas:plotting._matplotlib"
6060
[project.optional-dependencies]
6161
test = ['hypothesis>=6.84.0', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0']
6262
pyarrow = ['pyarrow>=10.0.1']
63-
performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4']
63+
performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4', 'bodo>=2024.11']
6464
computation = ['scipy>=1.10.0', 'xarray>=2022.12.0']
6565
fss = ['fsspec>=2022.11.0']
6666
aws = ['s3fs>=2022.11.0']
@@ -97,6 +97,7 @@ all = ['adbc-driver-postgresql>=0.10.0',
9797
'lxml>=4.9.2',
9898
'matplotlib>=3.6.3',
9999
'numba>=0.56.4',
100+
'bodo>=2024.11',
100101
'numexpr>=2.8.4',
101102
'odfpy>=1.4.1',
102103
'openpyxl>=3.1.0',

requirements-dev.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ jinja2>=3.1.2
2929
lxml>=4.9.2
3030
matplotlib>=3.6.3
3131
numba>=0.56.4
32+
bodo>=2024.11
3233
numexpr>=2.8.4
3334
openpyxl>=3.1.0
3435
odfpy>=1.4.1

0 commit comments

Comments
 (0)