Skip to content

Commit 987c06a

Browse files
mesejoMicah Wylde
authored and
Micah Wylde
committed
fix: encode should work with non-UTF-8 binaries (apache#14087)
* fix: encode function should work with strings and binary closes apache#14055 * chore: address comments, add test
1 parent 069a24f commit 987c06a

File tree

2 files changed

+39
-7
lines changed

2 files changed

+39
-7
lines changed

datafusion/functions/src/encoding/inner.rs

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,21 @@ impl ScalarUDFImpl for EncodeFunc {
8787
}
8888

8989
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
90-
Ok(arg_types[0].to_owned())
90+
use DataType::*;
91+
92+
Ok(match arg_types[0] {
93+
Utf8 => Utf8,
94+
LargeUtf8 => LargeUtf8,
95+
Utf8View => Utf8,
96+
Binary => Utf8,
97+
LargeBinary => LargeUtf8,
98+
Null => Null,
99+
_ => {
100+
return plan_err!(
101+
"The encode function can only accept Utf8 or Binary or Null."
102+
);
103+
}
104+
})
91105
}
92106

93107
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
@@ -108,12 +122,12 @@ impl ScalarUDFImpl for EncodeFunc {
108122
}
109123

110124
match arg_types[0] {
111-
DataType::Utf8 | DataType::Binary | DataType::Null => {
125+
DataType::Utf8 | DataType::Null => {
112126
Ok(vec![DataType::Utf8; 2])
113127
}
114-
DataType::LargeUtf8 | DataType::LargeBinary => {
115-
Ok(vec![DataType::LargeUtf8, DataType::Utf8])
116-
}
128+
DataType::LargeUtf8 => Ok(vec![DataType::LargeUtf8, DataType::Utf8]),
129+
DataType::Binary => Ok(vec![DataType::Binary, DataType::Utf8]),
130+
DataType::LargeBinary => Ok(vec![DataType::LargeBinary, DataType::Utf8]),
117131
_ => plan_err!(
118132
"1st argument should be Utf8 or Binary or Null, got {:?}",
119133
arg_types[0]

datafusion/sqllogictest/test_files/encoding.slt

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@ CREATE TABLE test(
2323
hex_field TEXT
2424
) as VALUES
2525
(0, 'abc', encode('abc', 'base64'), encode('abc', 'hex')),
26-
(1, 'qweqwe', encode('qweqwe', 'base64'), encode('qweqwe', 'hex')),
27-
(2, NULL, NULL, NULL)
26+
(1, 'qweqwe', encode('qweqwe', 'base64'), encode('qweqwe', 'hex')),
27+
(2, NULL, NULL, NULL),
28+
(3, X'8f50d3f60eae370ddbf85c86219c55108a350165', encode('8f50d3f60eae370ddbf85c86219c55108a350165', 'base64'), encode('8f50d3f60eae370ddbf85c86219c55108a350165', 'hex'))
2829
;
2930

3031
# errors
@@ -43,31 +44,48 @@ select decode(hex_field, 'non_encoding') from test;
4344
query error
4445
select to_hex(hex_field) from test;
4546

47+
query error
48+
select arrow_cast(decode(X'8f50d3f60eae370ddbf85c86219c55108a350165', 'base64'), 'Utf8');
49+
4650
# Arrays tests
4751
query T
4852
SELECT encode(bin_field, 'hex') FROM test ORDER BY num;
4953
----
5054
616263
5155
717765717765
5256
NULL
57+
8f50d3f60eae370ddbf85c86219c55108a350165
5358

5459
query T
5560
SELECT arrow_cast(decode(base64_field, 'base64'), 'Utf8') FROM test ORDER BY num;
5661
----
5762
abc
5863
qweqwe
5964
NULL
65+
8f50d3f60eae370ddbf85c86219c55108a350165
6066

6167
query T
6268
SELECT arrow_cast(decode(hex_field, 'hex'), 'Utf8') FROM test ORDER BY num;
6369
----
6470
abc
6571
qweqwe
6672
NULL
73+
8f50d3f60eae370ddbf85c86219c55108a350165
6774

6875
query T
6976
select to_hex(num) from test ORDER BY num;
7077
----
7178
0
7279
1
7380
2
81+
3
82+
83+
query T
84+
select encode(bin_field, 'base64') FROM test WHERE num = 3;
85+
----
86+
j1DT9g6uNw3b+FyGIZxVEIo1AWU
87+
88+
query B
89+
select decode(encode(bin_field, 'base64'), 'base64') = X'8f50d3f60eae370ddbf85c86219c55108a350165' FROM test WHERE num = 3;
90+
----
91+
true

0 commit comments

Comments
 (0)