Skip to content

Commit 6746007

Browse files
anhvdqJefffrey
andauthored
feat(spark): Implement Spark functions url_encode, url_decode and try_url_decode (apache#17399)
## Which issue does this PR close? - Part of apache#15914 ## Rationale for this change ## What changes are included in this PR? Implement Spark functions `url_encode`, `url_decode` and `try_url_decode` ## Are these changes tested? Yes ## Are there any user-facing changes? Yes Co-authored-by: Jeffrey Vo <jeffrey.vo.australia@gmail.com>
1 parent cf9d078 commit 6746007

File tree

9 files changed

+651
-20
lines changed

9 files changed

+651
-20
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/spark/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ datafusion-expr = { workspace = true }
5050
datafusion-functions = { workspace = true, features = ["crypto_expressions"] }
5151
datafusion-functions-nested = { workspace = true }
5252
log = { workspace = true }
53+
percent-encoding = "2.3.2"
5354
rand = { workspace = true }
5455
sha1 = "0.10"
5556
url = { workspace = true }

datafusion/spark/src/function/url/mod.rs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,15 @@ use std::sync::Arc;
2121

2222
pub mod parse_url;
2323
pub mod try_parse_url;
24+
pub mod try_url_decode;
25+
pub mod url_decode;
26+
pub mod url_encode;
2427

2528
make_udf_function!(parse_url::ParseUrl, parse_url);
2629
make_udf_function!(try_parse_url::TryParseUrl, try_parse_url);
30+
make_udf_function!(try_url_decode::TryUrlDecode, try_url_decode);
31+
make_udf_function!(url_decode::UrlDecode, url_decode);
32+
make_udf_function!(url_encode::UrlEncode, url_encode);
2733

2834
pub mod expr_fn {
2935
use datafusion_functions::export_functions;
@@ -38,8 +44,17 @@ pub mod expr_fn {
3844
"Same as parse_url but returns NULL if an invalid URL is provided.",
3945
args
4046
));
47+
export_functions!((url_decode, "Decodes a URL-encoded string in ‘application/x-www-form-urlencoded’ format to its original format.", args));
48+
export_functions!((try_url_decode, "Same as url_decode but returns NULL if an invalid URL-encoded string is provided", args));
49+
export_functions!((url_encode, "Encodes a string into a URL-encoded string in ‘application/x-www-form-urlencoded’ format.", args));
4150
}
4251

4352
pub fn functions() -> Vec<Arc<ScalarUDF>> {
44-
vec![parse_url(), try_parse_url()]
53+
vec![
54+
parse_url(),
55+
try_parse_url(),
56+
try_url_decode(),
57+
url_decode(),
58+
url_encode(),
59+
]
4560
}
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use std::any::Any;
19+
20+
use arrow::array::ArrayRef;
21+
use arrow::datatypes::DataType;
22+
23+
use datafusion_common::Result;
24+
use datafusion_expr::{
25+
ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
26+
};
27+
use datafusion_functions::utils::make_scalar_function;
28+
29+
use crate::function::url::url_decode::{spark_handled_url_decode, UrlDecode};
30+
31+
#[derive(Debug, PartialEq, Eq, Hash)]
32+
pub struct TryUrlDecode {
33+
signature: Signature,
34+
url_decoder: UrlDecode,
35+
}
36+
37+
impl Default for TryUrlDecode {
38+
fn default() -> Self {
39+
Self::new()
40+
}
41+
}
42+
43+
impl TryUrlDecode {
44+
pub fn new() -> Self {
45+
Self {
46+
signature: Signature::string(1, Volatility::Immutable),
47+
url_decoder: UrlDecode::new(),
48+
}
49+
}
50+
}
51+
52+
impl ScalarUDFImpl for TryUrlDecode {
53+
fn as_any(&self) -> &dyn Any {
54+
self
55+
}
56+
57+
fn name(&self) -> &str {
58+
"try_url_decode"
59+
}
60+
61+
fn signature(&self) -> &Signature {
62+
&self.signature
63+
}
64+
65+
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
66+
self.url_decoder.return_type(arg_types)
67+
}
68+
69+
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
70+
let ScalarFunctionArgs { args, .. } = args;
71+
make_scalar_function(spark_try_url_decode, vec![])(&args)
72+
}
73+
}
74+
75+
fn spark_try_url_decode(args: &[ArrayRef]) -> Result<ArrayRef> {
76+
spark_handled_url_decode(args, |x| match x {
77+
Err(_) => Ok(None),
78+
result => result,
79+
})
80+
}
81+
82+
#[cfg(test)]
83+
mod tests {
84+
use std::sync::Arc;
85+
86+
use arrow::array::StringArray;
87+
use datafusion_common::{cast::as_string_array, Result};
88+
89+
use super::*;
90+
91+
#[test]
92+
fn test_try_decode_error_handled() -> Result<()> {
93+
let input = Arc::new(StringArray::from(vec![
94+
Some("http%3A%2F%2spark.apache.org"), // '%2s' is not a valid percent encoded character
95+
// Valid cases
96+
Some("https%3A%2F%2Fspark.apache.org"),
97+
None,
98+
]));
99+
100+
let expected =
101+
StringArray::from(vec![None, Some("https://spark.apache.org"), None]);
102+
103+
let result = spark_try_url_decode(&[input as ArrayRef])?;
104+
let result = as_string_array(&result)?;
105+
106+
assert_eq!(&expected, result);
107+
Ok(())
108+
}
109+
}

0 commit comments

Comments
 (0)