diff --git a/Cargo.lock b/Cargo.lock index dff803ce08c3..47bbc7fdf686 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2221,7 +2221,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c" dependencies = [ "lazy_static", - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] @@ -2454,8 +2454,10 @@ dependencies = [ "serde", "serde_json", "session", + "simd-json", "snafu 0.8.6", "sql", + "sql-json-path", "store-api", "table", "tokio", @@ -6326,7 +6328,7 @@ dependencies = [ "js-sys", "log", "wasm-bindgen", - "windows-core 0.57.0", + "windows-core 0.61.2", ] [[package]] @@ -7323,7 +7325,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-targets 0.48.5", + "windows-targets 0.52.6", ] [[package]] @@ -12434,11 +12436,10 @@ checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" [[package]] name = "simd-json" -version = "0.15.1" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c962f626b54771990066e5435ec8331d1462576cd2d1e62f24076ae014f92112" +checksum = "90daf33666402178ddbb5d595e6d5e6d7d372da948e23ea26762f5a23e02a04e" dependencies = [ - "getrandom 0.3.3", "halfbrown", "ref-cast", "serde", @@ -12688,6 +12689,19 @@ dependencies = [ "uuid", ] +[[package]] +name = "sql-json-path" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa30c054bdeed860dd36c1e5a467ec4251d925246f7f57b55bfb77decc2e0ed4" +dependencies = [ + "nom 7.1.3", + "regex", + "serde_json", + "simd-json", + "thiserror 2.0.17", +] + [[package]] name = "sqlness" version = "0.6.1" @@ -15279,7 +15293,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index f314d2b14767..39ad5c79a4e5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -220,7 +220,7 @@ sea-query = "0.32" serde = { version = "1.0", features = ["derive"] } serde_json = { version = "1.0", features = ["float_roundtrip"] } serde_with = "3" -simd-json = "0.15" +simd-json = "0.16" similar-asserts = "1.6.0" smallvec = { version = "1", features = ["serde"] } snafu = "0.8" diff --git a/src/common/function/Cargo.toml b/src/common/function/Cargo.toml index d0f7ac685fdc..10fa9f556898 100644 --- a/src/common/function/Cargo.toml +++ b/src/common/function/Cargo.toml @@ -59,8 +59,10 @@ s2 = { version = "0.0.12", optional = true } serde.workspace = true serde_json.workspace = true session.workspace = true +simd-json.workspace = true snafu.workspace = true sql.workspace = true +sql-json-path = { version = "0.1", default-features = false, features = ["simd-json"] } store-api.workspace = true table.workspace = true uddsketch = { git = "https://github.com/GreptimeTeam/timescaledb-toolkit.git", rev = "84828fe8fb494a6a61412a3da96517fc80f7bb20" } diff --git a/src/common/function/src/scalars/json.rs b/src/common/function/src/scalars/json.rs index f84937fa0ff7..796781192e3d 100644 --- a/src/common/function/src/scalars/json.rs +++ b/src/common/function/src/scalars/json.rs @@ -16,6 +16,7 @@ pub mod json_get; mod json_is; mod json_path_exists; mod json_path_match; +mod json_path_query; mod json_to_string; mod parse_json; @@ -51,5 +52,6 @@ impl JsonFunction { registry.register_scalar(json_path_exists::JsonPathExistsFunction::default()); registry.register_scalar(json_path_match::JsonPathMatchFunction::default()); + registry.register_scalar(json_path_query::JsonPathQueryFunction::default()); } } diff --git a/src/common/function/src/scalars/json/json_path_query.rs b/src/common/function/src/scalars/json/json_path_query.rs new file mode 100644 index 000000000000..9c00cd03ab91 --- /dev/null +++ b/src/common/function/src/scalars/json/json_path_query.rs @@ -0,0 +1,127 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::{self, Display}; +use std::sync::Arc; + +use arrow::array::{Array, AsArray, BinaryBuilder}; +use arrow::compute; +use datafusion_common::DataFusionError; +use datafusion_common::arrow::datatypes::DataType; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature}; +use sql_json_path::JsonPath; + +use crate::function::{Function, extract_args}; +use crate::helper; + +/// Query JSON data using the given JSON path. +#[derive(Clone, Debug)] +pub(crate) struct JsonPathQueryFunction { + signature: Signature, +} + +impl Default for JsonPathQueryFunction { + fn default() -> Self { + Self { + signature: helper::one_of_sigs2( + vec![DataType::Binary, DataType::BinaryView], + vec![DataType::Utf8, DataType::Utf8View], + ), + } + } +} + +const NAME: &str = "json_path_query"; + +impl Function for JsonPathQueryFunction { + fn name(&self) -> &str { + NAME + } + + fn return_type(&self, _: &[DataType]) -> datafusion_common::Result { + Ok(DataType::Binary) + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn invoke_with_args( + &self, + args: ScalarFunctionArgs, + ) -> datafusion_common::Result { + let [arg0, arg1] = extract_args(self.name(), &args)?; + let arg0 = compute::cast(&arg0, &DataType::BinaryView)?; + let jsons = arg0.as_binary_view(); + let arg1 = compute::cast(&arg1, &DataType::Utf8View)?; + let paths = arg1.as_string_view(); + + let size = jsons.len(); + let mut builder = BinaryBuilder::with_capacity(size, size * 32); + + for i in 0..size { + let json = jsons.is_valid(i).then(|| jsons.value(i)); + let path = paths.is_valid(i).then(|| paths.value(i)); + + let result = match (json, path) { + (Some(json), Some(path)) => { + if !jsonb::is_null(json) { + let jsonb_value = jsonb::from_slice(json).map_err(|e| { + DataFusionError::Execution(format!("invalid jsonb binary: {e}")) + })?; + let mut json_str = jsonb_value.to_string(); + let json_value: simd_json::OwnedValue = unsafe { + simd_json::from_str(&mut json_str).map_err(|e| { + DataFusionError::Execution(format!("failed to parse json: {e}")) + })? + }; + let json_path = JsonPath::new(path).map_err(|e| { + DataFusionError::Execution(format!("invalid json path '{path}': {e}")) + })?; + let nodes = json_path.query(&json_value).map_err(|e| { + DataFusionError::Execution(format!( + "failed to evaluate json path '{path}': {e}" + )) + })?; + let node_values: Vec = + nodes.into_iter().map(|n| n.into_owned()).collect(); + let result_json = simd_json::OwnedValue::Array(Box::new(node_values)); + let json_bytes = simd_json::to_vec(&result_json).map_err(|e| { + DataFusionError::Execution(format!("failed to serialize json: {e}")) + })?; + let result_jsonb: jsonb::Value = + jsonb::from_slice(&json_bytes).map_err(|e| { + DataFusionError::Execution(format!( + "failed to deserialize json: {e}" + )) + })?; + Some(result_jsonb.to_vec()) + } else { + None + } + } + _ => None, + }; + builder.append_option(result.as_deref()); + } + + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } +} + +impl Display for JsonPathQueryFunction { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "JSON_PATH_QUERY") + } +} diff --git a/tests/cases/standalone/common/function/json/json.result b/tests/cases/standalone/common/function/json/json.result index 62c562cb9fbe..727147647c94 100644 --- a/tests/cases/standalone/common/function/json/json.result +++ b/tests/cases/standalone/common/function/json/json.result @@ -104,3 +104,36 @@ SELECT json_path_match(parse_json('null'), '$.a == 1'); | | +------------------------------------------------------------+ +--- json path query --- +SELECT json_to_string(json_path_query(parse_json('{"a": 1, "b": 2}'), '$.a')); + ++-----------------------------------------------------------------------------------+ +| json_to_string(json_path_query(parse_json(Utf8("{"a": 1, "b": 2}")),Utf8("$.a"))) | ++-----------------------------------------------------------------------------------+ +| [1] | ++-----------------------------------------------------------------------------------+ + +SELECT json_to_string(json_path_query(parse_json('{"a": 1, "b": 2}'), '$.*')); + ++-----------------------------------------------------------------------------------+ +| json_to_string(json_path_query(parse_json(Utf8("{"a": 1, "b": 2}")),Utf8("$.*"))) | ++-----------------------------------------------------------------------------------+ +| [1,2] | ++-----------------------------------------------------------------------------------+ + +SELECT json_to_string(json_path_query(parse_json('{"a": 1, "b": 2}'), '$.* ? (@ > 1)')); + ++---------------------------------------------------------------------------------------------+ +| json_to_string(json_path_query(parse_json(Utf8("{"a": 1, "b": 2}")),Utf8("$.* ? (@ > 1)"))) | ++---------------------------------------------------------------------------------------------+ +| [2] | ++---------------------------------------------------------------------------------------------+ + +SELECT json_to_string(json_path_query(parse_json('{"a": 1, "b": 2}'), '$.a.type()')); + ++------------------------------------------------------------------------------------------+ +| json_to_string(json_path_query(parse_json(Utf8("{"a": 1, "b": 2}")),Utf8("$.a.type()"))) | ++------------------------------------------------------------------------------------------+ +| ["number"] | ++------------------------------------------------------------------------------------------+ + diff --git a/tests/cases/standalone/common/function/json/json.sql b/tests/cases/standalone/common/function/json/json.sql index f8d6527ecc96..d30c83086257 100644 --- a/tests/cases/standalone/common/function/json/json.sql +++ b/tests/cases/standalone/common/function/json/json.sql @@ -26,3 +26,13 @@ SELECT json_path_match(parse_json('{"a":1,"b":[1,2,3]}'), '$.b[1 to last] >= 2') SELECT json_path_match(parse_json('{"a":1,"b":[1,2,3]}'), 'null'); SELECT json_path_match(parse_json('null'), '$.a == 1'); + +--- json path query --- + +SELECT json_to_string(json_path_query(parse_json('{"a": 1, "b": 2}'), '$.a')); + +SELECT json_to_string(json_path_query(parse_json('{"a": 1, "b": 2}'), '$.*')); + +SELECT json_to_string(json_path_query(parse_json('{"a": 1, "b": 2}'), '$.* ? (@ > 1)')); + +SELECT json_to_string(json_path_query(parse_json('{"a": 1, "b": 2}'), '$.a.type()'));