Skip to content

Commit c7c8916

Browse files
[feature](function) support hll functions hll_from_base64, hll_to_base64 (#32089)
Issue Number: #31320 Support two hll functions: - hll_from_base64 Convert a base64 string(result of function hll_to_base64) into a hll. - hll_to_base64 Convert an input hll to a base64 string.
1 parent 8d773a7 commit c7c8916

File tree

12 files changed

+594
-2
lines changed

12 files changed

+594
-2
lines changed
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include <cstddef>
19+
#include <cstdint>
20+
21+
#include "olap/hll.h"
22+
#include "util/url_coding.h"
23+
#include "vec/columns/column_complex.h"
24+
#include "vec/columns/column_nullable.h"
25+
#include "vec/columns/column_string.h"
26+
#include "vec/data_types/data_type.h"
27+
#include "vec/data_types/data_type_hll.h"
28+
#include "vec/functions/simple_function_factory.h"
29+
30+
namespace doris::vectorized {
31+
32+
class FunctionHllFromBase64 : public IFunction {
33+
public:
34+
static constexpr auto name = "hll_from_base64";
35+
36+
String get_name() const override { return name; }
37+
38+
static FunctionPtr create() { return std::make_shared<FunctionHllFromBase64>(); }
39+
40+
DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
41+
return make_nullable(std::make_shared<DataTypeHLL>());
42+
}
43+
44+
size_t get_number_of_arguments() const override { return 1; }
45+
46+
bool use_default_implementation_for_nulls() const override { return true; }
47+
48+
Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
49+
size_t result, size_t input_rows_count) const override {
50+
auto res_null_map = ColumnUInt8::create(input_rows_count, 0);
51+
auto res_data_column = ColumnHLL::create();
52+
auto& null_map = res_null_map->get_data();
53+
auto& res = res_data_column->get_data();
54+
55+
auto& argument_column = block.get_by_position(arguments[0]).column;
56+
const auto& str_column = static_cast<const ColumnString&>(*argument_column);
57+
const ColumnString::Chars& data = str_column.get_chars();
58+
const ColumnString::Offsets& offsets = str_column.get_offsets();
59+
60+
res.reserve(input_rows_count);
61+
62+
std::string decode_buff;
63+
int last_decode_buff_len = 0;
64+
int curr_decode_buff_len = 0;
65+
for (size_t i = 0; i < input_rows_count; ++i) {
66+
const char* src_str = reinterpret_cast<const char*>(&data[offsets[i - 1]]);
67+
int64_t src_size = offsets[i] - offsets[i - 1];
68+
69+
// Base64 encoding has a characteristic where every 4 characters represent 3 bytes of data.
70+
// Here, we check if the length of the input string is a multiple of 4 to ensure it's a valid base64 encoded string.
71+
if (0 != src_size % 4) {
72+
res.emplace_back();
73+
null_map[i] = 1;
74+
continue;
75+
}
76+
77+
// Allocate sufficient space for the decoded data.
78+
// The number 3 here represents the number of bytes in the decoded data for each group of 4 base64 characters.
79+
// We set the size of the decoding buffer to be 'src_size + 3' to ensure there is enough space to store the decoded data.
80+
curr_decode_buff_len = src_size + 3;
81+
if (curr_decode_buff_len > last_decode_buff_len) {
82+
decode_buff.resize(curr_decode_buff_len);
83+
last_decode_buff_len = curr_decode_buff_len;
84+
}
85+
auto outlen = base64_decode(src_str, src_size, decode_buff.data());
86+
if (outlen < 0) {
87+
res.emplace_back();
88+
null_map[i] = 1;
89+
} else {
90+
doris::Slice decoded_slice(decode_buff.data(), outlen);
91+
doris::HyperLogLog hll;
92+
if (!hll.deserialize(decoded_slice)) {
93+
return Status::RuntimeError(
94+
fmt::format("hll_from_base64 decode failed: base64: {}", src_str));
95+
} else {
96+
res.emplace_back(std::move(hll));
97+
}
98+
}
99+
}
100+
101+
block.get_by_position(result).column =
102+
ColumnNullable::create(std::move(res_data_column), std::move(res_null_map));
103+
return Status::OK();
104+
}
105+
};
106+
107+
void register_function_hll_from_base64(SimpleFunctionFactory& factory) {
108+
factory.register_function<FunctionHllFromBase64>();
109+
}
110+
111+
} // namespace doris::vectorized
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include <cstddef>
19+
#include <cstdint>
20+
21+
#include "olap/hll.h"
22+
#include "util/url_coding.h"
23+
#include "vec/columns/column_complex.h"
24+
#include "vec/columns/column_nullable.h"
25+
#include "vec/columns/column_string.h"
26+
#include "vec/data_types/data_type.h"
27+
#include "vec/data_types/data_type_hll.h"
28+
#include "vec/data_types/data_type_string.h"
29+
#include "vec/functions/function_totype.h"
30+
#include "vec/functions/simple_function_factory.h"
31+
32+
namespace doris::vectorized {
33+
34+
struct NameHllToBase64 {
35+
static constexpr auto name = "hll_to_base64";
36+
};
37+
38+
struct HllToBase64 {
39+
using ReturnType = DataTypeString;
40+
static constexpr auto TYPE_INDEX = TypeIndex::HLL;
41+
using Type = DataTypeHLL::FieldType;
42+
using ReturnColumnType = ColumnString;
43+
using Chars = ColumnString::Chars;
44+
using Offsets = ColumnString::Offsets;
45+
46+
static Status vector(const std::vector<HyperLogLog>& data, Chars& chars, Offsets& offsets) {
47+
size_t size = data.size();
48+
offsets.resize(size);
49+
size_t output_char_size = 0;
50+
for (size_t i = 0; i < size; ++i) {
51+
auto& hll_val = const_cast<HyperLogLog&>(data[i]);
52+
auto ser_size = hll_val.max_serialized_size();
53+
output_char_size += ser_size * (int)(4.0 * ceil((double)ser_size / 3.0));
54+
}
55+
ColumnString::check_chars_length(output_char_size, size);
56+
chars.resize(output_char_size);
57+
auto chars_data = chars.data();
58+
59+
size_t cur_ser_size = 0;
60+
size_t last_ser_size = 0;
61+
std::string ser_buff;
62+
size_t encoded_offset = 0;
63+
for (size_t i = 0; i < size; ++i) {
64+
auto& hll_val = const_cast<HyperLogLog&>(data[i]);
65+
66+
cur_ser_size = hll_val.max_serialized_size();
67+
if (cur_ser_size > last_ser_size) {
68+
last_ser_size = cur_ser_size;
69+
ser_buff.resize(cur_ser_size);
70+
}
71+
hll_val.serialize(reinterpret_cast<uint8_t*>(ser_buff.data()));
72+
auto outlen = base64_encode((const unsigned char*)ser_buff.data(), cur_ser_size,
73+
chars_data + encoded_offset);
74+
DCHECK(outlen > 0);
75+
76+
encoded_offset += outlen;
77+
offsets[i] = encoded_offset;
78+
}
79+
return Status::OK();
80+
}
81+
};
82+
83+
using FunctionHllToBase64 = FunctionUnaryToType<HllToBase64, NameHllToBase64>;
84+
85+
void register_function_hll_to_base64(SimpleFunctionFactory& factory) {
86+
factory.register_function<FunctionHllToBase64>();
87+
}
88+
89+
} // namespace doris::vectorized

be/src/vec/functions/simple_function_factory.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,9 @@ void register_function_comparison(SimpleFunctionFactory& factory);
3535
void register_function_comparison_eq_for_null(SimpleFunctionFactory& factory);
3636
void register_function_hll_cardinality(SimpleFunctionFactory& factory);
3737
void register_function_hll_empty(SimpleFunctionFactory& factory);
38+
void register_function_hll_from_base64(SimpleFunctionFactory& factory);
3839
void register_function_hll_hash(SimpleFunctionFactory& factory);
40+
void register_function_hll_to_base64(SimpleFunctionFactory& factory);
3941
void register_function_logical(SimpleFunctionFactory& factory);
4042
void register_function_case(SimpleFunctionFactory& factory);
4143
void register_function_cast(SimpleFunctionFactory& factory);
@@ -222,7 +224,9 @@ class SimpleFunctionFactory {
222224
register_function_bitmap_variadic(instance);
223225
register_function_hll_cardinality(instance);
224226
register_function_hll_empty(instance);
227+
register_function_hll_from_base64(instance);
225228
register_function_hll_hash(instance);
229+
register_function_hll_to_base64(instance);
226230
register_function_comparison(instance);
227231
register_function_logical(instance);
228232
register_function_case(instance);

fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,9 @@
186186
import org.apache.doris.nereids.trees.expressions.functions.scalar.Hex;
187187
import org.apache.doris.nereids.trees.expressions.functions.scalar.HllCardinality;
188188
import org.apache.doris.nereids.trees.expressions.functions.scalar.HllEmpty;
189+
import org.apache.doris.nereids.trees.expressions.functions.scalar.HllFromBase64;
189190
import org.apache.doris.nereids.trees.expressions.functions.scalar.HllHash;
191+
import org.apache.doris.nereids.trees.expressions.functions.scalar.HllToBase64;
190192
import org.apache.doris.nereids.trees.expressions.functions.scalar.Hour;
191193
import org.apache.doris.nereids.trees.expressions.functions.scalar.HourCeil;
192194
import org.apache.doris.nereids.trees.expressions.functions.scalar.HourFloor;
@@ -617,7 +619,9 @@ public class BuiltinScalarFunctions implements FunctionHelper {
617619
scalar(Hex.class, "hex"),
618620
scalar(HllCardinality.class, "hll_cardinality"),
619621
scalar(HllEmpty.class, "hll_empty"),
622+
scalar(HllFromBase64.class, "hll_from_base64"),
620623
scalar(HllHash.class, "hll_hash"),
624+
scalar(HllToBase64.class, "hll_to_base64"),
621625
scalar(Hour.class, "hour"),
622626
scalar(HourCeil.class, "hour_ceil"),
623627
scalar(HourFloor.class, "hour_floor"),
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
package org.apache.doris.nereids.trees.expressions.functions.scalar;
19+
20+
import org.apache.doris.catalog.FunctionSignature;
21+
import org.apache.doris.nereids.trees.expressions.Expression;
22+
import org.apache.doris.nereids.trees.expressions.functions.AlwaysNullable;
23+
import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
24+
import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression;
25+
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
26+
import org.apache.doris.nereids.types.HllType;
27+
import org.apache.doris.nereids.types.StringType;
28+
import org.apache.doris.nereids.types.VarcharType;
29+
30+
import com.google.common.base.Preconditions;
31+
import com.google.common.collect.ImmutableList;
32+
33+
import java.util.List;
34+
35+
/**
36+
* ScalarFunction 'hll_from_string'.
37+
*/
38+
public class HllFromBase64 extends ScalarFunction
39+
implements UnaryExpression, ExplicitlyCastableSignature, AlwaysNullable {
40+
41+
public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
42+
FunctionSignature.ret(HllType.INSTANCE).args(VarcharType.SYSTEM_DEFAULT),
43+
FunctionSignature.ret(HllType.INSTANCE).args(StringType.INSTANCE)
44+
);
45+
46+
/**
47+
* constructor with 1 argument.
48+
*/
49+
public HllFromBase64(Expression arg) {
50+
super("hll_from_base64", arg);
51+
}
52+
53+
/**
54+
* withChildren.
55+
*/
56+
@Override
57+
public HllFromBase64 withChildren(List<Expression> children) {
58+
Preconditions.checkArgument(children.size() == 1);
59+
return new HllFromBase64(children.get(0));
60+
}
61+
62+
@Override
63+
public List<FunctionSignature> getSignatures() {
64+
return SIGNATURES;
65+
}
66+
67+
@Override
68+
public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
69+
return visitor.visitHllFromBase64(this, context);
70+
}
71+
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
package org.apache.doris.nereids.trees.expressions.functions.scalar;
19+
20+
import org.apache.doris.catalog.FunctionSignature;
21+
import org.apache.doris.nereids.trees.expressions.Expression;
22+
import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
23+
import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
24+
import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression;
25+
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
26+
import org.apache.doris.nereids.types.HllType;
27+
import org.apache.doris.nereids.types.StringType;
28+
29+
import com.google.common.base.Preconditions;
30+
import com.google.common.collect.ImmutableList;
31+
32+
import java.util.List;
33+
34+
/**
35+
* ScalarFunction 'hll_to_base64'.
36+
*/
37+
public class HllToBase64 extends ScalarFunction
38+
implements UnaryExpression, ExplicitlyCastableSignature, PropagateNullable {
39+
40+
public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
41+
FunctionSignature.ret(StringType.INSTANCE).args(HllType.INSTANCE)
42+
);
43+
44+
/**
45+
* constructor with 1 argument.
46+
*/
47+
public HllToBase64(Expression arg) {
48+
super("hll_to_base64", arg);
49+
}
50+
51+
/**
52+
* withChildren.
53+
*/
54+
@Override
55+
public HllToBase64 withChildren(List<Expression> children) {
56+
Preconditions.checkArgument(children.size() == 1);
57+
return new HllToBase64(children.get(0));
58+
}
59+
60+
@Override
61+
public List<FunctionSignature> getSignatures() {
62+
return SIGNATURES;
63+
}
64+
65+
@Override
66+
public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
67+
return visitor.visitHllToBase64(this, context);
68+
}
69+
}

0 commit comments

Comments
 (0)