diff --git a/include/fluent-bit/flb_mem.h b/include/fluent-bit/flb_mem.h index 3c83580db05..ef7931509ab 100644 --- a/include/fluent-bit/flb_mem.h +++ b/include/fluent-bit/flb_mem.h @@ -50,8 +50,8 @@ /* * Return 1 or 0 based on a probability. */ -int flb_malloc_p; -int flb_malloc_mod; +extern int flb_malloc_p; +extern int flb_malloc_mod; static inline int flb_fuzz_get_probability(int val) { flb_malloc_p += 1; diff --git a/plugins/in_tail/tail_file.c b/plugins/in_tail/tail_file.c index 77d86e786a4..b68bb3fc467 100644 --- a/plugins/in_tail/tail_file.c +++ b/plugins/in_tail/tail_file.c @@ -471,6 +471,14 @@ static int process_content(struct flb_tail_file *file, size_t *bytes) } else if (ret == FLB_UNICODE_CONVERT_NOP) { flb_plg_debug(ctx->ins, "nothing to convert encoding '%.*s'", end - data, data); + /* Skip the UTF-8 BOM */ + if (file->buf_len >= 3 && + data[0] == '\xEF' && + data[1] == '\xBB' && + data[2] == '\xBF') { + data += 3; + processed_bytes += 3; + } } else { flb_plg_error(ctx->ins, "encoding failed '%.*s'", end - data, data); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ecf1dadd8a2..520d7ff3c20 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -91,6 +91,7 @@ set(src flb_cfl_ra_key.c flb_cfl_record_accessor.c flb_conditionals.c + flb_mem.c ) # Config format diff --git a/src/flb_mem.c b/src/flb_mem.c new file mode 100644 index 00000000000..eb5b50ce4a3 --- /dev/null +++ b/src/flb_mem.c @@ -0,0 +1,25 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2025 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#ifdef FLB_HAVE_TESTS_OSSFUZZ +int flb_malloc_p; +int flb_malloc_mod; +#endif diff --git a/src/simdutf/CMakeLists.txt b/src/simdutf/CMakeLists.txt index fde1b1bf650..d29e621a68d 100644 --- a/src/simdutf/CMakeLists.txt +++ b/src/simdutf/CMakeLists.txt @@ -7,3 +7,6 @@ set(src add_library(flb-simdutf-connector-static STATIC ${src}) target_link_libraries(flb-simdutf-connector-static simdutf-static) +if(FLB_JEMALLOC) + target_link_libraries(flb-simdutf-connector-static ${JEMALLOC_LIBRARIES}) +endif() diff --git a/src/simdutf/flb_simdutf_connector.cpp b/src/simdutf/flb_simdutf_connector.cpp index b625a91b2fb..cddfbc99cce 100644 --- a/src/simdutf/flb_simdutf_connector.cpp +++ b/src/simdutf/flb_simdutf_connector.cpp @@ -19,8 +19,49 @@ #include #include -#include #include +extern "C" +{ +#include +#include +} + +typedef int (*conversion_function)(const char16_t *buf, size_t len, + char **utf8_output, size_t *out_size); + +static int convert_from_unicode(conversion_function convert, + const char *input, size_t length, + char **output, size_t *out_size) +{ + size_t len; + std::unique_ptr temp_buffer(NULL, flb_free); + const char16_t *aligned_input = NULL; + int status; + + len = length; + if (len % 2) { + len--; + } + if (len < 2) { + return FLB_SIMDUTF_CONNECTOR_CONVERT_NOP; + } + + /* Check alignment to determine whether to copy or not */ + if ((uintptr_t) input % 2 == 0) { + aligned_input = (const char16_t *) input; + } + else { + temp_buffer.reset((char16_t *) flb_malloc(len)); + if (temp_buffer.get() == NULL) { + flb_errno(); + return FLB_SIMDUTF_CONNECTOR_CONVERT_ERROR; + } + memcpy(temp_buffer.get(), input, len); + aligned_input = temp_buffer.get(); + } + + return convert(aligned_input, len / 2, output, out_size); +} int flb_simdutf_connector_utf8_length_from_utf16le(const char16_t *buf, size_t len) { @@ -61,23 +102,24 @@ int flb_simdutf_connector_convert_utf16le_to_utf8(const char16_t *buf, size_t le char **utf8_output, size_t *out_size) { size_t clen = 0; - size_t converted = 0; - simdutf::result result; + simdutf::result result = {}; clen = simdutf::utf8_length_from_utf16le(buf, len); - /* convert_utfXXXX_to_utf8 function needs to pass allocated memory region with C++ style */ - std::unique_ptr output{new char[clen]}; - converted = simdutf::convert_utf16le_to_utf8(buf, len, output.get()); - result = simdutf::validate_utf8_with_errors(output.get(), clen); - if (result.error == simdutf::error_code::SUCCESS && converted > 0) { - std::string result_string(output.get(), clen); + *utf8_output = (char *) flb_malloc(clen + 1); + if (*utf8_output == NULL) { + flb_errno(); + return FLB_SIMDUTF_CONNECTOR_CONVERT_ERROR; + } - *utf8_output = strdup(result_string.c_str()); - *out_size = converted; + result = simdutf::convert_utf16le_to_utf8_with_errors(buf, len, *utf8_output); + if (result.error == simdutf::error_code::SUCCESS && result.count > 0) { + (*utf8_output)[result.count] = '\0'; + *out_size = result.count; return FLB_SIMDUTF_ERROR_CODE_SUCCESS; } else { + flb_free(*utf8_output); *utf8_output = NULL; *out_size = 0; @@ -89,23 +131,24 @@ int flb_simdutf_connector_convert_utf16be_to_utf8(const char16_t *buf, size_t le char **utf8_output, size_t *out_size) { size_t clen = 0; - size_t converted = 0; - simdutf::result result; + simdutf::result result = {}; clen = simdutf::utf8_length_from_utf16be(buf, len); - /* convert_utfXXXX_to_utf8 function needs to pass allocated memory region with C++ style */ - std::unique_ptr output{new char[clen]}; - converted = simdutf::convert_utf16be_to_utf8(buf, len, output.get()); - result = simdutf::validate_utf8_with_errors(output.get(), clen); - if (result.error == simdutf::error_code::SUCCESS && converted > 0) { - std::string result_string(output.get(), clen); + *utf8_output = (char *) flb_malloc(clen + 1); + if (*utf8_output == NULL) { + flb_errno(); + return FLB_SIMDUTF_CONNECTOR_CONVERT_ERROR; + } - *utf8_output = strdup(result_string.c_str()); - *out_size = converted; + result = simdutf::convert_utf16be_to_utf8_with_errors(buf, len, *utf8_output); + if (result.error == simdutf::error_code::SUCCESS && result.count > 0) { + (*utf8_output)[result.count] = '\0'; + *out_size = result.count; return FLB_SIMDUTF_ERROR_CODE_SUCCESS; } else { + flb_free(*utf8_output); *utf8_output = NULL; *out_size = 0; @@ -117,23 +160,24 @@ int flb_simdutf_connector_convert_utf16_to_utf8(const char16_t *buf, size_t len, char **utf8_output, size_t *out_size) { size_t clen = 0; - size_t converted = 0; - simdutf::result result; + simdutf::result result = {}; clen = simdutf::utf8_length_from_utf16(buf, len); - /* convert_utfXXXX_to_utf8 function needs to pass allocated memory region with C++ style */ - std::unique_ptr output{new char[clen]}; - converted = simdutf::convert_utf16_to_utf8(buf, len, output.get()); - result = simdutf::validate_utf8_with_errors(output.get(), clen); - if (result.error == simdutf::error_code::SUCCESS && converted > 0) { - std::string result_string(output.get(), clen); + *utf8_output = (char *) flb_malloc(clen + 1); + if (*utf8_output == NULL) { + flb_errno(); + return FLB_SIMDUTF_CONNECTOR_CONVERT_ERROR; + } - *utf8_output = strdup(result_string.c_str()); - *out_size = converted; + result = simdutf::convert_utf16_to_utf8_with_errors(buf, len, *utf8_output); + if (result.error == simdutf::error_code::SUCCESS && result.count > 0) { + (*utf8_output)[result.count] = '\0'; + *out_size = result.count; return FLB_SIMDUTF_ERROR_CODE_SUCCESS; } else { + flb_free(*utf8_output); *utf8_output = NULL; *out_size = 0; @@ -155,11 +199,7 @@ int flb_simdutf_connector_convert_from_unicode(int preferred_encoding, const char *input, size_t length, char **output, size_t *out_size) { - size_t len = 0; - size_t i = 0; int encoding = 0; - std::u16string str16; - if (preferred_encoding == FLB_SIMDUTF_ENCODING_TYPE_UNICODE_AUTO) { encoding = simdutf::detect_encodings(input, length); } @@ -175,46 +215,22 @@ int flb_simdutf_connector_convert_from_unicode(int preferred_encoding, return FLB_SIMDUTF_CONNECTOR_CONVERT_NOP; } else if ((encoding & simdutf::encoding_type::UTF16_LE) == simdutf::encoding_type::UTF16_LE) { - len = length; - if (len % 2) { - len--; + /* Skip the UTF-16 BOM */ + if (length >= 2 && input[0] == '\xFF' && input[1] == '\xFE') { + input += 2; + length -= 2; } - if (len < 2) { - return FLB_SIMDUTF_CONNECTOR_CONVERT_NOP; - } - for (i = 0 ; i < len;) { - if (i + 2 > len) { - break; - } - /* little-endian */ - int lo = input[i++] & 0xFF; - int hi = input[i++] & 0xFF; - str16.push_back(hi << 8 | lo); - } - - return flb_simdutf_connector_convert_utf16le_to_utf8(str16.c_str(), str16.size(), - output, out_size); + return convert_from_unicode(flb_simdutf_connector_convert_utf16le_to_utf8, + input, length, output, out_size); } else if ((encoding & simdutf::encoding_type::UTF16_BE) == simdutf::encoding_type::UTF16_BE) { - len = length; - if (len % 2) { - len--; + /* Skip the UTF-16 BOM */ + if (length >= 2 && input[0] == '\xFE' && input[1] == '\xFF') { + input += 2; + length -= 2; } - if (len < 2) { - return FLB_SIMDUTF_CONNECTOR_CONVERT_NOP; - } - for (i = 0; i < len;) { - if (i + 2 > len) { - break; - } - /* big-endian */ - int lo = input[i++] & 0xFF; - int hi = input[i++] & 0xFF; - str16.push_back(lo | hi << 8); - } - - return flb_simdutf_connector_convert_utf16be_to_utf8(str16.c_str(), str16.size(), - output, out_size); + return convert_from_unicode(flb_simdutf_connector_convert_utf16be_to_utf8, + input, length, output, out_size); } else { /* Note: UTF-32LE and UTF-32BE are used for internal usages