diff --git a/Cargo.lock b/Cargo.lock index 133aa0aa16202..fcf13dd3cc382 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4340,6 +4340,7 @@ dependencies = [ "databend-storages-common-table-meta", "divan", "enum-as-inner", + "enum_dispatch", "fastrace", "futures", "futures-util", @@ -4351,6 +4352,7 @@ dependencies = [ "opendal", "parking_lot 0.12.3", "parquet", + "paste", "rand 0.8.5", "serde", "serde_json", diff --git a/scripts/selfhost/restore_logs.sh b/scripts/selfhost/restore_logs.sh index 324b369b5ddf0..948216eab4856 100755 --- a/scripts/selfhost/restore_logs.sh +++ b/scripts/selfhost/restore_logs.sh @@ -2,52 +2,52 @@ # Simple logging log() { - echo "[$(date '+%H:%M:%S')] $1" + echo "[$(date '+%H:%M:%S')] $1" } log_error() { - echo "[$(date '+%H:%M:%S')] ERROR: $1" >&2 + echo "[$(date '+%H:%M:%S')] ERROR: $1" >&2 } log_step() { - echo "[$(date '+%H:%M:%S')] [$1/$2] $3" + echo "[$(date '+%H:%M:%S')] [$1/$2] $3" } # Parse arguments while [[ $# -gt 0 ]]; do - case "$1" in - --dsn) - DSN="$2" - shift 2 - ;; - --stage) - STAGE="$2" - shift 2 - ;; - *) - if [[ "$1" =~ ^[0-9]{8}$ ]]; then - DATE_ARG="$1" - shift - else - log_error "Unknown parameter: $1" - exit 1 - fi - ;; - esac + case "$1" in + --dsn) + DSN="$2" + shift 2 + ;; + --stage) + STAGE="$2" + shift 2 + ;; + *) + if [[ "$1" =~ ^[0-9]{8}$ ]]; then + DATE_ARG="$1" + shift + else + log_error "Unknown parameter: $1" + exit 1 + fi + ;; + esac done # Validate parameters if [[ -z "$STAGE" || -z "$DATE_ARG" ]]; then - log_error "Missing required parameters: --stage or yyyymmdd date" - exit 1 + log_error "Missing required parameters: --stage or yyyymmdd date" + exit 1 fi if [[ -z "$DSN" ]]; then - DSN="$BENDSQL_DSN" - if [[ -z "$DSN" ]]; then - log_error "DSN not provided and BENDSQL_DSN not set" - exit 1 - fi + DSN="$BENDSQL_DSN" + if [[ -z "$DSN" ]]; then + log_error "DSN not provided and BENDSQL_DSN not set" + exit 1 + fi fi # Format date @@ -66,8 +66,8 @@ DOWNLOAD_SQL="PRESIGN DOWNLOAD @${STAGE}/${TAR_FILE}" DOWNLOAD_URL=$(bendsql --dsn "${DSN}" --query="${DOWNLOAD_SQL}" | awk '{print $3}') if [[ -z "$DOWNLOAD_URL" ]]; then - log_error "Failed to generate download URL for ${TAR_FILE}" - exit 1 + log_error "Failed to generate download URL for ${TAR_FILE}" + exit 1 fi log "Download URL generated successfully" @@ -76,8 +76,8 @@ log_step "2" "6" "Downloading ${TAR_FILE} from stage @${STAGE}" curl -s -o "${TAR_FILE}" "${DOWNLOAD_URL}" if [[ ! -f "${TAR_FILE}" ]]; then - log_error "Failed to download ${TAR_FILE}" - exit 1 + log_error "Failed to download ${TAR_FILE}" + exit 1 fi FILE_SIZE=$(du -h "${TAR_FILE}" | cut -f1) @@ -98,21 +98,21 @@ TARGET_DIRS=("columns" "user_functions" "query_raw_logs" "query_logs" "query_pro PREFIX="" for target_dir in "${TARGET_DIRS[@]}"; do - SAMPLE_FILE=$(find "${TEMP_DIR}" -path "*/${target_dir}/*" -type f | head -1) - if [[ -n "$SAMPLE_FILE" ]]; then - RELATIVE_PATH="${SAMPLE_FILE#${TEMP_DIR}/}" - PREFIX=$(echo "$RELATIVE_PATH" | sed "s|/${target_dir}/.*||" | sed "s|${target_dir}/.*||") - if [[ -n "$PREFIX" ]]; then - PREFIX="${PREFIX}/" - fi - break - fi + SAMPLE_FILE=$(find "${TEMP_DIR}" -path "*/${target_dir}/*" -type f | head -1) + if [[ -n "$SAMPLE_FILE" ]]; then + RELATIVE_PATH="${SAMPLE_FILE#${TEMP_DIR}/}" + PREFIX=$(echo "$RELATIVE_PATH" | sed "s|/${target_dir}/.*||" | sed "s|${target_dir}/.*||") + if [[ -n "$PREFIX" ]]; then + PREFIX="${PREFIX}/" + fi + break + fi done if [[ -n "$PREFIX" ]]; then - log "Path prefix detected: '${PREFIX}' - will be stripped during upload" + log "Path prefix detected: '${PREFIX}' - will be stripped during upload" else - log "No path prefix detected - using original file paths" + log "No path prefix detected - using original file paths" fi # Step 5: Upload files @@ -129,32 +129,32 @@ UPLOAD_SUCCESS=0 UPLOAD_FAILED=0 find "${TEMP_DIR}" -type f | while read -r FILE; do - CURRENT_FILE=$((CURRENT_FILE + 1)) - RELATIVE_PATH="${FILE#${TEMP_DIR}/}" - - if [[ -n "$PREFIX" && "$RELATIVE_PATH" == ${PREFIX}* ]]; then - UPLOAD_PATH="${RELATIVE_PATH#${PREFIX}}" - else - UPLOAD_PATH="$RELATIVE_PATH" - fi - - printf "\rUploading: %d/%d files (Success: %d, Failed: %d)" "$CURRENT_FILE" "$TOTAL_FILES" "$UPLOAD_SUCCESS" "$UPLOAD_FAILED" - - UPLOAD_SQL="PRESIGN UPLOAD @${UPLOAD_STAGE}/${UPLOAD_PATH}" - UPLOAD_URL=$(bendsql --dsn "${DSN}" --query="${UPLOAD_SQL}" | awk '{print $3}') - - if [[ -n "$UPLOAD_URL" ]]; then - if curl -s -X PUT -T "${FILE}" "${UPLOAD_URL}"; then - UPLOAD_SUCCESS=$((UPLOAD_SUCCESS + 1)) - else - UPLOAD_FAILED=$((UPLOAD_FAILED + 1)) - fi - else - UPLOAD_FAILED=$((UPLOAD_FAILED + 1)) - fi + CURRENT_FILE=$((CURRENT_FILE + 1)) + RELATIVE_PATH="${FILE#${TEMP_DIR}/}" + + if [[ -n "$PREFIX" && "$RELATIVE_PATH" == ${PREFIX}* ]]; then + UPLOAD_PATH="${RELATIVE_PATH#${PREFIX}}" + else + UPLOAD_PATH="$RELATIVE_PATH" + fi + + printf "\rUploading: %d/%d files (Success: %d, Failed: %d)" "$CURRENT_FILE" "$TOTAL_FILES" "$UPLOAD_SUCCESS" "$UPLOAD_FAILED" + + UPLOAD_SQL="PRESIGN UPLOAD @${UPLOAD_STAGE}/${UPLOAD_PATH}" + UPLOAD_URL=$(bendsql --dsn "${DSN}" --query="${UPLOAD_SQL}" | awk '{print $3}') + + if [[ -n "$UPLOAD_URL" ]]; then + if curl -s -X PUT -T "${FILE}" "${UPLOAD_URL}"; then + UPLOAD_SUCCESS=$((UPLOAD_SUCCESS + 1)) + else + UPLOAD_FAILED=$((UPLOAD_FAILED + 1)) + fi + else + UPLOAD_FAILED=$((UPLOAD_FAILED + 1)) + fi done -echo # New line after progress +echo # New line after progress log "Upload completed: ${UPLOAD_SUCCESS} successful, ${UPLOAD_FAILED} failed" # Cleanup @@ -171,23 +171,23 @@ log "Created database: ${RESTORE_DATABASE}" # Restore tables declare -A TABLE_MAP=( - ["columns"]="system.columns:columns" - ["user_functions"]="system.user_functions:user_functions" - ["log_history"]="system_history.log_history:query_raw_logs" - ["query_history"]="system_history.query_history:query_logs" - ["profile_history"]="system_history.profile_history:query_profile_logs" + ["columns"]="system.columns:columns" + ["user_functions"]="system.user_functions:user_functions" + ["log_history"]="system_history.log_history:query_raw_logs" + ["query_history"]="system_history.query_history:query_logs" + ["profile_history"]="system_history.profile_history:query_profile_logs" ) for table_name in "${!TABLE_MAP[@]}"; do - IFS=':' read -r source_table source_path <<< "${TABLE_MAP[$table_name]}" - - log "Restoring table: ${RESTORE_DATABASE}.${table_name} from @${UPLOAD_STAGE}/${source_path}" - - bendsql --dsn "${DSN}" --database "${RESTORE_DATABASE}" --query="CREATE TABLE ${table_name} LIKE ${source_table};" >/dev/null 2>&1 - bendsql --dsn "${DSN}" --database "${RESTORE_DATABASE}" --query="COPY INTO ${table_name} FROM @${UPLOAD_STAGE}/${source_path};" >/dev/null 2>&1 - - ROW_COUNT=$(bendsql --dsn "${DSN}" --database "${RESTORE_DATABASE}" --query="SELECT COUNT(*) FROM ${table_name};" | tail -1) - log "Table ${table_name} restored: ${ROW_COUNT} rows" + IFS=':' read -r source_table source_path <<<"${TABLE_MAP[$table_name]}" + + log "Restoring table: ${RESTORE_DATABASE}.${table_name} from @${UPLOAD_STAGE}/${source_path}" + + bendsql --dsn "${DSN}" --database "${RESTORE_DATABASE}" --query="CREATE TABLE ${table_name} LIKE ${source_table};" >/dev/null 2>&1 + bendsql --dsn "${DSN}" --database "${RESTORE_DATABASE}" --query="COPY INTO ${table_name} FROM @${UPLOAD_STAGE}/${source_path};" >/dev/null 2>&1 + + ROW_COUNT=$(bendsql --dsn "${DSN}" --database "${RESTORE_DATABASE}" --query="SELECT COUNT(*) FROM ${table_name};" | tail -1) + log "Table ${table_name} restored: ${ROW_COUNT} rows" done log "Log restoration completed successfully" diff --git a/src/common/metrics/src/metrics/storage.rs b/src/common/metrics/src/metrics/storage.rs index 8059be0b39645..d7ae34d57f9ec 100644 --- a/src/common/metrics/src/metrics/storage.rs +++ b/src/common/metrics/src/metrics/storage.rs @@ -337,6 +337,14 @@ static BLOCK_VIRTUAL_COLUMN_WRITE_MILLISECONDS: LazyLock = LazyLock:: register_histogram_in_milliseconds("fuse_block_virtual_column_write_milliseconds") }); +// Block statistics metrics. +static BLOCK_STATS_WRITE_NUMS: LazyLock = + LazyLock::new(|| register_counter("fuse_block_stats_write_nums")); +static BLOCK_STATS_WRITE_BYTES: LazyLock = + LazyLock::new(|| register_counter("fuse_block_stats_write_bytes")); +static BLOCK_STATS_WRITE_MILLISECONDS: LazyLock = + LazyLock::new(|| register_histogram_in_milliseconds("fuse_block_stats_write_milliseconds")); + /// Common metrics. pub fn metrics_inc_omit_filter_rowgroups(c: u64) { OMIT_FILTER_ROWGROUPS.inc_by(c); @@ -907,3 +915,16 @@ pub fn metrics_inc_block_virtual_column_write_bytes(c: u64) { pub fn metrics_inc_block_virtual_column_write_milliseconds(c: u64) { BLOCK_VIRTUAL_COLUMN_WRITE_MILLISECONDS.observe(c as f64); } + +/// Block stats metrics. +pub fn metrics_inc_block_stats_write_nums(c: u64) { + BLOCK_STATS_WRITE_NUMS.inc_by(c); +} + +pub fn metrics_inc_block_stats_write_bytes(c: u64) { + BLOCK_STATS_WRITE_BYTES.inc_by(c); +} + +pub fn metrics_inc_block_stats_write_milliseconds(c: u64) { + BLOCK_STATS_WRITE_MILLISECONDS.observe(c as f64); +} diff --git a/src/query/ee/src/storages/fuse/operations/vacuum_table.rs b/src/query/ee/src/storages/fuse/operations/vacuum_table.rs index 141f987a8b3e3..19ee34c627efa 100644 --- a/src/query/ee/src/storages/fuse/operations/vacuum_table.rs +++ b/src/query/ee/src/storages/fuse/operations/vacuum_table.rs @@ -40,6 +40,7 @@ pub struct SnapshotReferencedFiles { pub segments: HashSet, pub blocks: HashSet, pub blocks_index: HashSet, + pub blocks_stats: HashSet, } impl SnapshotReferencedFiles { @@ -54,6 +55,9 @@ impl SnapshotReferencedFiles { for file in &self.blocks_index { files.push(file.clone()); } + for file in &self.blocks_stats { + files.push(file.clone()); + } files } } @@ -132,6 +136,7 @@ pub async fn get_snapshot_referenced_files( segments, blocks: locations_referenced.block_location, blocks_index: locations_referenced.bloom_location, + blocks_stats: locations_referenced.stats_location, })) } @@ -164,10 +169,11 @@ pub async fn do_gc_orphan_files( None => return Ok(()), }; let status = format!( - "gc orphan: read referenced files:{},{},{}, cost:{:?}", + "gc orphan: read referenced files:{},{},{},{}, cost:{:?}", referenced_files.segments.len(), referenced_files.blocks.len(), referenced_files.blocks_index.len(), + referenced_files.blocks_stats.len(), start.elapsed() ); ctx.set_status_info(&status); @@ -268,6 +274,36 @@ pub async fn do_gc_orphan_files( ); ctx.set_status_info(&status); + // 5. Purge orphan block stats files. + // 5.1 Get orphan block stats files to be purged + let stats_locations_to_be_purged = get_orphan_files_to_be_purged( + fuse_table, + location_gen.block_statistics_location_prefix(), + referenced_files.blocks_stats, + retention_time, + ) + .await?; + let status = format!( + "gc orphan: read stats_locations_to_be_purged:{}, cost:{:?}", + stats_locations_to_be_purged.len(), + start.elapsed() + ); + ctx.set_status_info(&status); + + // 5.2 Delete all the orphan block stats files to be purged + let purged_file_num = stats_locations_to_be_purged.len(); + fuse_table + .try_purge_location_files( + ctx.clone(), + HashSet::from_iter(stats_locations_to_be_purged.into_iter()), + ) + .await?; + let status = format!( + "gc orphan: purged block stats files:{}, cost:{:?}", + purged_file_num, + start.elapsed() + ); + ctx.set_status_info(&status); Ok(()) } @@ -286,10 +322,11 @@ pub async fn do_dry_run_orphan_files( None => return Ok(()), }; let status = format!( - "dry_run orphan: read referenced files:{},{},{}, cost:{:?}", + "dry_run orphan: read referenced files:{},{},{},{}, cost:{:?}", referenced_files.segments.len(), referenced_files.blocks.len(), referenced_files.blocks_index.len(), + referenced_files.blocks_stats.len(), start.elapsed() ); ctx.set_status_info(&status); @@ -351,6 +388,23 @@ pub async fn do_dry_run_orphan_files( purge_files.extend(index_locations_to_be_purged); + // 5. Get purge orphan block stats files. + let stats_locations_to_be_purged = get_orphan_files_to_be_purged( + fuse_table, + location_gen.block_statistics_location_prefix(), + referenced_files.blocks_stats, + retention_time, + ) + .await?; + let status = format!( + "dry_run orphan: read stats_locations_to_be_purged:{}, cost:{:?}", + stats_locations_to_be_purged.len(), + start.elapsed() + ); + ctx.set_status_info(&status); + + purge_files.extend(stats_locations_to_be_purged); + Ok(()) } diff --git a/src/query/ee/src/storages/fuse/operations/vacuum_table_v2.rs b/src/query/ee/src/storages/fuse/operations/vacuum_table_v2.rs index e03ce27f8466b..da2d43f7f708e 100644 --- a/src/query/ee/src/storages/fuse/operations/vacuum_table_v2.rs +++ b/src/query/ee/src/storages/fuse/operations/vacuum_table_v2.rs @@ -336,6 +336,9 @@ pub async fn do_vacuum2( } indexes_to_gc .push(TableMetaLocationGenerator::gen_bloom_index_location_from_block_location(loc)); + + indexes_to_gc + .push(TableMetaLocationGenerator::gen_block_stats_location_from_block_location(loc)); } ctx.set_status_info(&format!( diff --git a/src/query/ee/tests/it/storages/fuse/operations/vacuum.rs b/src/query/ee/tests/it/storages/fuse/operations/vacuum.rs index 87c2268ceee0d..dfdbbf35266e9 100644 --- a/src/query/ee/tests/it/storages/fuse/operations/vacuum.rs +++ b/src/query/ee/tests/it/storages/fuse/operations/vacuum.rs @@ -66,6 +66,7 @@ async fn test_fuse_do_vacuum_drop_tables() -> Result<()> { 1, 1, 1, + 1, None, None, ) @@ -90,6 +91,7 @@ async fn test_fuse_do_vacuum_drop_tables() -> Result<()> { 1, 1, 1, + 1, None, None, ) @@ -108,6 +110,7 @@ async fn test_fuse_do_vacuum_drop_tables() -> Result<()> { 0, 0, 0, + 0, None, None, ) diff --git a/src/query/expression/src/values.rs b/src/query/expression/src/values.rs index bff7c60ff1703..cd12b0ee8508b 100755 --- a/src/query/expression/src/values.rs +++ b/src/query/expression/src/values.rs @@ -1694,6 +1694,23 @@ impl Column { _ => (false, None), } } + + /// Checks if the average length of a string column exceeds 256 bytes. + /// If it does, the bloom index for the column will not be established. + pub fn check_large_string(&self) -> bool { + let (inner, len) = if let Column::Nullable(c) = self { + (&c.column, c.validity.true_count()) + } else { + (self, self.len()) + }; + if let Column::String(v) = inner { + let bytes_per_row = v.total_bytes_len() / len.max(1); + if bytes_per_row > 256 { + return true; + } + } + false + } } /// Serialize a column to a base64 string. diff --git a/src/query/service/src/interpreters/common/table_option_validation.rs b/src/query/service/src/interpreters/common/table_option_validation.rs index ab332a1dce642..1a967a1c469a2 100644 --- a/src/query/service/src/interpreters/common/table_option_validation.rs +++ b/src/query/service/src/interpreters/common/table_option_validation.rs @@ -24,6 +24,7 @@ use databend_common_exception::ErrorCode; use databend_common_expression::TableSchemaRef; use databend_common_io::constants::DEFAULT_BLOCK_ROW_COUNT; use databend_common_settings::Settings; +use databend_common_sql::ApproxDistinctColumns; use databend_common_sql::BloomIndexColumns; use databend_common_storages_fuse::FUSE_OPT_KEY_BLOCK_IN_MEM_SIZE_THRESHOLD; use databend_common_storages_fuse::FUSE_OPT_KEY_BLOCK_PER_SEGMENT; @@ -35,6 +36,8 @@ use databend_common_storages_fuse::FUSE_OPT_KEY_ROW_AVG_DEPTH_THRESHOLD; use databend_common_storages_fuse::FUSE_OPT_KEY_ROW_PER_BLOCK; use databend_common_storages_fuse::FUSE_OPT_KEY_ROW_PER_PAGE; use databend_storages_common_index::BloomIndex; +use databend_storages_common_index::RangeIndex; +use databend_storages_common_table_meta::table::OPT_KEY_APPROX_DISTINCT_COLUMNS; use databend_storages_common_table_meta::table::OPT_KEY_BLOOM_INDEX_COLUMNS; use databend_storages_common_table_meta::table::OPT_KEY_CHANGE_TRACKING; use databend_storages_common_table_meta::table::OPT_KEY_CLUSTER_TYPE; @@ -68,6 +71,7 @@ pub static CREATE_FUSE_OPTIONS: LazyLock> = LazyLock::new( r.insert(FUSE_OPT_KEY_ENABLE_AUTO_VACUUM); r.insert(OPT_KEY_BLOOM_INDEX_COLUMNS); + r.insert(OPT_KEY_APPROX_DISTINCT_COLUMNS); r.insert(OPT_KEY_TABLE_COMPRESSION); r.insert(OPT_KEY_STORAGE_FORMAT); r.insert(OPT_KEY_DATABASE_ID); @@ -213,6 +217,16 @@ pub fn is_valid_bloom_index_columns( Ok(()) } +pub fn is_valid_approx_distinct_columns( + options: &BTreeMap, + schema: TableSchemaRef, +) -> databend_common_exception::Result<()> { + if let Some(value) = options.get(OPT_KEY_APPROX_DISTINCT_COLUMNS) { + ApproxDistinctColumns::verify_definition(value, schema, RangeIndex::supported_table_type)?; + } + Ok(()) +} + pub fn is_valid_change_tracking( options: &BTreeMap, ) -> databend_common_exception::Result<()> { diff --git a/src/query/service/src/interpreters/interpreter_table_create.rs b/src/query/service/src/interpreters/interpreter_table_create.rs index de7c27cdd1777..faf37f95ee9d5 100644 --- a/src/query/service/src/interpreters/interpreter_table_create.rs +++ b/src/query/service/src/interpreters/interpreter_table_create.rs @@ -66,6 +66,7 @@ use databend_storages_common_table_meta::table::OPT_KEY_TEMP_PREFIX; use log::error; use log::info; +use crate::interpreters::common::table_option_validation::is_valid_approx_distinct_columns; use crate::interpreters::common::table_option_validation::is_valid_block_per_segment; use crate::interpreters::common::table_option_validation::is_valid_bloom_index_columns; use crate::interpreters::common::table_option_validation::is_valid_change_tracking; @@ -467,7 +468,8 @@ impl CreateTableInterpreter { is_valid_block_per_segment(&table_meta.options)?; is_valid_row_per_block(&table_meta.options)?; // check bloom_index_columns. - is_valid_bloom_index_columns(&table_meta.options, schema)?; + is_valid_bloom_index_columns(&table_meta.options, schema.clone())?; + is_valid_approx_distinct_columns(&table_meta.options, schema)?; is_valid_change_tracking(&table_meta.options)?; // check random seed is_valid_random_seed(&table_meta.options)?; diff --git a/src/query/service/src/interpreters/interpreter_table_set_options.rs b/src/query/service/src/interpreters/interpreter_table_set_options.rs index d75f94772cddf..93185f25feeb8 100644 --- a/src/query/service/src/interpreters/interpreter_table_set_options.rs +++ b/src/query/service/src/interpreters/interpreter_table_set_options.rs @@ -49,6 +49,7 @@ use databend_storages_common_table_meta::table::OPT_KEY_STORAGE_FORMAT; use databend_storages_common_table_meta::table::OPT_KEY_TEMP_PREFIX; use log::error; +use crate::interpreters::common::table_option_validation::is_valid_approx_distinct_columns; use crate::interpreters::common::table_option_validation::is_valid_block_per_segment; use crate::interpreters::common::table_option_validation::is_valid_bloom_index_columns; use crate::interpreters::common::table_option_validation::is_valid_create_opt; @@ -163,6 +164,7 @@ impl Interpreter for SetOptionsInterpreter { // check bloom_index_columns. is_valid_bloom_index_columns(&self.plan.set_options, table.schema())?; + is_valid_approx_distinct_columns(&self.plan.set_options, table.schema())?; if let Some(new_snapshot_location) = set_segment_format(self.ctx.clone(), table.clone(), &self.plan.set_options).await? diff --git a/src/query/service/src/test_kits/block_writer.rs b/src/query/service/src/test_kits/block_writer.rs index 6f81b9a8f1dbe..75914348ed1f2 100644 --- a/src/query/service/src/test_kits/block_writer.rs +++ b/src/query/service/src/test_kits/block_writer.rs @@ -19,14 +19,17 @@ use databend_common_expression::FunctionContext; use databend_common_expression::TableSchemaRef; use databend_common_io::constants::DEFAULT_BLOCK_BUFFER_SIZE; use databend_common_io::constants::DEFAULT_BLOCK_INDEX_BUFFER_SIZE; +use databend_common_sql::ApproxDistinctColumns; use databend_common_sql::BloomIndexColumns; use databend_common_storages_fuse::io::serialize_block; +use databend_common_storages_fuse::io::BlockStatisticsState; use databend_common_storages_fuse::io::TableMetaLocationGenerator; use databend_common_storages_fuse::io::WriteSettings; use databend_common_storages_fuse::FuseStorageFormat; use databend_storages_common_blocks::blocks_to_parquet; use databend_storages_common_index::BloomIndex; use databend_storages_common_index::BloomIndexBuilder; +use databend_storages_common_index::RangeIndex; use databend_storages_common_table_meta::meta::BlockMeta; use databend_storages_common_table_meta::meta::ClusterStatistics; use databend_storages_common_table_meta::meta::Compression; @@ -85,6 +88,9 @@ impl<'a> BlockWriter<'a> { let (bloom_filter_index_size, bloom_filter_index_location, meta) = self .build_block_index(data_accessor, schema.clone(), &block, block_id) .await?; + let (block_stats_size, block_stats_location) = self + .build_block_stats(data_accessor, schema.clone(), &block, block_id) + .await?; let write_settings = WriteSettings { storage_format, @@ -112,6 +118,8 @@ impl<'a> BlockWriter<'a> { None, None, None, + block_stats_location, + block_stats_size, Compression::Lz4Raw, Some(Utc::now()), ); @@ -154,4 +162,29 @@ impl<'a> BlockWriter<'a> { Ok((0u64, None, None)) } } + + pub async fn build_block_stats( + &self, + data_accessor: &Operator, + schema: TableSchemaRef, + block: &DataBlock, + block_id: Uuid, + ) -> Result<(u64, Option)> { + let location = self.location_generator.block_stats_location(&block_id); + + let hll_columns = ApproxDistinctColumns::All; + let ndv_columns_map = + hll_columns.distinct_column_fields(schema.clone(), RangeIndex::supported_table_type)?; + let maybe_block_stats = + BlockStatisticsState::from_data_block(location, block, &ndv_columns_map)?; + if let Some(block_stats) = maybe_block_stats { + let size = block_stats.block_stats_size(); + data_accessor + .write(&block_stats.location.0, block_stats.data) + .await?; + Ok((size, Some(block_stats.location))) + } else { + Ok((0u64, None)) + } + } } diff --git a/src/query/service/src/test_kits/check.rs b/src/query/service/src/test_kits/check.rs index a8a82cf60727a..6943810c7d949 100644 --- a/src/query/service/src/test_kits/check.rs +++ b/src/query/service/src/test_kits/check.rs @@ -24,6 +24,7 @@ use databend_common_meta_app::storage::StorageParams; use databend_common_storages_fuse::operations::load_last_snapshot_hint; use databend_common_storages_fuse::FuseTable; use databend_common_storages_fuse::FUSE_TBL_BLOCK_PREFIX; +use databend_common_storages_fuse::FUSE_TBL_BLOCK_STATISTICS_PREFIX; use databend_common_storages_fuse::FUSE_TBL_SEGMENT_PREFIX; use databend_common_storages_fuse::FUSE_TBL_SNAPSHOT_PREFIX; use databend_common_storages_fuse::FUSE_TBL_SNAPSHOT_STATISTICS_PREFIX; @@ -79,6 +80,7 @@ pub async fn check_data_dir( segment_count: u32, block_count: u32, index_count: u32, + block_stat_count: u32, check_last_snapshot: Option<()>, check_table_statistic_file: Option<()>, ) -> Result<()> { @@ -92,12 +94,14 @@ pub async fn check_data_dir( let mut sg_count = 0; let mut b_count = 0; let mut i_count = 0; + let mut b_stat_count = 0; let mut table_statistic_files = vec![]; let prefix_snapshot = FUSE_TBL_SNAPSHOT_PREFIX; let prefix_snapshot_statistics = FUSE_TBL_SNAPSHOT_STATISTICS_PREFIX; let prefix_segment = FUSE_TBL_SEGMENT_PREFIX; let prefix_block = FUSE_TBL_BLOCK_PREFIX; let prefix_index = FUSE_TBL_XOR_BLOOM_INDEX_PREFIX; + let prefix_block_stats = FUSE_TBL_BLOCK_STATISTICS_PREFIX; for entry in WalkDir::new(root) { let entry = entry.unwrap(); if entry.file_type().is_file() { @@ -109,6 +113,8 @@ pub async fn check_data_dir( ss_count += 1; } else if path.starts_with(prefix_segment) { sg_count += 1; + } else if path.starts_with(prefix_block_stats) { + b_stat_count += 1; } else if path.starts_with(prefix_block) { b_count += 1; } else if path.starts_with(prefix_index) { @@ -142,6 +148,12 @@ pub async fn check_data_dir( case_name ); + assert_eq!( + b_stat_count, block_stat_count, + "case [{}], check block statistics count", + case_name + ); + assert_eq!( i_count, index_count, "case [{}], check index count", diff --git a/src/query/service/tests/it/storages/fuse/bloom_index_meta_size.rs b/src/query/service/tests/it/storages/fuse/bloom_index_meta_size.rs index 520231794d2cb..3f958654bf778 100644 --- a/src/query/service/tests/it/storages/fuse/bloom_index_meta_size.rs +++ b/src/query/service/tests/it/storages/fuse/bloom_index_meta_size.rs @@ -340,6 +340,8 @@ fn build_test_segment_info( vector_index_size: None, vector_index_location: None, virtual_block_meta: None, + block_stats_location: None, + block_stats_size: 0, compression: Compression::Lz4, create_on: Some(Utc::now()), }; diff --git a/src/query/service/tests/it/storages/fuse/meta/column_oriented.rs b/src/query/service/tests/it/storages/fuse/meta/column_oriented.rs index 0a4d46b8c6f40..dc50928f2b4e1 100644 --- a/src/query/service/tests/it/storages/fuse/meta/column_oriented.rs +++ b/src/query/service/tests/it/storages/fuse/meta/column_oriented.rs @@ -288,6 +288,39 @@ fn check_block_level_meta( assert!(is_null); } + // check block stats location + let block_stats_location = column_oriented_segment + .col_by_name(&[BLOCK_STATS_LOCATION]) + .unwrap(); + for (block_stats_location, block_meta) in block_stats_location.iter().zip(block_metas.iter()) { + let block_stats_location = block_stats_location.as_tuple(); + if let Some(block_stats_location) = block_stats_location { + assert_eq!( + block_stats_location[0].as_string().unwrap(), + &block_meta.block_stats_location.as_ref().unwrap().0 + ); + assert_eq!( + block_stats_location[1] + .as_number() + .unwrap() + .as_u_int64() + .unwrap(), + &block_meta.block_stats_location.as_ref().unwrap().1 + ); + } else { + assert!(block_meta.block_stats_location.is_none()); + } + } + + // check block stats size + let block_stats_size = column_oriented_segment + .col_by_name(&[BLOCK_STATS_SIZE]) + .unwrap(); + for (block_stats_size, block_meta) in block_stats_size.iter().zip(block_metas.iter()) { + let block_stats_size = block_stats_size.as_number().unwrap().as_u_int64().unwrap(); + assert_eq!(block_stats_size, &block_meta.block_stats_size); + } + // check compression let compression = column_oriented_segment.col_by_name(&[COMPRESSION]).unwrap(); for (compression, block_meta) in compression.iter().zip(block_metas.iter()) { @@ -372,7 +405,7 @@ async fn test_segment_cache() -> Result<()> { ) .await?; let cached = cache.get(&location).unwrap(); - assert_eq!(cached.segment_schema.fields.len(), 10); + assert_eq!(cached.segment_schema.fields.len(), 12); assert_eq!(cached.segment_schema, segment_schema(&TableSchema::empty())); check_summary(&block_metas, &cached); check_block_level_meta(&block_metas, &cached); @@ -385,7 +418,7 @@ async fn test_segment_cache() -> Result<()> { let _column_oriented_segment = read_column_oriented_segment(operator.clone(), &location, &projection, true).await?; let cached = cache.get(&location).unwrap(); - assert_eq!(cached.segment_schema.fields.len(), 12); + assert_eq!(cached.segment_schema.fields.len(), 14); let column_1 = table_schema.field_of_column_id(col_id).unwrap(); let stat_1 = column_oriented_segment @@ -409,7 +442,7 @@ async fn test_segment_cache() -> Result<()> { read_column_oriented_segment(operator.clone(), &location, &projection, true).await?; let cached = cache.get(&location).unwrap(); // column 2 does not have stats - assert_eq!(cached.segment_schema.fields.len(), 13); + assert_eq!(cached.segment_schema.fields.len(), 15); check_summary(&block_metas, &cached); check_block_level_meta(&block_metas, &cached); check_column_stats_and_meta(&block_metas, &cached, &[1, 2]); @@ -423,7 +456,7 @@ async fn test_segment_cache() -> Result<()> { read_column_oriented_segment(operator.clone(), &location, &projection, true).await?; let cached = cache.get(&location).unwrap(); // column 2 does not have stats - assert_eq!(cached.segment_schema.fields.len(), 13); + assert_eq!(cached.segment_schema.fields.len(), 15); check_summary(&block_metas, &cached); check_block_level_meta(&block_metas, &cached); check_column_stats_and_meta(&block_metas, &cached, &[1, 2]); diff --git a/src/query/service/tests/it/storages/fuse/operations/analyze.rs b/src/query/service/tests/it/storages/fuse/operations/analyze.rs index 2a23d26433b5b..0b5ee07035bc5 100644 --- a/src/query/service/tests/it/storages/fuse/operations/analyze.rs +++ b/src/query/service/tests/it/storages/fuse/operations/analyze.rs @@ -32,7 +32,7 @@ async fn test_fuse_snapshot_analyze() -> Result<()> { do_insertions(&fixture).await?; analyze_table(&fixture).await?; - check_data_dir(&fixture, case_name, 3, 1, 2, 2, 2, Some(()), None).await?; + check_data_dir(&fixture, case_name, 3, 1, 2, 2, 2, 2, Some(()), None).await?; // Purge will keep at least two snapshots. let table = fixture.latest_default_table().await?; @@ -42,7 +42,7 @@ async fn test_fuse_snapshot_analyze() -> Result<()> { fuse_table .do_purge(&table_ctx, snapshot_files, None, true, false) .await?; - check_data_dir(&fixture, case_name, 1, 1, 1, 1, 1, Some(()), Some(())).await?; + check_data_dir(&fixture, case_name, 1, 1, 1, 1, 1, 1, Some(()), Some(())).await?; Ok(()) } @@ -65,7 +65,7 @@ async fn test_fuse_snapshot_analyze_and_truncate() -> Result<()> { fixture.execute_command(&qry).await?; - check_data_dir(&fixture, case_name, 3, 1, 2, 2, 2, None, Some(())).await?; + check_data_dir(&fixture, case_name, 3, 1, 2, 2, 2, 2, None, Some(())).await?; } // truncate table @@ -103,7 +103,19 @@ async fn test_fuse_snapshot_analyze_purge() -> Result<()> { // optimize statistics three times for i in 0..3 { analyze_table(&fixture).await?; - check_data_dir(&fixture, case_name, 3 + i, 1 + i, 2, 2, 2, Some(()), None).await?; + check_data_dir( + &fixture, + case_name, + 3 + i, + 1 + i, + 2, + 2, + 2, + 2, + Some(()), + None, + ) + .await?; } // Purge will keep at least two snapshots. @@ -114,7 +126,7 @@ async fn test_fuse_snapshot_analyze_purge() -> Result<()> { fuse_table .do_purge(&table_ctx, snapshot_files, None, true, false) .await?; - check_data_dir(&fixture, case_name, 1, 1, 1, 1, 1, Some(()), Some(())).await?; + check_data_dir(&fixture, case_name, 1, 1, 1, 1, 1, 1, Some(()), Some(())).await?; Ok(()) } diff --git a/src/query/service/tests/it/storages/fuse/operations/gc.rs b/src/query/service/tests/it/storages/fuse/operations/gc.rs index 24202424686f6..bb60f0dbf240f 100644 --- a/src/query/service/tests/it/storages/fuse/operations/gc.rs +++ b/src/query/service/tests/it/storages/fuse/operations/gc.rs @@ -59,6 +59,7 @@ async fn test_fuse_purge_normal_case() -> Result<()> { 1, // 1 segments 1, // 1 blocks 1, // 1 index + 1, // 1 block statistic Some(()), None, ) @@ -118,9 +119,10 @@ async fn test_fuse_purge_normal_orphan_snapshot() -> Result<()> { "do_gc: there should be 1 snapshot, 0 segment/block", expected_num_of_snapshot, 0, // 0 snapshot statistic - 1, // 0 segments - 1, // 0 blocks - 1, // 0 index + 1, // 1 segments + 1, // 1 blocks + 1, // 1 index + 1, // 1 block statistic Some(()), None, ) @@ -250,6 +252,7 @@ async fn test_fuse_purge_orphan_retention() -> Result<()> { let expected_num_of_segment = 3; let expected_num_of_blocks = 3; let expected_num_of_index = expected_num_of_blocks; + let expected_num_of_block_stats = expected_num_of_blocks; check_data_dir( &fixture, "do_gc: verify retention period", @@ -258,6 +261,7 @@ async fn test_fuse_purge_orphan_retention() -> Result<()> { expected_num_of_segment, expected_num_of_blocks, expected_num_of_index, + expected_num_of_block_stats, Some(()), None, ) @@ -296,6 +300,7 @@ async fn test_fuse_purge_older_version() -> Result<()> { let expected_num_of_segment = 3; let expected_num_of_blocks = 6; let expected_num_of_index = expected_num_of_blocks; + let expected_num_of_block_stats = expected_num_of_blocks; check_data_dir( &fixture, "do_gc: navigate to time point", @@ -304,6 +309,7 @@ async fn test_fuse_purge_older_version() -> Result<()> { expected_num_of_segment, expected_num_of_blocks, expected_num_of_index, + expected_num_of_block_stats, Some(()), None, ) @@ -317,7 +323,7 @@ async fn test_fuse_purge_older_version() -> Result<()> { { let table = fixture.latest_default_table().await?; compact_segment(ctx.clone(), &table).await?; - check_data_dir(&fixture, "", 4, 0, 5, 7, 7, Some(()), None).await?; + check_data_dir(&fixture, "", 4, 0, 5, 7, 7, 7, Some(()), None).await?; } let table = fixture.latest_default_table().await?; @@ -333,6 +339,7 @@ async fn test_fuse_purge_older_version() -> Result<()> { let expected_num_of_segment = 1; let expected_num_of_blocks = 7; let expected_num_of_index = expected_num_of_blocks; + let expected_num_of_block_stats = expected_num_of_blocks; check_data_dir( &fixture, "do_gc: with older version", @@ -341,6 +348,7 @@ async fn test_fuse_purge_older_version() -> Result<()> { expected_num_of_segment, expected_num_of_blocks, expected_num_of_index, + expected_num_of_block_stats, Some(()), None, ) @@ -357,6 +365,7 @@ async fn test_fuse_purge_older_version() -> Result<()> { let expected_num_of_segment = 0; let expected_num_of_blocks = 0; let expected_num_of_index = expected_num_of_blocks; + let expected_num_of_block_stats = expected_num_of_blocks; check_data_dir( &fixture, "do_gc: purge last snapshot", @@ -365,6 +374,7 @@ async fn test_fuse_purge_older_version() -> Result<()> { expected_num_of_segment, expected_num_of_blocks, expected_num_of_index, + expected_num_of_block_stats, Some(()), None, ) diff --git a/src/query/service/tests/it/storages/fuse/operations/mutation/recluster_mutator.rs b/src/query/service/tests/it/storages/fuse/operations/mutation/recluster_mutator.rs index 97c3e9cf85f0d..8d0ec47d7f19e 100644 --- a/src/query/service/tests/it/storages/fuse/operations/mutation/recluster_mutator.rs +++ b/src/query/service/tests/it/storages/fuse/operations/mutation/recluster_mutator.rs @@ -81,6 +81,8 @@ async fn test_recluster_mutator_block_select() -> Result<()> { None, None, None, + None, + 0, meta::Compression::Lz4Raw, Some(Utc::now()), )); diff --git a/src/query/service/tests/it/storages/fuse/operations/mutation/segments_compact_mutator.rs b/src/query/service/tests/it/storages/fuse/operations/mutation/segments_compact_mutator.rs index ebdcabfd13662..8387d3c7a2ace 100644 --- a/src/query/service/tests/it/storages/fuse/operations/mutation/segments_compact_mutator.rs +++ b/src/query/service/tests/it/storages/fuse/operations/mutation/segments_compact_mutator.rs @@ -781,6 +781,8 @@ impl CompactSegmentTestFixture { None, None, None, + None, + 0, Compression::Lz4Raw, Some(Utc::now()), ); diff --git a/src/query/service/tests/it/storages/fuse/operations/optimize.rs b/src/query/service/tests/it/storages/fuse/operations/optimize.rs index 02f300cf3680d..5470aa4853b8d 100644 --- a/src/query/service/tests/it/storages/fuse/operations/optimize.rs +++ b/src/query/service/tests/it/storages/fuse/operations/optimize.rs @@ -24,12 +24,12 @@ use crate::storages::fuse::utils::do_purge_test; #[tokio::test(flavor = "multi_thread")] async fn test_fuse_snapshot_optimize_purge() -> Result<()> { - do_purge_test("test_fuse_snapshot_optimize_purge", 1, 0, 1, 1, 1).await + do_purge_test("test_fuse_snapshot_optimize_purge", 1, 0, 1, 1, 1, 1).await } #[tokio::test(flavor = "multi_thread")] async fn test_fuse_snapshot_optimize_all() -> Result<()> { - do_purge_test("test_fuse_snapshot_optimize_all", 1, 0, 1, 1, 1).await + do_purge_test("test_fuse_snapshot_optimize_all", 1, 0, 1, 1, 1, 1).await } #[tokio::test(flavor = "multi_thread")] diff --git a/src/query/service/tests/it/storages/fuse/operations/purge_drop.rs b/src/query/service/tests/it/storages/fuse/operations/purge_drop.rs index 75c8bb405a844..54b4a535c9946 100644 --- a/src/query/service/tests/it/storages/fuse/operations/purge_drop.rs +++ b/src/query/service/tests/it/storages/fuse/operations/purge_drop.rs @@ -46,6 +46,20 @@ async fn test_fuse_snapshot_truncate_in_drop_all_stmt() -> Result<()> { // ingests some test data append_sample_data(1, &fixture).await?; + check_data_dir( + &fixture, + "drop table: there should be 1 snapshot, 0 segment/block", + 1, // 1 snapshot + 0, // 0 snapshot statistic + 1, // 0 segments + 1, // 0 blocks + 1, // 0 index + 1, // 0 block statistic + None, + None, + ) + .await?; + // let's Drop let qry = format!("drop table {}.{} all", db, tbl); fixture.execute_command(qry.as_str()).await?; @@ -58,6 +72,7 @@ async fn test_fuse_snapshot_truncate_in_drop_all_stmt() -> Result<()> { 0, // 0 segments 0, // 0 blocks 0, // 0 index + 0, // 0 block statistic None, None, ) diff --git a/src/query/service/tests/it/storages/fuse/operations/read_plan.rs b/src/query/service/tests/it/storages/fuse/operations/read_plan.rs index 09dccdb11816e..eb2d64ad67465 100644 --- a/src/query/service/tests/it/storages/fuse/operations/read_plan.rs +++ b/src/query/service/tests/it/storages/fuse/operations/read_plan.rs @@ -107,6 +107,8 @@ fn test_to_partitions() -> Result<()> { None, None, None, + None, + 0, meta::Compression::Lz4Raw, Some(Utc::now()), )); diff --git a/src/query/service/tests/it/storages/fuse/statistics.rs b/src/query/service/tests/it/storages/fuse/statistics.rs index c87b524f9217f..4bcfc5bf9cf6d 100644 --- a/src/query/service/tests/it/storages/fuse/statistics.rs +++ b/src/query/service/tests/it/storages/fuse/statistics.rs @@ -637,6 +637,8 @@ fn test_reduce_block_meta() -> databend_common_exception::Result<()> { None, None, None, + None, + 0, Compression::Lz4Raw, Some(Utc::now()), ); diff --git a/src/query/service/tests/it/storages/fuse/utils.rs b/src/query/service/tests/it/storages/fuse/utils.rs index 4e29ed6594705..2dbf9d36de9dc 100644 --- a/src/query/service/tests/it/storages/fuse/utils.rs +++ b/src/query/service/tests/it/storages/fuse/utils.rs @@ -54,6 +54,7 @@ pub async fn do_purge_test( segment_count: u32, block_count: u32, index_count: u32, + block_stat_count: u32, ) -> Result<()> { let fixture = TestFixture::setup().await?; fixture.create_default_database().await?; @@ -81,6 +82,7 @@ pub async fn do_purge_test( segment_count, block_count, index_count, + block_stat_count, Some(()), None, ) diff --git a/src/query/settings/src/settings_default.rs b/src/query/settings/src/settings_default.rs index 9b32bbbc233fd..cc029a648ce33 100644 --- a/src/query/settings/src/settings_default.rs +++ b/src/query/settings/src/settings_default.rs @@ -1320,7 +1320,7 @@ impl DefaultSettings { range: None, }), ("enable_block_stream_write", DefaultSettingValue { - value: UserSettingValue::UInt64(0), + value: UserSettingValue::UInt64(1), desc: "Enables block stream write", mode: SettingMode::Both, scope: SettingScope::Both, diff --git a/src/query/sql/src/planner/metadata/hll_columns.rs b/src/query/sql/src/planner/metadata/hll_columns.rs new file mode 100644 index 0000000000000..ec708a5744153 --- /dev/null +++ b/src/query/sql/src/planner/metadata/hll_columns.rs @@ -0,0 +1,148 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::BTreeMap; +use std::str::FromStr; + +use databend_common_ast::parser::parse_comma_separated_idents; +use databend_common_ast::parser::tokenize_sql; +use databend_common_ast::parser::Dialect; +use databend_common_exception::ErrorCode; +use databend_common_exception::Result; +use databend_common_expression::ComputedExpr; +use databend_common_expression::FieldIndex; +use databend_common_expression::TableDataType; +use databend_common_expression::TableField; +use databend_common_expression::TableSchemaRef; +use databend_common_meta_app::tenant::Tenant; +use databend_common_settings::Settings; + +use crate::normalize_identifier; +use crate::NameResolutionContext; + +#[derive(Clone)] +pub enum ApproxDistinctColumns { + /// Default, all columns that support distinct columns. + All, + /// Specify with column names. + Specify(Vec), + /// The column of distinct is empty. + None, +} + +impl FromStr for ApproxDistinctColumns { + type Err = ErrorCode; + + fn from_str(s: &str) -> std::result::Result { + let s = s.trim(); + if s.is_empty() { + return Ok(ApproxDistinctColumns::None); + } + + let sql_dialect = Dialect::default(); + let tokens = tokenize_sql(s)?; + let idents = parse_comma_separated_idents(&tokens, sql_dialect)?; + + let settings = Settings::create(Tenant::new_literal("dummy")); + let name_resolution_ctx = NameResolutionContext::try_from(settings.as_ref())?; + + let mut cols = Vec::with_capacity(idents.len()); + idents + .into_iter() + .for_each(|ident| cols.push(normalize_identifier(&ident, &name_resolution_ctx).name)); + + Ok(ApproxDistinctColumns::Specify(cols)) + } +} + +impl ApproxDistinctColumns { + /// Verify the definition based on schema. + pub fn verify_definition( + definition: &str, + schema: TableSchemaRef, + verify_type: F, + ) -> Result<()> + where + F: Fn(&TableDataType) -> bool, + { + let definition = definition.trim(); + if definition.is_empty() { + return Ok(()); + } + + let settings = Settings::create(Tenant::new_literal("dummy")); + let name_resolution_ctx = NameResolutionContext::try_from(settings.as_ref())?; + + let sql_dialect = Dialect::default(); + let tokens = tokenize_sql(definition)?; + let idents = parse_comma_separated_idents(&tokens, sql_dialect)?; + for ident in idents.iter() { + let name = &normalize_identifier(ident, &name_resolution_ctx).name; + let field = schema.field_with_name(name)?; + + if matches!(field.computed_expr(), Some(ComputedExpr::Virtual(_))) { + return Err(ErrorCode::TableOptionInvalid(format!( + "The value specified for computed column '{}' is not allowed for distinct columns", + name + ))); + } + + let data_type = field.data_type(); + if !verify_type(data_type) { + return Err(ErrorCode::TableOptionInvalid(format!( + "Unsupported data type '{}' for distinct columns", + data_type + ))); + } + } + Ok(()) + } + + pub fn distinct_column_fields( + &self, + schema: TableSchemaRef, + verify_type: F, + ) -> Result> + where + F: Fn(&TableDataType) -> bool, + { + let source_schema = schema.remove_virtual_computed_fields(); + let mut fields_map = BTreeMap::new(); + match self { + ApproxDistinctColumns::All => { + for (i, field) in source_schema.fields.into_iter().enumerate() { + if verify_type(field.data_type()) { + fields_map.insert(i, field); + } + } + } + ApproxDistinctColumns::Specify(cols) => { + for col in cols { + let field_index = source_schema.index_of(col)?; + let field = source_schema.fields[field_index].clone(); + let data_type = field.data_type(); + if !verify_type(data_type) { + return Err(ErrorCode::BadArguments(format!( + "Unsupported data type for distinct columns: {:?}", + data_type + ))); + } + fields_map.insert(field_index, field); + } + } + ApproxDistinctColumns::None => (), + } + Ok(fields_map) + } +} diff --git a/src/query/sql/src/planner/metadata/mod.rs b/src/query/sql/src/planner/metadata/mod.rs index f3de756113350..e37aa7b078812 100644 --- a/src/query/sql/src/planner/metadata/mod.rs +++ b/src/query/sql/src/planner/metadata/mod.rs @@ -13,8 +13,10 @@ // limitations under the License. mod bloom_index; +mod hll_columns; #[allow(clippy::module_inception)] mod metadata; pub use bloom_index::BloomIndexColumns; +pub use hll_columns::ApproxDistinctColumns; pub use metadata::*; diff --git a/src/query/storages/common/cache/src/manager.rs b/src/query/storages/common/cache/src/manager.rs index 2e4aeb6b0fb6b..1b8432a52ce98 100644 --- a/src/query/storages/common/cache/src/manager.rs +++ b/src/query/storages/common/cache/src/manager.rs @@ -1033,6 +1033,8 @@ mod tests { vector_index_location: None, vector_index_size: None, virtual_block_meta: None, + block_stats_location: None, + block_stats_size: 0, compression: Compression::Lz4, create_on: None, }); diff --git a/src/query/storages/common/index/src/bloom_index.rs b/src/query/storages/common/index/src/bloom_index.rs index bf5ec35035e64..9af4d2a3734fa 100644 --- a/src/query/storages/common/index/src/bloom_index.rs +++ b/src/query/storages/common/index/src/bloom_index.rs @@ -14,6 +14,7 @@ use std::collections::BTreeMap; use std::collections::HashMap; +use std::hash::DefaultHasher; use std::hash::Hasher; use std::ops::ControlFlow; use std::ops::Deref; @@ -35,12 +36,18 @@ use databend_common_expression::types::BinaryType; use databend_common_expression::types::Bitmap; use databend_common_expression::types::Buffer; use databend_common_expression::types::DataType; +use databend_common_expression::types::DateType; use databend_common_expression::types::MapType; use databend_common_expression::types::NullableType; use databend_common_expression::types::Number; use databend_common_expression::types::NumberDataType; +use databend_common_expression::types::NumberType; +use databend_common_expression::types::StringType; +use databend_common_expression::types::TimestampType; use databend_common_expression::types::UInt64Type; +use databend_common_expression::types::ValueType; use databend_common_expression::visit_expr; +use databend_common_expression::with_number_mapped_type; use databend_common_expression::BlockEntry; use databend_common_expression::Column; use databend_common_expression::ColumnBuilder; @@ -349,6 +356,68 @@ impl BloomIndex { Ok(column) } + pub fn calculate_digest_by_type(data_type: &DataType, column: &Column) -> Result> { + let inner_type = data_type.remove_nullable(); + with_number_mapped_type!(|NUM_TYPE| match inner_type { + DataType::Number(NumberDataType::NUM_TYPE) => { + Self::calculate_nullable_column_digests::>(column) + } + DataType::String => { + Self::calculate_nullable_column_digests::(column) + } + DataType::Date => { + Self::calculate_nullable_column_digests::(column) + } + DataType::Timestamp => { + Self::calculate_nullable_column_digests::(column) + } + _ => Err(ErrorCode::Internal(format!( + "Unsupported data type: {:?}", + data_type + ))), + }) + } + + #[inline(always)] + fn hash_one(v: &T) -> u64 { + let mut hasher = DefaultHasher::default(); + DFHash::hash(v, &mut hasher); + hasher.finish() + } + + fn calculate_nullable_column_digests(column: &Column) -> Result> + where for<'a> T::ScalarRef<'a>: DFHash { + let (column, validity) = if let Column::Nullable(box inner) = column { + let validity = if inner.validity.null_count() == 0 { + None + } else { + Some(&inner.validity) + }; + (&inner.column, validity) + } else { + (column, None) + }; + + let capacity = validity.map_or(column.len(), |v| v.true_count()); + let mut result = Vec::with_capacity(capacity); + let column = T::try_downcast_column(column).unwrap(); + if let Some(validity) = validity { + let column_iter = T::iter_column(&column); + let value_iter = column_iter + .zip(validity.iter()) + .filter(|(_, v)| *v) + .map(|(v, _)| v); + for value in value_iter { + result.push(Self::hash_one(&value)); + } + } else { + for value in T::iter_column(&column) { + result.push(Self::hash_one(&value)); + } + } + Ok(result) + } + /// calculate digest for column that may have null values /// /// returns (column, validity) where column is the digest of the column @@ -556,18 +625,6 @@ impl BloomIndex { let data_type = DataType::from(data_type); Xor8Filter::supported_type(&data_type) } - - /// Checks if the average length of a string column exceeds 256 bytes. - /// If it does, the bloom index for the column will not be established. - pub fn check_large_string(column: &Column) -> bool { - if let Column::String(v) = &column { - let bytes_per_row = v.total_bytes_len() / v.len().max(1); - if bytes_per_row > 256 { - return true; - } - } - false - } } pub struct BloomIndexBuilder { @@ -711,14 +768,14 @@ impl BloomIndexBuilder { builder.push_default(); } let str_column = builder.build(); - if BloomIndex::check_large_string(&str_column) { + if str_column.check_large_string() { bloom_keys_to_remove.push(index); continue; } let str_type = DataType::Nullable(Box::new(DataType::String)); (str_column, str_type) } else { - if BloomIndex::check_large_string(&column) { + if column.check_large_string() { bloom_keys_to_remove.push(index); continue; } @@ -726,7 +783,7 @@ impl BloomIndexBuilder { } } _ => { - if BloomIndex::check_large_string(&column) { + if column.check_large_string() { bloom_keys_to_remove.push(index); continue; } @@ -734,24 +791,8 @@ impl BloomIndexBuilder { } }; - let (column, validity) = - BloomIndex::calculate_nullable_column_digest(&self.func_ctx, &column, &data_type)?; - // create filter per column - if validity.as_ref().map(|v| v.null_count()).unwrap_or(0) > 0 { - let validity = validity.unwrap(); - let it = column.deref().iter().zip(validity.iter()).map( - |(v, b)| { - if !b { - &0 - } else { - v - } - }, - ); - index_column.builder.add_digests(it); - } else { - index_column.builder.add_digests(column.deref()); - } + let column = BloomIndex::calculate_digest_by_type(&data_type, &column)?; + index_column.builder.add_digests(column.deref()); } for index_column in self.ngram_columns.iter_mut() { let field_type = &block.data_type(index_column.index); @@ -771,6 +812,8 @@ impl BloomIndexBuilder { index_column.builder.add_digests(digests.iter()) } } + // reverse sorting. + bloom_keys_to_remove.sort_by(|a, b| b.cmp(a)); for k in bloom_keys_to_remove { self.bloom_columns.remove(k); } diff --git a/src/query/storages/common/index/src/range_index.rs b/src/query/storages/common/index/src/range_index.rs index 4f23fcef61bee..60d4da9655cc4 100644 --- a/src/query/storages/common/index/src/range_index.rs +++ b/src/query/storages/common/index/src/range_index.rs @@ -42,6 +42,7 @@ use databend_common_expression::Domain; use databend_common_expression::Expr; use databend_common_expression::FunctionContext; use databend_common_expression::Scalar; +use databend_common_expression::TableDataType; use databend_common_expression::TableSchemaRef; use databend_common_functions::BUILTIN_FUNCTIONS; use databend_storages_common_table_meta::meta::ColumnStatistics; @@ -169,6 +170,11 @@ impl RangeIndex { } .apply(stats, |_| false) } + + pub fn supported_table_type(data_type: &TableDataType) -> bool { + let data_type = DataType::from(data_type); + Self::supported_type(&data_type) + } } pub fn statistics_to_domain(mut stats: Vec<&ColumnStatistics>, data_type: &DataType) -> Domain { diff --git a/src/query/storages/common/table_meta/src/meta/column_oriented_segment/mod.rs b/src/query/storages/common/table_meta/src/meta/column_oriented_segment/mod.rs index 86453a71543fc..21787350577ee 100644 --- a/src/query/storages/common/table_meta/src/meta/column_oriented_segment/mod.rs +++ b/src/query/storages/common/table_meta/src/meta/column_oriented_segment/mod.rs @@ -26,6 +26,8 @@ pub use schema::meta_name; pub use schema::segment_schema; pub use schema::stat_name; pub use schema::BLOCK_SIZE; +pub use schema::BLOCK_STATS_LOCATION; +pub use schema::BLOCK_STATS_SIZE; pub use schema::BLOOM_FILTER_INDEX_LOCATION; pub use schema::BLOOM_FILTER_INDEX_SIZE; pub use schema::CLUSTER_STATS; diff --git a/src/query/storages/common/table_meta/src/meta/column_oriented_segment/schema.rs b/src/query/storages/common/table_meta/src/meta/column_oriented_segment/schema.rs index dbba08ce7c622..daea76a037897 100644 --- a/src/query/storages/common/table_meta/src/meta/column_oriented_segment/schema.rs +++ b/src/query/storages/common/table_meta/src/meta/column_oriented_segment/schema.rs @@ -34,6 +34,8 @@ pub const COMPRESSION: &str = "compression"; pub const CREATE_ON: &str = "create_on"; pub const LOCATION_PATH: &str = "path"; pub const LOCATION_FORMAT_VERSION: &str = "format_version"; +pub const BLOCK_STATS_LOCATION: &str = "block_stats_location"; +pub const BLOCK_STATS_SIZE: &str = "block_stats_size"; pub fn block_level_field_names() -> HashSet { let mut set = HashSet::new(); @@ -45,6 +47,8 @@ pub fn block_level_field_names() -> HashSet { set.insert(BLOOM_FILTER_INDEX_LOCATION.to_string()); set.insert(BLOOM_FILTER_INDEX_SIZE.to_string()); set.insert(INVERTED_INDEX_SIZE.to_string()); + set.insert(BLOCK_STATS_LOCATION.to_string()); + set.insert(BLOCK_STATS_SIZE.to_string()); set.insert(COMPRESSION.to_string()); set.insert(CREATE_ON.to_string()); set @@ -130,6 +134,8 @@ pub fn segment_schema(table_schema: &TableSchema) -> TableSchema { TableField::new(BLOOM_FILTER_INDEX_LOCATION, nullable_location_type()), TableField::new(BLOOM_FILTER_INDEX_SIZE, u64_t.clone()), TableField::new(INVERTED_INDEX_SIZE, nullable_u64_t.clone()), + TableField::new(BLOCK_STATS_LOCATION, nullable_location_type()), + TableField::new(BLOCK_STATS_SIZE, u64_t.clone()), TableField::new(COMPRESSION, u8_t.clone()), TableField::new(CREATE_ON, i64_t.clone()), ]; diff --git a/src/query/storages/common/table_meta/src/meta/column_oriented_segment/segment_builder.rs b/src/query/storages/common/table_meta/src/meta/column_oriented_segment/segment_builder.rs index eeb6c2305a8ca..4ce10f20ed867 100644 --- a/src/query/storages/common/table_meta/src/meta/column_oriented_segment/segment_builder.rs +++ b/src/query/storages/common/table_meta/src/meta/column_oriented_segment/segment_builder.rs @@ -45,6 +45,7 @@ use crate::meta::supported_stat_type; use crate::meta::BlockMeta; use crate::meta::ClusterStatistics; use crate::meta::ColumnStatistics; +use crate::meta::Location; use crate::meta::MetaEncoding; use crate::meta::Statistics; use crate::meta::VirtualBlockMeta; @@ -67,10 +68,12 @@ pub struct ColumnOrientedSegmentBuilder { file_size: Vec, cluster_stats: Vec>, location: (Vec, Vec), - bloom_filter_index_location: (Vec, Vec, MutableBitmap), + bloom_filter_index_location: LocationsWithOption, bloom_filter_index_size: Vec, inverted_index_size: Vec>, virtual_block_meta: Vec>, + block_stats_location: LocationsWithOption, + block_stats_size: Vec, compression: Vec, create_on: Vec>, column_stats: HashMap, @@ -127,28 +130,16 @@ impl SegmentBuilder for ColumnOrientedSegmentBuilder { self.cluster_stats.push(block_meta.cluster_stats); self.location.0.push(block_meta.location.0); self.location.1.push(block_meta.location.1); - self.bloom_filter_index_location.0.push( - block_meta - .bloom_filter_index_location - .as_ref() - .map(|l| l.0.clone()) - .unwrap_or_default(), - ); - self.bloom_filter_index_location.1.push( - block_meta - .bloom_filter_index_location - .as_ref() - .map(|l| l.1) - .unwrap_or_default(), - ); self.bloom_filter_index_location - .2 - .push(block_meta.bloom_filter_index_location.is_some()); + .add_location(block_meta.bloom_filter_index_location.as_ref()); self.bloom_filter_index_size .push(block_meta.bloom_filter_index_size); self.inverted_index_size .push(block_meta.inverted_index_size); self.virtual_block_meta.push(block_meta.virtual_block_meta); + self.block_stats_location + .add_location(block_meta.block_stats_location.as_ref()); + self.block_stats_size.push(block_meta.block_stats_size); self.compression.push(block_meta.compression.to_u8()); self.create_on .push(block_meta.create_on.map(|t| t.timestamp())); @@ -199,13 +190,21 @@ impl SegmentBuilder for ColumnOrientedSegmentBuilder { ]), Column::Nullable(Box::new(NullableColumn::new( Column::Tuple(vec![ - StringType::from_data(this.bloom_filter_index_location.0), - UInt64Type::from_data(this.bloom_filter_index_location.1), + StringType::from_data(this.bloom_filter_index_location.locations), + UInt64Type::from_data(this.bloom_filter_index_location.versions), ]), - this.bloom_filter_index_location.2.into(), + this.bloom_filter_index_location.validity.into(), ))), UInt64Type::from_data(this.bloom_filter_index_size), UInt64Type::from_opt_data(this.inverted_index_size), + Column::Nullable(Box::new(NullableColumn::new( + Column::Tuple(vec![ + StringType::from_data(this.block_stats_location.locations), + UInt64Type::from_data(this.block_stats_location.versions), + ]), + this.block_stats_location.validity.into(), + ))), + UInt64Type::from_data(this.block_stats_size), UInt8Type::from_data(this.compression), Int64Type::from_opt_data(this.create_on), ]; @@ -264,14 +263,12 @@ impl SegmentBuilder for ColumnOrientedSegmentBuilder { Vec::with_capacity(block_per_segment), Vec::with_capacity(block_per_segment), ), - bloom_filter_index_location: ( - Vec::with_capacity(block_per_segment), - Vec::with_capacity(block_per_segment), - MutableBitmap::with_capacity(block_per_segment), - ), + bloom_filter_index_location: LocationsWithOption::new_with_capacity(block_per_segment), bloom_filter_index_size: Vec::with_capacity(block_per_segment), inverted_index_size: Vec::with_capacity(block_per_segment), virtual_block_meta: Vec::with_capacity(block_per_segment), + block_stats_location: LocationsWithOption::new_with_capacity(block_per_segment), + block_stats_size: Vec::with_capacity(block_per_segment), compression: Vec::with_capacity(block_per_segment), create_on: Vec::with_capacity(block_per_segment), column_stats, @@ -434,3 +431,31 @@ fn cmp_with_null(v1: &Scalar, v2: &Scalar) -> Ordering { (false, false) => v1.cmp(v2), } } + +struct LocationsWithOption { + locations: Vec, + versions: Vec, + validity: MutableBitmap, +} + +impl LocationsWithOption { + fn new_with_capacity(capacity: usize) -> Self { + Self { + locations: Vec::with_capacity(capacity), + versions: Vec::with_capacity(capacity), + validity: MutableBitmap::with_capacity(capacity), + } + } + + fn add_location(&mut self, location: Option<&Location>) { + if let Some(location) = location { + self.locations.push(location.0.clone()); + self.versions.push(location.1); + self.validity.push(true); + } else { + self.locations.push(String::new()); + self.versions.push(0); + self.validity.push(false); + } + } +} diff --git a/src/query/storages/common/table_meta/src/meta/current/mod.rs b/src/query/storages/common/table_meta/src/meta/current/mod.rs index 2bb5544e0ace9..cc254a566e671 100644 --- a/src/query/storages/common/table_meta/src/meta/current/mod.rs +++ b/src/query/storages/common/table_meta/src/meta/current/mod.rs @@ -25,6 +25,7 @@ pub use v2::Statistics; pub use v2::VirtualBlockMeta; pub use v2::VirtualColumnMeta; pub use v3::TableSnapshotStatistics; +pub use v4::BlockStatistics; pub use v4::CompactSegmentInfo; pub use v4::RawBlockMeta; pub use v4::SegmentInfo; diff --git a/src/query/storages/common/table_meta/src/meta/mod.rs b/src/query/storages/common/table_meta/src/meta/mod.rs index 68e3bfd3e0bce..3c9133ec4fdca 100644 --- a/src/query/storages/common/table_meta/src/meta/mod.rs +++ b/src/query/storages/common/table_meta/src/meta/mod.rs @@ -49,6 +49,7 @@ pub use utils::VACUUM2_OBJECT_KEY_PREFIX; pub(crate) use utils::*; pub use v0::ColumnMeta as ColumnMetaV0; pub use versions::testify_version; +pub use versions::BlockStatisticsVersion; pub use versions::SegmentInfoVersion; pub use versions::SnapshotVersion; pub use versions::TableSnapshotStatisticsVersion; diff --git a/src/query/storages/common/table_meta/src/meta/statistics.rs b/src/query/storages/common/table_meta/src/meta/statistics.rs index b63d53cafa5af..0506f04df1e5e 100644 --- a/src/query/storages/common/table_meta/src/meta/statistics.rs +++ b/src/query/storages/common/table_meta/src/meta/statistics.rs @@ -25,7 +25,6 @@ pub type SnapshotId = Uuid; pub type Location = (String, FormatVersion); pub type ClusterKey = (u32, String); pub type StatisticsOfColumns = HashMap; -pub type ColumnDistinctHLL = simple_hll::HyperLogLog<10>; // Assigned to executors, describes that which blocks of given segment, an executor should take care of #[derive(Clone, Debug, serde::Serialize, serde::Deserialize, PartialEq)] diff --git a/src/query/storages/common/table_meta/src/meta/v2/segment.rs b/src/query/storages/common/table_meta/src/meta/v2/segment.rs index 47c2127145d60..f9b0bcbd403bd 100644 --- a/src/query/storages/common/table_meta/src/meta/v2/segment.rs +++ b/src/query/storages/common/table_meta/src/meta/v2/segment.rs @@ -176,6 +176,10 @@ pub struct BlockMeta { pub virtual_block_meta: Option, pub compression: Compression, + pub block_stats_location: Option, + #[serde(default)] + pub block_stats_size: u64, + // block create_on pub create_on: Option>, } @@ -197,6 +201,8 @@ impl BlockMeta { vector_index_size: Option, vector_index_location: Option, virtual_block_meta: Option, + block_stats_location: Option, + block_stats_size: u64, compression: Compression, create_on: Option>, ) -> Self { @@ -215,6 +221,8 @@ impl BlockMeta { vector_index_size, vector_index_location, virtual_block_meta, + block_stats_location, + block_stats_size, compression, create_on, } @@ -377,6 +385,8 @@ impl BlockMeta { vector_index_size: None, vector_index_location: None, virtual_block_meta: None, + block_stats_location: None, + block_stats_size: 0, create_on: None, ngram_filter_index_size: None, } @@ -405,6 +415,8 @@ impl BlockMeta { vector_index_size: None, vector_index_location: None, virtual_block_meta: None, + block_stats_location: None, + block_stats_size: 0, create_on: None, ngram_filter_index_size: None, } diff --git a/src/query/storages/common/table_meta/src/meta/v3/frozen/block_meta.rs b/src/query/storages/common/table_meta/src/meta/v3/frozen/block_meta.rs index ccc22fdbdb600..a02b2761848f6 100644 --- a/src/query/storages/common/table_meta/src/meta/v3/frozen/block_meta.rs +++ b/src/query/storages/common/table_meta/src/meta/v3/frozen/block_meta.rs @@ -66,6 +66,8 @@ impl From for crate::meta::BlockMeta { vector_index_size: None, vector_index_location: None, virtual_block_meta: None, + block_stats_location: None, + block_stats_size: 0, compression: value.compression.into(), create_on: None, } diff --git a/src/query/storages/common/table_meta/src/meta/v3/table_snapshot_statistics.rs b/src/query/storages/common/table_meta/src/meta/v3/table_snapshot_statistics.rs index 55841a886e03c..0689e4a687dfd 100644 --- a/src/query/storages/common/table_meta/src/meta/v3/table_snapshot_statistics.rs +++ b/src/query/storages/common/table_meta/src/meta/v3/table_snapshot_statistics.rs @@ -29,7 +29,7 @@ pub type MetaHLL = simple_hll::HyperLogLog<12>; #[derive(Serialize, Deserialize, Clone, Debug)] pub struct TableSnapshotStatistics { - /// format version of snapshot + /// format version of statistics pub format_version: FormatVersion, /// id of snapshot diff --git a/src/query/storages/common/table_meta/src/meta/v4/block_statistics.rs b/src/query/storages/common/table_meta/src/meta/v4/block_statistics.rs new file mode 100644 index 0000000000000..e453f47f12630 --- /dev/null +++ b/src/query/storages/common/table_meta/src/meta/v4/block_statistics.rs @@ -0,0 +1,84 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::io::Cursor; +use std::io::Read; + +use databend_common_exception::Result; +use databend_common_expression::ColumnId; +use databend_common_io::prelude::BinaryRead; +use serde::Deserialize; +use serde::Serialize; + +use crate::meta::format::compress; +use crate::meta::format::encode; +use crate::meta::format::read_and_deserialize; +use crate::meta::versions::Versioned; +use crate::meta::FormatVersion; +use crate::meta::MetaCompression; +use crate::meta::MetaEncoding; +use crate::meta::MetaHLL; + +#[derive(Serialize, Deserialize, Clone, Debug)] +pub struct BlockStatistics { + pub format_version: FormatVersion, + + pub hll: HashMap, +} + +impl BlockStatistics { + pub fn new(hll: HashMap) -> Self { + Self { + format_version: BlockStatistics::VERSION, + hll, + } + } + + pub fn to_bytes(&self) -> Result> { + let encoding = MetaEncoding::MessagePack; + let compression = MetaCompression::default(); + + let data = encode(&encoding, &self)?; + let data_compress = compress(&compression, data)?; + + let data_size = self.format_version.to_le_bytes().len() + + 2 + + data_compress.len().to_le_bytes().len() + + data_compress.len(); + let mut buf = Vec::with_capacity(data_size); + + buf.extend_from_slice(&self.format_version.to_le_bytes()); + buf.push(encoding as u8); + buf.push(compression as u8); + buf.extend_from_slice(&data_compress.len().to_le_bytes()); + + buf.extend(data_compress); + + Ok(buf) + } + + pub fn from_slice(buffer: &[u8]) -> Result { + Self::from_read(Cursor::new(buffer)) + } + + pub fn from_read(mut r: impl Read) -> Result { + let version = r.read_scalar::()?; + assert_eq!(version, BlockStatistics::VERSION); + let encoding = MetaEncoding::try_from(r.read_scalar::()?)?; + let compression = MetaCompression::try_from(r.read_scalar::()?)?; + let statistics_size: u64 = r.read_scalar::()?; + read_and_deserialize(&mut r, statistics_size, &encoding, &compression) + } +} diff --git a/src/query/storages/common/table_meta/src/meta/v4/mod.rs b/src/query/storages/common/table_meta/src/meta/v4/mod.rs index 6a596b9ec8807..cf73718ea7c29 100644 --- a/src/query/storages/common/table_meta/src/meta/v4/mod.rs +++ b/src/query/storages/common/table_meta/src/meta/v4/mod.rs @@ -12,9 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +mod block_statistics; mod segment; mod snapshot; +pub use block_statistics::BlockStatistics; pub use segment::CompactSegmentInfo; pub use segment::RawBlockMeta; pub use segment::SegmentInfo; diff --git a/src/query/storages/common/table_meta/src/meta/versions.rs b/src/query/storages/common/table_meta/src/meta/versions.rs index 71493750481b3..670ce5c73805c 100644 --- a/src/query/storages/common/table_meta/src/meta/versions.rs +++ b/src/query/storages/common/table_meta/src/meta/versions.rs @@ -128,6 +128,24 @@ impl TableSnapshotStatisticsVersion { } } +impl Versioned<0> for v4::BlockStatistics {} + +pub enum BlockStatisticsVersion { + V0(PhantomData), +} + +impl BlockStatisticsVersion { + pub fn version(&self) -> u64 { + match self { + BlockStatisticsVersion::V0(a) => Self::ver(a), + } + } + + fn ver>(_v: &PhantomData) -> u64 { + V + } +} + /// Statically check that if T implements Versioned where U equals V #[inline] pub fn testify_version(t: PhantomData) -> PhantomData @@ -189,7 +207,21 @@ mod converters { PhantomData, ))), _ => Err(ErrorCode::Internal(format!( - "unknown table snapshot statistics version {value}, versions supported: 0" + "unknown table snapshot statistics version {value}, versions supported: 0, 2, 3" + ))), + } + } + } + + impl TryFrom for BlockStatisticsVersion { + type Error = ErrorCode; + fn try_from(value: u64) -> Result { + match value { + 0 => Ok(BlockStatisticsVersion::V0(testify_version::<_, 0>( + PhantomData, + ))), + _ => Err(ErrorCode::Internal(format!( + "unknown block statistics version {value}, versions supported: 0" ))), } } diff --git a/src/query/storages/common/table_meta/src/readers/versioned_reader.rs b/src/query/storages/common/table_meta/src/readers/versioned_reader.rs index a2ef7efec9558..a67c796d7eda1 100644 --- a/src/query/storages/common/table_meta/src/readers/versioned_reader.rs +++ b/src/query/storages/common/table_meta/src/readers/versioned_reader.rs @@ -17,6 +17,8 @@ use std::io::Read; use databend_common_exception::Result; use crate::meta::load_json; +use crate::meta::BlockStatistics; +use crate::meta::BlockStatisticsVersion; use crate::meta::TableSnapshotStatistics; use crate::meta::TableSnapshotStatisticsVersion; @@ -46,3 +48,15 @@ impl VersionedReader for TableSnapshotStatisticsVersion Ok(r) } } + +impl VersionedReader for BlockStatisticsVersion { + type TargetType = BlockStatistics; + + fn read(&self, reader: R) -> Result + where R: Read + Unpin + Send { + let r = match self { + BlockStatisticsVersion::V0(_) => BlockStatistics::from_read(reader)?, + }; + Ok(r) + } +} diff --git a/src/query/storages/common/table_meta/src/table/table_keys.rs b/src/query/storages/common/table_meta/src/table/table_keys.rs index 9a75763f47080..fddc120058b74 100644 --- a/src/query/storages/common/table_meta/src/table/table_keys.rs +++ b/src/query/storages/common/table_meta/src/table/table_keys.rs @@ -27,6 +27,7 @@ pub const OPT_KEY_TABLE_COMPRESSION: &str = "compression"; pub const OPT_KEY_COMMENT: &str = "comment"; pub const OPT_KEY_ENGINE: &str = "engine"; pub const OPT_KEY_BLOOM_INDEX_COLUMNS: &str = "bloom_index_columns"; +pub const OPT_KEY_APPROX_DISTINCT_COLUMNS: &str = "approx_distinct_columns"; pub const OPT_KEY_CHANGE_TRACKING: &str = "change_tracking"; pub const OPT_KEY_CHANGE_TRACKING_BEGIN_VER: &str = "begin_version"; diff --git a/src/query/storages/fuse/Cargo.toml b/src/query/storages/fuse/Cargo.toml index 811821ec8088d..0601548e6e4bf 100644 --- a/src/query/storages/fuse/Cargo.toml +++ b/src/query/storages/fuse/Cargo.toml @@ -50,6 +50,7 @@ backoff = { workspace = true, features = ["futures", "tokio"] } bytes = { workspace = true } chrono = { workspace = true } enum-as-inner = { workspace = true } +enum_dispatch = { workspace = true } fastrace = { workspace = true } futures = { workspace = true } futures-util = { workspace = true } @@ -61,6 +62,7 @@ match-template = { workspace = true } opendal = { workspace = true } parking_lot = { workspace = true } parquet = { workspace = true } +paste = { workspace = true } rand = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } diff --git a/src/query/storages/fuse/src/constants.rs b/src/query/storages/fuse/src/constants.rs index f16b4975939ad..bc95d3fe14a48 100644 --- a/src/query/storages/fuse/src/constants.rs +++ b/src/query/storages/fuse/src/constants.rs @@ -31,6 +31,7 @@ pub const FUSE_TBL_XOR_BLOOM_INDEX_PREFIX: &str = "_i_b_v2"; pub const FUSE_TBL_SEGMENT_PREFIX: &str = "_sg"; pub const FUSE_TBL_SNAPSHOT_PREFIX: &str = "_ss"; pub const FUSE_TBL_SNAPSHOT_STATISTICS_PREFIX: &str = "_ts"; +pub const FUSE_TBL_BLOCK_STATISTICS_PREFIX: &str = "_bs"; pub const FUSE_TBL_LAST_SNAPSHOT_HINT: &str = "last_snapshot_location_hint"; pub const FUSE_TBL_LAST_SNAPSHOT_HINT_V2: &str = "last_snapshot_location_hint_v2"; pub const FUSE_TBL_VIRTUAL_BLOCK_PREFIX: &str = "_vb"; diff --git a/src/query/storages/fuse/src/fuse_table.rs b/src/query/storages/fuse/src/fuse_table.rs index f832b97be638c..51a096d43c0d4 100644 --- a/src/query/storages/fuse/src/fuse_table.rs +++ b/src/query/storages/fuse/src/fuse_table.rs @@ -69,6 +69,7 @@ use databend_common_pipeline_core::Pipeline; use databend_common_sql::binder::STREAM_COLUMN_FACTORY; use databend_common_sql::parse_cluster_keys; use databend_common_sql::plans::TruncateMode; +use databend_common_sql::ApproxDistinctColumns; use databend_common_sql::BloomIndexColumns; use databend_common_storage::init_operator; use databend_common_storage::DataOperator; @@ -87,6 +88,7 @@ use databend_storages_common_table_meta::meta::Versioned; use databend_storages_common_table_meta::table::ChangeType; use databend_storages_common_table_meta::table::ClusterType; use databend_storages_common_table_meta::table::TableCompression; +use databend_storages_common_table_meta::table::OPT_KEY_APPROX_DISTINCT_COLUMNS; use databend_storages_common_table_meta::table::OPT_KEY_BLOOM_INDEX_COLUMNS; use databend_storages_common_table_meta::table::OPT_KEY_CHANGE_TRACKING; use databend_storages_common_table_meta::table::OPT_KEY_CLUSTER_TYPE; @@ -141,6 +143,7 @@ pub struct FuseTable { pub(crate) segment_format: FuseSegmentFormat, pub(crate) table_compression: TableCompression, pub(crate) bloom_index_cols: BloomIndexColumns, + pub(crate) approx_distinct_cols: ApproxDistinctColumns, pub(crate) operator: Operator, pub(crate) data_metrics: Arc, @@ -234,6 +237,12 @@ impl FuseTable { .and_then(|s| s.parse::().ok()) .unwrap_or(BloomIndexColumns::All); + let approx_distinct_cols = table_info + .options() + .get(OPT_KEY_APPROX_DISTINCT_COLUMNS) + .and_then(|s| s.parse::().ok()) + .unwrap_or(ApproxDistinctColumns::All); + let meta_location_generator = TableMetaLocationGenerator::new(storage_prefix); if !table_info.meta.part_prefix.is_empty() { return Err(ErrorCode::StorageOther( @@ -246,6 +255,7 @@ impl FuseTable { meta_location_generator, cluster_key_meta, bloom_index_cols, + approx_distinct_cols, operator, data_metrics, storage_format: FuseStorageFormat::from_str(storage_format.as_str())?, @@ -460,6 +470,10 @@ impl FuseTable { self.bloom_index_cols.clone() } + pub fn approx_distinct_cols(&self) -> ApproxDistinctColumns { + self.approx_distinct_cols.clone() + } + // Check if table is attached. pub fn is_table_attached(table_meta_options: &BTreeMap) -> bool { table_meta_options @@ -745,6 +759,14 @@ impl FuseTable { ) }) } + + pub fn enable_stream_block_write(&self, ctx: Arc) -> Result { + Ok(ctx.get_settings().get_enable_block_stream_write()? + && matches!(self.storage_format, FuseStorageFormat::Parquet) + && self + .cluster_type() + .is_none_or(|v| matches!(v, ClusterType::Hilbert))) + } } #[async_trait::async_trait] diff --git a/src/query/storages/fuse/src/io/locations.rs b/src/query/storages/fuse/src/io/locations.rs index 13272ee77706c..95ce57493f5b0 100644 --- a/src/query/storages/fuse/src/io/locations.rs +++ b/src/query/storages/fuse/src/io/locations.rs @@ -18,6 +18,7 @@ use databend_common_exception::Result; use databend_common_expression::DataBlock; use databend_storages_common_table_meta::meta::trim_object_prefix; use databend_storages_common_table_meta::meta::uuid_from_date_time; +use databend_storages_common_table_meta::meta::BlockStatistics; use databend_storages_common_table_meta::meta::Location; use databend_storages_common_table_meta::meta::SegmentInfo; use databend_storages_common_table_meta::meta::SnapshotVersion; @@ -36,6 +37,7 @@ use crate::constants::FUSE_TBL_VIRTUAL_BLOCK_PREFIX; use crate::index::filters::BlockFilter; use crate::index::InvertedIndexFile; use crate::FUSE_TBL_AGG_INDEX_PREFIX; +use crate::FUSE_TBL_BLOCK_STATISTICS_PREFIX; use crate::FUSE_TBL_INVERTED_INDEX_PREFIX; use crate::FUSE_TBL_LAST_SNAPSHOT_HINT_V2; use crate::FUSE_TBL_VECTOR_INDEX_PREFIX; @@ -65,6 +67,7 @@ pub struct TableMetaLocationGenerator { agg_index_location_prefix: String, inverted_index_location_prefix: String, vector_index_location_prefix: String, + block_statistics_location_prefix: String, } impl TableMetaLocationGenerator { @@ -78,6 +81,8 @@ impl TableMetaLocationGenerator { let inverted_index_location_prefix = format!("{}/{}/", &prefix, FUSE_TBL_INVERTED_INDEX_PREFIX); let vector_index_location_prefix = format!("{}/{}/", &prefix, FUSE_TBL_VECTOR_INDEX_PREFIX); + let block_statistics_location_prefix = + format!("{}/{}/", &prefix, FUSE_TBL_BLOCK_STATISTICS_PREFIX); Self { prefix, block_location_prefix, @@ -87,6 +92,7 @@ impl TableMetaLocationGenerator { agg_index_location_prefix, inverted_index_location_prefix, vector_index_location_prefix, + block_statistics_location_prefix, } } @@ -114,6 +120,10 @@ impl TableMetaLocationGenerator { &self.snapshot_location_prefix } + pub fn block_statistics_location_prefix(&self) -> &str { + &self.block_statistics_location_prefix + } + pub fn gen_block_location( &self, table_meta_timestamps: TableMetaTimestamps, @@ -142,6 +152,18 @@ impl TableMetaLocationGenerator { ) } + pub fn block_stats_location(&self, block_id: &Uuid) -> Location { + ( + format!( + "{}{}_v{}.mpk", + self.block_statistics_location_prefix(), + block_id.as_simple(), + BlockStatistics::VERSION, + ), + BlockStatistics::VERSION, + ) + } + pub fn block_vector_index_location(&self) -> Location { let uuid = Uuid::now_v7(); ( @@ -296,6 +318,21 @@ impl TableMetaLocationGenerator { BlockFilter::VERSION, ) } + + pub fn gen_block_stats_location_from_block_location(loc: &str) -> String { + let splits = loc.split('/').collect::>(); + let len = splits.len(); + let prefix = splits[..len - 2].join("/"); + let block_name = trim_object_prefix(splits[len - 1]); + let id: String = block_name.chars().take(32).collect(); + format!( + "{}/{}/{}_v{}.mpk", + prefix, + FUSE_TBL_BLOCK_STATISTICS_PREFIX, + id, + BlockStatistics::VERSION, + ) + } } trait SnapshotLocationCreator { diff --git a/src/query/storages/fuse/src/io/mod.rs b/src/query/storages/fuse/src/io/mod.rs index 94d22a40bbb90..24fc70297a0cd 100644 --- a/src/query/storages/fuse/src/io/mod.rs +++ b/src/query/storages/fuse/src/io/mod.rs @@ -43,6 +43,7 @@ pub use write::serialize_block; pub use write::write_data; pub use write::BlockBuilder; pub use write::BlockSerialization; +pub use write::BlockStatisticsState; pub use write::BlockWriter; pub use write::BloomIndexRebuilder; pub use write::BloomIndexState; diff --git a/src/query/storages/fuse/src/io/write/block_statistics_writer.rs b/src/query/storages/fuse/src/io/write/block_statistics_writer.rs new file mode 100644 index 0000000000000..ad54bbb972efa --- /dev/null +++ b/src/query/storages/fuse/src/io/write/block_statistics_writer.rs @@ -0,0 +1,127 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::BTreeMap; +use std::collections::HashMap; + +use databend_common_exception::Result; +use databend_common_expression::BlockEntry; +use databend_common_expression::ColumnId; +use databend_common_expression::DataBlock; +use databend_common_expression::FieldIndex; +use databend_common_expression::TableField; +use databend_storages_common_table_meta::meta::BlockStatistics; +use databend_storages_common_table_meta::meta::Location; + +use crate::io::write::stream::create_column_ndv_estimator; +use crate::io::write::stream::ColumnNDVEstimator; +use crate::io::write::stream::ColumnNDVEstimatorOps; + +#[derive(Debug)] +pub struct BlockStatisticsState { + pub data: Vec, + pub location: Location, + pub column_distinct_count: HashMap, +} + +impl BlockStatisticsState { + pub fn from_data_block( + location: Location, + block: &DataBlock, + ndv_columns_map: &BTreeMap, + ) -> Result> { + let mut builder = BlockStatsBuilder::new(ndv_columns_map); + builder.add_block(block)?; + builder.finalize(location) + } + + pub fn block_stats_size(&self) -> u64 { + self.data.len() as u64 + } +} + +pub struct BlockStatsBuilder { + builders: Vec, +} + +pub struct ColumnNDVBuilder { + index: FieldIndex, + field: TableField, + builder: ColumnNDVEstimator, +} + +impl BlockStatsBuilder { + pub fn new(ndv_columns_map: &BTreeMap) -> BlockStatsBuilder { + let mut builders = Vec::with_capacity(ndv_columns_map.len()); + for (index, field) in ndv_columns_map { + let builder = create_column_ndv_estimator(&field.data_type().into()); + builders.push(ColumnNDVBuilder { + index: *index, + field: field.clone(), + builder, + }); + } + BlockStatsBuilder { builders } + } + + pub fn add_block(&mut self, block: &DataBlock) -> Result<()> { + let mut keys_to_remove = vec![]; + for (index, column_builder) in self.builders.iter_mut().enumerate() { + let entry = block.get_by_offset(column_builder.index); + match entry { + BlockEntry::Const(s, ..) => { + column_builder.builder.update_scalar(&s.as_ref()); + } + BlockEntry::Column(col) => { + if col.check_large_string() { + keys_to_remove.push(index); + continue; + } + column_builder.builder.update_column(col); + } + } + } + + // reverse sorting. + keys_to_remove.sort_by(|a, b| b.cmp(a)); + for k in keys_to_remove { + self.builders.remove(k); + } + Ok(()) + } + + pub fn finalize(self, location: Location) -> Result> { + if self.builders.is_empty() { + return Ok(None); + } + + let mut hlls = HashMap::with_capacity(self.builders.len()); + let mut column_distinct_count = HashMap::with_capacity(self.builders.len()); + for column_builder in self.builders { + let column_id = column_builder.field.column_id(); + let distinct_count = column_builder.builder.finalize(); + let hll = column_builder.builder.hll(); + hlls.insert(column_id, hll); + column_distinct_count.insert(column_id, distinct_count); + } + + let block_stats = BlockStatistics::new(hlls); + let data = block_stats.to_bytes()?; + Ok(Some(BlockStatisticsState { + data, + location, + column_distinct_count, + })) + } +} diff --git a/src/query/storages/fuse/src/io/write/block_writer.rs b/src/query/storages/fuse/src/io/write/block_writer.rs index 1f1b6a5579e25..342c455204b36 100644 --- a/src/query/storages/fuse/src/io/write/block_writer.rs +++ b/src/query/storages/fuse/src/io/write/block_writer.rs @@ -20,6 +20,8 @@ use std::time::Instant; use chrono::Utc; use databend_common_catalog::table_context::TableContext; use databend_common_exception::Result; +use databend_common_expression::local_block_meta_serde; +use databend_common_expression::BlockMetaInfo; use databend_common_expression::Column; use databend_common_expression::ColumnId; use databend_common_expression::DataBlock; @@ -32,6 +34,9 @@ use databend_common_metrics::storage::metrics_inc_block_index_write_nums; use databend_common_metrics::storage::metrics_inc_block_inverted_index_write_bytes; use databend_common_metrics::storage::metrics_inc_block_inverted_index_write_milliseconds; use databend_common_metrics::storage::metrics_inc_block_inverted_index_write_nums; +use databend_common_metrics::storage::metrics_inc_block_stats_write_bytes; +use databend_common_metrics::storage::metrics_inc_block_stats_write_milliseconds; +use databend_common_metrics::storage::metrics_inc_block_stats_write_nums; use databend_common_metrics::storage::metrics_inc_block_vector_index_write_bytes; use databend_common_metrics::storage::metrics_inc_block_vector_index_write_milliseconds; use databend_common_metrics::storage::metrics_inc_block_vector_index_write_nums; @@ -53,6 +58,7 @@ use opendal::Operator; use crate::io::write::virtual_column_builder::VirtualColumnBuilder; use crate::io::write::virtual_column_builder::VirtualColumnState; +use crate::io::write::BlockStatisticsState; use crate::io::write::InvertedIndexBuilder; use crate::io::write::InvertedIndexState; use crate::io::write::VectorIndexBuilder; @@ -129,6 +135,7 @@ pub async fn write_data(data: Vec, data_accessor: &Operator, location: &str) Ok(()) } +#[derive(Debug)] pub struct BlockSerialization { pub block_raw_data: Vec, pub block_meta: BlockMeta, @@ -136,8 +143,14 @@ pub struct BlockSerialization { pub inverted_index_states: Vec, pub virtual_column_state: Option, pub vector_index_state: Option, + pub block_stats_state: Option, } +local_block_meta_serde!(BlockSerialization); + +#[typetag::serde(name = "block_serialization_meta")] +impl BlockMetaInfo for BlockSerialization {} + #[derive(Clone)] pub struct BlockBuilder { pub ctx: Arc, @@ -146,6 +159,7 @@ pub struct BlockBuilder { pub write_settings: WriteSettings, pub cluster_stats_gen: ClusterStatsGenerator, pub bloom_columns_map: BTreeMap, + pub ndv_columns_map: BTreeMap, pub ngram_args: Vec, pub inverted_index_builders: Vec, pub virtual_column_builder: Option, @@ -170,9 +184,22 @@ impl BlockBuilder { self.bloom_columns_map.clone(), &self.ngram_args, )?; - let column_distinct_count = bloom_index_state + let mut column_distinct_count = bloom_index_state .as_ref() - .map(|i| i.column_distinct_count.clone()); + .map(|i| i.column_distinct_count.clone()) + .unwrap_or_default(); + + let block_stats_location = self.meta_locations.block_stats_location(&block_id); + let block_stats_state = BlockStatisticsState::from_data_block( + block_stats_location, + &data_block, + &self.ndv_columns_map, + )?; + if let Some(block_stats_state) = &block_stats_state { + for (key, val) in &block_stats_state.column_distinct_count { + column_distinct_count.entry(*key).or_insert(*val); + } + } let mut inverted_index_states = Vec::with_capacity(self.inverted_index_builders.len()); for inverted_index_builder in &self.inverted_index_builders { @@ -206,8 +233,11 @@ impl BlockBuilder { }; let row_count = data_block.num_rows() as u64; - let col_stats = - gen_columns_statistics(&data_block, column_distinct_count, &self.source_schema)?; + let col_stats = gen_columns_statistics( + &data_block, + Some(column_distinct_count), + &self.source_schema, + )?; let mut buffer = Vec::with_capacity(DEFAULT_BLOCK_BUFFER_SIZE); let block_size = data_block.estimate_block_size() as u64; @@ -246,6 +276,10 @@ impl BlockBuilder { compression: self.write_settings.table_compression.into(), inverted_index_size, virtual_block_meta: None, + block_stats_location: block_stats_state.as_ref().map(|v| v.location.clone()), + block_stats_size: block_stats_state + .as_ref() + .map_or(0, |v| v.block_stats_size()), create_on: Some(Utc::now()), }; @@ -256,6 +290,7 @@ impl BlockBuilder { inverted_index_states, virtual_column_state, vector_index_state, + block_stats_state, }; Ok(serialized) } @@ -290,6 +325,7 @@ impl BlockWriter { Self::write_down_vector_index_state(dal, serialized.vector_index_state).await?; Self::write_down_inverted_index_state(dal, serialized.inverted_index_states).await?; Self::write_down_virtual_column_state(dal, serialized.virtual_column_state).await?; + Self::write_down_block_stats_state(dal, serialized.block_stats_state).await?; Ok(extended_block_meta) } @@ -391,4 +427,22 @@ impl BlockWriter { } Ok(()) } + + pub async fn write_down_block_stats_state( + dal: &Operator, + block_stats_state: Option, + ) -> Result<()> { + if let Some(block_stats_state) = block_stats_state { + let start = Instant::now(); + + let stats_size = block_stats_state.block_stats_size(); + let location = &block_stats_state.location.0; + write_data(block_stats_state.data, dal, location).await?; + + metrics_inc_block_stats_write_nums(1); + metrics_inc_block_stats_write_bytes(stats_size); + metrics_inc_block_stats_write_milliseconds(start.elapsed().as_millis() as u64); + } + Ok(()) + } } diff --git a/src/query/storages/fuse/src/io/write/bloom_index_writer.rs b/src/query/storages/fuse/src/io/write/bloom_index_writer.rs index ec49070a6f08f..738c33ac2f2c3 100644 --- a/src/query/storages/fuse/src/io/write/bloom_index_writer.rs +++ b/src/query/storages/fuse/src/io/write/bloom_index_writer.rs @@ -40,6 +40,7 @@ use opendal::Operator; use crate::io::BlockReader; use crate::FuseStorageFormat; +#[derive(Debug)] pub struct BloomIndexState { pub(crate) data: Vec, pub(crate) size: u64, diff --git a/src/query/storages/fuse/src/io/write/inverted_index_writer.rs b/src/query/storages/fuse/src/io/write/inverted_index_writer.rs index 74377a86108cb..8cf0b5f2355f0 100644 --- a/src/query/storages/fuse/src/io/write/inverted_index_writer.rs +++ b/src/query/storages/fuse/src/io/write/inverted_index_writer.rs @@ -121,6 +121,7 @@ pub fn create_inverted_index_builders(table_meta: &TableMeta) -> Vec, pub(crate) size: u64, diff --git a/src/query/storages/fuse/src/io/write/mod.rs b/src/query/storages/fuse/src/io/write/mod.rs index e7f3bfbe82c2f..544d32d988988 100644 --- a/src/query/storages/fuse/src/io/write/mod.rs +++ b/src/query/storages/fuse/src/io/write/mod.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +mod block_statistics_writer; mod block_writer; mod bloom_index_writer; mod inverted_index_writer; @@ -21,6 +22,8 @@ mod vector_index_writer; mod virtual_column_builder; mod write_settings; +pub use block_statistics_writer::BlockStatisticsState; +pub use block_statistics_writer::BlockStatsBuilder; pub use block_writer::serialize_block; pub use block_writer::write_data; pub use block_writer::BlockBuilder; diff --git a/src/query/storages/fuse/src/io/write/stream/block_builder.rs b/src/query/storages/fuse/src/io/write/stream/block_builder.rs index 73ee3c1ded29e..66114228a0082 100644 --- a/src/query/storages/fuse/src/io/write/stream/block_builder.rs +++ b/src/query/storages/fuse/src/io/write/stream/block_builder.rs @@ -32,10 +32,11 @@ use databend_common_expression::FieldIndex; use databend_common_expression::TableField; use databend_common_expression::TableSchema; use databend_common_expression::TableSchemaRef; -use databend_common_expression::ORIGIN_BLOCK_ROW_NUM_COLUMN_ID; use databend_common_io::constants::DEFAULT_BLOCK_BUFFER_SIZE; use databend_common_meta_app::schema::TableIndex; use databend_common_native::write::NativeWriter; +use databend_common_native::write::WriteOptions; +use databend_common_sql::executor::physical_plans::MutationKind; use databend_storages_common_index::BloomIndex; use databend_storages_common_index::BloomIndexBuilder; use databend_storages_common_index::Index; @@ -53,7 +54,8 @@ use parquet::file::properties::WriterProperties; use crate::io::create_inverted_index_builders; use crate::io::write::stream::cluster_statistics::ClusterStatisticsBuilder; use crate::io::write::stream::cluster_statistics::ClusterStatisticsState; -use crate::io::write::stream::column_statistics::ColumnStatisticsState; +use crate::io::write::stream::ColumnStatisticsState; +use crate::io::write::BlockStatsBuilder; use crate::io::write::InvertedIndexState; use crate::io::BlockSerialization; use crate::io::BloomIndexState; @@ -69,6 +71,7 @@ use crate::FuseTable; pub enum BlockWriterImpl { Arrow(ArrowWriter>), + // Native format doesnot support stream write. Native(NativeWriter>), } @@ -153,6 +156,7 @@ pub struct StreamBlockBuilder { bloom_index_builder: BloomIndexBuilder, virtual_column_builder: Option, vector_index_builder: Option, + block_stats_builder: BlockStatsBuilder, cluster_stats_state: ClusterStatisticsState, column_stats_state: ColumnStatisticsState, @@ -191,7 +195,7 @@ impl StreamBlockBuilder { let writer = NativeWriter::new( buffer, properties.source_schema.as_ref().clone(), - databend_common_native::write::WriteOptions { + WriteOptions { default_compression: properties.write_settings.table_compression.into(), max_page_size: Some(properties.write_settings.max_page_size), default_compress_ratio, @@ -216,27 +220,13 @@ impl StreamBlockBuilder { &properties.ngram_args, )?; - let virtual_column_builder = if properties - .ctx - .get_settings() - .get_enable_refresh_virtual_column_after_write() - .unwrap_or_default() - && properties.support_virtual_columns - { - VirtualColumnBuilder::try_create( - properties.ctx.clone(), - properties.source_schema.clone(), - ) - .ok() - } else { - None - }; + let virtual_column_builder = properties.virtual_column_builder.clone(); let vector_index_builder = VectorIndexBuilder::try_create( properties.ctx.clone(), &properties.table_indexes, properties.source_schema.clone(), ); - + let block_stats_builder = BlockStatsBuilder::new(&properties.ndv_columns_map); let cluster_stats_state = ClusterStatisticsState::new(properties.cluster_stats_builder.clone()); let column_stats_state = @@ -249,6 +239,7 @@ impl StreamBlockBuilder { bloom_index_builder, virtual_column_builder, vector_index_builder, + block_stats_builder, row_count: 0, block_size: 0, column_stats_state, @@ -263,7 +254,7 @@ impl StreamBlockBuilder { pub fn need_flush(&self) -> bool { let file_size = self.block_writer.compressed_size(); self.row_count >= self.properties.block_thresholds.min_rows_per_block - || self.block_size >= self.properties.block_thresholds.max_bytes_per_block + || self.block_size >= self.properties.block_thresholds.min_bytes_per_block * 2 || (file_size >= self.properties.block_thresholds.min_compressed_per_block && self.block_size >= self.properties.block_thresholds.min_bytes_per_block) } @@ -281,6 +272,7 @@ impl StreamBlockBuilder { self.column_stats_state .add_block(&self.properties.source_schema, &block)?; self.bloom_index_builder.add_block(&block)?; + self.block_stats_builder.add_block(&block)?; for writer in self.inverted_index_writers.iter_mut() { writer.add_block(&self.properties.source_schema, &block)?; } @@ -316,10 +308,20 @@ impl StreamBlockBuilder { } else { None }; - let column_distinct_count = bloom_index_state + let mut column_distinct_count = bloom_index_state .as_ref() .map(|i| i.column_distinct_count.clone()) .unwrap_or_default(); + let block_stats_location = self + .properties + .meta_locations + .block_stats_location(&block_id); + let block_stats_state = self.block_stats_builder.finalize(block_stats_location)?; + if let Some(state) = &block_stats_state { + for (key, val) in &state.column_distinct_count { + column_distinct_count.entry(*key).or_insert(*val); + } + } let col_stats = self.column_stats_state.finalize(column_distinct_count)?; let mut inverted_index_states = Vec::with_capacity(self.inverted_index_writers.len()); @@ -386,8 +388,15 @@ impl StreamBlockBuilder { vector_index_size, vector_index_location, create_on: Some(Utc::now()), - ngram_filter_index_size: None, + ngram_filter_index_size: bloom_index_state + .as_ref() + .map(|v| v.ngram_size) + .unwrap_or_default(), virtual_block_meta: None, + block_stats_location: block_stats_state.as_ref().map(|v| v.location.clone()), + block_stats_size: block_stats_state + .as_ref() + .map_or(0, |v| v.block_stats_size()), }; let serialized = BlockSerialization { block_raw_data, @@ -396,6 +405,7 @@ impl StreamBlockBuilder { inverted_index_states, virtual_column_state, vector_index_state, + block_stats_state, }; Ok(serialized) } @@ -410,13 +420,14 @@ pub struct StreamBlockProperties { source_schema: TableSchemaRef, cluster_stats_builder: Arc, - stats_columns: Vec, - distinct_columns: Vec, + stats_columns: Vec<(ColumnId, DataType)>, + distinct_columns: Vec<(ColumnId, DataType)>, bloom_columns_map: BTreeMap, + ndv_columns_map: BTreeMap, ngram_args: Vec, inverted_index_builders: Vec, + virtual_column_builder: Option, table_meta_timestamps: TableMetaTimestamps, - support_virtual_columns: bool, table_indexes: BTreeMap, } @@ -424,16 +435,23 @@ impl StreamBlockProperties { pub fn try_create( ctx: Arc, table: &FuseTable, + kind: MutationKind, table_meta_timestamps: TableMetaTimestamps, ) -> Result> { // remove virtual computed fields. - let fields = table + let mut fields = table .schema() .fields() .iter() .filter(|f| !matches!(f.computed_expr(), Some(ComputedExpr::Virtual(_)))) .cloned() .collect::>(); + if !matches!(kind, MutationKind::Insert | MutationKind::Replace) { + // add stream fields. + for stream_column in table.stream_columns().iter() { + fields.push(stream_column.table_field()); + } + } let source_schema = Arc::new(TableSchema { fields, @@ -449,12 +467,26 @@ impl StreamBlockProperties { &table.table_info.meta, &table.table_info.meta.schema, )?; - let bloom_column_ids = bloom_columns_map + let ndv_columns_map = table + .approx_distinct_cols + .distinct_column_fields(source_schema.clone(), RangeIndex::supported_table_type)?; + let bloom_ndv_columns = bloom_columns_map .values() + .chain(ndv_columns_map.values()) .map(|v| v.column_id()) .collect::>(); let inverted_index_builders = create_inverted_index_builders(&table.table_info.meta); + let virtual_column_builder = if ctx + .get_settings() + .get_enable_refresh_virtual_column_after_write() + .unwrap_or_default() + && table.support_virtual_columns() + { + VirtualColumnBuilder::try_create(ctx.clone(), source_schema.clone()).ok() + } else { + None + }; let cluster_stats_builder = ClusterStatisticsBuilder::try_create(table, ctx.clone(), &source_schema)?; @@ -464,16 +496,14 @@ impl StreamBlockProperties { let leaf_fields = source_schema.leaf_fields(); for field in leaf_fields.iter() { let column_id = field.column_id(); - if RangeIndex::supported_type(&DataType::from(field.data_type())) - && column_id != ORIGIN_BLOCK_ROW_NUM_COLUMN_ID - { - stats_columns.push(column_id); - if !bloom_column_ids.contains(&column_id) { - distinct_columns.push(column_id); + let data_type = DataType::from(field.data_type()); + if RangeIndex::supported_type(&data_type) { + stats_columns.push((column_id, data_type.clone())); + if !bloom_ndv_columns.contains(&column_id) { + distinct_columns.push((column_id, data_type)); } } } - let support_virtual_columns = table.support_virtual_columns(); let table_indexes = table.table_info.meta.indexes.clone(); Ok(Arc::new(StreamBlockProperties { ctx, @@ -482,14 +512,15 @@ impl StreamBlockProperties { source_schema, write_settings, cluster_stats_builder, + virtual_column_builder, stats_columns, distinct_columns, bloom_columns_map, ngram_args, inverted_index_builders, table_meta_timestamps, - support_virtual_columns, table_indexes, + ndv_columns_map, })) } } diff --git a/src/query/storages/fuse/src/io/write/stream/column_ndv_estimator.rs b/src/query/storages/fuse/src/io/write/stream/column_ndv_estimator.rs new file mode 100644 index 0000000000000..23d2f2475546a --- /dev/null +++ b/src/query/storages/fuse/src/io/write/stream/column_ndv_estimator.rs @@ -0,0 +1,187 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::hash::Hash; +use std::marker::PhantomData; + +use databend_common_expression::types::boolean::TrueIdxIter; +use databend_common_expression::types::DataType; +use databend_common_expression::types::DateType; +use databend_common_expression::types::Decimal128Type; +use databend_common_expression::types::Decimal256Type; +use databend_common_expression::types::Decimal64Type; +use databend_common_expression::types::Float32Type; +use databend_common_expression::types::Float64Type; +use databend_common_expression::types::Int16Type; +use databend_common_expression::types::Int32Type; +use databend_common_expression::types::Int64Type; +use databend_common_expression::types::Int8Type; +use databend_common_expression::types::NumberDataType; +use databend_common_expression::types::StringType; +use databend_common_expression::types::TimestampType; +use databend_common_expression::types::UInt16Type; +use databend_common_expression::types::UInt32Type; +use databend_common_expression::types::UInt64Type; +use databend_common_expression::types::UInt8Type; +use databend_common_expression::types::ValueType; +use databend_common_expression::with_number_type; +use databend_common_expression::Column; +use databend_common_expression::ScalarRef; +use databend_common_expression::SELECTIVITY_THRESHOLD; +use databend_storages_common_table_meta::meta::MetaHLL; +use enum_dispatch::enum_dispatch; + +#[enum_dispatch] +pub trait ColumnNDVEstimatorOps: Send + Sync { + fn update_column(&mut self, column: &Column); + fn update_scalar(&mut self, scalar: &ScalarRef); + fn finalize(&self) -> usize; + fn hll(self) -> MetaHLL; +} + +#[enum_dispatch(ColumnNDVEstimatorOps)] +pub enum ColumnNDVEstimator { + Int8(ColumnNDVEstimatorImpl), + Int16(ColumnNDVEstimatorImpl), + Int32(ColumnNDVEstimatorImpl), + Int64(ColumnNDVEstimatorImpl), + UInt8(ColumnNDVEstimatorImpl), + UInt16(ColumnNDVEstimatorImpl), + UInt32(ColumnNDVEstimatorImpl), + UInt64(ColumnNDVEstimatorImpl), + Float32(ColumnNDVEstimatorImpl), + Float64(ColumnNDVEstimatorImpl), + String(ColumnNDVEstimatorImpl), + Date(ColumnNDVEstimatorImpl), + Timestamp(ColumnNDVEstimatorImpl), + Decimal64(ColumnNDVEstimatorImpl), + Decimal128(ColumnNDVEstimatorImpl), + Decimal256(ColumnNDVEstimatorImpl), +} + +pub fn create_column_ndv_estimator(data_type: &DataType) -> ColumnNDVEstimator { + macro_rules! match_number_type_create { + ($inner_type:expr) => {{ + with_number_type!(|NUM_TYPE| match $inner_type { + NumberDataType::NUM_TYPE => { + paste::paste! { + ColumnNDVEstimator::NUM_TYPE(ColumnNDVEstimatorImpl::<[]>::new()) + } + } + }) + }}; + } + + let inner_type = data_type.remove_nullable(); + match inner_type { + DataType::Number(num_type) => { + match_number_type_create!(num_type) + } + DataType::String => ColumnNDVEstimator::String(ColumnNDVEstimatorImpl::::new()), + DataType::Date => ColumnNDVEstimator::Date(ColumnNDVEstimatorImpl::::new()), + DataType::Timestamp => { + ColumnNDVEstimator::Timestamp(ColumnNDVEstimatorImpl::::new()) + } + DataType::Decimal(size) => { + if size.can_carried_by_64() { + ColumnNDVEstimator::Decimal64(ColumnNDVEstimatorImpl::::new()) + } else if size.can_carried_by_128() { + ColumnNDVEstimator::Decimal128(ColumnNDVEstimatorImpl::::new()) + } else { + ColumnNDVEstimator::Decimal256(ColumnNDVEstimatorImpl::::new()) + } + } + _ => unreachable!("Unsupported data type: {:?}", data_type), + } +} + +pub struct ColumnNDVEstimatorImpl +where + T: ValueType + Send + Sync, + for<'a> T::ScalarRef<'a>: Hash, +{ + hll: MetaHLL, + _phantom: PhantomData, +} + +impl ColumnNDVEstimatorImpl +where + T: ValueType + Send + Sync, + for<'a> T::ScalarRef<'a>: Hash, +{ + pub fn new() -> Self { + Self { + hll: MetaHLL::new(), + _phantom: Default::default(), + } + } +} + +impl ColumnNDVEstimatorOps for ColumnNDVEstimatorImpl +where + T: ValueType + Send + Sync, + for<'a> T::ScalarRef<'a>: Hash, +{ + fn update_column(&mut self, column: &Column) { + let (column, validity) = match column { + Column::Nullable(box inner) => { + let validity = if inner.validity.null_count() == 0 { + None + } else { + Some(&inner.validity) + }; + (&inner.column, validity) + } + Column::Null { .. } => return, + column => (column, None), + }; + + let column = T::try_downcast_column(column).unwrap(); + if let Some(v) = validity { + if v.true_count() as f64 / v.len() as f64 >= SELECTIVITY_THRESHOLD { + for (data, valid) in T::iter_column(&column).zip(v.iter()) { + if valid { + self.hll.add_object(&data); + } + } + } else { + TrueIdxIter::new(v.len(), Some(v)).for_each(|idx| { + let val = unsafe { T::index_column_unchecked(&column, idx) }; + self.hll.add_object(&val); + }) + } + } else { + for value in T::iter_column(&column) { + self.hll.add_object(&value); + } + } + } + + fn update_scalar(&mut self, scalar: &ScalarRef) { + if matches!(scalar, ScalarRef::Null) { + return; + } + + let val = T::try_downcast_scalar(scalar).unwrap(); + self.hll.add_object(&val); + } + + fn finalize(&self) -> usize { + self.hll.count() + } + + fn hll(self) -> MetaHLL { + self.hll + } +} diff --git a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs b/src/query/storages/fuse/src/io/write/stream/column_statistics.rs deleted file mode 100644 index 402504e3a207e..0000000000000 --- a/src/query/storages/fuse/src/io/write/stream/column_statistics.rs +++ /dev/null @@ -1,253 +0,0 @@ -// Copyright 2021 Datafuse Labs -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashMap; - -use databend_common_exception::Result; -use databend_common_expression::types::AccessType; -use databend_common_expression::types::DataType; -use databend_common_expression::types::DateType; -use databend_common_expression::types::DecimalColumn; -use databend_common_expression::types::DecimalScalar; -use databend_common_expression::types::NumberDataType; -use databend_common_expression::types::NumberType; -use databend_common_expression::types::StringType; -use databend_common_expression::types::TimestampType; -use databend_common_expression::with_number_mapped_type; -use databend_common_expression::Column; -use databend_common_expression::ColumnId; -use databend_common_expression::DataBlock; -use databend_common_expression::Scalar; -use databend_common_expression::ScalarRef; -use databend_common_expression::TableSchemaRef; -use databend_common_expression::Value; -use databend_common_functions::aggregates::eval_aggr; -use databend_storages_common_table_meta::meta::ColumnDistinctHLL; -use databend_storages_common_table_meta::meta::ColumnStatistics; -use databend_storages_common_table_meta::meta::StatisticsOfColumns; - -use crate::statistics::reducers::reduce_column_statistics; -use crate::statistics::traverse_values_dfs; -use crate::statistics::Trim; - -pub struct ColumnStatisticsState { - col_stats: HashMap>, - distinct_columns: HashMap, -} - -impl ColumnStatisticsState { - pub fn new(stats_columns: &[ColumnId], distinct_columns: &[ColumnId]) -> Self { - let col_stats = stats_columns - .iter() - .map(|&col_id| (col_id, Vec::new())) - .collect(); - - let distinct_columns = distinct_columns - .iter() - .map(|&col_id| (col_id, ColumnDistinctHLL::default())) - .collect(); - - Self { - col_stats, - distinct_columns, - } - } - - pub fn add_block(&mut self, schema: &TableSchemaRef, data_block: &DataBlock) -> Result<()> { - let rows = data_block.num_rows(); - let leaves = traverse_values_dfs(data_block.columns(), schema.fields())?; - for (column_id, col, data_type) in leaves { - match col { - Value::Scalar(s) => { - let unset_bits = if s == Scalar::Null { rows } else { 0 }; - // when we read it back from parquet, it is a Column instead of Scalar - let in_memory_size = s.as_ref().estimated_scalar_repeat_size(rows, &data_type); - let col_stats = ColumnStatistics::new( - s.clone(), - s.clone(), - unset_bits as u64, - in_memory_size as u64, - None, - ); - if let Some(hll) = self.distinct_columns.get_mut(&column_id) { - scalar_update_hll_cardinality(&s.as_ref(), &data_type, hll); - } - self.col_stats.get_mut(&column_id).unwrap().push(col_stats); - } - Value::Column(col) => { - // later, during the evaluation of expressions, name of field does not matter - let mut min = Scalar::Null; - let mut max = Scalar::Null; - - let (mins, _) = eval_aggr("min", vec![], &[col.clone().into()], rows, vec![])?; - if mins.len() > 0 { - min = if let Some(v) = mins.index(0) { - // safe upwrap. - v.to_owned().trim_min().unwrap() - } else { - self.col_stats.remove(&column_id); - continue; - } - } - - let (maxs, _) = eval_aggr("max", vec![], &[col.clone().into()], rows, vec![])?; - if maxs.len() > 0 { - max = if let Some(v) = maxs.index(0) { - if let Some(v) = v.to_owned().trim_max() { - v - } else { - self.col_stats.remove(&column_id); - continue; - } - } else { - self.col_stats.remove(&column_id); - continue; - } - } - - let (is_all_null, bitmap) = col.validity(); - let unset_bits = match (is_all_null, bitmap) { - (true, _) => rows, - (false, Some(bitmap)) => bitmap.null_count(), - (false, None) => 0, - }; - let in_memory_size = col.memory_size() as u64; - let col_stats = - ColumnStatistics::new(min, max, unset_bits as u64, in_memory_size, None); - self.col_stats.get_mut(&column_id).unwrap().push(col_stats); - - // use distinct count calculated by the xor hash function to avoid repetitive operation. - if let Some(hll) = self.distinct_columns.get_mut(&column_id) { - column_update_hll_cardinality(&col, &data_type, hll); - } - } - } - } - Ok(()) - } - - pub fn finalize( - self, - column_distinct_count: HashMap, - ) -> Result { - let mut statistics = StatisticsOfColumns::with_capacity(self.col_stats.len()); - for (id, stats) in &self.col_stats { - let mut col_stats = reduce_column_statistics(stats); - if let Some(count) = column_distinct_count.get(id) { - col_stats.distinct_of_values = Some(*count as u64); - } else if let Some(hll) = self.distinct_columns.get(id) { - col_stats.distinct_of_values = Some(hll.count() as u64); - } - statistics.insert(*id, col_stats); - } - Ok(statistics) - } -} - -fn column_update_hll_cardinality(col: &Column, ty: &DataType, hll: &mut ColumnDistinctHLL) { - if let DataType::Nullable(inner) = ty { - let col = col.as_nullable().unwrap(); - for (i, v) in col.validity.iter().enumerate() { - if v { - let scalar = col.column.index(i).unwrap(); - scalar_update_hll_cardinality(&scalar, inner, hll); - } - } - return; - } - - with_number_mapped_type!(|NUM_TYPE| match ty { - DataType::Number(NumberDataType::NUM_TYPE) => { - let col = NumberType::::try_downcast_column(col).unwrap(); - for v in col.iter() { - hll.add_object(v); - } - } - DataType::String => { - let col = StringType::try_downcast_column(col).unwrap(); - for v in col.iter() { - hll.add_object(&v); - } - } - DataType::Date => { - let col = DateType::try_downcast_column(col).unwrap(); - for v in col.iter() { - hll.add_object(v); - } - } - DataType::Timestamp => { - let col = TimestampType::try_downcast_column(col).unwrap(); - for v in col.iter() { - hll.add_object(v); - } - } - DataType::Decimal(_) => { - match col { - Column::Decimal(DecimalColumn::Decimal64(col, _)) => { - for v in col.iter() { - hll.add_object(v); - } - } - Column::Decimal(DecimalColumn::Decimal128(col, _)) => { - for v in col.iter() { - hll.add_object(v); - } - } - Column::Decimal(DecimalColumn::Decimal256(col, _)) => { - for v in col.iter() { - hll.add_object(v); - } - } - _ => unreachable!(), - }; - } - _ => unreachable!("Unsupported data type: {:?}", ty), - }); -} - -fn scalar_update_hll_cardinality(scalar: &ScalarRef, ty: &DataType, hll: &mut ColumnDistinctHLL) { - if matches!(scalar, ScalarRef::Null) { - return; - } - - let ty = ty.remove_nullable(); - - with_number_mapped_type!(|NUM_TYPE| match ty { - DataType::Number(NumberDataType::NUM_TYPE) => { - let val = NumberType::::try_downcast_scalar(scalar).unwrap(); - hll.add_object(&val); - } - DataType::String => { - let val = StringType::try_downcast_scalar(scalar).unwrap(); - hll.add_object(&val); - } - DataType::Date => { - let val = DateType::try_downcast_scalar(scalar).unwrap(); - hll.add_object(&val); - } - DataType::Timestamp => { - let val = TimestampType::try_downcast_scalar(scalar).unwrap(); - hll.add_object(&val); - } - DataType::Decimal(_) => { - match scalar { - ScalarRef::Decimal(DecimalScalar::Decimal64(v, _)) => hll.add_object(&v), - ScalarRef::Decimal(DecimalScalar::Decimal128(v, _)) => hll.add_object(&v), - ScalarRef::Decimal(DecimalScalar::Decimal256(v, _)) => hll.add_object(&v), - _ => unreachable!(), - } - } - _ => unreachable!("Unsupported data type: {:?}", ty), - }); -} diff --git a/src/query/storages/fuse/src/io/write/stream/column_statistics_builder.rs b/src/query/storages/fuse/src/io/write/stream/column_statistics_builder.rs new file mode 100644 index 0000000000000..e9278b4b1e71a --- /dev/null +++ b/src/query/storages/fuse/src/io/write/stream/column_statistics_builder.rs @@ -0,0 +1,357 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp::Ordering; +use std::marker::PhantomData; + +use databend_common_exception::ErrorCode; +use databend_common_exception::Result; +use databend_common_expression::types::boolean::TrueIdxIter; +use databend_common_expression::types::DataType; +use databend_common_expression::types::DateType; +use databend_common_expression::types::Decimal; +use databend_common_expression::types::Decimal128Type; +use databend_common_expression::types::Decimal256Type; +use databend_common_expression::types::Decimal64Type; +use databend_common_expression::types::Float32Type; +use databend_common_expression::types::Float64Type; +use databend_common_expression::types::Int16Type; +use databend_common_expression::types::Int32Type; +use databend_common_expression::types::Int64Type; +use databend_common_expression::types::Int8Type; +use databend_common_expression::types::NumberDataType; +use databend_common_expression::types::StringType; +use databend_common_expression::types::TimestampType; +use databend_common_expression::types::UInt16Type; +use databend_common_expression::types::UInt32Type; +use databend_common_expression::types::UInt64Type; +use databend_common_expression::types::UInt8Type; +use databend_common_expression::types::ValueType; +use databend_common_expression::with_number_type; +use databend_common_expression::Column; +use databend_common_expression::Scalar; +use databend_common_expression::ScalarRef; +use databend_common_expression::SELECTIVITY_THRESHOLD; +use databend_storages_common_table_meta::meta::ColumnStatistics; +use enum_dispatch::enum_dispatch; + +use crate::statistics::Trim; + +pub type CommonBuilder = GenericColumnStatisticsBuilder; +pub type DecimalBuilder = GenericColumnStatisticsBuilder; + +#[enum_dispatch(ColumnStatsOps)] +pub enum ColumnStatisticsBuilder { + Int8(CommonBuilder), + Int16(CommonBuilder), + Int32(CommonBuilder), + Int64(CommonBuilder), + UInt8(CommonBuilder), + UInt16(CommonBuilder), + UInt32(CommonBuilder), + UInt64(CommonBuilder), + Float32(CommonBuilder), + Float64(CommonBuilder), + String(CommonBuilder), + Date(CommonBuilder), + Timestamp(CommonBuilder), + Decimal64(DecimalBuilder), + Decimal128(DecimalBuilder), + Decimal256(DecimalBuilder), +} + +#[enum_dispatch] +pub trait ColumnStatsOps { + fn update_column(&mut self, column: &Column); + fn update_scalar(&mut self, scalar: &ScalarRef, num_rows: usize, data_type: &DataType); + fn finalize(self) -> Result; +} + +impl ColumnStatsOps for GenericColumnStatisticsBuilder +where + T: ValueType + Send + Sync, + T::Scalar: Send + Sync, + A: ColumnStatisticsAdapter + 'static, + for<'a, 'b> T::ScalarRef<'a>: PartialOrd>, +{ + fn update_column(&mut self, column: &Column) { + GenericColumnStatisticsBuilder::update_column(self, column); + } + + fn update_scalar(&mut self, scalar: &ScalarRef, num_rows: usize, data_type: &DataType) { + GenericColumnStatisticsBuilder::update_scalar(self, scalar, num_rows, data_type); + } + + fn finalize(self) -> Result { + GenericColumnStatisticsBuilder::finalize(self) + } +} + +pub fn create_column_stats_builder(data_type: &DataType) -> ColumnStatisticsBuilder { + let inner_type = data_type.remove_nullable(); + macro_rules! match_number_type_create { + ($inner_type:expr) => {{ + with_number_type!(|NUM_TYPE| match $inner_type { + NumberDataType::NUM_TYPE => { + paste::paste! { + ColumnStatisticsBuilder::NUM_TYPE(CommonBuilder::<[]>::create(inner_type)) + } + } + }) + }}; + } + + match inner_type { + DataType::Number(num_type) => { + match_number_type_create!(num_type) + } + DataType::String => { + ColumnStatisticsBuilder::String(CommonBuilder::::create(inner_type)) + } + DataType::Date => { + ColumnStatisticsBuilder::Date(CommonBuilder::::create(inner_type)) + } + DataType::Timestamp => { + ColumnStatisticsBuilder::Timestamp(CommonBuilder::::create(inner_type)) + } + DataType::Decimal(size) => { + if size.can_carried_by_64() { + ColumnStatisticsBuilder::Decimal64(DecimalBuilder::::create( + inner_type, + )) + } else if size.can_carried_by_128() { + ColumnStatisticsBuilder::Decimal128(DecimalBuilder::::create( + inner_type, + )) + } else { + ColumnStatisticsBuilder::Decimal256(DecimalBuilder::::create( + inner_type, + )) + } + } + _ => unreachable!("Unsupported data type: {:?}", data_type), + } +} + +pub trait ColumnStatisticsAdapter: Send + Sync { + type Value: Clone + Send + Sync; + + fn scalar_to_value(val: T::ScalarRef<'_>) -> Self::Value; + + fn value_to_scalar(val: Self::Value) -> T::Scalar; + + fn update_value(value: &mut Self::Value, scalar: T::ScalarRef<'_>, ordering: Ordering); +} + +pub struct CommonAdapter; + +impl ColumnStatisticsAdapter for CommonAdapter +where + T: ValueType, + T::Scalar: Send + Sync, + for<'a, 'b> T::ScalarRef<'a>: PartialOrd>, +{ + type Value = T::Scalar; + + fn scalar_to_value(val: T::ScalarRef<'_>) -> Self::Value { + T::to_owned_scalar(val) + } + + fn value_to_scalar(val: Self::Value) -> T::Scalar { + val + } + + fn update_value(value: &mut Self::Value, scalar: T::ScalarRef<'_>, ordering: Ordering) { + if scalar.partial_cmp(&T::to_scalar_ref(value)) == Some(ordering) { + *value = T::to_owned_scalar(scalar); + } + } +} + +pub struct DecimalAdapter; + +impl ColumnStatisticsAdapter for DecimalAdapter +where + T: ValueType, + T::Scalar: Decimal + Send + Sync, + for<'a, 'b> T::ScalarRef<'a>: PartialOrd>, +{ + type Value = ::U64Array; + + fn scalar_to_value(val: T::ScalarRef<'_>) -> Self::Value { + T::Scalar::to_u64_array(T::to_owned_scalar(val)) + } + + fn value_to_scalar(val: Self::Value) -> T::Scalar { + T::Scalar::from_u64_array(val) + } + + fn update_value(value: &mut Self::Value, scalar: T::ScalarRef<'_>, ordering: Ordering) { + let val = T::Scalar::from_u64_array(*value); + if scalar.partial_cmp(&T::to_scalar_ref(&val)) == Some(ordering) { + *value = T::Scalar::to_u64_array(T::to_owned_scalar(scalar)); + } + } +} + +pub struct GenericColumnStatisticsBuilder +where + T: ValueType, + A: ColumnStatisticsAdapter, +{ + min: Option, + max: Option, + null_count: usize, + in_memory_size: usize, + data_type: DataType, + + _phantom: PhantomData<(T, A)>, +} + +impl GenericColumnStatisticsBuilder +where + T: ValueType + Send + Sync, + T::Scalar: Send + Sync, + A: ColumnStatisticsAdapter + 'static, + for<'a, 'b> T::ScalarRef<'a>: PartialOrd>, +{ + fn create(data_type: DataType) -> Self { + Self { + min: None, + max: None, + null_count: 0, + in_memory_size: 0, + data_type, + _phantom: PhantomData, + } + } + + fn add_batch<'a, I>(&mut self, mut iter: I) + where I: Iterator> { + let first = iter.next().unwrap(); + let mut min = first.clone(); + let mut max = first; + for v in iter { + if matches!(min.partial_cmp(&v), Some(Ordering::Greater)) { + min = v; + continue; + } + + if matches!(max.partial_cmp(&v), Some(Ordering::Less)) { + max = v; + } + } + + self.add(min, max); + } + + fn add(&mut self, min: T::ScalarRef<'_>, max: T::ScalarRef<'_>) { + if let Some(val) = self.min.as_mut() { + A::update_value(val, min, Ordering::Less); + } else { + self.min = Some(A::scalar_to_value(min)); + } + + if let Some(val) = self.max.as_mut() { + A::update_value(val, max, Ordering::Greater); + } else { + self.max = Some(A::scalar_to_value(max)); + } + } + + fn update_column(&mut self, column: &Column) { + self.in_memory_size += column.memory_size(); + if column.len() == 0 { + return; + } + let (column, validity) = match column { + Column::Nullable(box inner) => { + let validity = if inner.validity.null_count() == 0 { + None + } else { + Some(&inner.validity) + }; + (&inner.column, validity) + } + Column::Null { len } => { + self.null_count += *len; + return; + } + col => (col, None), + }; + self.null_count += validity.map_or(0, |v| v.null_count()); + + let column = T::try_downcast_column(column).unwrap(); + if let Some(v) = validity { + if v.true_count() as f64 / v.len() as f64 >= SELECTIVITY_THRESHOLD { + let column_iter = T::iter_column(&column); + let value_iter = column_iter + .zip(v.iter()) + .filter(|(_, v)| *v) + .map(|(v, _)| v); + self.add_batch(value_iter); + } else { + for idx in TrueIdxIter::new(v.len(), Some(v)) { + let v = unsafe { T::index_column_unchecked(&column, idx) }; + self.add(v.clone(), v); + } + } + } else { + let column_iter = T::iter_column(&column); + self.add_batch(column_iter); + } + } + + fn update_scalar(&mut self, scalar: &ScalarRef, num_rows: usize, data_type: &DataType) { + // when we read it back from parquet, it is a Column instead of Scalar + self.in_memory_size += scalar.estimated_scalar_repeat_size(num_rows, data_type); + if scalar.is_null() { + self.null_count += num_rows; + return; + } + + let val = T::try_downcast_scalar(scalar).unwrap(); + self.add(val.clone(), val); + } + + fn finalize(self) -> Result { + let min = if let Some(v) = self.min { + let v = A::value_to_scalar(v); + // safe upwrap. + T::upcast_scalar_with_type(v, &self.data_type) + .trim_min() + .unwrap() + } else { + Scalar::Null + }; + let max = if let Some(v) = self.max { + let v = A::value_to_scalar(v); + if let Some(v) = T::upcast_scalar_with_type(v, &self.data_type).trim_max() { + v + } else { + return Err(ErrorCode::Internal("Unable to trim string")); + } + } else { + Scalar::Null + }; + + Ok(ColumnStatistics::new( + min, + max, + self.null_count as u64, + self.in_memory_size as u64, + None, + )) + } +} diff --git a/src/query/storages/fuse/src/io/write/stream/column_statistics_state.rs b/src/query/storages/fuse/src/io/write/stream/column_statistics_state.rs new file mode 100644 index 0000000000000..ac65378b20d22 --- /dev/null +++ b/src/query/storages/fuse/src/io/write/stream/column_statistics_state.rs @@ -0,0 +1,184 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use databend_common_exception::Result; +use databend_common_expression::types::DataType; +use databend_common_expression::ColumnId; +use databend_common_expression::DataBlock; +use databend_common_expression::TableSchemaRef; +use databend_common_expression::Value; +use databend_storages_common_table_meta::meta::StatisticsOfColumns; + +use crate::io::write::stream::create_column_ndv_estimator; +use crate::io::write::stream::create_column_stats_builder; +use crate::io::write::stream::ColumnNDVEstimator; +use crate::io::write::stream::ColumnNDVEstimatorOps; +use crate::io::write::stream::ColumnStatisticsBuilder; +use crate::io::write::stream::ColumnStatsOps; +use crate::statistics::traverse_values_dfs; + +pub struct ColumnStatisticsState { + col_stats: HashMap, + distinct_columns: HashMap, +} + +impl ColumnStatisticsState { + pub fn new( + stats_columns: &[(ColumnId, DataType)], + distinct_columns: &[(ColumnId, DataType)], + ) -> Self { + let col_stats = stats_columns + .iter() + .map(|(col_id, data_type)| (*col_id, create_column_stats_builder(data_type))) + .collect(); + + let distinct_columns = distinct_columns + .iter() + .map(|(col_id, data_type)| (*col_id, create_column_ndv_estimator(data_type))) + .collect(); + + Self { + col_stats, + distinct_columns, + } + } + + pub fn add_block(&mut self, schema: &TableSchemaRef, data_block: &DataBlock) -> Result<()> { + let rows = data_block.num_rows(); + let leaves = traverse_values_dfs(data_block.columns(), schema.fields())?; + for (column_id, col, data_type) in leaves { + match col { + Value::Scalar(s) => { + self.col_stats.get_mut(&column_id).unwrap().update_scalar( + &s.as_ref(), + rows, + &data_type, + ); + if let Some(estimator) = self.distinct_columns.get_mut(&column_id) { + estimator.update_scalar(&s.as_ref()); + } + } + Value::Column(col) => { + self.col_stats + .get_mut(&column_id) + .unwrap() + .update_column(&col); + // use distinct count calculated by the xor hash function to avoid repetitive operation. + if let Some(estimator) = self.distinct_columns.get_mut(&column_id) { + estimator.update_column(&col); + } + } + } + } + Ok(()) + } + + pub fn finalize( + self, + mut column_distinct_count: HashMap, + ) -> Result { + for (column_id, estimator) in &self.distinct_columns { + column_distinct_count.insert(*column_id, estimator.finalize()); + } + + let mut statistics = StatisticsOfColumns::with_capacity(self.col_stats.len()); + for (id, stats) in self.col_stats { + let mut col_stats = stats.finalize()?; + if let Some(count) = column_distinct_count.get(&id) { + col_stats.distinct_of_values = Some(*count as u64); + } else if col_stats.min == col_stats.max { + // Bloom index will skip the large string column, it also no need to calc distinct values. + if col_stats.min.is_null() { + col_stats.distinct_of_values = Some(0); + } else { + col_stats.distinct_of_values = Some(1); + } + } + statistics.insert(id, col_stats); + } + Ok(statistics) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use databend_common_expression::types::BinaryType; + use databend_common_expression::types::Int64Type; + use databend_common_expression::types::NumberDataType; + use databend_common_expression::types::StringType; + use databend_common_expression::types::UInt64Type; + use databend_common_expression::Column; + use databend_common_expression::FromData; + use databend_common_expression::TableDataType; + use databend_common_expression::TableField; + use databend_common_expression::TableSchema; + use databend_storages_common_index::Index; + use databend_storages_common_index::RangeIndex; + + use super::*; + use crate::statistics::gen_columns_statistics; + + #[test] + fn test_column_stats_state() -> Result<()> { + let field1 = TableField::new( + "a", + TableDataType::Nullable(Box::new(TableDataType::Number(NumberDataType::Int64))), + ); + let field2 = TableField::new("b", TableDataType::String); + let field3 = TableField::new("c", TableDataType::Tuple { + fields_name: vec!["d".to_string(), "e".to_string()], + fields_type: vec![ + TableDataType::Number(NumberDataType::UInt64), + TableDataType::Binary, + ], + }); + let schema = Arc::new(TableSchema::new(vec![field1, field2, field3])); + let block = DataBlock::new_from_columns(vec![ + Int64Type::from_opt_data(vec![Some(1), Some(2), None, Some(4), Some(5)]), + StringType::from_data(vec!["a", "b", "c", "d", "e"]), + Column::Tuple(vec![ + UInt64Type::from_data(vec![11, 12, 13, 14, 15]), + BinaryType::from_data(vec![ + "hello".as_bytes().to_vec(), + "world".as_bytes().to_vec(), + "".as_bytes().to_vec(), + "foo".as_bytes().to_vec(), + "bar".as_bytes().to_vec(), + ]), + ]), + ]); + + let stats_0 = gen_columns_statistics(&block, None, &schema)?; + + let mut stats_columns = vec![]; + let leaf_fields = schema.leaf_fields(); + for field in leaf_fields.iter() { + let column_id = field.column_id(); + let data_type = DataType::from(field.data_type()); + if RangeIndex::supported_type(&data_type) { + stats_columns.push((column_id, data_type.clone())); + } + } + let mut column_stats_state = ColumnStatisticsState::new(&stats_columns, &stats_columns); + column_stats_state.add_block(&schema, &block)?; + let stats_1 = column_stats_state.finalize(HashMap::new())?; + + assert_eq!(stats_0, stats_1); + Ok(()) + } +} diff --git a/src/query/storages/fuse/src/io/write/stream/mod.rs b/src/query/storages/fuse/src/io/write/stream/mod.rs index 26d32ee679582..f0c7365b5ba01 100644 --- a/src/query/storages/fuse/src/io/write/stream/mod.rs +++ b/src/query/storages/fuse/src/io/write/stream/mod.rs @@ -14,7 +14,16 @@ mod block_builder; mod cluster_statistics; -mod column_statistics; +mod column_ndv_estimator; +mod column_statistics_builder; +mod column_statistics_state; pub(crate) use block_builder::StreamBlockBuilder; pub(crate) use block_builder::StreamBlockProperties; +pub(crate) use column_ndv_estimator::create_column_ndv_estimator; +pub(crate) use column_ndv_estimator::ColumnNDVEstimator; +pub(crate) use column_ndv_estimator::ColumnNDVEstimatorOps; +pub(crate) use column_statistics_builder::create_column_stats_builder; +pub(crate) use column_statistics_builder::ColumnStatisticsBuilder; +pub(crate) use column_statistics_builder::ColumnStatsOps; +pub(crate) use column_statistics_state::ColumnStatisticsState; diff --git a/src/query/storages/fuse/src/operations/append.rs b/src/query/storages/fuse/src/operations/append.rs index 9316374128528..b6a10fb38e06e 100644 --- a/src/query/storages/fuse/src/operations/append.rs +++ b/src/query/storages/fuse/src/operations/append.rs @@ -37,10 +37,11 @@ use databend_common_sql::executor::physical_plans::MutationKind; use databend_storages_common_table_meta::meta::TableMetaTimestamps; use databend_storages_common_table_meta::table::ClusterType; +use crate::io::StreamBlockProperties; +use crate::operations::TransformBlockBuilder; use crate::operations::TransformBlockWriter; use crate::operations::TransformSerializeBlock; use crate::statistics::ClusterStatsGenerator; -use crate::FuseStorageFormat; use crate::FuseTable; impl FuseTable { @@ -50,19 +51,22 @@ impl FuseTable { pipeline: &mut Pipeline, table_meta_timestamps: TableMetaTimestamps, ) -> Result<()> { - let enable_stream_block_write = ctx.get_settings().get_enable_block_stream_write()? - && matches!(self.storage_format, FuseStorageFormat::Parquet); + let enable_stream_block_write = self.enable_stream_block_write(ctx.clone())?; if enable_stream_block_write { + let properties = StreamBlockProperties::try_create( + ctx.clone(), + self, + MutationKind::Insert, + table_meta_timestamps, + )?; + pipeline.add_transform(|input, output| { - TransformBlockWriter::try_create( - ctx.clone(), - input, - output, - self, - table_meta_timestamps, - false, - ) + TransformBlockBuilder::try_create(input, output, properties.clone()) })?; + + pipeline.add_async_accumulating_transformer(|| { + TransformBlockWriter::create(ctx.clone(), MutationKind::Insert, self, false) + }); } else { let block_thresholds = self.get_block_thresholds(); build_compact_block_pipeline(pipeline, block_thresholds)?; diff --git a/src/query/storages/fuse/src/operations/common/processors/mod.rs b/src/query/storages/fuse/src/operations/common/processors/mod.rs index e0e3d3b25f25a..d43c569c14016 100644 --- a/src/query/storages/fuse/src/operations/common/processors/mod.rs +++ b/src/query/storages/fuse/src/operations/common/processors/mod.rs @@ -22,6 +22,7 @@ mod transform_serialize_segment; pub use multi_table_insert_commit::CommitMultiTableInsert; pub use sink_commit::CommitSink; +pub use transform_block_writer::TransformBlockBuilder; pub use transform_block_writer::TransformBlockWriter; pub use transform_merge_commit_meta::TransformMergeCommitMeta; pub use transform_mutation_aggregator::TableMutationAggregator; diff --git a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs index ce11a4834c7fb..895f5c7a0ebc5 100644 --- a/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs +++ b/src/query/storages/fuse/src/operations/common/processors/transform_block_writer.rs @@ -22,35 +22,36 @@ use databend_common_catalog::table::Table; use databend_common_catalog::table_context::TableContext; use databend_common_exception::ErrorCode; use databend_common_exception::Result; +use databend_common_expression::BlockMetaInfoDowncast; use databend_common_expression::DataBlock; -use databend_common_io::constants::DEFAULT_BLOCK_ROW_COUNT; +use databend_common_metrics::storage::metrics_inc_recluster_write_block_nums; use databend_common_pipeline_core::processors::Event; use databend_common_pipeline_core::processors::InputPort; use databend_common_pipeline_core::processors::OutputPort; use databend_common_pipeline_core::processors::Processor; use databend_common_pipeline_core::processors::ProcessorPtr; +use databend_common_pipeline_transforms::AsyncAccumulatingTransform; +use databend_common_sql::executor::physical_plans::MutationKind; use databend_common_storage::MutationStatus; -use databend_storages_common_table_meta::meta::TableMetaTimestamps; use opendal::Operator; use crate::io::BlockSerialization; use crate::io::BlockWriter; use crate::io::StreamBlockBuilder; use crate::io::StreamBlockProperties; +use crate::operations::MutationLogEntry; +use crate::operations::MutationLogs; use crate::FuseTable; -use crate::FUSE_OPT_KEY_ROW_PER_BLOCK; -#[allow(clippy::large_enum_variant)] enum State { Consume, Collect(DataBlock), Serialize, Finalize, Flush, - Write(BlockSerialization), } -pub struct TransformBlockWriter { +pub struct TransformBlockBuilder { state: State, input: Arc, output: Arc, @@ -62,43 +63,27 @@ pub struct TransformBlockWriter { input_data_size: usize, input_num_rows: usize, - dal: Operator, - // Only used in multi table insert - table_id: Option, - - max_block_rows: usize, - input_data: VecDeque, + input_data: VecDeque<(usize, DataBlock)>, output_data: Option, } -impl TransformBlockWriter { +impl TransformBlockBuilder { pub fn try_create( - ctx: Arc, input: Arc, output: Arc, - table: &FuseTable, - table_meta_timestamps: TableMetaTimestamps, - with_tid: bool, + properties: Arc, ) -> Result { - let max_block_rows = std::cmp::min( - ctx.get_settings().get_max_block_size()? as usize, - table.get_option(FUSE_OPT_KEY_ROW_PER_BLOCK, DEFAULT_BLOCK_ROW_COUNT), - ); - let properties = StreamBlockProperties::try_create(ctx, table, table_meta_timestamps)?; - Ok(ProcessorPtr::create(Box::new(TransformBlockWriter { + Ok(ProcessorPtr::create(Box::new(TransformBlockBuilder { state: State::Consume, input, output, properties, builder: None, - dal: table.get_operator(), need_flush: false, - table_id: if with_tid { Some(table.get_id()) } else { None }, input_data: VecDeque::new(), input_data_size: 0, input_num_rows: 0, output_data: None, - max_block_rows, }))) } @@ -111,23 +96,24 @@ impl TransformBlockWriter { Ok(self.builder.as_mut().unwrap()) } - fn calc_max_block_rows(&self, block: &DataBlock) -> usize { - let min_bytes_per_block = self.properties.block_thresholds.min_bytes_per_block; - let block_size = block.estimate_block_size(); - if block_size < min_bytes_per_block { - return self.max_block_rows; - } - let num_rows = block.num_rows(); + fn split_input(&self, input: DataBlock) -> Vec { + let block_size = input.estimate_block_size(); + let num_rows = input.num_rows(); let average_row_size = block_size.div_ceil(num_rows); - let max_rows = min_bytes_per_block.div_ceil(average_row_size); - self.max_block_rows.min(max_rows) + let max_rows = self + .properties + .block_thresholds + .min_bytes_per_block + .div_ceil(average_row_size) + .min(self.properties.block_thresholds.max_rows_per_block); + input.split_by_rows_no_tail(max_rows) } } #[async_trait] -impl Processor for TransformBlockWriter { +impl Processor for TransformBlockBuilder { fn name(&self) -> String { - "TransformBlockWriter".to_string() + "TransformBlockBuilder".to_string() } fn as_any(&mut self) -> &mut dyn Any { @@ -135,15 +121,15 @@ impl Processor for TransformBlockWriter { } fn event(&mut self) -> Result { - match &self.state { - State::Collect(_) | State::Serialize | State::Flush | State::Finalize => { - return Ok(Event::Sync) - } - State::Write(_) => return Ok(Event::Async), - _ => {} + if matches!( + self.state, + State::Collect(_) | State::Serialize | State::Flush | State::Finalize + ) { + return Ok(Event::Sync); } if self.output.is_finished() { + self.input.finish(); return Ok(Event::Finished); } @@ -196,15 +182,16 @@ impl Processor for TransformBlockWriter { State::Collect(block) => { // Check if the datablock is valid, this is needed to ensure data is correct block.check_valid()?; - self.input_data_size += block.estimate_block_size(); self.input_num_rows += block.num_rows(); - let max_rows_per_block = self.calc_max_block_rows(&block); - let blocks = block.split_by_rows_no_tail(max_rows_per_block); - self.input_data.extend(blocks); + for block in self.split_input(block) { + let block_size = block.estimate_block_size(); + self.input_data_size += block_size; + self.input_data.push_back((block_size, block)); + } } State::Serialize => { - while let Some(b) = self.input_data.pop_front() { - self.input_data_size -= b.estimate_block_size(); + while let Some((block_size, b)) = self.input_data.pop_front() { + self.input_data_size -= block_size; self.input_num_rows -= b.num_rows(); let builder = self.get_or_create_builder()?; @@ -217,7 +204,7 @@ impl Processor for TransformBlockWriter { } } State::Finalize => { - while let Some(b) = self.input_data.pop_front() { + while let Some((_, b)) = self.input_data.pop_front() { let builder = self.get_or_create_builder()?; builder.write(b)?; } @@ -227,7 +214,7 @@ impl Processor for TransformBlockWriter { let builder = self.builder.take().unwrap(); if !builder.is_empty() { let serialized = builder.finish()?; - self.state = State::Write(serialized); + self.output_data = Some(DataBlock::empty_with_meta(Box::new(serialized))); } self.need_flush = false; } @@ -235,11 +222,41 @@ impl Processor for TransformBlockWriter { } Ok(()) } +} - #[async_backtrace::framed] - async fn async_process(&mut self) -> Result<()> { - match std::mem::replace(&mut self.state, State::Consume) { - State::Write(serialized) => { +pub struct TransformBlockWriter { + kind: MutationKind, + dal: Operator, + ctx: Arc, + // Only used in multi table insert + table_id: Option, +} + +impl TransformBlockWriter { + pub fn create( + ctx: Arc, + kind: MutationKind, + table: &FuseTable, + with_tid: bool, + ) -> Self { + Self { + ctx, + dal: table.get_operator(), + table_id: if with_tid { Some(table.get_id()) } else { None }, + kind, + } + } +} + +#[async_trait::async_trait] +impl AsyncAccumulatingTransform for TransformBlockWriter { + const NAME: &'static str = "TransformBlockWriter"; + + async fn transform(&mut self, data: DataBlock) -> Result> { + debug_assert!(data.is_empty()); + + if let Some(ptr) = data.get_owned_meta() { + if let Some(serialized) = BlockSerialization::downcast_from(ptr) { let extended_block_meta = BlockWriter::write_down(&self.dal, serialized).await?; let bytes = if let Some(draft_virtual_block_meta) = @@ -251,32 +268,45 @@ impl Processor for TransformBlockWriter { extended_block_meta.block_meta.block_size as usize }; - self.properties - .ctx - .get_write_progress() - .incr(&ProgressValues { - rows: extended_block_meta.block_meta.row_count as usize, - bytes, - }); + self.ctx.get_write_progress().incr(&ProgressValues { + rows: extended_block_meta.block_meta.row_count as usize, + bytes, + }); // appending new data block if let Some(tid) = self.table_id { - self.properties.ctx.update_multi_table_insert_status( + self.ctx.update_multi_table_insert_status( tid, extended_block_meta.block_meta.row_count, ); } else { - self.properties.ctx.add_mutation_status(MutationStatus { + self.ctx.add_mutation_status(MutationStatus { insert_rows: extended_block_meta.block_meta.row_count, update_rows: 0, deleted_rows: 0, }); } - self.output_data = Some(DataBlock::empty_with_meta(Box::new(extended_block_meta))); + let output = if matches!(self.kind, MutationKind::Insert) { + DataBlock::empty_with_meta(Box::new(extended_block_meta)) + } else { + if matches!(self.kind, MutationKind::Recluster) { + metrics_inc_recluster_write_block_nums(); + } + + DataBlock::empty_with_meta(Box::new(MutationLogs { + entries: vec![MutationLogEntry::AppendBlock { + block_meta: Arc::new(extended_block_meta), + }], + })) + }; + + return Ok(Some(output)); } - _ => return Err(ErrorCode::Internal("It's a bug.")), } - Ok(()) + + Err(ErrorCode::Internal( + "Cannot downcast meta to BlockSerialization", + )) } } diff --git a/src/query/storages/fuse/src/operations/common/processors/transform_serialize_block.rs b/src/query/storages/fuse/src/operations/common/processors/transform_serialize_block.rs index f13ed6701482d..9997b7dd5a893 100644 --- a/src/query/storages/fuse/src/operations/common/processors/transform_serialize_block.rs +++ b/src/query/storages/fuse/src/operations/common/processors/transform_serialize_block.rs @@ -34,6 +34,7 @@ use databend_common_pipeline_core::PipeItem; use databend_common_sql::executor::physical_plans::MutationKind; use databend_common_storage::MutationStatus; use databend_storages_common_index::BloomIndex; +use databend_storages_common_index::RangeIndex; use databend_storages_common_table_meta::meta::TableMetaTimestamps; use opendal::Operator; @@ -152,6 +153,9 @@ impl TransformSerializeBlock { let bloom_columns_map = table .bloom_index_cols .bloom_index_fields(source_schema.clone(), BloomIndex::supported_type)?; + let ndv_columns_map = table + .approx_distinct_cols + .distinct_column_fields(source_schema.clone(), RangeIndex::supported_table_type)?; let ngram_args = FuseTable::create_ngram_index_args( &table.table_info.meta, &table.table_info.meta.schema, @@ -182,6 +186,7 @@ impl TransformSerializeBlock { write_settings: table.get_write_settings(), cluster_stats_gen, bloom_columns_map, + ndv_columns_map, ngram_args, inverted_index_builders, virtual_column_builder, diff --git a/src/query/storages/fuse/src/operations/gc.rs b/src/query/storages/fuse/src/operations/gc.rs index 9ddc1da9d1133..7e3bfee8e566e 100644 --- a/src/query/storages/fuse/src/operations/gc.rs +++ b/src/query/storages/fuse/src/operations/gc.rs @@ -32,6 +32,7 @@ use databend_storages_common_index::InvertedIndexFile; use databend_storages_common_index::InvertedIndexMeta; use databend_storages_common_io::Files; use databend_storages_common_table_meta::meta::column_oriented_segment::ColumnOrientedSegment; +use databend_storages_common_table_meta::meta::column_oriented_segment::BLOCK_STATS_LOCATION; use databend_storages_common_table_meta::meta::column_oriented_segment::BLOOM_FILTER_INDEX_LOCATION; use databend_storages_common_table_meta::meta::column_oriented_segment::LOCATION; use databend_storages_common_table_meta::meta::CompactSegmentInfo; @@ -390,6 +391,13 @@ impl FuseTable { purge_files.push(loc.to_string()) } + for loc in &locations.stats_location { + if locations_referenced_by_root.stats_location.contains(loc) { + continue; + } + purge_files.push(loc.to_string()) + } + purge_files.extend(chunk.iter().map(|loc| loc.0.clone())); } purge_files.extend(ts_to_be_purged.iter().map(|loc| loc.to_string())); @@ -455,6 +463,14 @@ impl FuseTable { blooms_to_be_purged.insert(loc.to_string()); } + let mut stats_to_be_purged = HashSet::new(); + for loc in &locations.stats_location { + if locations_referenced_by_root.stats_location.contains(loc) { + continue; + } + stats_to_be_purged.insert(loc.to_string()); + } + let segment_locations_to_be_purged = HashSet::from_iter( chunk .iter() @@ -481,6 +497,7 @@ impl FuseTable { agg_indexes_to_be_purged, inverted_indexes_to_be_purged, blooms_to_be_purged, + stats_to_be_purged, segment_locations_to_be_purged, ) .await?; @@ -541,6 +558,7 @@ impl FuseTable { agg_indexes_to_be_purged, inverted_indexes_to_be_purged, root_location_tuple.bloom_location, + root_location_tuple.stats_location, segment_locations_to_be_purged, ) .await?; @@ -566,6 +584,7 @@ impl FuseTable { agg_indexes_to_be_purged: HashSet, inverted_indexes_to_be_purged: HashSet, blooms_to_be_purged: HashSet, + stats_to_be_purged: HashSet, segments_to_be_purged: HashSet, ) -> Result<()> { // 1. Try to purge block file chunks. @@ -616,7 +635,15 @@ impl FuseTable { .await?; } - // 3. Try to purge segment file chunks. + // 3. Try to purge block statistic file chunks. + let stats_count = stats_to_be_purged.len(); + if stats_count > 0 { + counter.block_stats += stats_count; + self.try_purge_location_files(ctx.clone(), stats_to_be_purged) + .await?; + } + + // 4. Try to purge segment file chunks. let segments_count = segments_to_be_purged.len(); if segments_count > 0 { counter.segments += segments_count; @@ -661,9 +688,10 @@ impl FuseTable { // 5. Refresh status. { let status = format!( - "gc: block files purged:{}, bloom files purged:{}, segment files purged:{}, table statistic files purged:{}, snapshots purged:{}, take:{:?}", + "gc: block files purged:{}, bloom files purged:{}, block stats files purged:{}, segment files purged:{}, table statistic files purged:{}, snapshots purged:{}, take:{:?}", counter.blocks, counter.blooms, + counter.block_stats, counter.segments, counter.table_statistics, counter.snapshots, @@ -714,6 +742,7 @@ impl FuseTable { ) -> Result { let mut blocks = HashSet::new(); let mut blooms = HashSet::new(); + let mut stats = HashSet::new(); let fuse_segments = SegmentsIO::create(ctx.clone(), self.operator.clone(), self.schema()); let chunk_size = ctx.get_settings().get_max_threads()? as usize * 4; @@ -779,12 +808,14 @@ impl FuseTable { }; blocks.extend(location_tuple.block_location.into_iter()); blooms.extend(location_tuple.bloom_location.into_iter()); + stats.extend(location_tuple.stats_location.into_iter()); } } Ok(LocationTuple { block_location: blocks, bloom_location: blooms, + stats_location: stats, }) } @@ -808,6 +839,7 @@ struct RootSnapshotInfo { pub struct LocationTuple { pub block_location: HashSet, pub bloom_location: HashSet, + pub stats_location: HashSet, } impl TryFrom> for LocationTuple { @@ -815,16 +847,21 @@ impl TryFrom> for LocationTuple { fn try_from(value: Arc) -> Result { let mut block_location = HashSet::new(); let mut bloom_location = HashSet::new(); + let mut stats_location = HashSet::new(); let block_metas = value.block_metas()?; for block_meta in block_metas.into_iter() { block_location.insert(block_meta.location.0.clone()); if let Some(bloom_loc) = &block_meta.bloom_filter_index_location { bloom_location.insert(bloom_loc.0.clone()); } + if let Some(stats_loc) = &block_meta.block_stats_location { + stats_location.insert(stats_loc.0.clone()); + } } Ok(Self { block_location, bloom_location, + stats_location, }) } } @@ -834,6 +871,7 @@ impl TryFrom> for LocationTuple { fn try_from(value: Arc) -> Result { let mut block_location = HashSet::new(); let mut bloom_location = HashSet::new(); + let mut stats_location = HashSet::new(); let location_path = value.location_path_col(); for path in location_path.iter() { @@ -846,19 +884,28 @@ impl TryFrom> for LocationTuple { .unwrap(); let column = value.block_metas.get_by_offset(index).to_column(); for value in column.iter() { - match value { - ScalarRef::Null => {} - ScalarRef::Tuple(values) => { - let path = values[0].as_string().unwrap(); - bloom_location.insert(path.to_string()); - } - _ => unreachable!(), + if let ScalarRef::Tuple(values) = value { + let path = values[0].as_string().unwrap(); + bloom_location.insert(path.to_string()); + } + } + + let (index, _) = value + .segment_schema + .column_with_name(BLOCK_STATS_LOCATION) + .unwrap(); + let column = value.block_metas.get_by_offset(index).to_column(); + for value in column.iter() { + if let ScalarRef::Tuple(values) = value { + let path = values[0].as_string().unwrap(); + stats_location.insert(path.to_string()); } } Ok(Self { block_location, bloom_location, + stats_location, }) } } @@ -870,6 +917,7 @@ struct PurgeCounter { agg_indexes: usize, inverted_indexes: usize, blooms: usize, + block_stats: usize, segments: usize, table_statistics: usize, snapshots: usize, @@ -883,6 +931,7 @@ impl PurgeCounter { agg_indexes: 0, inverted_indexes: 0, blooms: 0, + block_stats: 0, segments: 0, table_statistics: 0, snapshots: 0, diff --git a/src/query/storages/fuse/src/operations/merge.rs b/src/query/storages/fuse/src/operations/merge.rs index e149196075dcd..6ec54cd993899 100644 --- a/src/query/storages/fuse/src/operations/merge.rs +++ b/src/query/storages/fuse/src/operations/merge.rs @@ -21,6 +21,7 @@ use databend_common_exception::Result; use databend_common_expression::TableSchemaRef; use databend_common_pipeline_core::PipeItem; use databend_storages_common_index::BloomIndex; +use databend_storages_common_index::RangeIndex; use databend_storages_common_table_meta::meta::Location; use databend_storages_common_table_meta::meta::TableMetaTimestamps; @@ -93,6 +94,9 @@ impl FuseTable { let bloom_columns_map = self .bloom_index_cols() .bloom_index_fields(new_schema.clone(), BloomIndex::supported_type)?; + let ndv_columns_map = self + .approx_distinct_cols() + .distinct_column_fields(new_schema.clone(), RangeIndex::supported_table_type)?; let ngram_args = FuseTable::create_ngram_index_args( &self.table_info.meta, &self.table_info.meta.schema, @@ -111,6 +115,7 @@ impl FuseTable { write_settings: self.get_write_settings(), cluster_stats_gen, bloom_columns_map, + ndv_columns_map, ngram_args, inverted_index_builders, vector_index_builder, diff --git a/src/query/storages/fuse/src/statistics/column_statistic.rs b/src/query/storages/fuse/src/statistics/column_statistic.rs index 36737dd9e7a62..0f980a57a97de 100644 --- a/src/query/storages/fuse/src/statistics/column_statistic.rs +++ b/src/query/storages/fuse/src/statistics/column_statistic.rs @@ -83,51 +83,48 @@ pub fn gen_columns_statistics( let mut min = Scalar::Null; let mut max = Scalar::Null; - let (mins, _) = eval_aggr("min", vec![], &[col.clone().into()], rows, vec![])?; - let (maxs, _) = eval_aggr("max", vec![], &[col.clone().into()], rows, vec![])?; - - if mins.len() > 0 { - min = if let Some(v) = mins.index(0) { - if let Some(v) = v.to_owned().trim_min() { - v + if col.len() > 0 { + let (mins, _) = eval_aggr("min", vec![], &[col.clone().into()], rows, vec![])?; + let (maxs, _) = eval_aggr("max", vec![], &[col.clone().into()], rows, vec![])?; + + if mins.len() > 0 { + min = if let Some(v) = mins.index(0) { + if let Some(v) = v.to_owned().trim_min() { + v + } else { + continue; + } } else { continue; } - } else { - continue; } - } - if maxs.len() > 0 { - max = if let Some(v) = maxs.index(0) { - if let Some(v) = v.to_owned().trim_max() { - v + if maxs.len() > 0 { + max = if let Some(v) = maxs.index(0) { + if let Some(v) = v.to_owned().trim_max() { + v + } else { + continue; + } } else { continue; } - } else { - continue; } } let (is_all_null, bitmap) = col.validity(); let unset_bits = match (is_all_null, bitmap) { - (true, _) => rows, - (false, Some(bitmap)) => bitmap.null_count(), + (_, Some(bitmap)) => bitmap.null_count(), + (true, None) => rows, (false, None) => 0, }; // use distinct count calculated by the xor hash function to avoid repetitive operation. - let distinct_of_values = if let Some(value) = column_distinct_count + let distinct_of_values = if let Some(&value) = column_distinct_count .as_ref() .and_then(|v| v.get(&column_id)) { - // value calculated by xor hash function include NULL, need to subtract one. - if unset_bits > 0 { - *value as u64 - 1 - } else { - *value as u64 - } + value as u64 } else { calc_column_distinct_of_values(&col, rows)? }; diff --git a/src/query/storages/fuse/src/statistics/traverse.rs b/src/query/storages/fuse/src/statistics/traverse.rs index b42375eff5030..5681c281df8ae 100644 --- a/src/query/storages/fuse/src/statistics/traverse.rs +++ b/src/query/storages/fuse/src/statistics/traverse.rs @@ -23,7 +23,6 @@ use databend_common_expression::ColumnId; use databend_common_expression::Scalar; use databend_common_expression::TableField; use databend_common_expression::Value; -use databend_common_expression::ORIGIN_BLOCK_ROW_NUM_COLUMN_ID; use databend_storages_common_index::Index; use databend_storages_common_index::RangeIndex; @@ -107,9 +106,7 @@ fn traverse_column_recursive( _ => unreachable!(), }, _ => { - if RangeIndex::supported_type(data_type) - && *next_column_id != ORIGIN_BLOCK_ROW_NUM_COLUMN_ID - { + if RangeIndex::supported_type(data_type) { leaves.push(( *next_column_id, Value::Column(column.clone()), @@ -169,9 +166,7 @@ fn traverse_scalar_recursive( }, _ => { // Ignore the range index does not supported type. - if RangeIndex::supported_type(data_type) - && *next_column_id != ORIGIN_BLOCK_ROW_NUM_COLUMN_ID - { + if RangeIndex::supported_type(data_type) { leaves.push(( *next_column_id, Value::Scalar(scalar.clone()), diff --git a/src/query/storages/fuse/src/table_functions/fuse_block.rs b/src/query/storages/fuse/src/table_functions/fuse_block.rs index eaecb5425c809..11c31d1227e0b 100644 --- a/src/query/storages/fuse/src/table_functions/fuse_block.rs +++ b/src/query/storages/fuse/src/table_functions/fuse_block.rs @@ -75,6 +75,10 @@ impl TableMetaFunc for FuseBlock { "virtual_column_size", TableDataType::Nullable(Box::new(TableDataType::Number(NumberDataType::UInt64))), ), + TableField::new( + "block_stats_size", + TableDataType::Number(NumberDataType::UInt64), + ), ]) } @@ -99,6 +103,7 @@ impl TableMetaFunc for FuseBlock { let mut ngram_index_size = Vec::with_capacity(len); let mut vector_index_size = Vec::with_capacity(len); let mut virtual_column_size = Vec::with_capacity(len); + let mut block_stats_size = Vec::with_capacity(len); let segments_io = SegmentsIO::create(ctx.clone(), tbl.operator.clone(), tbl.schema()); @@ -134,6 +139,7 @@ impl TableMetaFunc for FuseBlock { .as_ref() .map(|m| m.virtual_column_size), ); + block_stats_size.push(block.block_stats_size); num_rows += 1; if num_rows >= limit { @@ -157,6 +163,7 @@ impl TableMetaFunc for FuseBlock { UInt64Type::from_opt_data(ngram_index_size).into(), UInt64Type::from_opt_data(vector_index_size).into(), UInt64Type::from_opt_data(virtual_column_size).into(), + UInt64Type::from_data(block_stats_size).into(), ], num_rows, )) diff --git a/tests/sqllogictests/suites/base/09_fuse_engine/09_0004_remote_insert_into_select.test b/tests/sqllogictests/suites/base/09_fuse_engine/09_0004_remote_insert_into_select.test index 5e399dc1b406f..5ad4e316896d0 100644 --- a/tests/sqllogictests/suites/base/09_fuse_engine/09_0004_remote_insert_into_select.test +++ b/tests/sqllogictests/suites/base/09_fuse_engine/09_0004_remote_insert_into_select.test @@ -7,9 +7,6 @@ CREATE DATABASE db_09_004 statement ok USE db_09_004 -statement ok -set enable_block_stream_write = 1 - statement ok CREATE TABLE IF NOT EXISTS t1(a UInt8 not null, b UInt64 not null, c Int8 not null, d Int64 not null, e Date not null, f Date not null, g DateTime not null, h String not null) Engine = Fuse diff --git a/tests/sqllogictests/suites/base/09_fuse_engine/09_0006_func_fuse_history.test b/tests/sqllogictests/suites/base/09_fuse_engine/09_0006_func_fuse_history.test index 6f55970c81103..cad96b3804842 100644 --- a/tests/sqllogictests/suites/base/09_fuse_engine/09_0006_func_fuse_history.test +++ b/tests/sqllogictests/suites/base/09_fuse_engine/09_0006_func_fuse_history.test @@ -122,12 +122,42 @@ create table t_in_memory(a uint64 not null) engine = Memory statement error 4000 select * from fuse_snapshot('db_09_0006', 't_in_memory') +statement ok +CREATE TABLE t2(a int, b string) approx_distinct_columns = 'a,b'; + +query I +insert into t2 values(1, 'a'), (2, 'b'); +---- +2 + +query I +select block_stats_size > 0 from fuse_block('db_09_0006', 't2'); +---- +1 + +statement ok +alter table t2 set options(approx_distinct_columns = ''); + +query I +insert into t2 values(3, 'c'), (4, 'd'); +---- +2 + +query I +select block_stats_size > 0 from fuse_block('db_09_0006', 't2') order by block_location; +---- +1 +0 + statement ok DROP TABLE t statement ok DROP TABLE t1 +statement ok +DROP TABLE t2 + statement ok DROP TABLE t_in_memory diff --git a/tests/sqllogictests/suites/base/09_fuse_engine/09_0008_fuse_optimize_table.test b/tests/sqllogictests/suites/base/09_fuse_engine/09_0008_fuse_optimize_table.test index 43a2b262ca2f9..c19a27a9e8890 100644 --- a/tests/sqllogictests/suites/base/09_fuse_engine/09_0008_fuse_optimize_table.test +++ b/tests/sqllogictests/suites/base/09_fuse_engine/09_0008_fuse_optimize_table.test @@ -7,9 +7,6 @@ CREATE DATABASE db_09_0008 statement ok USE db_09_0008 -statement ok -set enable_block_stream_write = 1 - statement ok create table t(a uint64 not null) diff --git a/tests/sqllogictests/suites/base/issues/issue_18275.test b/tests/sqllogictests/suites/base/issues/issue_18275.test index 36217cda4b3ca..ce895d228329e 100644 --- a/tests/sqllogictests/suites/base/issues/issue_18275.test +++ b/tests/sqllogictests/suites/base/issues/issue_18275.test @@ -14,9 +14,6 @@ CREATE OR REPLACE TABLE product_test ( stock INT ); -statement ok -set enable_block_stream_write = 1; - statement ok INSERT INTO product_test (id, name, category, price, stock) VALUES(6, 'Keyboard', 'Electronics', 79.99, 25), diff --git a/tests/sqllogictests/suites/ee/01_ee_system/01_0002_virtual_column.test b/tests/sqllogictests/suites/ee/01_ee_system/01_0002_virtual_column.test index 0d86d3d55b737..cd693939daffa 100644 --- a/tests/sqllogictests/suites/ee/01_ee_system/01_0002_virtual_column.test +++ b/tests/sqllogictests/suites/ee/01_ee_system/01_0002_virtual_column.test @@ -613,9 +613,6 @@ S001 ST001 A Excellent Y S002 ST002 B Good Y S003 ST003 C Average N -statement ok -set enable_block_stream_write = 1 - statement ok CREATE OR REPLACE TABLE test_stream ( id INT, @@ -670,9 +667,6 @@ FROM test_stream; 9 "Richard" 33 "Austin" "hiking" "cycling" 10 "Lisa" 26 "Chicago" "gaming" "reading" -statement ok -set enable_block_stream_write = 0 - statement ok set enable_experimental_virtual_column = 0; diff --git a/tests/suites/1_stateful/05_formats/05_01_compact/05_01_02_load_compact_copy_row_per_block.sh b/tests/suites/1_stateful/05_formats/05_01_compact/05_01_02_load_compact_copy_row_per_block.sh index 4d6cafcb184f9..862680eab447d 100755 --- a/tests/suites/1_stateful/05_formats/05_01_compact/05_01_02_load_compact_copy_row_per_block.sh +++ b/tests/suites/1_stateful/05_formats/05_01_compact/05_01_02_load_compact_copy_row_per_block.sh @@ -15,7 +15,7 @@ echo "drop table if exists t1 all" | $BENDSQL_CLIENT_CONNECT echo "CREATE TABLE t1 ( c0 string -) engine=fuse row_per_block=800; +) engine=fuse row_per_block=500; " | $BENDSQL_CLIENT_CONNECT diff --git a/tests/suites/1_stateful/09_http_handler/09_0007_session.py b/tests/suites/1_stateful/09_http_handler/09_0007_session.py index 3f3c96959d59a..5860548afa3ea 100755 --- a/tests/suites/1_stateful/09_http_handler/09_0007_session.py +++ b/tests/suites/1_stateful/09_http_handler/09_0007_session.py @@ -16,21 +16,31 @@ logout_url = "http://localhost:8000/v1/session/logout" auth = ("root", "") + def check(func): def wrapper(self, *args, **kwargs): print(f"---- {func.__name__}{args[:1]}") - resp : Response = func(self, *args, **kwargs) + resp: Response = func(self, *args, **kwargs) self.session_header = resp.headers.get(HEADER_SESSION) last = self.session_header_json - self.session_header_json = json.loads(base64.urlsafe_b64decode(self.session_header)) + self.session_header_json = json.loads( + base64.urlsafe_b64decode(self.session_header) + ) if last: if last["id"] != self.session_header_json["id"]: - print("error: session id should not change", last, self.session_header_json) + print( + "error: session id should not change", + last, + self.session_header_json, + ) if last["last_refresh_time"] < time.time() - 100: if last["last_refresh_time"] > time.time() - 2: print("error: last_refresh_time should not change") else: - if last["last_refresh_time"] != self.session_header_json["last_refresh_time"]: + if ( + last["last_refresh_time"] + != self.session_header_json["last_refresh_time"] + ): print("error: last_refresh_time should not change") # print("get header: ", self.session_header_json) @@ -44,11 +54,11 @@ def wrapper(self, *args, **kwargs): if err: pprint(err) return resp + return wrapper class Client(object): - def __init__(self): self.client = requests.session() self.session_header = "" @@ -82,7 +92,7 @@ def do_query(self, query, url=query_url): auth=auth, headers={ "Content-Type": "application/json", - HEADER_SESSION: self.session_header + HEADER_SESSION: self.session_header, }, json=query_payload, ) @@ -91,7 +101,9 @@ def do_query(self, query, url=query_url): def set_fake_last_refresh_time(self): j = self.session_header_json j["last_refresh_time"] = int(time.time()) - 10 * 60 - self.session_header = base64.urlsafe_b64encode(json.dumps(j).encode('utf-8')).decode('ascii') + self.session_header = base64.urlsafe_b64encode( + json.dumps(j).encode("utf-8") + ).decode("ascii") def main():