From 2fda373f49b17269464afe41b275fcf0ca2cc497 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=86=99=E5=87=AF?= Date: Thu, 26 May 2022 16:06:22 +0800 Subject: [PATCH] initial commit --- .gitignore | 4 + Cargo.lock | 4363 +++++++++++++++++ Cargo.toml | 74 + Dockerfile | 39 + LICENSE | 202 + Makefile | 64 + README.md | 90 + analytic_engine/Cargo.toml | 43 + analytic_engine/src/compaction/metrics.rs | 15 + analytic_engine/src/compaction/mod.rs | 494 ++ analytic_engine/src/compaction/picker.rs | 740 +++ analytic_engine/src/compaction/scheduler.rs | 595 +++ analytic_engine/src/context.rs | 38 + analytic_engine/src/engine.rs | 163 + analytic_engine/src/instance/alter.rs | 289 ++ analytic_engine/src/instance/close.rs | 93 + analytic_engine/src/instance/create.rs | 131 + analytic_engine/src/instance/drop.rs | 152 + analytic_engine/src/instance/engine.rs | 230 + .../src/instance/flush_compaction.rs | 1037 ++++ analytic_engine/src/instance/mem_collector.rs | 118 + analytic_engine/src/instance/mod.rs | 271 + analytic_engine/src/instance/open.rs | 415 ++ analytic_engine/src/instance/read.rs | 388 ++ analytic_engine/src/instance/write.rs | 464 ++ analytic_engine/src/instance/write_worker.rs | 970 ++++ analytic_engine/src/lib.rs | 98 + analytic_engine/src/memtable/factory.rs | 38 + analytic_engine/src/memtable/key.rs | 249 + analytic_engine/src/memtable/mod.rs | 198 + .../src/memtable/skiplist/factory.rs | 32 + analytic_engine/src/memtable/skiplist/iter.rs | 346 ++ analytic_engine/src/memtable/skiplist/mod.rs | 363 ++ analytic_engine/src/meta/details.rs | 1282 +++++ analytic_engine/src/meta/meta_data.rs | 193 + analytic_engine/src/meta/meta_update.rs | 463 ++ analytic_engine/src/meta/mod.rs | 29 + analytic_engine/src/payload.rs | 174 + analytic_engine/src/row_iter/chain.rs | 373 ++ analytic_engine/src/row_iter/dedup.rs | 243 + analytic_engine/src/row_iter/merge.rs | 957 ++++ analytic_engine/src/row_iter/mod.rs | 87 + .../src/row_iter/record_batch_stream.rs | 287 ++ analytic_engine/src/row_iter/tests.rs | 93 + analytic_engine/src/sampler.rs | 448 ++ analytic_engine/src/setup.rs | 103 + analytic_engine/src/space.rs | 305 ++ analytic_engine/src/sst/builder.rs | 76 + analytic_engine/src/sst/factory.rs | 87 + analytic_engine/src/sst/file.rs | 699 +++ analytic_engine/src/sst/manager.rs | 159 + analytic_engine/src/sst/mod.rs | 10 + analytic_engine/src/sst/parquet/builder.rs | 560 +++ analytic_engine/src/sst/parquet/encoding.rs | 152 + analytic_engine/src/sst/parquet/mod.rs | 7 + analytic_engine/src/sst/parquet/reader.rs | 371 ++ analytic_engine/src/sst/reader.rs | 90 + analytic_engine/src/table/data.rs | 713 +++ analytic_engine/src/table/metrics.rs | 229 + analytic_engine/src/table/mod.rs | 270 + analytic_engine/src/table/sst_util.rs | 27 + analytic_engine/src/table/version.rs | 1096 +++++ analytic_engine/src/table/version_edit.rs | 176 + analytic_engine/src/table_options.rs | 553 +++ analytic_engine/src/tests/alter_test.rs | 449 ++ analytic_engine/src/tests/compaction_test.rs | 90 + analytic_engine/src/tests/drop_test.rs | 231 + analytic_engine/src/tests/mod.rs | 17 + analytic_engine/src/tests/open_test.rs | 18 + analytic_engine/src/tests/read_write_test.rs | 735 +++ analytic_engine/src/tests/row_util.rs | 93 + analytic_engine/src/tests/table.rs | 331 ++ analytic_engine/src/tests/util.rs | 404 ++ arrow_deps/Cargo.toml | 19 + arrow_deps/src/display.rs | 428 ++ arrow_deps/src/lib.rs | 14 + arrow_deps/src/util.rs | 133 + benchmarks/Cargo.toml | 34 + benchmarks/README.md | 25 + benchmarks/bench.toml | 45 + benchmarks/benches/bench.rs | 208 + benchmarks/config/bench.toml | 50 + benchmarks/config/sst.toml | 33 + benchmarks/src/arrow2_bench.rs | 81 + benchmarks/src/bin/sst-tools.rs | 70 + benchmarks/src/config.rs | 123 + benchmarks/src/lib.rs | 17 + benchmarks/src/merge_memtable_bench.rs | 209 + benchmarks/src/merge_sst_bench.rs | 225 + benchmarks/src/parquet_bench.rs | 137 + benchmarks/src/scan_memtable_bench.rs | 111 + benchmarks/src/sst_bench.rs | 123 + benchmarks/src/sst_tools.rs | 257 + benchmarks/src/util.rs | 146 + build.rs | 26 + catalog/Cargo.toml | 15 + catalog/src/consts.rs | 12 + catalog/src/lib.rs | 59 + catalog/src/manager.rs | 32 + catalog/src/schema.rs | 169 + catalog_impls/Cargo.toml | 23 + catalog_impls/src/lib.rs | 52 + catalog_impls/src/memory.rs | 260 + catalog_impls/src/system_tables.rs | 131 + catalog_impls/src/table_based.rs | 1126 +++++ cluster/Cargo.toml | 21 + cluster/src/config.rs | 18 + cluster/src/lib.rs | 263 + cluster/src/table_manager.rs | 163 + cluster/src/util.rs | 0 common_types/Cargo.toml | 25 + common_types/src/bytes.rs | 5 + common_types/src/column.rs | 868 ++++ common_types/src/column_schema.rs | 477 ++ common_types/src/datum.rs | 887 ++++ common_types/src/hash.rs | 39 + common_types/src/lib.rs | 24 + common_types/src/projected_schema.rs | 292 ++ common_types/src/record_batch.rs | 695 +++ common_types/src/request_id.rs | 43 + common_types/src/row/contiguous.rs | 501 ++ common_types/src/row/mod.rs | 590 +++ common_types/src/schema.rs | 1554 ++++++ common_types/src/string.rs | 107 + common_types/src/tests.rs | 139 + common_types/src/time.rs | 363 ++ common_util/Cargo.toml | 44 + common_util/src/alloc_tracker.rs | 159 + common_util/src/codec/compact/bytes.rs | 130 + common_util/src/codec/compact/datum.rs | 264 + common_util/src/codec/compact/float.rs | 101 + common_util/src/codec/compact/mod.rs | 92 + common_util/src/codec/compact/number.rs | 160 + common_util/src/codec/consts.rs | 21 + common_util/src/codec/memcomparable/bytes.rs | 279 ++ common_util/src/codec/memcomparable/datum.rs | 290 ++ common_util/src/codec/memcomparable/mod.rs | 98 + common_util/src/codec/memcomparable/number.rs | 333 ++ common_util/src/codec/mod.rs | 42 + common_util/src/codec/row.rs | 234 + common_util/src/codec/varint.rs | 209 + common_util/src/config.rs | 711 +++ common_util/src/lib.rs | 31 + common_util/src/macros.rs | 25 + common_util/src/metric.rs | 267 + common_util/src/panic.rs | 159 + common_util/src/runtime/metrics.rs | 57 + common_util/src/runtime/mod.rs | 277 ++ common_util/src/time.rs | 68 + common_util/src/toml.rs | 104 + components/arena/Cargo.toml | 10 + components/arena/src/arena_trait.rs | 70 + components/arena/src/fixed_size.rs | 107 + components/arena/src/lib.rs | 11 + components/arena/src/mono_inc.rs | 347 ++ components/bytes/Cargo.toml | 10 + components/bytes/src/lib.rs | 368 ++ components/logger/Cargo.toml | 21 + components/logger/src/lib.rs | 422 ++ components/object_store/Cargo.toml | 21 + components/object_store/src/disk.rs | 389 ++ components/object_store/src/lib.rs | 329 ++ components/object_store/src/path/file.rs | 518 ++ components/object_store/src/path/mod.rs | 35 + components/object_store/src/path/parsed.rs | 389 ++ components/object_store/src/path/parts.rs | 142 + components/parquet/Cargo.toml | 12 + components/parquet/src/cache.rs | 67 + components/parquet/src/lib.rs | 17 + components/parquet/src/reverse_reader.rs | 231 + components/parquet/src/serialized_reader.rs | 738 +++ components/parquet/src/tests.rs | 118 + components/profile/Cargo.toml | 16 + components/profile/src/lib.rs | 142 + .../rust-hyperloglog/.github/dependabot.yml | 10 + components/rust-hyperloglog/.gitignore | 7 + components/rust-hyperloglog/.travis.yml | 6 + components/rust-hyperloglog/Cargo.toml | 20 + components/rust-hyperloglog/LICENSE | 23 + components/rust-hyperloglog/README.md | 27 + components/rust-hyperloglog/THANKS | 3 + .../rust-hyperloglog/src/hyperloglog/lib.rs | 4264 ++++++++++++++++ components/skiplist/Cargo.toml | 21 + components/skiplist/benches/bench.rs | 181 + components/skiplist/src/key.rs | 55 + components/skiplist/src/lib.rs | 21 + components/skiplist/src/list.rs | 698 +++ components/skiplist/src/slice.rs | 74 + components/skiplist/tests/tests.rs | 261 + components/tracing/Cargo.toml | 9 + components/tracing/src/lib.rs | 5 + components/tracing_examples/Cargo.toml | 10 + .../examples/init_tracing_with_file.rs | 41 + components/tracing_util/Cargo.toml | 13 + components/tracing_util/src/lib.rs | 22 + components/tracing_util/src/logging.rs | 147 + configs/ceresdb.toml | 23 + docker/entrypoint.py | 88 + docker/supervisor/conf.d/ceresdb.conf | 17 + docker/supervisor/supervisord.conf | 24 + docker/tini | Bin 0 -> 24064 bytes docs/crate-deps.dot | 93 + docs/crate-deps.svg | 433 ++ docs/example.toml | 20 + etc/license.template | 1 + grpcio/Cargo.toml | 16 + grpcio/src/lib.rs | 3 + interpreters/Cargo.toml | 27 + interpreters/src/alter_table.rs | 132 + interpreters/src/context.rs | 79 + interpreters/src/create.rs | 137 + interpreters/src/describe.rs | 89 + interpreters/src/drop.rs | 126 + interpreters/src/exists.rs | 62 + interpreters/src/factory.rs | 49 + interpreters/src/insert.rs | 138 + interpreters/src/interpreter.rs | 56 + interpreters/src/lib.rs | 23 + interpreters/src/select.rs | 75 + interpreters/src/show_create.rs | 136 + interpreters/src/tests.rs | 236 + meta_client/Cargo.toml | 26 + meta_client/src/lib.rs | 705 +++ meta_client/src/load_balance.rs | 65 + meta_client/src/static_client.rs | 86 + meta_client_v2/Cargo.toml | 26 + meta_client_v2/src/lib.rs | 676 +++ meta_client_v2/src/load_balance.rs | 65 + meta_client_v2/src/types.rs | 458 ++ proto/.gitignore | 1 + proto/Cargo.toml | 14 + proto/build.rs | 11 + proto/protos/analytic_common.proto | 62 + proto/protos/common.proto | 63 + proto/protos/meta_update.proto | 101 + proto/protos/sst.proto | 21 + proto/protos/sys_catalog.proto | 55 + proto/protos/table_requests.proto | 19 + proto/src/lib.rs | 10 + query_engine/Cargo.toml | 20 + query_engine/src/context.rs | 121 + .../src/df_execution_extension/mod.rs | 4 + .../src/df_execution_extension/prom_align.rs | 931 ++++ query_engine/src/df_planner_extension/mod.rs | 40 + .../src/df_planner_extension/prom_align.rs | 53 + .../table_scan_by_primary_key.rs | 141 + query_engine/src/executor.rs | 138 + query_engine/src/lib.rs | 19 + query_engine/src/logical_optimizer/mod.rs | 61 + .../logical_optimizer/order_by_primary_key.rs | 413 ++ query_engine/src/logical_optimizer/tests.rs | 159 + .../src/logical_optimizer/type_conversion.rs | 506 ++ .../physical_optimizer/coalesce_batches.rs | 70 + query_engine/src/physical_optimizer/mod.rs | 87 + .../src/physical_optimizer/repartition.rs | 59 + query_engine/src/physical_plan.rs | 101 + rust-toolchain | 1 + rustfmt.toml | 14 + server/Cargo.toml | 44 + server/src/avro_util.rs | 166 + server/src/config.rs | 88 + server/src/consts.rs | 8 + server/src/context.rs | 81 + server/src/error.rs | 67 + server/src/grpc/metrics.rs | 42 + server/src/grpc/mod.rs | 1034 ++++ server/src/grpc/prom_query.rs | 467 ++ server/src/grpc/query.rs | 224 + server/src/grpc/route.rs | 35 + server/src/grpc/write.rs | 586 +++ server/src/handlers/admin.rs | 71 + server/src/handlers/error.rs | 52 + server/src/handlers/mod.rs | 21 + server/src/handlers/sql.rs | 148 + server/src/http.rs | 341 ++ server/src/instance.rs | 26 + server/src/lib.rs | 25 + server/src/limiter.rs | 194 + server/src/logger.rs | 32 + server/src/metrics.rs | 19 + server/src/router.rs | 196 + server/src/server.rs | 180 + server/src/table_engine.rs | 97 + sql/Cargo.toml | 29 + sql/src/ast.rs | 80 + sql/src/container.rs | 175 + sql/src/frontend.rs | 108 + sql/src/lib.rs | 19 + sql/src/parser.rs | 814 +++ sql/src/plan.rs | 158 + sql/src/planner.rs | 1277 +++++ sql/src/promql.rs | 10 + sql/src/promql/convert.rs | 673 +++ sql/src/promql/datafusion_util.rs | 105 + sql/src/promql/pushdown.rs | 50 + sql/src/promql/udf.rs | 300 ++ sql/src/provider.rs | 345 ++ sql/src/tests.rs | 69 + src/bin/ceresdb-server.rs | 83 + src/docs/config.toml | 27 + src/lib.rs | 6 + src/setup.rs | 127 + src/signal_handler.rs | 31 + system_catalog/Cargo.toml | 22 + system_catalog/src/lib.rs | 168 + system_catalog/src/sys_catalog_table.rs | 1017 ++++ system_catalog/src/tables.rs | 179 + table_engine/Cargo.toml | 23 + table_engine/src/engine.rs | 261 + table_engine/src/lib.rs | 20 + table_engine/src/memory.rs | 252 + table_engine/src/partition/expression.rs | 71 + table_engine/src/partition/mod.rs | 27 + table_engine/src/partition/rule.rs | 108 + .../src/predicate/filter_record_batch.rs | 249 + table_engine/src/predicate/mod.rs | 540 ++ table_engine/src/provider.rs | 275 ++ table_engine/src/stream.rs | 128 + table_engine/src/table.rs | 608 +++ udf/Cargo.toml | 16 + udf/src/aggregate.rs | 164 + udf/src/functions.rs | 326 ++ udf/src/lib.rs | 10 + udf/src/registry.rs | 92 + udf/src/scalar.rs | 39 + udf/src/udaf.rs | 45 + udf/src/udfs/mod.rs | 16 + udf/src/udfs/thetasketch_distinct.rs | 166 + udf/src/udfs/time_bucket.rs | 324 ++ wal/Cargo.toml | 24 + wal/src/lib.rs | 10 + wal/src/log_batch.rs | 89 + wal/src/manager.rs | 237 + wal/src/rocks_impl/encoding.rs | 533 ++ wal/src/rocks_impl/manager.rs | 621 +++ wal/src/rocks_impl/mod.rs | 6 + wal/src/tests/mod.rs | 6 + wal/src/tests/read_write.rs | 449 ++ wal/src/tests/util.rs | 158 + 339 files changed, 75050 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 Dockerfile create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 README.md create mode 100644 analytic_engine/Cargo.toml create mode 100644 analytic_engine/src/compaction/metrics.rs create mode 100644 analytic_engine/src/compaction/mod.rs create mode 100644 analytic_engine/src/compaction/picker.rs create mode 100644 analytic_engine/src/compaction/scheduler.rs create mode 100644 analytic_engine/src/context.rs create mode 100644 analytic_engine/src/engine.rs create mode 100644 analytic_engine/src/instance/alter.rs create mode 100644 analytic_engine/src/instance/close.rs create mode 100644 analytic_engine/src/instance/create.rs create mode 100644 analytic_engine/src/instance/drop.rs create mode 100644 analytic_engine/src/instance/engine.rs create mode 100644 analytic_engine/src/instance/flush_compaction.rs create mode 100644 analytic_engine/src/instance/mem_collector.rs create mode 100644 analytic_engine/src/instance/mod.rs create mode 100644 analytic_engine/src/instance/open.rs create mode 100644 analytic_engine/src/instance/read.rs create mode 100644 analytic_engine/src/instance/write.rs create mode 100644 analytic_engine/src/instance/write_worker.rs create mode 100644 analytic_engine/src/lib.rs create mode 100644 analytic_engine/src/memtable/factory.rs create mode 100644 analytic_engine/src/memtable/key.rs create mode 100644 analytic_engine/src/memtable/mod.rs create mode 100644 analytic_engine/src/memtable/skiplist/factory.rs create mode 100644 analytic_engine/src/memtable/skiplist/iter.rs create mode 100644 analytic_engine/src/memtable/skiplist/mod.rs create mode 100644 analytic_engine/src/meta/details.rs create mode 100644 analytic_engine/src/meta/meta_data.rs create mode 100644 analytic_engine/src/meta/meta_update.rs create mode 100644 analytic_engine/src/meta/mod.rs create mode 100644 analytic_engine/src/payload.rs create mode 100644 analytic_engine/src/row_iter/chain.rs create mode 100644 analytic_engine/src/row_iter/dedup.rs create mode 100644 analytic_engine/src/row_iter/merge.rs create mode 100644 analytic_engine/src/row_iter/mod.rs create mode 100644 analytic_engine/src/row_iter/record_batch_stream.rs create mode 100644 analytic_engine/src/row_iter/tests.rs create mode 100644 analytic_engine/src/sampler.rs create mode 100644 analytic_engine/src/setup.rs create mode 100644 analytic_engine/src/space.rs create mode 100644 analytic_engine/src/sst/builder.rs create mode 100644 analytic_engine/src/sst/factory.rs create mode 100644 analytic_engine/src/sst/file.rs create mode 100644 analytic_engine/src/sst/manager.rs create mode 100644 analytic_engine/src/sst/mod.rs create mode 100644 analytic_engine/src/sst/parquet/builder.rs create mode 100644 analytic_engine/src/sst/parquet/encoding.rs create mode 100644 analytic_engine/src/sst/parquet/mod.rs create mode 100644 analytic_engine/src/sst/parquet/reader.rs create mode 100644 analytic_engine/src/sst/reader.rs create mode 100644 analytic_engine/src/table/data.rs create mode 100644 analytic_engine/src/table/metrics.rs create mode 100644 analytic_engine/src/table/mod.rs create mode 100644 analytic_engine/src/table/sst_util.rs create mode 100644 analytic_engine/src/table/version.rs create mode 100644 analytic_engine/src/table/version_edit.rs create mode 100644 analytic_engine/src/table_options.rs create mode 100644 analytic_engine/src/tests/alter_test.rs create mode 100644 analytic_engine/src/tests/compaction_test.rs create mode 100644 analytic_engine/src/tests/drop_test.rs create mode 100644 analytic_engine/src/tests/mod.rs create mode 100644 analytic_engine/src/tests/open_test.rs create mode 100644 analytic_engine/src/tests/read_write_test.rs create mode 100644 analytic_engine/src/tests/row_util.rs create mode 100644 analytic_engine/src/tests/table.rs create mode 100644 analytic_engine/src/tests/util.rs create mode 100644 arrow_deps/Cargo.toml create mode 100644 arrow_deps/src/display.rs create mode 100644 arrow_deps/src/lib.rs create mode 100644 arrow_deps/src/util.rs create mode 100644 benchmarks/Cargo.toml create mode 100644 benchmarks/README.md create mode 100644 benchmarks/bench.toml create mode 100644 benchmarks/benches/bench.rs create mode 100644 benchmarks/config/bench.toml create mode 100644 benchmarks/config/sst.toml create mode 100644 benchmarks/src/arrow2_bench.rs create mode 100644 benchmarks/src/bin/sst-tools.rs create mode 100644 benchmarks/src/config.rs create mode 100644 benchmarks/src/lib.rs create mode 100644 benchmarks/src/merge_memtable_bench.rs create mode 100644 benchmarks/src/merge_sst_bench.rs create mode 100644 benchmarks/src/parquet_bench.rs create mode 100644 benchmarks/src/scan_memtable_bench.rs create mode 100644 benchmarks/src/sst_bench.rs create mode 100644 benchmarks/src/sst_tools.rs create mode 100644 benchmarks/src/util.rs create mode 100644 build.rs create mode 100644 catalog/Cargo.toml create mode 100644 catalog/src/consts.rs create mode 100644 catalog/src/lib.rs create mode 100644 catalog/src/manager.rs create mode 100644 catalog/src/schema.rs create mode 100644 catalog_impls/Cargo.toml create mode 100644 catalog_impls/src/lib.rs create mode 100644 catalog_impls/src/memory.rs create mode 100644 catalog_impls/src/system_tables.rs create mode 100644 catalog_impls/src/table_based.rs create mode 100644 cluster/Cargo.toml create mode 100644 cluster/src/config.rs create mode 100644 cluster/src/lib.rs create mode 100644 cluster/src/table_manager.rs create mode 100644 cluster/src/util.rs create mode 100644 common_types/Cargo.toml create mode 100644 common_types/src/bytes.rs create mode 100644 common_types/src/column.rs create mode 100644 common_types/src/column_schema.rs create mode 100644 common_types/src/datum.rs create mode 100644 common_types/src/hash.rs create mode 100644 common_types/src/lib.rs create mode 100644 common_types/src/projected_schema.rs create mode 100644 common_types/src/record_batch.rs create mode 100644 common_types/src/request_id.rs create mode 100644 common_types/src/row/contiguous.rs create mode 100644 common_types/src/row/mod.rs create mode 100644 common_types/src/schema.rs create mode 100644 common_types/src/string.rs create mode 100644 common_types/src/tests.rs create mode 100644 common_types/src/time.rs create mode 100644 common_util/Cargo.toml create mode 100644 common_util/src/alloc_tracker.rs create mode 100644 common_util/src/codec/compact/bytes.rs create mode 100644 common_util/src/codec/compact/datum.rs create mode 100644 common_util/src/codec/compact/float.rs create mode 100644 common_util/src/codec/compact/mod.rs create mode 100644 common_util/src/codec/compact/number.rs create mode 100644 common_util/src/codec/consts.rs create mode 100644 common_util/src/codec/memcomparable/bytes.rs create mode 100644 common_util/src/codec/memcomparable/datum.rs create mode 100644 common_util/src/codec/memcomparable/mod.rs create mode 100644 common_util/src/codec/memcomparable/number.rs create mode 100644 common_util/src/codec/mod.rs create mode 100644 common_util/src/codec/row.rs create mode 100644 common_util/src/codec/varint.rs create mode 100644 common_util/src/config.rs create mode 100644 common_util/src/lib.rs create mode 100644 common_util/src/macros.rs create mode 100644 common_util/src/metric.rs create mode 100644 common_util/src/panic.rs create mode 100644 common_util/src/runtime/metrics.rs create mode 100644 common_util/src/runtime/mod.rs create mode 100644 common_util/src/time.rs create mode 100644 common_util/src/toml.rs create mode 100644 components/arena/Cargo.toml create mode 100644 components/arena/src/arena_trait.rs create mode 100644 components/arena/src/fixed_size.rs create mode 100644 components/arena/src/lib.rs create mode 100644 components/arena/src/mono_inc.rs create mode 100644 components/bytes/Cargo.toml create mode 100644 components/bytes/src/lib.rs create mode 100644 components/logger/Cargo.toml create mode 100644 components/logger/src/lib.rs create mode 100644 components/object_store/Cargo.toml create mode 100644 components/object_store/src/disk.rs create mode 100644 components/object_store/src/lib.rs create mode 100644 components/object_store/src/path/file.rs create mode 100644 components/object_store/src/path/mod.rs create mode 100644 components/object_store/src/path/parsed.rs create mode 100644 components/object_store/src/path/parts.rs create mode 100644 components/parquet/Cargo.toml create mode 100644 components/parquet/src/cache.rs create mode 100644 components/parquet/src/lib.rs create mode 100644 components/parquet/src/reverse_reader.rs create mode 100644 components/parquet/src/serialized_reader.rs create mode 100644 components/parquet/src/tests.rs create mode 100644 components/profile/Cargo.toml create mode 100644 components/profile/src/lib.rs create mode 100644 components/rust-hyperloglog/.github/dependabot.yml create mode 100644 components/rust-hyperloglog/.gitignore create mode 100644 components/rust-hyperloglog/.travis.yml create mode 100644 components/rust-hyperloglog/Cargo.toml create mode 100644 components/rust-hyperloglog/LICENSE create mode 100644 components/rust-hyperloglog/README.md create mode 100644 components/rust-hyperloglog/THANKS create mode 100644 components/rust-hyperloglog/src/hyperloglog/lib.rs create mode 100644 components/skiplist/Cargo.toml create mode 100644 components/skiplist/benches/bench.rs create mode 100644 components/skiplist/src/key.rs create mode 100644 components/skiplist/src/lib.rs create mode 100644 components/skiplist/src/list.rs create mode 100644 components/skiplist/src/slice.rs create mode 100644 components/skiplist/tests/tests.rs create mode 100644 components/tracing/Cargo.toml create mode 100644 components/tracing/src/lib.rs create mode 100644 components/tracing_examples/Cargo.toml create mode 100644 components/tracing_examples/examples/init_tracing_with_file.rs create mode 100644 components/tracing_util/Cargo.toml create mode 100644 components/tracing_util/src/lib.rs create mode 100644 components/tracing_util/src/logging.rs create mode 100644 configs/ceresdb.toml create mode 100755 docker/entrypoint.py create mode 100644 docker/supervisor/conf.d/ceresdb.conf create mode 100644 docker/supervisor/supervisord.conf create mode 100644 docker/tini create mode 100644 docs/crate-deps.dot create mode 100644 docs/crate-deps.svg create mode 100644 docs/example.toml create mode 100644 etc/license.template create mode 100644 grpcio/Cargo.toml create mode 100644 grpcio/src/lib.rs create mode 100644 interpreters/Cargo.toml create mode 100644 interpreters/src/alter_table.rs create mode 100644 interpreters/src/context.rs create mode 100644 interpreters/src/create.rs create mode 100644 interpreters/src/describe.rs create mode 100644 interpreters/src/drop.rs create mode 100644 interpreters/src/exists.rs create mode 100644 interpreters/src/factory.rs create mode 100644 interpreters/src/insert.rs create mode 100644 interpreters/src/interpreter.rs create mode 100644 interpreters/src/lib.rs create mode 100644 interpreters/src/select.rs create mode 100644 interpreters/src/show_create.rs create mode 100644 interpreters/src/tests.rs create mode 100644 meta_client/Cargo.toml create mode 100644 meta_client/src/lib.rs create mode 100644 meta_client/src/load_balance.rs create mode 100644 meta_client/src/static_client.rs create mode 100644 meta_client_v2/Cargo.toml create mode 100644 meta_client_v2/src/lib.rs create mode 100644 meta_client_v2/src/load_balance.rs create mode 100644 meta_client_v2/src/types.rs create mode 100644 proto/.gitignore create mode 100644 proto/Cargo.toml create mode 100644 proto/build.rs create mode 100644 proto/protos/analytic_common.proto create mode 100644 proto/protos/common.proto create mode 100644 proto/protos/meta_update.proto create mode 100644 proto/protos/sst.proto create mode 100644 proto/protos/sys_catalog.proto create mode 100644 proto/protos/table_requests.proto create mode 100644 proto/src/lib.rs create mode 100644 query_engine/Cargo.toml create mode 100644 query_engine/src/context.rs create mode 100644 query_engine/src/df_execution_extension/mod.rs create mode 100644 query_engine/src/df_execution_extension/prom_align.rs create mode 100644 query_engine/src/df_planner_extension/mod.rs create mode 100644 query_engine/src/df_planner_extension/prom_align.rs create mode 100644 query_engine/src/df_planner_extension/table_scan_by_primary_key.rs create mode 100644 query_engine/src/executor.rs create mode 100644 query_engine/src/lib.rs create mode 100644 query_engine/src/logical_optimizer/mod.rs create mode 100644 query_engine/src/logical_optimizer/order_by_primary_key.rs create mode 100644 query_engine/src/logical_optimizer/tests.rs create mode 100644 query_engine/src/logical_optimizer/type_conversion.rs create mode 100644 query_engine/src/physical_optimizer/coalesce_batches.rs create mode 100644 query_engine/src/physical_optimizer/mod.rs create mode 100644 query_engine/src/physical_optimizer/repartition.rs create mode 100644 query_engine/src/physical_plan.rs create mode 100644 rust-toolchain create mode 100644 rustfmt.toml create mode 100644 server/Cargo.toml create mode 100644 server/src/avro_util.rs create mode 100644 server/src/config.rs create mode 100644 server/src/consts.rs create mode 100644 server/src/context.rs create mode 100644 server/src/error.rs create mode 100644 server/src/grpc/metrics.rs create mode 100644 server/src/grpc/mod.rs create mode 100644 server/src/grpc/prom_query.rs create mode 100644 server/src/grpc/query.rs create mode 100644 server/src/grpc/route.rs create mode 100644 server/src/grpc/write.rs create mode 100644 server/src/handlers/admin.rs create mode 100644 server/src/handlers/error.rs create mode 100644 server/src/handlers/mod.rs create mode 100644 server/src/handlers/sql.rs create mode 100644 server/src/http.rs create mode 100644 server/src/instance.rs create mode 100644 server/src/lib.rs create mode 100644 server/src/limiter.rs create mode 100644 server/src/logger.rs create mode 100644 server/src/metrics.rs create mode 100644 server/src/router.rs create mode 100644 server/src/server.rs create mode 100644 server/src/table_engine.rs create mode 100644 sql/Cargo.toml create mode 100644 sql/src/ast.rs create mode 100644 sql/src/container.rs create mode 100644 sql/src/frontend.rs create mode 100644 sql/src/lib.rs create mode 100644 sql/src/parser.rs create mode 100644 sql/src/plan.rs create mode 100644 sql/src/planner.rs create mode 100644 sql/src/promql.rs create mode 100644 sql/src/promql/convert.rs create mode 100644 sql/src/promql/datafusion_util.rs create mode 100644 sql/src/promql/pushdown.rs create mode 100644 sql/src/promql/udf.rs create mode 100644 sql/src/provider.rs create mode 100644 sql/src/tests.rs create mode 100644 src/bin/ceresdb-server.rs create mode 100644 src/docs/config.toml create mode 100644 src/lib.rs create mode 100644 src/setup.rs create mode 100644 src/signal_handler.rs create mode 100644 system_catalog/Cargo.toml create mode 100644 system_catalog/src/lib.rs create mode 100644 system_catalog/src/sys_catalog_table.rs create mode 100644 system_catalog/src/tables.rs create mode 100644 table_engine/Cargo.toml create mode 100644 table_engine/src/engine.rs create mode 100644 table_engine/src/lib.rs create mode 100644 table_engine/src/memory.rs create mode 100644 table_engine/src/partition/expression.rs create mode 100644 table_engine/src/partition/mod.rs create mode 100644 table_engine/src/partition/rule.rs create mode 100644 table_engine/src/predicate/filter_record_batch.rs create mode 100644 table_engine/src/predicate/mod.rs create mode 100644 table_engine/src/provider.rs create mode 100644 table_engine/src/stream.rs create mode 100644 table_engine/src/table.rs create mode 100644 udf/Cargo.toml create mode 100644 udf/src/aggregate.rs create mode 100644 udf/src/functions.rs create mode 100644 udf/src/lib.rs create mode 100644 udf/src/registry.rs create mode 100644 udf/src/scalar.rs create mode 100644 udf/src/udaf.rs create mode 100644 udf/src/udfs/mod.rs create mode 100644 udf/src/udfs/thetasketch_distinct.rs create mode 100644 udf/src/udfs/time_bucket.rs create mode 100644 wal/Cargo.toml create mode 100644 wal/src/lib.rs create mode 100644 wal/src/log_batch.rs create mode 100644 wal/src/manager.rs create mode 100644 wal/src/rocks_impl/encoding.rs create mode 100644 wal/src/rocks_impl/manager.rs create mode 100644 wal/src/rocks_impl/mod.rs create mode 100644 wal/src/tests/mod.rs create mode 100644 wal/src/tests/read_write.rs create mode 100644 wal/src/tests/util.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..7f220c5e8a --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +target +.DS_Store +.idea/ +.vscode diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000000..6958411537 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,4363 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "addr2line" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e61f2b7f93d2c7d2b08263acaa4a363b3e276806c68af6134c44f523bf1aacd" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "adler32" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" + +[[package]] +name = "ahash" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217" +dependencies = [ + "const-random", +] + +[[package]] +name = "ahash" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43bb833f0bf979d8475d38fbf09ed3b8a55e1885fe93ad3f93239fc6a4f17b98" +dependencies = [ + "getrandom 0.2.3", + "once_cell", + "version_check", +] + +[[package]] +name = "aho-corasick" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" +dependencies = [ + "memchr", +] + +[[package]] +name = "alloc-no-stdlib" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35ef4730490ad1c4eae5c4325b2a95f521d023e5c885853ff7aca0a6a1631db3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "697ed7edc0f1711de49ce108c541623a0af97c6c60b2f6e2b65229847ac843c2" +dependencies = [ + "alloc-no-stdlib", +] + +[[package]] +name = "analytic_engine" +version = "0.1.0" +dependencies = [ + "arc-swap 1.4.0", + "arena", + "arrow_deps", + "async-trait", + "base64", + "common_types", + "common_util", + "env_logger", + "futures", + "lazy_static", + "log", + "object_store", + "parquet 0.1.0", + "prometheus 0.12.0", + "proto", + "protobuf", + "serde", + "serde_derive", + "skiplist", + "smallvec", + "snafu", + "table_engine", + "tempfile", + "tokio", + "wal", +] + +[[package]] +name = "ansi_term" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +dependencies = [ + "winapi", +] + +[[package]] +name = "ansi_term" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi", +] + +[[package]] +name = "anyhow" +version = "1.0.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28ae2b3dec75a406790005a200b1bd89785afc02517a00ca99ecfe093ee9e6cf" + +[[package]] +name = "arc-swap" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc4662175ead9cd84451d5c35070517777949a2ed84551764129cedb88384841" + +[[package]] +name = "arc-swap" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6df5aef5c5830360ce5218cecb8f018af3438af5686ae945094affc86fdec63" + +[[package]] +name = "arena" +version = "0.1.0" +dependencies = [ + "parking_lot", +] + +[[package]] +name = "arrayref" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4c527152e37cf757a3f78aae5a06fbeefdb07ccc535c980a3208ee3060dd544" + +[[package]] +name = "arrayvec" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" + +[[package]] +name = "arrow" +version = "7.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66ec0a5964feebf378e2fc6db9530e712657b8edf72aa17b1b277b0f52a48e2d" +dependencies = [ + "bitflags", + "chrono", + "comfy-table", + "csv", + "flatbuffers", + "half", + "hex", + "indexmap", + "lazy_static", + "lexical-core", + "multiversion", + "num", + "rand 0.8.4", + "regex", + "serde", + "serde_derive", + "serde_json", +] + +[[package]] +name = "arrow-format" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7da2d9660bfaebbdb0a44a33b3bd1dcb5a952fafa02c0dfc6a51ea471fef2a" +dependencies = [ + "flatbuffers", +] + +[[package]] +name = "arrow2" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d873e2775c3d87a4e8d77aa544cbd43f34a0779d5164c59e7c6a1dd0678eb395" +dependencies = [ + "arrow-format", + "base64", + "chrono", + "futures", + "hash_hasher", + "num-traits", + "parquet2", + "simdutf8", +] + +[[package]] +name = "arrow_deps" +version = "0.1.0" +dependencies = [ + "arrow", + "datafusion", + "parquet 7.0.0", + "uncover", +] + +[[package]] +name = "async-stream" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "171374e7e3b2504e0e5236e3b59260560f9fe94bfe9ac39ba5e4e929c5590625" +dependencies = [ + "async-stream-impl", + "futures-core", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "648ed8c8d2ce5409ccd57453d9d1b214b342a0d69376a6feda1fd6cae3299308" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "async-trait" +version = "0.1.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44318e776df68115a881de9a8fd1b9e53368d7a4a5ce4cc48517da3393233a5e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + +[[package]] +name = "autocfg" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" + +[[package]] +name = "avro-rs" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ece550dd6710221de9bcdc1697424d8eee4fc4ca7e017479ea9d50c348465e37" +dependencies = [ + "byteorder", + "digest 0.9.0", + "lazy_static", + "libflate", + "num-bigint 0.2.6", + "rand 0.7.3", + "serde", + "serde_json", + "strum 0.18.0", + "strum_macros 0.18.0", + "thiserror", + "typed-builder", + "uuid", + "zerocopy", +] + +[[package]] +name = "backtrace" +version = "0.3.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7a905d892734eea339e896738c14b9afce22b5318f64b951e70bf3844419b01" +dependencies = [ + "addr2line", + "cc", + "cfg-if 1.0.0", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "base64" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" + +[[package]] +name = "benchmarks" +version = "0.1.0" +dependencies = [ + "analytic_engine", + "arena", + "arrow2", + "arrow_deps", + "clap", + "common_types", + "common_util", + "criterion", + "env_logger", + "futures", + "log", + "object_store", + "parquet 0.1.0", + "serde", + "serde_derive", + "table_engine", + "tokio", +] + +[[package]] +name = "bindgen" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd4865004a46a0aafb2a0a5eb19d3c9fc46ee5f063a6cfc605c69ac9ecf5263d" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "peeking_take_while", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitpacking" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8c7d2ac73c167c06af4a5f37e6e59d84148d57ccbe4480b76f0273eefea82d7" +dependencies = [ + "crunchy", +] + +[[package]] +name = "blake2" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a4e37d16930f5459780f5621038b6382b9bb37c19016f39fb6b5808d831f174" +dependencies = [ + "crypto-mac", + "digest 0.9.0", + "opaque-debug", +] + +[[package]] +name = "blake3" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "882e99e4a0cb2ae6cb6e442102e8e6b7131718d94110e64c3e6a34ea9b106f37" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if 1.0.0", + "constant_time_eq", + "digest 0.10.1", +] + +[[package]] +name = "block-buffer" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" +dependencies = [ + "generic-array", +] + +[[package]] +name = "block-buffer" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1d36a02058e76b040de25a4464ba1c80935655595b661505c8b39b664828b95" +dependencies = [ + "generic-array", +] + +[[package]] +name = "boringssl-src" +version = "0.3.0+688fc5c" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f901accdf830d2ea2f4e27f923a5e1125cd8b1a39ab578b9db1a42d578a6922b" +dependencies = [ + "cmake", +] + +[[package]] +name = "brotli" +version = "3.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71cb90ade945043d3d53597b2fc359bb063db8ade2bcffe7997351d0756e9d50" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ad2d4653bf5ca36ae797b1f4bb4dbddb60ce49ca4aed8a2ce4829f60425b80" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + +[[package]] +name = "bstr" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90682c8d613ad3373e66de8c6411e0ae2ab2571e879d2efbf73558cc66f21279" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + +[[package]] +name = "buf_redux" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b953a6887648bb07a535631f2bc00fbdb2a2216f135552cb3f534ed136b9c07f" +dependencies = [ + "memchr", + "safemem", +] + +[[package]] +name = "bumpalo" +version = "3.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c59e7af012c713f529e7a3ee57ce9b31ddd858d4b512923602f74608b009631" + +[[package]] +name = "bytecount" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72feb31ffc86498dacdbd0fcebb56138e7177a8cc5cea4516031d15ae85a742e" + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + +[[package]] +name = "bytes" +version = "0.1.0" +dependencies = [ + "bytes 1.1.0", + "snafu", +] + +[[package]] +name = "bytes" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8" + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + +[[package]] +name = "cast" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a" +dependencies = [ + "rustc_version", +] + +[[package]] +name = "catalog" +version = "0.1.0" +dependencies = [ + "async-trait", + "common_types", + "common_util", + "snafu", + "table_engine", +] + +[[package]] +name = "catalog_impls" +version = "0.1.0" +dependencies = [ + "analytic_engine", + "async-trait", + "catalog", + "common_types", + "common_util", + "log", + "server", + "snafu", + "system_catalog", + "table_engine", + "tokio", +] + +[[package]] +name = "cc" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e70cc2f62c6ce1868963827bd677764c62d07c3d9a3e1fb1177ee1a9ab199eb2" +dependencies = [ + "jobserver", +] + +[[package]] +name = "ceresdbproto" +version = "0.1.0" +source = "git+https://github.com/CeresDB/ceresdbproto.git#dc8eb387ca66347c2ea9d5b00924ae63e7360be3" +dependencies = [ + "futures", + "grpcio 0.9.1", + "protobuf", + "protobuf-builder", +] + +[[package]] +name = "ceresdbx" +version = "0.1.0" +dependencies = [ + "analytic_engine", + "catalog", + "catalog_impls", + "clap", + "common_util", + "log", + "logger", + "query_engine", + "server", + "signal-hook", + "table_engine", + "tracing_util", + "udf", + "vergen", +] + +[[package]] +name = "cexpr" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4aedb84272dbe89af497cf81375129abda4fc0a9e7c5d317498c15cc30c0d27" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" +dependencies = [ + "libc", + "num-integer", + "num-traits", + "time", + "winapi", +] + +[[package]] +name = "clang-sys" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa66045b9cb23c2e9c1520732030608b02ee07e5cfaa5a521ec15ded7fa24c90" +dependencies = [ + "glob", + "libc", + "libloading", +] + +[[package]] +name = "clap" +version = "2.33.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" +dependencies = [ + "ansi_term 0.11.0", + "atty", + "bitflags", + "strsim", + "textwrap", + "unicode-width", + "vec_map", +] + +[[package]] +name = "cmake" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb6210b637171dfba4cda12e579ac6dc73f5165ad56133e5d72ef3131f320855" +dependencies = [ + "cc", +] + +[[package]] +name = "comfy-table" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c42350b81f044f576ff88ac750419f914abb46a03831bb1747134344ee7a4e64" +dependencies = [ + "strum 0.22.0", + "strum_macros 0.22.0", + "unicode-width", +] + +[[package]] +name = "common_types" +version = "0.1.0" +dependencies = [ + "arrow_deps", + "byteorder", + "bytes 0.1.0", + "chrono", + "murmur3", + "paste 1.0.5", + "proto", + "serde", + "serde_derive", + "snafu", + "sqlparser", +] + +[[package]] +name = "common_util" +version = "0.1.0" +dependencies = [ + "backtrace", + "chrono", + "common_types", + "crossbeam-utils 0.8.5", + "env_logger", + "gag", + "lazy_static", + "libc", + "log", + "logger", + "nix", + "pin-project-lite", + "prometheus 0.12.0", + "proto", + "serde", + "serde_derive", + "slog", + "slog-global 0.1.0 (git+https://github.com/breezewish/slog-global.git?rev=0e23a5baff302a9d7bccd85f8f31e43339c2f2c1)", + "snafu", + "tempfile", + "time", + "tokio", + "tokio-test", + "toml", +] + +[[package]] +name = "const-random" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f590d95d011aa80b063ffe3253422ed5aa462af4e9867d43ce8337562bac77c4" +dependencies = [ + "const-random-macro", + "proc-macro-hack", +] + +[[package]] +name = "const-random-macro" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "615f6e27d000a2bffbc7f2f6a8669179378fa27ee4d0a509e985dfc0a7defb40" +dependencies = [ + "getrandom 0.2.3", + "lazy_static", + "proc-macro-hack", + "tiny-keccak", +] + +[[package]] +name = "constant_time_eq" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" + +[[package]] +name = "core-foundation" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a89e2ae426ea83155dccf10c0fa6b1463ef6d5fcb44cee0b224a408fa640a62" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea221b5284a47e40033bf9b66f35f984ec0ea2931eb03505246cd27a963f981b" + +[[package]] +name = "cpufeatures" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95059428f66df56b63431fdb4e1947ed2190586af5c5a8a8b71122bdf5a7f469" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81156fece84ab6a9f2afdb109ce3ae577e42b1228441eded99bd77f627953b1a" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "criterion" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10" +dependencies = [ + "atty", + "cast", + "clap", + "criterion-plot", + "csv", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_cbor", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-utils 0.8.5", +] + +[[package]] +name = "crossbeam-deque" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c20ff29ded3204c5106278a81a38f4b482636ed4fa1e6cfbeef193291beb29ed" +dependencies = [ + "crossbeam-epoch 0.8.2", + "crossbeam-utils 0.7.2", + "maybe-uninit", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-epoch 0.9.5", + "crossbeam-utils 0.8.5", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "058ed274caafc1f60c4997b5fc07bf7dc7cca454af7c6e81edffe5f33f70dace" +dependencies = [ + "autocfg", + "cfg-if 0.1.10", + "crossbeam-utils 0.7.2", + "lazy_static", + "maybe-uninit", + "memoffset 0.5.6", + "scopeguard", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-utils 0.8.5", + "lazy_static", + "memoffset 0.6.4", + "scopeguard", +] + +[[package]] +name = "crossbeam-utils" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8" +dependencies = [ + "autocfg", + "cfg-if 0.1.10", + "lazy_static", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" +dependencies = [ + "cfg-if 1.0.0", + "lazy_static", +] + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "crypto-common" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d6b536309245c849479fba3da410962a43ed8e51c26b729208ec0ac2798d0" +dependencies = [ + "generic-array", +] + +[[package]] +name = "crypto-mac" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b584a330336237c1eecd3e94266efb216c56ed91225d634cb2991c5f3fd1aeab" +dependencies = [ + "generic-array", + "subtle", +] + +[[package]] +name = "csv" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +dependencies = [ + "bstr", + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + +[[package]] +name = "dashmap" +version = "3.11.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f260e2fc850179ef410018660006951c1b55b79e8087e87111a2c388994b9b5" +dependencies = [ + "ahash 0.3.8", + "cfg-if 0.1.10", + "num_cpus", +] + +[[package]] +name = "datafusion" +version = "6.0.0" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=444c153863520072ea22d4f8c498dee39437516d#444c153863520072ea22d4f8c498dee39437516d" +dependencies = [ + "ahash 0.7.4", + "arrow", + "async-trait", + "blake2", + "blake3", + "chrono", + "futures", + "hashbrown", + "lazy_static", + "log", + "md-5", + "num_cpus", + "ordered-float 2.10.0", + "parquet 7.0.0", + "paste 1.0.5", + "pin-project-lite", + "rand 0.8.4", + "regex", + "sha2", + "smallvec", + "sqlparser", + "tempfile", + "tokio", + "tokio-stream", + "unicode-segmentation", +] + +[[package]] +name = "digest" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" +dependencies = [ + "generic-array", +] + +[[package]] +name = "digest" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b697d66081d42af4fba142d56918a3cb21dc8eb63372c6b85d14f44fb9c5979b" +dependencies = [ + "block-buffer 0.10.0", + "crypto-common", + "generic-array", + "subtle", +] + +[[package]] +name = "dirs-next" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1" +dependencies = [ + "cfg-if 1.0.0", + "dirs-sys-next", +] + +[[package]] +name = "dirs-sys-next" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" +dependencies = [ + "libc", + "redox_users", + "winapi", +] + +[[package]] +name = "doc-comment" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" + +[[package]] +name = "either" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" + +[[package]] +name = "encoding_rs" +version = "0.8.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80df024fbc5ac80f87dfef0d9f5209a252f2a497f7f42944cff24d8253cac065" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "enum-iterator" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4eeac5c5edb79e4e39fe8439ef35207780a11f69c52cbe424ce3dfad4cb78de6" +dependencies = [ + "enum-iterator-derive", +] + +[[package]] +name = "enum-iterator-derive" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c134c37760b27a871ba422106eedbb8247da973a09e82558bf26d619c882b159" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "env_logger" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aafcde04e90a5226a6443b7aabdb016ba2f8307c847d524724bd9b346dd1a2d3" +dependencies = [ + "atty", + "humantime", + "log", + "regex", + "termcolor", +] + +[[package]] +name = "fail" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be3c61c59fdc91f5dbc3ea31ee8623122ce80057058be560654c5d410d181a6" +dependencies = [ + "lazy_static", + "log", + "rand 0.7.3", +] + +[[package]] +name = "failure" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d32e9bd16cc02eae7db7ef620b392808b89f6a5e16bb3497d159c6b92a0f4f86" +dependencies = [ + "backtrace", + "failure_derive", +] + +[[package]] +name = "failure_derive" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa4da3c766cd7a0db8242e326e9e4e081edd567072893ed320008189715366a4" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + +[[package]] +name = "filedescriptor" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed3d8a5e20435ff00469e51a0d82049bae66504b5c429920dadf9bb54d47b3f" +dependencies = [ + "libc", + "thiserror", + "winapi", +] + +[[package]] +name = "flatbuffers" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef4c5738bcd7fad10315029c50026f83c9da5e4a21f8ed66826f43e0e2bde5f6" +dependencies = [ + "bitflags", + "smallvec", + "thiserror", +] + +[[package]] +name = "flate2" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd3aec53de10fe96d7d8c565eb17f2c687bb5518a2ec453b5b1252964526abe0" +dependencies = [ + "cfg-if 1.0.0", + "crc32fast", + "libc", + "miniz_oxide", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] +name = "form_urlencoded" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fc25a87fa4fd2094bffb06925852034d90a17f0d1e05197d4956d3555752191" +dependencies = [ + "matches", + "percent-encoding", +] + +[[package]] +name = "fs_extra" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394" + +[[package]] +name = "futures" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1adc00f486adfc9ce99f77d717836f0c5aa84965eb0b4f051f4e83f7cab53f8b" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74ed2411805f6e4e3d9bc904c95d5d423b89b3b25dc0250aa74729de20629ff9" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af51b1b4a7fdff033703db39de8802c673eb91855f2e0d47dcf3bf2c0ef01f99" + +[[package]] +name = "futures-executor" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d0d535a57b87e1ae31437b892713aee90cd2d7b0ee48727cd11fc72ef54761c" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b0e06c393068f3a6ef246c75cdca793d6a46347e75286933e5e75fd2fd11582" + +[[package]] +name = "futures-macro" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c54913bae956fb8df7f4dc6fc90362aa72e69148e3f39041fbe8742d21e0ac57" +dependencies = [ + "autocfg", + "proc-macro-hack", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f30aaa67363d119812743aa5f33c201a7a66329f97d1a887022971feea4b53" + +[[package]] +name = "futures-task" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbe54a98670017f3be909561f6ad13e810d9a51f3f061b902062ca3da80799f2" + +[[package]] +name = "futures-util" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eb846bfd58e44a8481a00049e82c43e0ccb5d61f8dc071057cb19249dd4d78" +dependencies = [ + "autocfg", + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "proc-macro-hack", + "proc-macro-nested", + "slab", +] + +[[package]] +name = "gag" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a713bee13966e9fbffdf7193af71d54a6b35a0bb34997cd6c9519ebeb5005972" +dependencies = [ + "filedescriptor", + "tempfile", +] + +[[package]] +name = "generic-array" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "501466ecc8a30d1d3b7fc9229b122b2ce8ed6e9d9223f1138d4babb253e51817" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + +[[package]] +name = "getrandom" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "wasi 0.10.2+wasi-snapshot-preview1", +] + +[[package]] +name = "getset" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24b328c01a4d71d2d8173daa93562a73ab0fe85616876f02500f53d82948c504" +dependencies = [ + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "gimli" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0a01e0497841a3b2db4f8afa483cce65f7e96a3498bd6c541734792aeac8fe7" + +[[package]] +name = "git2" +version = "0.13.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "659cd14835e75b64d9dba5b660463506763cf0aa6cb640aeeb0e98d841093490" +dependencies = [ + "bitflags", + "libc", + "libgit2-sys", + "log", + "url", +] + +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + +[[package]] +name = "grpcio" +version = "0.1.0" +dependencies = [ + "grpcio 0.9.1", +] + +[[package]] +name = "grpcio" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d99e00eed7e0a04ee2705112e7cfdbe1a3cc771147f22f016a8cd2d002187b" +dependencies = [ + "futures", + "grpcio-sys", + "libc", + "log", + "parking_lot", + "protobuf", +] + +[[package]] +name = "grpcio-compiler" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1f1abac9f330ac9ee0950220c10eea84d66479cede4836f0b924407fecf093c" +dependencies = [ + "protobuf", +] + +[[package]] +name = "grpcio-sys" +version = "0.9.1+1.38.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9447d1a926beeef466606cc45717f80897998b548e7dc622873d453e1ecb4be4" +dependencies = [ + "bindgen", + "boringssl-src", + "cc", + "cmake", + "libc", + "libz-sys", + "pkg-config", + "walkdir", +] + +[[package]] +name = "h2" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7f3675cfef6a30c8031cf9e6493ebdc3bb3272a3fea3923c4210d1830e6a472" +dependencies = [ + "bytes 1.1.0", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing 0.1.26", +] + +[[package]] +name = "half" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" + +[[package]] +name = "hash_hasher" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74721d007512d0cb3338cd20f0654ac913920061a4c4d0d8708edb3f2a698c0c" + +[[package]] +name = "hashbrown" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" +dependencies = [ + "ahash 0.7.4", +] + +[[package]] +name = "headers" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0b7591fb62902706ae8e7aaff416b1b0fa2c0fd0878b46dc13baa3712d8a855" +dependencies = [ + "base64", + "bitflags", + "bytes 1.1.0", + "headers-core", + "http", + "mime", + "sha-1", + "time", +] + +[[package]] +name = "headers-core" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7f66481bfee273957b1f20485a4ff3362987f85b2c236580d81b4eb7a326429" +dependencies = [ + "http", +] + +[[package]] +name = "heck" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "http" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "527e8c9ac747e28542699a951517aa9a6945af506cd1f2e1b53a576c17b6cc11" +dependencies = [ + "bytes 1.1.0", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "399c583b2979440c60be0821a6199eca73bc3c8dcd9d070d75ac726e2c6186e5" +dependencies = [ + "bytes 1.1.0", + "http", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acd94fdbe1d4ff688b67b04eee2e17bd50995534a61539e45adfefb45e5e5503" + +[[package]] +name = "httpdate" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6456b8a6c8f33fee7d958fcd1b60d55b11940a79e63ae87013e6d22e26034440" + +[[package]] +name = "humantime" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df004cfca50ef23c36850aaaa59ad52cc70d0e90243c3c7737a4dd32dc7a3c4f" +dependencies = [ + "quick-error", +] + +[[package]] +name = "hyper" +version = "0.14.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13f67199e765030fa08fe0bd581af683f0d5bc04ea09c2b1102012c5fb90e7fd" +dependencies = [ + "bytes 1.1.0", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing 0.1.26", + "want", +] + +[[package]] +name = "hyper-tls" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" +dependencies = [ + "bytes 1.1.0", + "hyper", + "native-tls", + "tokio", + "tokio-native-tls", +] + +[[package]] +name = "hyperloglog" +version = "1.0.0" +dependencies = [ + "bytecount", + "bytes 0.1.0", + "rand 0.8.4", + "siphasher", + "snafu", +] + +[[package]] +name = "idna" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8" +dependencies = [ + "matches", + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "indexmap" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc633605454125dec4b66843673f01c7df2b89479b32e0ed634e43a91cff62a5" +dependencies = [ + "autocfg", + "hashbrown", +] + +[[package]] +name = "input_buffer" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f97967975f448f1a7ddb12b0bc41069d09ed6a1c161a92687e057325db35d413" +dependencies = [ + "bytes 1.1.0", +] + +[[package]] +name = "instant" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bee0328b1209d157ef001c94dd85b4f8f64139adb0eac2659f4b08382b2f474d" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "integer-encoding" +version = "1.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48dc51180a9b377fd75814d0cc02199c20f8e99433d6762f650d39cdbbd3b56f" + +[[package]] +name = "integer-encoding" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90c11140ffea82edce8dcd74137ce9324ec24b3cf0175fc9d7e29164da9915b8" +dependencies = [ + "async-trait", + "futures-util", +] + +[[package]] +name = "interpreters" +version = "0.1.0" +dependencies = [ + "analytic_engine", + "arrow_deps", + "async-trait", + "catalog", + "catalog_impls", + "common_types", + "common_util", + "log", + "query_engine", + "snafu", + "sql", + "table_engine", + "tokio", + "udf", +] + +[[package]] +name = "ipnet" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f2d64f2edebec4ce84ad108148e67e1064789bee435edc5b60ad398714a3a9" + +[[package]] +name = "itertools" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" + +[[package]] +name = "jemalloc-ctl" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c502a5ff9dd2924f1ed32ba96e3b65735d837b4bfd978d3161b1702e66aca4b7" +dependencies = [ + "jemalloc-sys", + "libc", + "paste 0.1.18", +] + +[[package]] +name = "jemalloc-sys" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d3b9f3f5c9b31aa0f5ed3260385ac205db665baa41d49bb8338008ae94ede45" +dependencies = [ + "cc", + "fs_extra", + "libc", +] + +[[package]] +name = "jemallocator" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43ae63fcfc45e99ab3d1b29a46782ad679e98436c3169d15a167a1108a724b69" +dependencies = [ + "jemalloc-sys", + "libc", +] + +[[package]] +name = "jobserver" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa" +dependencies = [ + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.53" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4bf49d50e2961077d9c99f4b7997d770a1114f087c3c2e0069b36c13fc2979d" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "lexical-core" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a3926d8f156019890be4abe5fd3785e0cff1001e06f59c597641fd513a5a284" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4d066d004fa762d9da995ed21aa8845bb9f6e4265f540d716fb4b315197bf0e" +dependencies = [ + "lexical-parse-integer", + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-parse-integer" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2c92badda8cc0fc4f3d3cc1c30aaefafb830510c8781ce4e8669881f3ed53ac" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-util" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ff669ccaae16ee33af90dc51125755efed17f1309626ba5c12052512b11e291" +dependencies = [ + "static_assertions", +] + +[[package]] +name = "lexical-write-float" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b5186948c7b297abaaa51560f2581dae625e5ce7dfc2d8fdc56345adb6dc576" +dependencies = [ + "lexical-util", + "lexical-write-integer", + "static_assertions", +] + +[[package]] +name = "lexical-write-integer" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ece956492e0e40fd95ef8658a34d53a3b8c2015762fdcaaff2167b28de1f56ef" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "libc" +version = "0.2.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cb00336871be5ed2c8ed44b60ae9959dc5b9f08539422ed43f09e34ecaeba21" + +[[package]] +name = "libflate" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16364af76ebb39b5869bb32c81fa93573267cd8c62bb3474e28d78fac3fb141e" +dependencies = [ + "adler32", + "crc32fast", + "libflate_lz77", +] + +[[package]] +name = "libflate_lz77" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39a734c0493409afcd49deee13c006a04e3586b9761a03543c6272c9c51f2f5a" +dependencies = [ + "rle-decode-fast", +] + +[[package]] +name = "libgit2-sys" +version = "0.12.22+1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89c53ac117c44f7042ad8d8f5681378dfbc6010e49ec2c0d1f11dfedc7a4a1c3" +dependencies = [ + "cc", + "libc", + "libz-sys", + "pkg-config", +] + +[[package]] +name = "libloading" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afe203d669ec979b7128619bae5a63b7b42e9203c1b29146079ee05e2f604b52" +dependencies = [ + "cfg-if 1.0.0", + "winapi", +] + +[[package]] +name = "librocksdb_sys" +version = "0.1.0" +source = "git+https://github.com/tikv/rust-rocksdb.git?branch=tikv-5.2#23bd00d50c79b40b6a32c11446c86f0714fa7844" +dependencies = [ + "bindgen", + "bzip2-sys", + "cc", + "cmake", + "libc", + "libtitan_sys", + "libz-sys", + "lz4-sys", + "snappy-sys", + "zstd-sys", +] + +[[package]] +name = "libtitan_sys" +version = "0.0.1" +source = "git+https://github.com/tikv/rust-rocksdb.git?branch=tikv-5.2#23bd00d50c79b40b6a32c11446c86f0714fa7844" +dependencies = [ + "bzip2-sys", + "cc", + "cmake", + "libc", + "libz-sys", + "lz4-sys", + "snappy-sys", + "zstd-sys", +] + +[[package]] +name = "libz-sys" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de5435b8549c16d423ed0c03dbaafe57cf6c3344744f1242520d59c9d8ecec66" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "lock_api" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712a4d093c9976e24e7dbca41db895dabcbac38eb5f4045393d17a95bdfb1109" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "logger" +version = "0.1.0" +dependencies = [ + "chrono", + "grpcio 0.1.0", + "log", + "slog", + "slog-async", + "slog-global 0.1.0 (git+https://github.com/breeswish/slog-global.git?rev=0e23a5baff302a9d7bccd85f8f31e43339c2f2c1)", + "slog-term", + "slog_derive", +] + +[[package]] +name = "lru" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c748cfe47cb8da225c37595b3108bea1c198c84aaae8ea0ba76d01dda9fc803" +dependencies = [ + "hashbrown", +] + +[[package]] +name = "lz4" +version = "1.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aac20ed6991e01bf6a2e68cc73df2b389707403662a8ba89f68511fb340f724c" +dependencies = [ + "libc", + "lz4-sys", +] + +[[package]] +name = "lz4-sys" +version = "1.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dca79aa95d8b3226213ad454d328369853be3a1382d89532a854f4d69640acae" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "matchers" +version = "0.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1" +dependencies = [ + "regex-automata", +] + +[[package]] +name = "matches" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f" + +[[package]] +name = "maybe-uninit" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" + +[[package]] +name = "md-5" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b5a279bb9607f9f53c22d496eade00d138d1bdcccd07d74650387cf94942a15" +dependencies = [ + "block-buffer 0.9.0", + "digest 0.9.0", + "opaque-debug", +] + +[[package]] +name = "memchr" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" + +[[package]] +name = "memoffset" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "043175f069eda7b85febe4a74abbaeff828d9f8b448515d3151a14a3542811aa" +dependencies = [ + "autocfg", +] + +[[package]] +name = "memoffset" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" +dependencies = [ + "autocfg", +] + +[[package]] +name = "meta_client" +version = "0.1.0" +dependencies = [ + "async-trait", + "catalog", + "ceresdbproto", + "common_types", + "common_util", + "futures", + "grpcio 0.1.0", + "log", + "rand 0.7.3", + "reqwest", + "serde", + "serde_derive", + "serde_json", + "snafu", + "table_engine", + "tokio", + "url", +] + +[[package]] +name = "mime" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" + +[[package]] +name = "mime_guess" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2684d4c2e97d99848d30b324b00c8fcc7e5c897b7cbb5819b09e7c90e8baf212" +dependencies = [ + "mime", + "unicase", +] + +[[package]] +name = "miniz_oxide" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b" +dependencies = [ + "adler", + "autocfg", +] + +[[package]] +name = "mio" +version = "0.7.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c2bdb6314ec10835cd3293dd268473a835c02b7b352e788be788b3c6ca6bb16" +dependencies = [ + "libc", + "log", + "miow", + "ntapi", + "winapi", +] + +[[package]] +name = "miow" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" +dependencies = [ + "winapi", +] + +[[package]] +name = "multipart" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d050aeedc89243f5347c3e237e3e13dc76fbe4ae3742a57b94dc14f69acf76d4" +dependencies = [ + "buf_redux", + "httparse", + "log", + "mime", + "mime_guess", + "quick-error", + "rand 0.7.3", + "safemem", + "tempfile", + "twoway", +] + +[[package]] +name = "multiversion" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "025c962a3dd3cc5e0e520aa9c612201d127dcdf28616974961a649dca64f5373" +dependencies = [ + "multiversion-macros", +] + +[[package]] +name = "multiversion-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8a3e2bde382ebf960c1f3e79689fa5941625fe9bf694a1cb64af3e85faff3af" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "murmur3" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a198f9589efc03f544388dfc4a19fe8af4323662b62f598b8dcfdac62c14771c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "native-tls" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48ba9f7719b5a0f42f338907614285fb5fd70e53858141f69898a1fb7203b24d" +dependencies = [ + "lazy_static", + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + +[[package]] +name = "nix" +version = "0.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ccba0cfe4fdf15982d1674c69b1fd80bad427d293849982668dfe454bd61f2" +dependencies = [ + "bitflags", + "cc", + "cfg-if 1.0.0", + "libc", +] + +[[package]] +name = "nom" +version = "5.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af" +dependencies = [ + "memchr", + "version_check", +] + +[[package]] +name = "ntapi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" +dependencies = [ + "winapi", +] + +[[package]] +name = "num" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43db66d1170d347f9a065114077f7dccb00c1b9478c89384490a3425279a4606" +dependencies = [ + "num-bigint 0.4.1", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "090c7f9998ee0ff65aa5b723e4009f7b217707f1fb5ea551329cc4d6231fb304" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76e97c412795abf6c24ba30055a8f20642ea57ca12875220b854cfa501bf1e48" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26873667bbbb7c5182d4a37c1add32cdf09f841af72da53318fdb81543c15085" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2021c8337a54d21aca0d59a92577a029af9431cb59b909b03252b9c164fad59" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d41702bd167c2df5520b384281bc111a4b5efcf7fbc4c9c222c815b07e0a6a6a" +dependencies = [ + "autocfg", + "num-bigint 0.4.1", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "object" +version = "0.26.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39f37e50073ccad23b6d09bcb5b263f4e76d3bb6038e4a3c08e52162ffa8abc2" +dependencies = [ + "memchr", +] + +[[package]] +name = "object_store" +version = "0.1.0" +dependencies = [ + "async-trait", + "bytes 1.1.0", + "common_util", + "futures", + "itertools", + "percent-encoding", + "snafu", + "tempfile", + "tokio", + "tokio-util", + "walkdir", +] + +[[package]] +name = "once_cell" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "692fcb63b64b1758029e0a96ee63e049ce8c5948587f2f7208df04625e5f6b56" + +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + +[[package]] +name = "opaque-debug" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" + +[[package]] +name = "openssl" +version = "0.10.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d9facdb76fec0b73c406f125d44d86fdad818d66fef0531eec9233ca425ff4a" +dependencies = [ + "bitflags", + "cfg-if 1.0.0", + "foreign-types", + "libc", + "once_cell", + "openssl-sys", +] + +[[package]] +name = "openssl-probe" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28988d872ab76095a6e6ac88d99b54fd267702734fd7ffe610ca27f533ddb95a" + +[[package]] +name = "openssl-sys" +version = "0.9.66" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1996d2d305e561b70d1ee0c53f1542833f4e1ac6ce9a6708b6ff2738ca67dc82" +dependencies = [ + "autocfg", + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "ordered-float" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3305af35278dd29f46fcdd139e0b1fbfae2153f0e5928b39b035542dd31e37b7" +dependencies = [ + "num-traits", +] + +[[package]] +name = "ordered-float" +version = "2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7940cf2ca942593318d07fcf2596cdca60a85c9e7fab408a5e21a4f9dcd40d87" +dependencies = [ + "num-traits", +] + +[[package]] +name = "parking_lot" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216" +dependencies = [ + "cfg-if 1.0.0", + "instant", + "libc", + "redox_syscall", + "smallvec", + "winapi", +] + +[[package]] +name = "parquet" +version = "0.1.0" +dependencies = [ + "arrow_deps", + "lru", + "parquet-format", + "thrift", +] + +[[package]] +name = "parquet" +version = "7.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c718575b34e488fa78d4f0286356abb8466573cb17ae8faa96ffd871ca6e8c6" +dependencies = [ + "arrow", + "base64", + "brotli", + "byteorder", + "chrono", + "flate2", + "lz4", + "num-bigint 0.4.1", + "parquet-format", + "rand 0.8.4", + "snap", + "thrift", + "zstd", +] + +[[package]] +name = "parquet-format" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f0c06cdcd5460967c485f9c40a821746f5955ad81990533c7fae95dbd9bc0b5" +dependencies = [ + "thrift", +] + +[[package]] +name = "parquet-format-async-temp" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03abc2f9c83fe9ceec83f47c76cc071bfd56caba33794340330f35623ab1f544" +dependencies = [ + "async-trait", + "byteorder", + "futures", + "integer-encoding 3.0.2", + "ordered-float 1.1.1", +] + +[[package]] +name = "parquet2" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db82df54cdd88931d29b850190915b9069bb93fba8e1aefc0d59d8ca81603d6d" +dependencies = [ + "async-stream", + "bitpacking", + "futures", + "parquet-format-async-temp", + "streaming-decompression", +] + +[[package]] +name = "paste" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45ca20c77d80be666aef2b45486da86238fabe33e38306bd3118fe4af33fa880" +dependencies = [ + "paste-impl", + "proc-macro-hack", +] + +[[package]] +name = "paste" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acbf547ad0c65e31259204bd90935776d1c693cec2f4ff7abb7a1bbbd40dfe58" + +[[package]] +name = "paste-impl" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d95a7db200b97ef370c8e6de0088252f7e0dfff7d047a28528e47456c0fc98b6" +dependencies = [ + "proc-macro-hack", +] + +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + +[[package]] +name = "percent-encoding" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" + +[[package]] +name = "pin-project" +version = "0.4.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "918192b5c59119d51e0cd221f4d49dde9112824ba717369e903c97d076083d0f" +dependencies = [ + "pin-project-internal 0.4.28", +] + +[[package]] +name = "pin-project" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "576bc800220cc65dac09e99e97b08b358cfab6e17078de8dc5fee223bd2d0c08" +dependencies = [ + "pin-project-internal 1.0.8", +] + +[[package]] +name = "pin-project-internal" +version = "0.4.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be26700300be6d9d23264c73211d8190e755b6b5ca7a1b28230025511b52a5e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pin-project-internal" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e8fe8163d14ce7f0cdac2e040116f22eac817edabff0be91e8aff7e9accf389" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d31d11c69a6b52a174b42bdc0c30e5e11670f90788b2c471c31c1d17d449443" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3831453b3449ceb48b6d9c7ad7c96d5ea673e9b470a1dc578c2ce6521230884c" + +[[package]] +name = "plotters" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d88417318da0eaf0fdcdb51a0ee6c3bed624333bff8f946733049380be67ac1c" + +[[package]] +name = "plotters-svg" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro-hack" +version = "0.5.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" + +[[package]] +name = "proc-macro-nested" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086" + +[[package]] +name = "proc-macro2" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c7ed8b8c7b886ea3ed7dde405212185f423ab44682667c8c6dd14aa1d9f6612" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "profile" +version = "0.1.0" +dependencies = [ + "jemalloc-ctl", + "jemalloc-sys", + "jemallocator", + "log", + "tempfile", +] + +[[package]] +name = "prometheus" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30d70cf4412832bcac9cffe27906f4a66e450d323525e977168c70d1b36120ae" +dependencies = [ + "cfg-if 0.1.10", + "fnv", + "lazy_static", + "parking_lot", + "regex", + "thiserror", +] + +[[package]] +name = "prometheus" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5986aa8d62380092d2f50f8b1cdba9cb9b6731ffd4b25b51fd126b6c3e05b99c" +dependencies = [ + "cfg-if 1.0.0", + "fnv", + "lazy_static", + "memchr", + "parking_lot", + "protobuf", + "thiserror", +] + +[[package]] +name = "prometheus-static-metric" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8f30cdb09c39930b8fa5e0f23cbb895ab3f766b187403a0ba0956fc1ef4f0e5" +dependencies = [ + "lazy_static", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "proto" +version = "0.1.0" +dependencies = [ + "protobuf", + "protobuf-builder", +] + +[[package]] +name = "protobuf" +version = "2.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23129d50f2c9355ced935fce8a08bd706ee2e7ce2b3b33bf61dace0e379ac63a" + +[[package]] +name = "protobuf-builder" +version = "0.1.0" +source = "git+https://github.com/CeresDB/protobuf-builder.git?rev=745cc8527d1c5eb48745f5ce74b2b5bdb75c3bf2#745cc8527d1c5eb48745f5ce74b2b5bdb75c3bf2" +dependencies = [ + "protobuf", + "protoc", + "protoc-bin-vendored", + "protoc-grpcio", +] + +[[package]] +name = "protobuf-codegen" +version = "2.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ba98ce0dadaa6de1e7f1b6d82a0a73b03e0c049169a167c919d906b0875026c" +dependencies = [ + "protobuf", +] + +[[package]] +name = "protoc" +version = "2.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace5c4ea0e4b0381eb37837e070182b7ab491445e2d5ea2201d861f2b2f94f82" +dependencies = [ + "log", + "which", +] + +[[package]] +name = "protoc-bin-vendored" +version = "2.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a56d817108caebed2cfb20931270a6d95dc6e36a0801999eacfbf35c21a5dd" + +[[package]] +name = "protoc-grpcio" +version = "3.0.0" +source = "git+https://github.com/CeresDB/protoc-grpcio.git?rev=fe9664cf003c908528f940d003a9c3e90e522658#fe9664cf003c908528f940d003a9c3e90e522658" +dependencies = [ + "failure", + "grpcio-compiler", + "protobuf", + "protobuf-codegen", + "protoc", + "tempfile", +] + +[[package]] +name = "query_engine" +version = "0.1.0" +dependencies = [ + "arrow_deps", + "async-trait", + "common_types", + "common_util", + "futures", + "log", + "snafu", + "sql", + "table_engine", + "udf", +] + +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + +[[package]] +name = "quote" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc 0.2.0", +] + +[[package]] +name = "rand" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.3", + "rand_hc 0.3.1", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.3", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", +] + +[[package]] +name = "rand_core" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" +dependencies = [ + "getrandom 0.2.3", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", +] + +[[package]] +name = "rand_hc" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7" +dependencies = [ + "rand_core 0.6.3", +] + +[[package]] +name = "rayon" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" +dependencies = [ + "autocfg", + "crossbeam-deque 0.8.1", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque 0.8.1", + "crossbeam-utils 0.8.5", + "lazy_static", + "num_cpus", +] + +[[package]] +name = "redox_syscall" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" +dependencies = [ + "bitflags", +] + +[[package]] +name = "redox_users" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64" +dependencies = [ + "getrandom 0.2.3", + "redox_syscall", +] + +[[package]] +name = "regex" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +dependencies = [ + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" + +[[package]] +name = "remove_dir_all" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" +dependencies = [ + "winapi", +] + +[[package]] +name = "reqwest" +version = "0.11.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "246e9f61b9bb77df069a947682be06e31ac43ea37862e244a69f177694ea6d22" +dependencies = [ + "base64", + "bytes 1.1.0", + "encoding_rs", + "futures-core", + "futures-util", + "http", + "http-body", + "hyper", + "hyper-tls", + "ipnet", + "js-sys", + "lazy_static", + "log", + "mime", + "native-tls", + "percent-encoding", + "pin-project-lite", + "serde", + "serde_urlencoded", + "tokio", + "tokio-native-tls", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "winreg", +] + +[[package]] +name = "rle-decode-fast" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cabe4fa914dec5870285fa7f71f602645da47c486e68486d2b4ceb4a343e90ac" + +[[package]] +name = "rocksdb" +version = "0.3.0" +source = "git+https://github.com/tikv/rust-rocksdb.git?branch=tikv-5.2#23bd00d50c79b40b6a32c11446c86f0714fa7844" +dependencies = [ + "libc", + "librocksdb_sys", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + +[[package]] +name = "rustversion" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61b3909d758bb75c79f23d4736fac9433868679d3ad2ea7a61e3c25cfda9a088" + +[[package]] +name = "ryu" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" + +[[package]] +name = "safemem" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef703b7cb59335eae2eb93ceb664c0eb7ea6bf567079d843e09420219668e072" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "schannel" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f05ba609c234e60bee0d547fe94a4c7e9da733d1c962cf6e59efa4cd9c8bc75" +dependencies = [ + "lazy_static", + "winapi", +] + +[[package]] +name = "scoped-tls" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea6a9290e3c9cf0f18145ef7ffa62d68ee0bf5fcd651017e586dc7fd5da448c2" + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "security-framework" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c1016a0b396a0e68d6c541a54370e0db49524aead4c9e6aa263d6855d978d78" +dependencies = [ + "bitflags", + "core-foundation", + "core-foundation-sys", + "libc", + "num", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f6f179cd85a30f8652b3f8830f73861c76e87e70b939773e72daf38be3afc02" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "568a8e6258aa33c13358f81fd834adb854c6f7c9468520910a9b1e8fac068012" + +[[package]] +name = "serde" +version = "1.0.130" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f12d06de37cf59146fbdecab66aa99f9fe4f78722e3607577a5375d66bd0c913" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_cbor" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" +dependencies = [ + "half", + "serde", +] + +[[package]] +name = "serde_derive" +version = "1.0.130" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7bc1a1ab1961464eae040d96713baa5a724a8152c1222492465b54322ec508b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.67" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7f9e390c27c3c0ce8bc5d725f6e4d30a29d26659494aa4b17535f7522c5c950" +dependencies = [ + "indexmap", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edfa57a7f8d9c1d260a549e7224100f6c43d43f9103e06dd8b4095a9b2b43ce9" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "server" +version = "0.1.0" +dependencies = [ + "analytic_engine", + "arrow_deps", + "async-trait", + "avro-rs", + "catalog", + "ceresdbproto", + "common_types", + "common_util", + "futures", + "grpcio 0.1.0", + "http", + "interpreters", + "lazy_static", + "log", + "logger", + "meta_client", + "profile", + "prometheus 0.12.0", + "prometheus-static-metric", + "protobuf", + "query_engine", + "serde", + "serde_derive", + "serde_json", + "snafu", + "sql", + "system_catalog", + "table_engine", + "tokio", + "twox-hash", + "udf", + "warp", +] + +[[package]] +name = "sha-1" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99cd6713db3cf16b6c84e06321e049a9b9f699826e16096d23bbcc44d15d51a6" +dependencies = [ + "block-buffer 0.9.0", + "cfg-if 1.0.0", + "cpufeatures", + "digest 0.9.0", + "opaque-debug", +] + +[[package]] +name = "sha2" +version = "0.9.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800" +dependencies = [ + "block-buffer 0.9.0", + "cfg-if 1.0.0", + "cpufeatures", + "digest 0.9.0", + "opaque-debug", +] + +[[package]] +name = "sharded-slab" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "740223c51853f3145fe7c90360d2d4232f2b62e3449489c207eccde818979982" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "shlex" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fdf1b9db47230893d76faad238fd6097fd6d6a9245cd7a4d90dbd639536bbd2" + +[[package]] +name = "signal-hook" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "470c5a6397076fae0094aaf06a08e6ba6f37acb77d3b1b91ea92b4d6c8650c39" +dependencies = [ + "libc", + "signal-hook-registry", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" +dependencies = [ + "libc", +] + +[[package]] +name = "simdutf8" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c970da16e7c682fa90a261cf0724dee241c9f7831635ecc4e988ae8f3b505559" + +[[package]] +name = "siphasher" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "729a25c17d72b06c68cb47955d44fda88ad2d3e7d77e025663fdd69b93dd71a1" + +[[package]] +name = "skiplist" +version = "0.1.0" +dependencies = [ + "arena", + "bytes 1.1.0", + "criterion", + "rand 0.7.3", + "yatp", +] + +[[package]] +name = "slab" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c307a32c1c5c437f38c7fd45d753050587732ba8628319fbdf12a7e289ccc590" + +[[package]] +name = "slog" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8347046d4ebd943127157b94d63abb990fcf729dc4e9978927fdf4ac3c998d06" + +[[package]] +name = "slog-async" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "766c59b252e62a34651412870ff55d8c4e6d04df19b43eecb2703e417b097ffe" +dependencies = [ + "crossbeam-channel", + "slog", + "take_mut", + "thread_local", +] + +[[package]] +name = "slog-global" +version = "0.1.0" +source = "git+https://github.com/breeswish/slog-global.git?rev=0e23a5baff302a9d7bccd85f8f31e43339c2f2c1#0e23a5baff302a9d7bccd85f8f31e43339c2f2c1" +dependencies = [ + "arc-swap 0.3.11", + "lazy_static", + "log", + "slog", +] + +[[package]] +name = "slog-global" +version = "0.1.0" +source = "git+https://github.com/breezewish/slog-global.git?rev=0e23a5baff302a9d7bccd85f8f31e43339c2f2c1#0e23a5baff302a9d7bccd85f8f31e43339c2f2c1" +dependencies = [ + "arc-swap 0.3.11", + "lazy_static", + "log", + "slog", +] + +[[package]] +name = "slog-term" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95c1e7e5aab61ced6006149ea772770b84a0d16ce0f7885def313e4829946d76" +dependencies = [ + "atty", + "chrono", + "slog", + "term", + "thread_local", +] + +[[package]] +name = "slog_derive" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a945ec7f7ce853e89ffa36be1e27dce9a43e82ff9093bf3461c30d5da74ed11b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "smallvec" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" + +[[package]] +name = "snafu" +version = "0.6.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eab12d3c261b2308b0d80c26fffb58d17eba81a4be97890101f416b478c79ca7" +dependencies = [ + "backtrace", + "doc-comment", + "futures-core", + "pin-project 0.4.28", + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.6.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1508efa03c362e23817f96cde18abed596a25219a8b2c66e8db33c03543d315b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "snap" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45456094d1983e2ee2a18fdfebce3189fa451699d0502cb8e3b49dba5ba41451" + +[[package]] +name = "snappy-sys" +version = "0.1.0" +source = "git+https://github.com/busyjay/rust-snappy.git?branch=static-link#8c12738bad811397600455d6982aff754ea2ac44" +dependencies = [ + "cmake", + "libc", + "pkg-config", +] + +[[package]] +name = "socket2" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "765f090f0e423d2b55843402a07915add955e7d60657db13707a159727326cad" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "sql" +version = "0.1.0" +dependencies = [ + "arrow_deps", + "catalog", + "ceresdbproto", + "common_types", + "common_util", + "log", + "paste 1.0.5", + "regex", + "snafu", + "sqlparser", + "table_engine", + "tokio", + "udf", +] + +[[package]] +name = "sqlparser" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9907f54bd0f7b6ce72c2be1e570a614819ee08e3deb66d90480df341d8a12a8" +dependencies = [ + "log", +] + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "streaming-decompression" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9bc687acd5dc742c4a7094f2927a8614a68e4743ef682e7a2f9f0f711656cc92" +dependencies = [ + "fallible-streaming-iterator", +] + +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" + +[[package]] +name = "strum" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57bd81eb48f4c437cadc685403cad539345bf703d78e63707418431cecd4522b" + +[[package]] +name = "strum" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7ac893c7d471c8a21f31cfe213ec4f6d9afeed25537c772e08ef3f005f8729e" + +[[package]] +name = "strum_macros" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87c85aa3f8ea653bfd3ddf25f7ee357ee4d204731f6aa9ad04002306f6e2774c" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "strum_macros" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "339f799d8b549e3744c7ac7feb216383e4005d94bdb22561b3ab8f3b808ae9fb" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "subtle" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" + +[[package]] +name = "syn" +version = "1.0.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7f58f7e8eaa0009c5fec437aabf511bd9933e4b2d7407bd05273c01a8906ea7" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "synstructure" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "474aaa926faa1603c40b7885a9eaea29b444d1cb2850cb7c0e37bb1a4182f4fa" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "unicode-xid", +] + +[[package]] +name = "system_catalog" +version = "0.1.0" +dependencies = [ + "arrow_deps", + "async-trait", + "catalog", + "common_types", + "common_util", + "futures", + "log", + "proto", + "protobuf", + "snafu", + "table_engine", + "tokio", +] + +[[package]] +name = "table_engine" +version = "0.1.0" +dependencies = [ + "arrow_deps", + "async-trait", + "common_types", + "common_util", + "futures", + "log", + "proto", + "protobuf", + "serde", + "serde_derive", + "smallvec", + "snafu", + "tokio", +] + +[[package]] +name = "take_mut" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60" + +[[package]] +name = "tempfile" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "rand 0.8.4", + "redox_syscall", + "remove_dir_all", + "winapi", +] + +[[package]] +name = "term" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c59df8ac95d96ff9bede18eb7300b0fda5e5d8d90960e76f8e14ae765eedbf1f" +dependencies = [ + "dirs-next", + "rustversion", + "winapi", +] + +[[package]] +name = "termcolor" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "thiserror" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "283d5230e63df9608ac7d9691adc1dfb6e701225436eb64d0b9a7f0a5a04f6ec" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa3884228611f5cd3608e2d409bf7dce832e4eb3135e3f11addbd7e41bd68e71" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thread_local" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8018d24e04c95ac8790716a5987d0fec4f8b27249ffa0f7d33f1369bdfb88cbd" +dependencies = [ + "once_cell", +] + +[[package]] +name = "threadpool" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa" +dependencies = [ + "num_cpus", +] + +[[package]] +name = "thrift" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c6d965454947cc7266d22716ebfd07b18d84ebaf35eec558586bbb2a8cb6b5b" +dependencies = [ + "byteorder", + "integer-encoding 1.1.7", + "log", + "ordered-float 1.1.1", + "threadpool", +] + +[[package]] +name = "time" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca8a50ef2360fbd1eeb0ecd46795a87a19024eb4b53c5dc916ca1fd95fe62438" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "tinyvec" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "848a1e1181b9f6753b5e96a092749e29b11d19ede67dfbbd6c7dc7e0f49b5338" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" + +[[package]] +name = "tokio" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbbf1c778ec206785635ce8ad57fe52b3009ae9e0c9f574a728f3049d3e55838" +dependencies = [ + "bytes 1.1.0", + "libc", + "memchr", + "mio", + "num_cpus", + "once_cell", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "tokio-macros", + "winapi", +] + +[[package]] +name = "tokio-macros" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b557f72f448c511a979e2564e55d74e6c4432fc96ff4f6241bc6bded342643b7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-native-tls" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d995660bd2b7f8c1568414c1126076c13fbb725c40112dc0120b78eb9b717b" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-stream" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b2f3f698253f03119ac0102beaa64f67a67e08074d03a22d18784104543727f" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-test" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53474327ae5e166530d17f2d956afcb4f8a004de581b3cae10f12006bc8163e3" +dependencies = [ + "async-stream", + "bytes 1.1.0", + "futures-core", + "tokio", + "tokio-stream", +] + +[[package]] +name = "tokio-tungstenite" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1a5f475f1b9d077ea1017ecbc60890fda8e54942d680ca0b1d2b47cfa2d861b" +dependencies = [ + "futures-util", + "log", + "pin-project 1.0.8", + "tokio", + "tungstenite", +] + +[[package]] +name = "tokio-util" +version = "0.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1caa0b0c8d94a049db56b5acf8cba99dc0623aab1b26d5b5f5e2d945846b3592" +dependencies = [ + "bytes 1.1.0", + "futures-core", + "futures-io", + "futures-sink", + "log", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "toml" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31142970826733df8241ef35dc040ef98c679ab14d7c3e54d827099b3acecaa" +dependencies = [ + "serde", +] + +[[package]] +name = "tower-service" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" + +[[package]] +name = "trace_examples" +version = "0.1.0" +dependencies = [ + "tracing 0.1.0", + "tracing_util", +] + +[[package]] +name = "tracing" +version = "0.1.0" +dependencies = [ + "tracing 0.1.26", +] + +[[package]] +name = "tracing" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d" +dependencies = [ + "cfg-if 1.0.0", + "log", + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-appender" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9965507e507f12c8901432a33e31131222abac31edd90cabbcf85cf544b7127a" +dependencies = [ + "chrono", + "crossbeam-channel", + "tracing-subscriber", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c42e6fa53307c8a17e4ccd4dc81cf5ec38db9209f59b222210375b54ee40d1e2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ca517f43f0fb96e0c3072ed5c275fe5eece87e8cb52f4a77b69226d3b1c9df8" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "tracing-log" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3" +dependencies = [ + "lazy_static", + "log", + "tracing-core", +] + +[[package]] +name = "tracing-serde" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b" +dependencies = [ + "serde", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9cbe87a2fa7e35900ce5de20220a582a9483a7063811defce79d7cbd59d4cfe" +dependencies = [ + "ansi_term 0.12.1", + "chrono", + "lazy_static", + "matchers", + "regex", + "serde", + "serde_json", + "sharded-slab", + "smallvec", + "thread_local", + "tracing 0.1.26", + "tracing-core", + "tracing-log", + "tracing-serde", +] + +[[package]] +name = "tracing_util" +version = "0.1.0" +dependencies = [ + "lazy_static", + "tracing 0.1.26", + "tracing-appender", + "tracing-subscriber", +] + +[[package]] +name = "try-lock" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642" + +[[package]] +name = "tungstenite" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ada8297e8d70872fa9a551d93250a9f407beb9f37ef86494eb20012a2ff7c24" +dependencies = [ + "base64", + "byteorder", + "bytes 1.1.0", + "http", + "httparse", + "input_buffer", + "log", + "rand 0.8.4", + "sha-1", + "url", + "utf-8", +] + +[[package]] +name = "twoway" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59b11b2b5241ba34be09c3cc85a36e56e48f9888862e19cedf23336d35316ed1" +dependencies = [ + "memchr", +] + +[[package]] +name = "twox-hash" +version = "1.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ee73e6e4924fe940354b8d4d98cad5231175d615cd855b758adc658c0aac6a0" +dependencies = [ + "cfg-if 1.0.0", + "rand 0.8.4", + "static_assertions", +] + +[[package]] +name = "typed-builder" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78cea224ddd4282dfc40d1edabbd0c020a12e946e3a48e2c2b8f6ff167ad29fe" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "typenum" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879f6906492a7cd215bfa4cf595b600146ccfac0c79bcbd1f3000162af5e8b06" + +[[package]] +name = "udf" +version = "0.1.0" +dependencies = [ + "arrow_deps", + "base64", + "chrono", + "common_types", + "common_util", + "hyperloglog", + "smallvec", + "snafu", +] + +[[package]] +name = "uncover" +version = "0.1.1" +source = "git+https://github.com/matklad/uncover.git?rev=1d0770d997e29731b287e9e11e4ffbbea5f456da#1d0770d997e29731b287e9e11e4ffbbea5f456da" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "unicase" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" +dependencies = [ + "version_check", +] + +[[package]] +name = "unicode-bidi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "246f4c42e67e7a4e3c6106ff716a5d067d4132a642840b242e357e468a2a0085" + +[[package]] +name = "unicode-normalization" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" + +[[package]] +name = "unicode-width" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3" + +[[package]] +name = "unicode-xid" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" + +[[package]] +name = "url" +version = "2.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c" +dependencies = [ + "form_urlencoded", + "idna", + "matches", + "percent-encoding", +] + +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + +[[package]] +name = "uuid" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" +dependencies = [ + "getrandom 0.2.3", + "serde", +] + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" + +[[package]] +name = "vergen" +version = "5.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "265455aab08c55a1ab13f07c8d5e25c7d46900f4484dd7cbd682e77171f93f3c" +dependencies = [ + "anyhow", + "cfg-if 1.0.0", + "chrono", + "enum-iterator", + "getset", + "git2", + "rustversion", + "thiserror", +] + +[[package]] +name = "version_check" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" + +[[package]] +name = "wal" +version = "0.1.0" +dependencies = [ + "async-trait", + "common_types", + "common_util", + "futures", + "log", + "rocksdb", + "snafu", + "tempfile", + "tokio", +] + +[[package]] +name = "walkdir" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" +dependencies = [ + "same-file", + "winapi", + "winapi-util", +] + +[[package]] +name = "want" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0" +dependencies = [ + "log", + "try-lock", +] + +[[package]] +name = "warp" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "332d47745e9a0c38636dbd454729b147d16bd1ed08ae67b3ab281c4506771054" +dependencies = [ + "bytes 1.1.0", + "futures", + "headers", + "http", + "hyper", + "log", + "mime", + "mime_guess", + "multipart", + "percent-encoding", + "pin-project 1.0.8", + "scoped-tls", + "serde", + "serde_json", + "serde_urlencoded", + "tokio", + "tokio-stream", + "tokio-tungstenite", + "tokio-util", + "tower-service", + "tracing 0.1.26", +] + +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + +[[package]] +name = "wasi" +version = "0.10.2+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" + +[[package]] +name = "wasm-bindgen" +version = "0.2.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce9b1b516211d33767048e5d47fa2a381ed8b76fc48d2ce4aa39877f9f183e0" +dependencies = [ + "cfg-if 1.0.0", + "serde", + "serde_json", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe8dc78e2326ba5f845f4b5bf548401604fa20b1dd1d365fb73b6c1d6364041" +dependencies = [ + "bumpalo", + "lazy_static", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95fded345a6559c2cfee778d562300c581f7d4ff3edb9b0d230d69800d213972" +dependencies = [ + "cfg-if 1.0.0", + "js-sys", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44468aa53335841d9d6b6c023eaab07c0cd4bddbcfdee3e2bb1e8d2cb8069fef" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0195807922713af1e67dc66132c7328206ed9766af3858164fb583eedc25fbad" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdb075a845574a1fa5f09fd77e43f7747599301ea3417a9fbffdeedfc1f4a29" + +[[package]] +name = "web-sys" +version = "0.3.53" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224b2f6b67919060055ef1a67807367c2066ed520c3862cc013d26cf893a783c" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "which" +version = "4.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea187a8ef279bc014ec368c27a920da2024d2a711109bfbe3440585d5cf27ad9" +dependencies = [ + "either", + "lazy_static", + "libc", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "winreg" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0120db82e8a1e0b9fb3345a539c478767c0048d842860994d96113d5b667bd69" +dependencies = [ + "winapi", +] + +[[package]] +name = "yatp" +version = "0.0.1" +source = "git+https://github.com/tikv/yatp.git?rev=4b71f8abd86890f0d1e95778c2b6bf5a9ee4c502#4b71f8abd86890f0d1e95778c2b6bf5a9ee4c502" +dependencies = [ + "crossbeam-deque 0.7.4", + "dashmap", + "fail", + "lazy_static", + "num_cpus", + "parking_lot_core", + "prometheus 0.10.0", + "rand 0.7.3", +] + +[[package]] +name = "zerocopy" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6580539ad917b7c026220c4b3f2c08d52ce54d6ce0dc491e66002e35388fab46" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d498dbd1fd7beb83c86709ae1c33ca50942889473473d287d56ce4770a18edfb" +dependencies = [ + "proc-macro2", + "syn", + "synstructure", +] + +[[package]] +name = "zstd" +version = "0.9.0+zstd.1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07749a5dc2cb6b36661290245e350f15ec3bbb304e493db54a1d354480522ccd" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "4.1.1+zstd.1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c91c90f2c593b003603e5e0493c837088df4469da25aafff8bce42ba48caf079" +dependencies = [ + "libc", + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "1.6.1+zstd.1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "615120c7a2431d16cf1cf979e7fc31ba7a5b5e5707b29c8a99e5dbf8a8392a33" +dependencies = [ + "cc", + "libc", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000000..7ad1ca4a7f --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,74 @@ +[package] +name = "ceresdbx" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" +resolver = "2" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[workspace] +# In alphabetical order +members = [ + "analytic_engine", + "arrow_deps", + "benchmarks", + "catalog", + "catalog_impls", + "common_types", + "common_util", + "components/arena", + "components/bytes", + "components/logger", + "components/object_store", + "components/parquet", + "components/profile", + "components/rust-hyperloglog", + "components/skiplist", + "components/tracing", + "components/tracing_util", + "components/tracing_examples", + "grpcio", + "interpreters", + "meta_client", + "proto", + "query_engine", + "server", + "sql", + "system_catalog", + "table_engine", + "udf", + "wal", +] + +[[bin]] +name = "ceresdb-server" + +[dependencies] +# Workspace dependencies, in alphabetical order +analytic_engine = { path = "analytic_engine" } +catalog = { path = "catalog" } +catalog_impls = { path = "catalog_impls" } +clap = "2.0" +common_util = { path = "common_util" } +log = "0.4" +logger = { path = "components/logger" } +query_engine = { path = "query_engine" } +server = { path = "server" } +table_engine = { path = "table_engine" } +tracing_util = { path = "components/tracing_util" } +udf = { path = "udf" } + +# Crates.io dependencies, in alphabetical order +signal-hook = "0.3" + +[build-dependencies] +vergen = { version = "5", default-features = false, features = ["build", "git"] } + +[profile.release] +debug = true +opt-level = 2 +overflow-checks = true + +[profile.bench] +debug = true diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000..37cf72300f --- /dev/null +++ b/Dockerfile @@ -0,0 +1,39 @@ +ARG RUST_VERSION=1.59.0 +FROM rust:${RUST_VERSION}-slim-bullseye as build + +# cache mounts below may already exist and owned by root +USER root + +RUN apt update && apt install --yes gcc g++ libssl-dev pkg-config cmake && rm -rf /var/lib/apt/lists/* + +# Build ceresdb +COPY . /ceresdb +WORKDIR /ceresdb + +RUN make build + +FROM ubuntu:20.04 +# create admin user +ARG USER=admin +ARG PASS="1q2w3s" +RUN useradd -m -s /bin/bash $USER && echo "$USER:$PASS" | chpasswd + +COPY --from=build /ceresdb/target/release/ceresdb-server /usr/bin/ceresdb-server + +RUN apt update && apt install --yes curl gdb iotop cron + +ENV RUST_BACKTRACE 1 + +COPY ./docker/entrypoint.py /entrypoint.py +COPY ./docker/supervisor/supervisord.conf /etc/supervisor/supervisord.conf +COPY ./docker/supervisor/conf.d /etc/supervisor/conf.d +COPY ./configs/ceresdb.toml /usr/bin/ + +RUN mkdir -p /etc/ceresdb +RUN chmod +x /usr/bin/ceresdb-server + +COPY ./configs /etc/ceresdb + +COPY ./docker/tini /tini +RUN chmod +x /tini +ENTRYPOINT ["/tini", "--", "/entrypoint.py"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000..7a4a3ea242 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000..dfc71ea25a --- /dev/null +++ b/Makefile @@ -0,0 +1,64 @@ +SHELL = /bin/bash + +DIR=$(shell pwd) + +init: + echo "init" + echo "Git branch: $GITBRANCH" + +build: + ls -alh + cd $(DIR); cargo build --release + +build-asan: + ls -alh + export RUSTFLAGS=-Zsanitizer=address RUSTDOCFLAGS=-Zsanitizer=address + cd $(DIR); cargo build -Zbuild-std --target x86_64-unknown-linux-gnu --release + +build-arm64: + ls -alh + cd $(DIR); cargo build --release --no-default-features + +test: + cd $(DIR); cargo test --workspace -- --test-threads=4 + +# grcov needs build first, then run test +build-ut: + echo $(CARGO_INCREMENTAL) + echo $(RUSTFLAGS) + echo $(RUSTDOCFLAGS) + cd $(DIR); cargo build -j 4 --workspace + +test-ut: + echo $(CARGO_INCREMENTAL) + echo $(RUSTFLAGS) + echo $(RUSTDOCFLAGS) + cd $(DIR); cargo test -j 4 --workspace -- -Z unstable-options --format json | tee results.json; \ + cat results.json | cargo2junit > ${WORKSPACE}/testresult/TEST-all.xml + +fmt: + cd $(DIR); cargo fmt -- --check + +clippy: + cd $(DIR); cargo clippy --all-targets --all-features --workspace -- -D warnings + +# test with address sanitizer +asan-test: + export RUSTFLAGS=-Zsanitizer=address RUSTDOCFLAGS=-Zsanitizer=address + cd $(DIR); cargo test -Zbuild-std --target x86_64-unknown-linux-gnu --workspace + +# test with address sanitizer under release mode to workaround `attempt to create unaligned or null slice` +# error in parquet crate. +asan-test-release: + export RUSTFLAGS=-Zsanitizer=address RUSTDOCFLAGS=-Zsanitizer=address + cd $(DIR); cargo test -Zbuild-std --target x86_64-unknown-linux-gnu --release --workspace + +# test with memory sanitizer +mem-test: + export RUSTFLAGS=-Zsanitizer=memory RUSTDOCFLAGS=-Zsanitizer=memory + cd $(DIR); cargo test -Zbuild-std --target x86_64-unknown-linux-gnu --workspace + +# test with miri. +# only list packages will be tested. +miri: + cd $(DIR); cargo miri test --package arena diff --git a/README.md b/README.md new file mode 100644 index 0000000000..4e9d118178 --- /dev/null +++ b/README.md @@ -0,0 +1,90 @@ +# ceresdbx + +## Building +Install clang (for rocksdb) + +Install deps (required by rust-rocksdb) +```bash +brew install cmake +brew install lz4 +``` + +Build in debug mode +```bash +cargo build --bin ceresdb-server +``` + +Build in release mode +```bash +cargo build --release --bin ceresdb-server +``` + +## Usage +Run the server +```bash +./ceresdb-server +``` + +## RESTful API +```bash +curl -L -X POST 'http://localhost:5000/sql' \ +-H 'Content-Type: application/json' \ +-d '{ + "query": "your DDL sql" +}' +``` + +Describe a table +```bash +curl -L -X POST 'http://localhost:5000/sql' \ +-H 'Content-Type: application/json' \ +-d '{ + "query": "DESCRIBE TABLE mytest" +}' +``` + +Insert data +```bash +curl -L -X POST 'http://localhost:5000/sql' \ +-H 'Content-Type: application/json' \ +--data-raw '{ + "query": "INSERT INTO mytest(c1, c2, c3, c4, c5, c6) VALUES(1618310218001, 12.5, '\''hello world'\'', 3.14159265, true, 2147483650)" +}' +``` + +Query +```bash +curl -L -X POST 'http://localhost:5000/sql' \ +-H 'Content-Type: application/json' \ +-d '{ + "query": "SELECT c1, c2, c3, c4, c5, c6 FROM mytest LIMIT 3" +}' +``` + +Query from system tables +```bash +curl -L -X POST 'http://localhost:5000/sql' \ +-H 'Content-Type: application/json' \ +-d '{ + "query": "SELECT * FROM system.numbers LIMIT 3" +}' +``` + +## Support Data Type +| SQL | CeresDB | Arrow | +| --- | --- | --- | +| null | Null | Null | +| timestamp | Timestamp | Timestamp(TimeUnit::Millisecond, None) | +| double | Double | Float64 | +| float | Float | Float32 | +| string | String | String | +| Varbinary | Varbinary | Binary | +| uint64 | UInt64 | UInt64 | +| uint32 | UInt32 | UInt32 | +| uint16 | UInt16 | UInt16 | +| uint8 | UInt8 | UInt8 | +| int64/bigint | Int64 | Int64 | +| int32/int | Int32 | Int32 | +| int16/smallint | Int16 | Int16 | +| int8/tinyint | Int8 | Int8 | +| boolean | Boolean | Boolean | diff --git a/analytic_engine/Cargo.toml b/analytic_engine/Cargo.toml new file mode 100644 index 0000000000..3be6760574 --- /dev/null +++ b/analytic_engine/Cargo.toml @@ -0,0 +1,43 @@ +[package] +name = "analytic_engine" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[features] +test = ["tempfile"] + +[dependencies] +# In alphabetical order +arc-swap = "1.4.0" +arena = { path = "../components/arena" } +arrow_deps = { path = "../arrow_deps" } +async-trait = "0.1.41" +base64 = "0.13" +common_types = { path = "../common_types" } +common_util = { path = "../common_util"} +futures = "0.3" +lazy_static = "1.4.0" +log = "0.4" +object_store = { path = "../components/object_store" } +parquet = { path = "../components/parquet" } +prometheus = "0.12" +proto = { path = "../proto" } +protobuf = "2.20" +serde = "1.0" +serde_derive = "1.0" +skiplist = { path = "../components/skiplist" } +smallvec = "1.6" +snafu = { version = "0.6.10", features = ["backtraces"] } +table_engine = { path = "../table_engine" } +tokio = { version = "1.0", features = ["sync", "time"] } +wal = { path = "../wal" } +tempfile = { version = "3.1.0", optional = true } + +[dev-dependencies] +common_types = { path = "../common_types", features = ["test"] } +common_util = { path = "../common_util", features = ["test"] } +env_logger = "0.6" +tempfile = "3.1.0" diff --git a/analytic_engine/src/compaction/metrics.rs b/analytic_engine/src/compaction/metrics.rs new file mode 100644 index 0000000000..61d76453e3 --- /dev/null +++ b/analytic_engine/src/compaction/metrics.rs @@ -0,0 +1,15 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Metrics of compaction. + +use lazy_static::lazy_static; +use prometheus::{register_int_gauge, IntGauge}; + +lazy_static! { + // Counters: + pub static ref COMPACTION_PENDING_REQUEST_GAUGE: IntGauge = register_int_gauge!( + "compaction_pending_request_gauge", + "Pending request queue length of compaction" + ) + .unwrap(); +} diff --git a/analytic_engine/src/compaction/mod.rs b/analytic_engine/src/compaction/mod.rs new file mode 100644 index 0000000000..a76ce2324a --- /dev/null +++ b/analytic_engine/src/compaction/mod.rs @@ -0,0 +1,494 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Compaction. + +use std::{collections::HashMap, sync::Arc}; + +use common_util::config::{ReadableSize, TimeUnit}; +use serde_derive::Deserialize; +use snafu::{ensure, Backtrace, GenerateBacktrace, ResultExt, Snafu}; +use tokio::sync::oneshot; + +use crate::{ + compaction::picker::{CommonCompactionPicker, CompactionPickerRef}, + instance::write_worker::CompactionNotifier, + sst::file::{FileHandle, Level}, + table::data::TableDataRef, + table_options::COMPACTION_STRATEGY, +}; + +mod metrics; +pub mod picker; +pub mod scheduler; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Unable to parse compaction strategy, value: {}", value))] + ParseStrategy { value: String, backtrace: Backtrace }, + #[snafu(display("Unable to parse float, key: {}, value: {}", key, value))] + ParseFloat { + key: String, + value: String, + source: std::num::ParseFloatError, + backtrace: Backtrace, + }, + #[snafu(display("Unable to parse int, key: {}, value: {}", key, value))] + ParseInt { + key: String, + value: String, + source: std::num::ParseIntError, + backtrace: Backtrace, + }, + #[snafu(display("Unable to parse readable size, key: {}, value: {}", key, value))] + ParseSize { + key: String, + value: String, + error: String, + backtrace: Backtrace, + }, + #[snafu(display("Unable to parse time unit, key: {}, value: {}", key, value))] + ParseTimeUnit { + key: String, + value: String, + error: String, + backtrace: Backtrace, + }, + #[snafu(display("Invalid compaction option value, err: {}", error))] + InvalidOption { error: String, backtrace: Backtrace }, +} + +#[derive(Debug, Clone, Copy, Deserialize, PartialEq)] +pub enum CompactionStrategy { + Default, + TimeWindow(TimeWindowCompactionOptions), + SizeTiered(SizeTieredCompactionOptions), +} + +#[derive(Debug, Clone, Copy, Deserialize, PartialEq)] +pub struct SizeTieredCompactionOptions { + pub bucket_low: f32, + pub bucket_high: f32, + pub min_sstable_size: ReadableSize, + pub min_threshold: usize, + pub max_threshold: usize, +} + +#[derive(Debug, Clone, Copy, Deserialize, PartialEq)] +pub struct TimeWindowCompactionOptions { + pub size_tiered: SizeTieredCompactionOptions, + // TODO(boyan) In fact right now we only supports TimeUnit::Milliseconds resolution. + pub timestamp_resolution: TimeUnit, +} + +impl protobuf::Clear for SizeTieredCompactionOptions { + fn clear(&mut self) { + *self = SizeTieredCompactionOptions::default() + } +} + +impl protobuf::Clear for TimeWindowCompactionOptions { + fn clear(&mut self) { + *self = TimeWindowCompactionOptions::default() + } +} + +impl Default for SizeTieredCompactionOptions { + fn default() -> Self { + Self { + bucket_low: 0.5, + bucket_high: 1.5, + min_sstable_size: ReadableSize::mb(50), + min_threshold: 4, + max_threshold: 16, + } + } +} + +impl Default for TimeWindowCompactionOptions { + fn default() -> Self { + Self { + size_tiered: SizeTieredCompactionOptions::default(), + timestamp_resolution: TimeUnit::Milliseconds, + } + } +} + +impl Default for CompactionStrategy { + fn default() -> Self { + CompactionStrategy::Default + } +} + +const BUCKET_LOW_KEY: &str = "compaction_bucket_low"; +const BUCKET_HIGH_KEY: &str = "compaction_bucket_high"; +const MIN_THRESHOLD_KEY: &str = "compaction_min_threshold"; +const MAX_THRESHOLD_KEY: &str = "compaction_max_threshold"; +const MIN_SSTABLE_SIZE_KEY: &str = "compaction_min_sstable_size"; +const TIMESTAMP_RESOLUTION_KEY: &str = "compaction_timestamp_resolution"; +const DEFAULT_STRATEGY: &str = "default"; +const STC_STRATEGY: &str = "size_tiered"; +const TWC_STRATEGY: &str = "time_window"; + +impl CompactionStrategy { + pub(crate) fn parse_from( + value: &str, + options: &HashMap, + ) -> Result { + match value.trim().to_lowercase().as_str() { + DEFAULT_STRATEGY => Ok(CompactionStrategy::Default), + STC_STRATEGY => Ok(CompactionStrategy::SizeTiered( + SizeTieredCompactionOptions::parse_from(options)?, + )), + TWC_STRATEGY => Ok(CompactionStrategy::TimeWindow( + TimeWindowCompactionOptions::parse_from(options)?, + )), + _ => ParseStrategy { + value: value.to_string(), + } + .fail(), + } + } + + pub(crate) fn fill_raw_map(&self, m: &mut HashMap) { + match self { + CompactionStrategy::Default => { + m.insert( + COMPACTION_STRATEGY.to_string(), + DEFAULT_STRATEGY.to_string(), + ); + } + CompactionStrategy::SizeTiered(opts) => { + m.insert(COMPACTION_STRATEGY.to_string(), STC_STRATEGY.to_string()); + opts.fill_raw_map(m); + } + CompactionStrategy::TimeWindow(opts) => { + m.insert(COMPACTION_STRATEGY.to_string(), TWC_STRATEGY.to_string()); + opts.fill_raw_map(m); + } + } + } +} + +impl SizeTieredCompactionOptions { + pub(crate) fn validate(&self) -> Result<(), Error> { + ensure!( + self.bucket_high > self.bucket_low, + InvalidOption { + error: format!( + "{} value({}) is less than or equal to the {} value({}) ", + BUCKET_HIGH_KEY, self.bucket_high, BUCKET_LOW_KEY, self.bucket_low + ), + } + ); + + Ok(()) + } + + fn fill_raw_map(&self, m: &mut HashMap) { + m.insert(BUCKET_LOW_KEY.to_string(), format!("{}", self.bucket_low)); + m.insert(BUCKET_HIGH_KEY.to_string(), format!("{}", self.bucket_high)); + m.insert( + MIN_SSTABLE_SIZE_KEY.to_string(), + format!("{}", self.min_sstable_size.0), + ); + m.insert( + MAX_THRESHOLD_KEY.to_string(), + format!("{}", self.max_threshold), + ); + m.insert( + MIN_THRESHOLD_KEY.to_string(), + format!("{}", self.min_threshold), + ); + } + + pub(crate) fn parse_from( + options: &HashMap, + ) -> Result { + let mut opts = SizeTieredCompactionOptions::default(); + if let Some(v) = options.get(BUCKET_LOW_KEY) { + opts.bucket_low = v.parse().context(ParseFloat { + key: BUCKET_HIGH_KEY, + value: v, + })?; + } + if let Some(v) = options.get(BUCKET_HIGH_KEY) { + opts.bucket_high = v.parse().context(ParseFloat { + key: BUCKET_HIGH_KEY, + value: v, + })?; + } + if let Some(v) = options.get(MIN_SSTABLE_SIZE_KEY) { + opts.min_sstable_size = v.parse::().map_err(|err| Error::ParseSize { + key: MIN_SSTABLE_SIZE_KEY.to_string(), + value: v.to_string(), + error: err, + backtrace: Backtrace::generate(), + })?; + } + if let Some(v) = options.get(MAX_THRESHOLD_KEY) { + opts.max_threshold = v.parse().context(ParseInt { + key: MAX_THRESHOLD_KEY, + value: v, + })?; + } + if let Some(v) = options.get(MIN_THRESHOLD_KEY) { + opts.min_threshold = v.parse().context(ParseInt { + key: MIN_THRESHOLD_KEY, + value: v, + })?; + } + + opts.validate()?; + + Ok(opts) + } +} + +impl TimeWindowCompactionOptions { + /// TODO(boyan) In fact right now we only supports TimeUnit::Milliseconds + /// resolution. + fn valid_timestamp_unit(unit: TimeUnit) -> bool { + matches!( + unit, + TimeUnit::Seconds + | TimeUnit::Milliseconds + | TimeUnit::Microseconds + | TimeUnit::Nanoseconds + ) + } + + fn fill_raw_map(&self, m: &mut HashMap) { + self.size_tiered.fill_raw_map(m); + + m.insert( + TIMESTAMP_RESOLUTION_KEY.to_string(), + format!("{}", self.timestamp_resolution), + ); + } + + pub(crate) fn validate(&self) -> Result<(), Error> { + if !Self::valid_timestamp_unit(self.timestamp_resolution) { + return InvalidOption { + error: format!( + "{:?} is not valid for {}) ", + self.timestamp_resolution, TIMESTAMP_RESOLUTION_KEY + ), + } + .fail(); + } + + Ok(()) + } + + pub(crate) fn parse_from( + options: &HashMap, + ) -> Result { + let mut opts = TimeWindowCompactionOptions { + size_tiered: SizeTieredCompactionOptions::parse_from(options)?, + ..Default::default() + }; + + if let Some(v) = options.get(TIMESTAMP_RESOLUTION_KEY) { + opts.timestamp_resolution = + v.parse::().map_err(|err| Error::ParseTimeUnit { + key: TIMESTAMP_RESOLUTION_KEY.to_string(), + value: v.to_string(), + error: err, + backtrace: Backtrace::generate(), + })?; + } + + opts.validate()?; + + Ok(opts) + } +} + +#[derive(Debug, Clone)] +pub struct CompactionInputFiles { + /// Level of the files to be compacted. + pub level: Level, + /// Files to be compacted. + pub files: Vec, + /// The output level of the merged file. + pub output_level: Level, +} + +#[derive(Default, Clone)] +pub struct ExpiredFiles { + /// Level of the expired files. + pub level: Level, + /// Expired files. + pub files: Vec, +} + +#[derive(Default, Clone)] +pub struct CompactionTask { + pub compaction_inputs: Vec, + pub expired: Vec, +} + +impl CompactionTask { + pub fn mark_files_being_compacted(&self, being_compacted: bool) { + for input in &self.compaction_inputs { + for file in &input.files { + file.set_being_compacted(being_compacted); + } + } + for expired in &self.expired { + for file in &expired.files { + file.set_being_compacted(being_compacted); + } + } + } +} + +pub struct PickerManager { + default_picker: CompactionPickerRef, + time_window_picker: CompactionPickerRef, + size_tiered_picker: CompactionPickerRef, +} + +impl Default for PickerManager { + fn default() -> Self { + let size_tiered_picker = Arc::new(CommonCompactionPicker::new( + CompactionStrategy::SizeTiered(SizeTieredCompactionOptions::default()), + )); + let time_window_picker = Arc::new(CommonCompactionPicker::new( + CompactionStrategy::TimeWindow(TimeWindowCompactionOptions::default()), + )); + + Self { + default_picker: time_window_picker.clone(), + size_tiered_picker, + time_window_picker, + } + } +} + +impl PickerManager { + pub fn get_picker(&self, strategy: CompactionStrategy) -> CompactionPickerRef { + match strategy { + CompactionStrategy::Default => self.default_picker.clone(), + CompactionStrategy::SizeTiered(_) => self.size_tiered_picker.clone(), + CompactionStrategy::TimeWindow(_) => self.time_window_picker.clone(), + } + } +} + +#[derive(Debug, Snafu)] +pub enum WaitError { + #[snafu(display("The compaction is canceled"))] + Canceled, + + #[snafu(display("Failed to compact, err:{}", source))] + Compaction { + source: Arc, + }, +} + +pub type WaitResult = std::result::Result; + +pub struct WaiterNotifier { + waiter: Option>>, +} + +impl WaiterNotifier { + pub fn new(waiter: Option>>) -> Self { + Self { waiter } + } + + pub fn notify_wait_result(mut self, res: WaitResult<()>) { + // Ignore error if failed to send result. + if let Some(waiter) = self.waiter.take() { + let _ = waiter.send(res); + } + } +} + +impl Drop for WaiterNotifier { + fn drop(&mut self) { + if let Some(waiter) = self.waiter.take() { + // The compaction result hasn't been sent before the notifier dropped, we + // send a canceled error to waiter. + let _ = waiter.send(Canceled.fail()); + } + } +} + +/// Request to compact single table. +pub struct TableCompactionRequest { + pub table_data: TableDataRef, + pub compaction_notifier: CompactionNotifier, + pub waiter: Option>>, +} + +impl TableCompactionRequest { + pub fn no_waiter(table_data: TableDataRef, compaction_notifier: CompactionNotifier) -> Self { + TableCompactionRequest { + table_data, + compaction_notifier, + waiter: None, + } + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use super::*; + + #[test] + fn test_fill_raw_map_then_parse() { + let c = CompactionStrategy::Default; + let mut m = HashMap::new(); + c.fill_raw_map(&mut m); + assert_eq!(1, m.len()); + assert_eq!(m[COMPACTION_STRATEGY], "default"); + assert_eq!(c, CompactionStrategy::parse_from("default", &m).unwrap()); + + let opts = SizeTieredCompactionOptions { + bucket_low: 0.1, + min_sstable_size: ReadableSize(1024), + max_threshold: 10, + ..Default::default() + }; + + let c = CompactionStrategy::SizeTiered(opts); + let mut m = HashMap::new(); + c.fill_raw_map(&mut m); + assert_eq!(6, m.len()); + assert_eq!(m[COMPACTION_STRATEGY], "size_tiered"); + assert_eq!(m[BUCKET_LOW_KEY], "0.1"); + assert_eq!(m[BUCKET_HIGH_KEY], "1.5"); + assert_eq!(m[MIN_SSTABLE_SIZE_KEY], "1024"); + assert_eq!(m[MIN_THRESHOLD_KEY], "4"); + assert_eq!(m[MAX_THRESHOLD_KEY], "10"); + assert_eq!( + c, + CompactionStrategy::parse_from("size_tiered", &m).unwrap() + ); + + let twc_opts = TimeWindowCompactionOptions { + size_tiered: opts, + ..Default::default() + }; + let c = CompactionStrategy::TimeWindow(twc_opts); + let mut m = HashMap::new(); + c.fill_raw_map(&mut m); + + assert_eq!(7, m.len()); + assert_eq!(m[COMPACTION_STRATEGY], "time_window"); + assert_eq!(m[BUCKET_LOW_KEY], "0.1"); + assert_eq!(m[BUCKET_HIGH_KEY], "1.5"); + assert_eq!(m[MIN_SSTABLE_SIZE_KEY], "1024"); + assert_eq!(m[MIN_THRESHOLD_KEY], "4"); + assert_eq!(m[MAX_THRESHOLD_KEY], "10"); + assert_eq!(m[TIMESTAMP_RESOLUTION_KEY], "milliseconds"); + assert_eq!( + c, + CompactionStrategy::parse_from("time_window", &m).unwrap() + ); + } +} diff --git a/analytic_engine/src/compaction/picker.rs b/analytic_engine/src/compaction/picker.rs new file mode 100644 index 0000000000..5cc9f2afc9 --- /dev/null +++ b/analytic_engine/src/compaction/picker.rs @@ -0,0 +1,740 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Compaction picker. + +use std::{ + collections::{BTreeSet, HashMap}, + sync::Arc, + time::Duration, +}; + +use common_types::time::Timestamp; +use common_util::{config::TimeUnit, define_result}; +use log::{debug, info}; +use snafu::Snafu; + +use crate::{ + compaction::{ + CompactionInputFiles, CompactionStrategy, CompactionTask, SizeTieredCompactionOptions, + TimeWindowCompactionOptions, + }, + sst::{ + file::{FileHandle, Level}, + manager::LevelsController, + }, +}; + +#[derive(Debug, Snafu)] +pub enum Error {} + +define_result!(Error); + +#[derive(Clone)] +pub struct PickerContext { + pub segment_duration: Duration, + /// The ttl of the data in sst. + pub ttl: Option, + pub strategy: CompactionStrategy, +} + +impl PickerContext { + fn size_tiered_opts(&self) -> SizeTieredCompactionOptions { + match self.strategy { + CompactionStrategy::SizeTiered(opts) => opts, + _ => SizeTieredCompactionOptions::default(), + } + } + + fn time_window_opts(&self) -> TimeWindowCompactionOptions { + match self.strategy { + CompactionStrategy::TimeWindow(opts) => opts, + _ => TimeWindowCompactionOptions::default(), + } + } +} + +pub trait CompactionPicker { + /// Pick candidate files for compaction. + /// + /// Note: files being compacted should be ignored. + fn pick_compaction( + &self, + ctx: PickerContext, + levels_controller: &LevelsController, + ) -> Result; +} + +pub type CompactionPickerRef = Arc; + +trait LevelPicker { + /// Pick candidate files for compaction at level + fn pick_candidates_at_level( + &self, + ctx: &PickerContext, + levels_controller: &LevelsController, + level: Level, + expire_time: Option, + ) -> Option>; +} + +type LevelPickerRef = Arc; + +pub struct CommonCompactionPicker { + level_picker: LevelPickerRef, +} + +impl CommonCompactionPicker { + pub fn new(strategy: CompactionStrategy) -> Self { + let level_picker: LevelPickerRef = match strategy { + CompactionStrategy::SizeTiered(_) | CompactionStrategy::Default => { + Arc::new(SizeTieredPicker::default()) + } + CompactionStrategy::TimeWindow(_) => Arc::new(TimeWindowPicker::default()), + }; + Self { level_picker } + } + + fn pick_compact_candidates( + &self, + ctx: &PickerContext, + levels_controller: &LevelsController, + expire_time: Option, + ) -> Option { + let num_levels = levels_controller.num_levels(); + //TODO(boyan) level compaction strategy + for level in 0..num_levels { + if let Some(files) = self.level_picker.pick_candidates_at_level( + ctx, + levels_controller, + level, + expire_time, + ) { + return Some(CompactionInputFiles { + level, + files, + // Now, we always output to the same level. + output_level: level, + }); + } + } + + None + } +} + +impl CompactionPicker for CommonCompactionPicker { + fn pick_compaction( + &self, + ctx: PickerContext, + levels_controller: &LevelsController, + ) -> Result { + let expire_time = ctx.ttl.map(Timestamp::expire_time); + let mut compaction_task = CompactionTask { + expired: levels_controller.expired_ssts(expire_time), + ..Default::default() + }; + + if let Some(input_files) = + self.pick_compact_candidates(&ctx, levels_controller, expire_time) + { + info!( + "Compaction strategy: {:?} picker pick files to compact, input_files:{:?}", + ctx.strategy, input_files + ); + + compaction_task.compaction_inputs = vec![input_files]; + } + + Ok(compaction_task) + } +} + +#[inline] +fn find_uncompact_files( + levels_controller: &LevelsController, + level: Level, + expire_time: Option, +) -> Vec { + levels_controller + .iter_ssts_at_level(level) + // Only use files not being compacted and not expired. + .filter(|file| !file.being_compacted() && !file.time_range().is_expired(expire_time)) + .map(Clone::clone) + .collect() +} + +/// Size tiered compaction strategy +/// See https://github.com/jeffjirsa/twcs/blob/master/src/main/java/com/jeffjirsa/cassandra/db/compaction/SizeTieredCompactionStrategy.java +#[derive(Default)] +pub struct SizeTieredPicker {} + +/// Similar size files group +#[derive(Debug)] +struct Bucket { + pub avg_size: usize, + pub files: Vec, +} + +impl Bucket { + fn with_file(file: &FileHandle) -> Self { + Self { + avg_size: file.size() as usize, + files: vec![file.clone()], + } + } + + fn with_files(files: Vec) -> Self { + let total: usize = files.iter().map(|f| f.size() as usize).sum(); + Self { + avg_size: total / files.len(), + files, + } + } + + fn insert_file(&mut self, file: &FileHandle) { + let total_size = self.files.len() * self.avg_size + file.size() as usize; + self.avg_size = total_size / (self.files.len() + 1); + self.files.push(file.clone()); + } + + fn get_hotness_map(&self) -> HashMap { + self.files + .iter() + .map(|f| (f.clone(), Self::hotness(f))) + .collect() + } + + #[inline] + fn hotness(f: &FileHandle) -> f64 { + let row_num = match f.row_num() { + 0 => 1, //prevent NAN hotness + v => v, + }; + f.read_meter().h2_rate() / (row_num as f64) + } +} + +impl LevelPicker for SizeTieredPicker { + fn pick_candidates_at_level( + &self, + ctx: &PickerContext, + levels_controller: &LevelsController, + level: Level, + expire_time: Option, + ) -> Option> { + let files_by_segment = + Self::files_by_segment(levels_controller, level, ctx.segment_duration, expire_time); + if files_by_segment.is_empty() { + return None; + } + + let all_segments: BTreeSet<_> = files_by_segment.keys().collect(); + let opts = ctx.size_tiered_opts(); + + // Iterate the segment in reverse order, so newest segment is examined first. + for (idx, segment_key) in all_segments.iter().rev().enumerate() { + // segment_key should always exist. + if let Some(segment) = files_by_segment.get(segment_key) { + let buckets = Self::get_buckets( + segment.to_vec(), + opts.bucket_high, + opts.bucket_low, + opts.min_sstable_size.as_bytes() as f32, + ); + + let files = + Self::most_interesting_bucket(buckets, opts.min_threshold, opts.max_threshold); + + if files.is_some() { + info!( + "Compact segment, idx: {}, size:{}, segment_key:{:?}, files:{:?}", + idx, + segment.len(), + segment_key, + segment + ); + return files; + } + debug!( + "No compaction necessary for segment, size:{}, segment_key:{:?}, idx:{}", + segment.len(), + segment_key, + idx + ); + } + } + + None + } +} + +impl SizeTieredPicker { + /// Group files of similar size into buckets. + fn get_buckets( + mut files: Vec, + bucket_high: f32, + bucket_low: f32, + min_sst_size: f32, + ) -> Vec { + // sort by file length + files.sort_unstable_by_key(FileHandle::size); + + let mut buckets: Vec = Vec::new(); + 'outer: for sst in &files { + let size = sst.size() as f32; + // look for a bucket containing similar-sized files: + // group in the same bucket if it's w/in 50% of the average for this bucket, + // or this file and the bucket are all considered "small" (less than + // `min_sst_size`) + for bucket in buckets.iter_mut() { + let old_avg_size = bucket.avg_size as f32; + if (size > (old_avg_size * bucket_low) && size < (old_avg_size * bucket_high)) + || (size < min_sst_size && old_avg_size < min_sst_size) + { + // find a similar file, insert it into bucket + bucket.insert_file(sst); + continue 'outer; + } + } + + // no similar bucket found + // put it in a new bucket + buckets.push(Bucket::with_file(sst)); + } + + debug!("Group files of similar size into buckets: {:?}", buckets); + + buckets + } + + fn most_interesting_bucket( + buckets: Vec, + min_threshold: usize, + max_threshold: usize, + ) -> Option> { + let mut pruned_bucket_and_hotness = Vec::with_capacity(buckets.len()); + // skip buckets containing less than min_threshold sstables, + // and limit other buckets to max_threshold sstables + for bucket in buckets { + let (bucket, hotness) = Self::trim_to_threshold_with_hotness(bucket, max_threshold); + if bucket.files.len() >= min_threshold { + pruned_bucket_and_hotness.push((bucket, hotness)); + } + } + + if pruned_bucket_and_hotness.is_empty() { + return None; + } + + // Find the hotest bucket + if let Some((bucket, hotness)) = + pruned_bucket_and_hotness + .into_iter() + .max_by(|(b1, h1), (b2, h2)| { + let c = h1.partial_cmp(h2).unwrap(); + if !c.is_eq() { + return c; + } + //TODO(boyan), compacting smallest sstables first? + b1.avg_size.cmp(&b2.avg_size) + }) + { + debug!( + "Find the hotest bucket, hotness: {}, bucket: {:?}", + hotness, bucket + ); + Some(bucket.files) + } else { + None + } + } + + fn files_by_segment( + levels_controller: &LevelsController, + level: Level, + segment_duration: Duration, + expire_time: Option, + ) -> HashMap> { + let mut files_by_segment = HashMap::new(); + let uncompact_files = find_uncompact_files(levels_controller, level, expire_time); + for file in uncompact_files { + // We use the end time of the range to calculate segment. + let segment = file + .time_range() + .exclusive_end() + .truncate_by(segment_duration); + let files = files_by_segment.entry(segment).or_insert_with(Vec::new); + files.push(file); + } + + files_by_segment + } + + fn trim_to_threshold_with_hotness(bucket: Bucket, max_threshold: usize) -> (Bucket, f64) { + let hotness_snapshot = bucket.get_hotness_map(); + + // Sort by sstable hotness (descending). + let mut sorted_files = bucket.files.to_vec(); + sorted_files.sort_unstable_by(|f1, f2| { + hotness_snapshot[f1] + .partial_cmp(&hotness_snapshot[f2]) + .unwrap() + .reverse() + }); + + // and then trim the coldest sstables off the end to meet the max_threshold + let len = sorted_files.len(); + let pruned_bucket: Vec = sorted_files + .into_iter() + .take(std::cmp::min(max_threshold, len)) + .collect(); + + // bucket hotness is the sum of the hotness of all sstable members + let bucket_hotness = pruned_bucket.iter().map(Bucket::hotness).sum(); + + (Bucket::with_files(pruned_bucket), bucket_hotness) + } +} + +/// Time window compaction strategy +/// See https://github.com/jeffjirsa/twcs/blob/master/src/main/java/com/jeffjirsa/cassandra/db/compaction/TimeWindowCompactionStrategy.java +#[derive(Default)] +pub struct TimeWindowPicker {} + +impl TimeWindowPicker { + fn get_window_bounds_in_millis(window: &Duration, ts: i64) -> (i64, i64) { + let ts_secs = ts / 1000; + + let size = window.as_secs() as i64; + + let lower = ts_secs - (ts_secs % size); + let upper = lower + size - 1; + + (lower * 1000, upper * 1000) + } + + #[inline] + fn resolve_timetamp(ts: i64, timestamp_resolution: TimeUnit) -> i64 { + match timestamp_resolution { + TimeUnit::Microseconds => ts / 1000, + TimeUnit::Nanoseconds => ts / 1000000, + TimeUnit::Seconds => ts * 1000, + TimeUnit::Milliseconds => ts, + // the option is validated before, so it won't reach here + _ => unreachable!(), + } + } + + /// Group files of similar timestamp into buckets. + fn get_buckets( + files: &[FileHandle], + window: &Duration, + timestamp_resolution: TimeUnit, + ) -> (HashMap>, i64) { + let mut max_ts = 0i64; + let mut buckets: HashMap> = HashMap::new(); + for f in files { + let ts = f.time_range_ref().exclusive_end().as_i64(); + + let ts = Self::resolve_timetamp(ts, timestamp_resolution); + + let (left, _) = Self::get_window_bounds_in_millis(window, ts); + + let bucket_files = buckets.entry(left).or_insert_with(Vec::new); + + bucket_files.push(f.clone()); + + if left > max_ts { + max_ts = left; + } + } + + debug!( + "Group files of similar timestamp into buckets: {:?}", + buckets + ); + (buckets, max_ts) + } + + fn newest_bucket( + buckets: HashMap>, + size_tiered_opts: SizeTieredCompactionOptions, + now: i64, + ) -> Option> { + // If the current bucket has at least minThreshold SSTables, choose that one. + // For any other bucket, at least 2 SSTables is enough. + // In any case, limit to max_threshold SSTables. + + let all_keys: BTreeSet<_> = buckets.keys().collect(); + + for key in all_keys.into_iter().rev() { + if let Some(bucket) = buckets.get(key) { + debug!("Key {}, now {}", key, now); + + if bucket.len() >= size_tiered_opts.min_threshold && *key >= now { + // If we're in the newest bucket, we'll use STCS to prioritize sstables + let buckets = SizeTieredPicker::get_buckets( + bucket.to_vec(), + size_tiered_opts.bucket_high, + size_tiered_opts.bucket_low, + size_tiered_opts.min_sstable_size.as_bytes() as f32, + ); + let files = SizeTieredPicker::most_interesting_bucket( + buckets, + size_tiered_opts.min_threshold, + size_tiered_opts.max_threshold, + ); + + if files.is_some() { + return files; + } + } else if bucket.len() >= 2 && *key < now { + debug!("Bucket size {} >= 2 and not in current bucket, compacting what's here: {:?}", bucket.len(), bucket); + return Some(Self::trim_to_threshold( + bucket, + size_tiered_opts.max_threshold, + )); + } else { + debug!( + "No compaction necessary for bucket size {} , key {}, now {}", + bucket.len(), + key, + now + ); + } + } + } + + None + } + + fn trim_to_threshold(files: &[FileHandle], max_threshold: usize) -> Vec { + // Sort by sstable file size + let mut sorted_files = files.to_vec(); + sorted_files.sort_unstable_by_key(FileHandle::size); + + // Trim the largest sstables off the end to meet the maxThreshold + let len = sorted_files.len(); + sorted_files + .into_iter() + .take(std::cmp::min(max_threshold, len)) + .collect() + } + + /// Get current window timestamp, the caller MUST ensure the level has ssts, + /// panic otherwise. + fn get_current_window( + levels_controller: &LevelsController, + level: Level, + window: &Duration, + timestamp_resolution: TimeUnit, + ) -> i64 { + // always find the latest sst here + let now = levels_controller + .latest_sst(level) + .unwrap() + .time_range() + .exclusive_end() + .as_i64(); + let now = Self::resolve_timetamp(now, timestamp_resolution); + Self::get_window_bounds_in_millis(window, now).0 + } +} + +impl LevelPicker for TimeWindowPicker { + fn pick_candidates_at_level( + &self, + ctx: &PickerContext, + levels_controller: &LevelsController, + level: Level, + expire_time: Option, + ) -> Option> { + let uncompact_files = find_uncompact_files(levels_controller, level, expire_time); + + if uncompact_files.is_empty() { + return None; + } + + let opts = ctx.time_window_opts(); + + debug!("TWCS compaction options: {:?}", opts); + + let (buckets, ts) = Self::get_buckets( + &uncompact_files, + &ctx.segment_duration, + opts.timestamp_resolution, + ); + + let now = Self::get_current_window( + levels_controller, + level, + &ctx.segment_duration, + opts.timestamp_resolution, + ); + debug!("now {}, max_ts: {}", now, ts); + assert!(now >= ts); + + Self::newest_bucket(buckets, opts.size_tiered, now) + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use common_types::{ + bytes::Bytes, + tests::build_schema, + time::{TimeRange, Timestamp}, + }; + + use crate::{ + compaction::{picker::PickerContext, CompactionStrategy, PickerManager}, + sst::{ + file::SstMetaData, + manager::{tests::LevelsControllerMockBuilder, LevelsController}, + }, + }; + + fn build_sst_meta_data(time_range: TimeRange, size: u64) -> SstMetaData { + SstMetaData { + min_key: Bytes::from_static(b"100"), + max_key: Bytes::from_static(b"200"), + time_range, + max_sequence: 200, + schema: build_schema(), + size, + row_num: 2, + } + } + + // testcase 0: file buckets: old bucket:[0,1] newest bucket:[2], expired:[3] + fn build_old_bucket_case(now: i64) -> LevelsController { + let builder = LevelsControllerMockBuilder::default(); + let sst_meta_vec = vec![ + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 14000), Timestamp::new(now - 13000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 14000), Timestamp::new(now - 13000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(100), Timestamp::new(200)), + 2, + ), + ]; + builder.add_sst(sst_meta_vec).build() + } + + // testcase 1: file buckets: old bucket:[0,1] newest bucket:[2,3,4,5] + // default min_threshold=4 + fn build_newest_bucket_case(now: i64) -> LevelsController { + let builder = LevelsControllerMockBuilder::default(); + let sst_meta_vec = vec![ + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 14000), Timestamp::new(now - 13000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 14000), Timestamp::new(now - 13000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)), + 2, + ), + ]; + builder.add_sst(sst_meta_vec).build() + } + + // testcase 2: file buckets: old bucket:[0] newest bucket:[1,2,3] + // default min_threshold=4 + fn build_newest_bucket_no_match_case(now: i64) -> LevelsController { + let builder = LevelsControllerMockBuilder::default(); + let sst_meta_vec = vec![ + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 14000), Timestamp::new(now - 13000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)), + 2, + ), + ]; + builder.add_sst(sst_meta_vec).build() + } + + #[test] + fn test_time_window_picker() { + let picker_manager = PickerManager::default(); + let twp = picker_manager.get_picker(CompactionStrategy::Default); + let mut ctx = PickerContext { + segment_duration: Duration::from_millis(1000), + ttl: Some(Duration::from_secs(100000)), + strategy: CompactionStrategy::Default, + }; + let now = Timestamp::now(); + { + let lc = build_old_bucket_case(now.as_i64()); + let task = twp.pick_compaction(ctx.clone(), &lc).unwrap(); + assert_eq!(task.compaction_inputs[0].files.len(), 2); + assert_eq!(task.compaction_inputs[0].files[0].id(), 0); + assert_eq!(task.compaction_inputs[0].files[1].id(), 1); + assert_eq!(task.expired[0].files.len(), 1); + assert_eq!(task.expired[0].files[0].id(), 3); + } + + { + let lc = build_newest_bucket_case(now.as_i64()); + let task = twp.pick_compaction(ctx.clone(), &lc).unwrap(); + assert_eq!(task.compaction_inputs[0].files.len(), 4); + assert_eq!(task.compaction_inputs[0].files[0].id(), 2); + assert_eq!(task.compaction_inputs[0].files[1].id(), 3); + assert_eq!(task.compaction_inputs[0].files[2].id(), 4); + assert_eq!(task.compaction_inputs[0].files[3].id(), 5); + } + + { + let lc = build_newest_bucket_no_match_case(now.as_i64()); + let task = twp.pick_compaction(ctx.clone(), &lc).unwrap(); + assert_eq!(task.compaction_inputs.len(), 0); + } + + // If ttl is None, then no file is expired. + ctx.ttl = None; + { + let lc = build_old_bucket_case(now.as_i64()); + let task = twp.pick_compaction(ctx, &lc).unwrap(); + assert_eq!(task.compaction_inputs[0].files.len(), 2); + assert_eq!(task.compaction_inputs[0].files[0].id(), 0); + assert_eq!(task.compaction_inputs[0].files[1].id(), 1); + assert!(task.expired[0].files.is_empty()); + } + } +} diff --git a/analytic_engine/src/compaction/scheduler.rs b/analytic_engine/src/compaction/scheduler.rs new file mode 100644 index 0000000000..d06925d6d2 --- /dev/null +++ b/analytic_engine/src/compaction/scheduler.rs @@ -0,0 +1,595 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Compaction scheduler. + +use std::{ + collections::{HashMap, VecDeque}, + hash::Hash, + sync::{ + atomic::{AtomicBool, AtomicUsize, Ordering}, + Arc, RwLock, + }, + time::Duration, +}; + +use async_trait::async_trait; +use common_types::{request_id::RequestId, time::Timestamp}; +use common_util::{ + config::ReadableDuration, + define_result, + runtime::{JoinHandle, Runtime}, +}; +use log::{debug, error, info, warn}; +use object_store::ObjectStore; +use serde_derive::Deserialize; +use snafu::{ResultExt, Snafu}; +use table_engine::table::TableId; +use tokio::{ + sync::{ + mpsc::{self, Receiver, Sender}, + Mutex, + }, + time, +}; + +use crate::{ + compaction::{ + metrics::COMPACTION_PENDING_REQUEST_GAUGE, picker::PickerContext, CompactionTask, + PickerManager, TableCompactionRequest, WaitError, WaiterNotifier, + }, + instance::SpaceStore, + meta::Manifest, + sst::factory::Factory, + table::data::TableDataRef, + TableOptions, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to join compaction schedule worker, err:{}", source))] + JoinWorker { source: common_util::runtime::Error }, +} + +define_result!(Error); + +#[derive(Debug, Clone, Deserialize)] +#[serde(default)] +pub struct SchedulerConfig { + pub schedule_channel_len: usize, + pub schedule_interval: ReadableDuration, + pub max_ongoing_tasks: usize, +} + +// TODO(boyan), a better default value? +const MAX_GOING_COMPACTION_TASKS: usize = 8; +const MAX_PENDING_COMPACTION_TASKS: usize = 1024; + +impl Default for SchedulerConfig { + fn default() -> Self { + Self { + schedule_channel_len: 16, + // 30 minutes schedule interval. + schedule_interval: ReadableDuration(Duration::from_secs(60 * 30)), + max_ongoing_tasks: MAX_GOING_COMPACTION_TASKS, + } + } +} + +enum ScheduleTask { + Request(TableCompactionRequest), + Schedule, + Exit, +} + +#[async_trait] +pub trait CompactionScheduler { + /// Stop the scheduler. + async fn stop_scheduler(&self) -> Result<()>; + + /// Schedule a compaction job to background workers. + async fn schedule_table_compaction(&self, request: TableCompactionRequest); +} + +// A FIFO queue that remove duplicate values by key. +struct RequestQueue { + keys: VecDeque, + values: HashMap, +} + +impl Default for RequestQueue { + fn default() -> Self { + Self { + keys: VecDeque::default(), + values: HashMap::default(), + } + } +} + +impl RequestQueue { + fn push_back(&mut self, key: K, value: V) -> bool { + if self.values.insert(key.clone(), value).is_none() { + self.keys.push_back(key); + return true; + } + false + } + + fn pop_front(&mut self) -> Option { + if let Some(key) = self.keys.pop_front() { + return self.values.remove(&key); + } + None + } + + #[inline] + fn len(&self) -> usize { + self.values.len() + } + + #[inline] + fn is_empty(&self) -> bool { + self.values.is_empty() + } +} + +type RequestBuf = RwLock>; + +struct OngoingTaskLimit { + ongoing_tasks: AtomicUsize, + /// Buffer to hold pending requests + request_buf: RequestBuf, +} + +impl OngoingTaskLimit { + #[inline] + fn start_task(&self) { + self.ongoing_tasks.fetch_add(1, Ordering::SeqCst); + } + + #[inline] + fn finish_task(&self) { + self.ongoing_tasks.fetch_sub(1, Ordering::SeqCst); + } + + #[inline] + fn add_request(&self, request: TableCompactionRequest) { + let mut dropped = 0; + + { + let mut req_buf = self.request_buf.write().unwrap(); + + // Remove older requests + if req_buf.len() >= MAX_PENDING_COMPACTION_TASKS { + while req_buf.len() >= MAX_PENDING_COMPACTION_TASKS { + req_buf.pop_front(); + dropped += 1; + } + COMPACTION_PENDING_REQUEST_GAUGE.sub(dropped) + } + + if req_buf.push_back(request.table_data.id, request) { + COMPACTION_PENDING_REQUEST_GAUGE.add(1) + } + } + + if dropped > 0 { + warn!( + "Too many compaction pending tasks, limit: {}, dropped {} older tasks.", + MAX_PENDING_COMPACTION_TASKS, dropped, + ); + } + } + + fn drain_requests(&self, max_num: usize) -> Vec { + let mut result = Vec::with_capacity(max_num); + let mut req_buf = self.request_buf.write().unwrap(); + + while result.len() < max_num { + if let Some(req) = req_buf.pop_front() { + result.push(req); + } else { + break; + } + } + COMPACTION_PENDING_REQUEST_GAUGE.sub(result.len() as i64); + + result + } + + #[inline] + fn has_pending_requests(&self) -> bool { + !self.request_buf.read().unwrap().is_empty() + } + + #[inline] + fn request_buf_len(&self) -> usize { + self.request_buf.read().unwrap().len() + } + + #[inline] + fn ongoing_tasks(&self) -> usize { + self.ongoing_tasks.load(Ordering::SeqCst) + } +} + +pub type CompactionSchedulerRef = Arc; + +pub struct SchedulerImpl { + sender: Sender, + running: Arc, + handle: Mutex>, +} + +impl SchedulerImpl { + pub fn new< + Wal: Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore + Send + Sync + 'static, + Fa: Factory + Send + Sync + 'static, + >( + space_store: Arc>, + runtime: Arc, + config: SchedulerConfig, + ) -> Self { + let (tx, rx) = mpsc::channel(config.schedule_channel_len); + let running = Arc::new(AtomicBool::new(true)); + + let mut worker = ScheduleWorker { + sender: tx.clone(), + receiver: rx, + space_store, + runtime: runtime.clone(), + schedule_interval: config.schedule_interval.0, + picker_manager: PickerManager::default(), + tables_buf: Vec::new(), + max_ongoing_tasks: config.max_ongoing_tasks, + limit: Arc::new(OngoingTaskLimit { + ongoing_tasks: AtomicUsize::new(0), + request_buf: RwLock::new(RequestQueue::default()), + }), + running: running.clone(), + }; + + let handle = runtime.spawn(async move { + worker.schedule_loop().await; + }); + + Self { + sender: tx, + running, + handle: Mutex::new(handle), + } + } +} + +#[async_trait] +impl CompactionScheduler for SchedulerImpl { + async fn stop_scheduler(&self) -> Result<()> { + self.running.store(false, Ordering::Relaxed); + // Wake up the receiver, if the channel is full, the worker should be busy and + // check the running flag later. + let _ = self.sender.try_send(ScheduleTask::Exit); + + let mut handle = self.handle.lock().await; + (&mut *handle).await.context(JoinWorker)?; + + Ok(()) + } + + async fn schedule_table_compaction(&self, request: TableCompactionRequest) { + let send_res = self.sender.send(ScheduleTask::Request(request)).await; + + if let Err(e) = send_res { + error!("Compaction scheduler failed to send request, err:{}", e); + } + } +} + +struct OngoingTask { + limit: Arc, + sender: Sender, +} + +impl OngoingTask { + async fn schedule_worker_if_need(&self) { + if self.limit.has_pending_requests() { + if let Err(e) = self.sender.send(ScheduleTask::Schedule).await { + error!("Fail to schedule worker, err:{}", e); + } + } + } +} + +struct ScheduleWorker { + sender: Sender, + receiver: Receiver, + space_store: Arc>, + runtime: Arc, + schedule_interval: Duration, + picker_manager: PickerManager, + /// Buffer to hold all tables. + tables_buf: Vec, + max_ongoing_tasks: usize, + limit: Arc, + running: Arc, +} + +#[inline] +async fn schedule_table_compaction(sender: Sender, request: TableCompactionRequest) { + if let Err(e) = sender.send(ScheduleTask::Request(request)).await { + error!("Fail to send table compaction request, err:{}", e); + } +} + +impl< + Wal: Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore + Send + Sync + 'static, + Fa: Factory + Send + Sync + 'static, + > ScheduleWorker +{ + async fn schedule_loop(&mut self) { + while self.running.load(Ordering::Relaxed) { + // TODO(yingwen): Maybe add a random offset to the interval. + match time::timeout(self.schedule_interval, self.receiver.recv()).await { + Ok(Some(schedule_task)) => { + self.handle_schedule_task(schedule_task).await; + } + Ok(None) => { + // The channel is disconnected. + info!("Channel disconnected, compaction schedule worker exit"); + break; + } + Err(_) => { + // Timeout. + info!("Periodical compaction schedule start"); + + self.full_ttl_purge(); + + info!("Periodical compaction schedule end"); + } + } + } + + info!("Compaction schedule loop exit"); + } + + // This function is called seqentially, so we can mark files in compaction + // without racy. + async fn handle_schedule_task(&self, schedule_task: ScheduleTask) { + let ongoing = self.limit.ongoing_tasks(); + match schedule_task { + ScheduleTask::Request(compact_req) => { + debug!("Ongoing compaction tasks:{}", ongoing); + if ongoing >= self.max_ongoing_tasks { + self.limit.add_request(compact_req); + warn!( + "Too many compaction ongoing tasks:{}, max:{}, buf_len:{}", + ongoing, + self.max_ongoing_tasks, + self.limit.request_buf_len() + ); + } else { + self.do_table_compaction_request(compact_req).await; + } + } + ScheduleTask::Schedule => { + if self.max_ongoing_tasks > ongoing { + let pending = self.limit.drain_requests(self.max_ongoing_tasks - ongoing); + let len = pending.len(); + for compact_req in pending { + self.do_table_compaction_request(compact_req).await; + } + debug!("Scheduled {} pending compaction tasks.", len); + } + } + ScheduleTask::Exit => (), + }; + } + + async fn do_table_compaction_request(&self, compact_req: TableCompactionRequest) { + let table_data = compact_req.table_data; + let compaction_notifier = compact_req.compaction_notifier; + let waiter_notifier = WaiterNotifier::new(compact_req.waiter); + + let table_options = table_data.table_options(); + let compaction_strategy = table_options.compaction_strategy; + let picker = self.picker_manager.get_picker(compaction_strategy); + let picker_ctx = match new_picker_context(&*table_options) { + Some(v) => v, + None => { + warn!("No valid context can be created, compaction request will be ignored, table_id:{}, table_name:{}", + table_data.id, table_data.name); + return; + } + }; + let version = table_data.current_version(); + + // Pick compaction task. + let compaction_task = version.pick_for_compaction(picker_ctx, &picker); + let compaction_task = match compaction_task { + Ok(v) => v, + Err(e) => { + error!( + "Compaction scheduler failed to pick compaction, table:{}, table_id:{}, err:{}", + table_data.name, table_data.id, e + ); + // Now the error of picking compaction is considered not fatal and not sent to + // compaction notifier. + return; + } + }; + + // Mark files are in compaction. + compaction_task.mark_files_being_compacted(true); + + let keep_scheduling_compaction = !compaction_task.compaction_inputs.is_empty(); + + let runtime = self.runtime.clone(); + let space_store = self.space_store.clone(); + self.limit.start_task(); + let task = OngoingTask { + sender: self.sender.clone(), + limit: self.limit.clone(), + }; + + let sender = self.sender.clone(); + let request_id = RequestId::next_id(); + // Do actual costly compact job in background. + self.runtime.spawn(async move { + let res = space_store + .compact_table(runtime, &table_data, request_id, &compaction_task) + .await; + + if let Err(e) = &res { + // Compaction is failed, we need to unset the compaction mark. + compaction_task.mark_files_being_compacted(false); + + error!( + "Failed to compact table, table_name:{}, table_id:{}, request_id:{}, err:{}", + table_data.name, table_data.id, request_id, e + ); + } + + task.limit.finish_task(); + task.schedule_worker_if_need().await; + + // Notify the background compact table result. + match res { + Ok(()) => { + let new_compaction_notifier = compaction_notifier.clone(); + compaction_notifier.notify_ok(); + waiter_notifier.notify_wait_result(Ok(())); + + if keep_scheduling_compaction { + schedule_table_compaction( + sender, + TableCompactionRequest::no_waiter( + table_data.clone(), + new_compaction_notifier, + ), + ) + .await; + } + } + Err(e) => { + let e = Arc::new(e); + compaction_notifier.notify_err(e.clone()); + let wait_err = WaitError::Compaction { source: e }; + waiter_notifier.notify_wait_result(Err(wait_err)); + } + } + }); + } + + fn full_ttl_purge(&mut self) { + self.tables_buf.clear(); + self.space_store.list_all_tables(&mut self.tables_buf); + + let mut to_purge = Vec::new(); + + let now = Timestamp::now(); + for table_data in &self.tables_buf { + let expire_time = table_data + .table_options() + .ttl() + .map(|ttl| now.sub_duration_or_min(ttl.0)); + + let version = table_data.current_version(); + if !version.has_expired_sst(expire_time) { + debug!( + "Table has no expired sst, table:{}, table_id:{}, expire_time:{:?}", + table_data.name, table_data.id, expire_time + ); + + continue; + } + + // Create a compaction task that only purge expired files. + let compaction_task = CompactionTask { + expired: version.expired_ssts(expire_time), + ..Default::default() + }; + + // Marks being compacted. + compaction_task.mark_files_being_compacted(true); + + to_purge.push((table_data.clone(), compaction_task)); + } + + let runtime = self.runtime.clone(); + let space_store = self.space_store.clone(); + let request_id = RequestId::next_id(); + // Spawn a background job to purge ssts and avoid schedule thread blocked. + self.runtime.spawn(async move { + for (table_data, compaction_task) in to_purge { + info!("Period purge expired files, table:{}, table_id:{}, request_id:{}", table_data.name, table_data.id, request_id); + + if let Err(e) = space_store + .compact_table(runtime.clone(), &table_data, request_id, &compaction_task) + .await + { + error!( + "Failed to purge expired files of table, table:{}, table_id:{}, request_id:{}, err:{}", + table_data.name, table_data.id, request_id, e + ); + + // Unset the compaction mark. + compaction_task.mark_files_being_compacted(false); + } + } + }); + } +} + +// If segment duration is None, then no compaction should be triggered, but we +// return a None context instead of panic here. +fn new_picker_context(table_opts: &TableOptions) -> Option { + table_opts + .segment_duration() + .map(|segment_duration| PickerContext { + segment_duration, + ttl: table_opts.ttl().map(|ttl| ttl.0), + strategy: table_opts.compaction_strategy, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_request_queue() { + let mut q: RequestQueue = RequestQueue::default(); + assert!(q.is_empty()); + assert_eq!(0, q.len()); + + q.push_back(1, "task1".to_string()); + q.push_back(2, "task2".to_string()); + q.push_back(3, "task3".to_string()); + + assert_eq!(3, q.len()); + assert!(!q.is_empty()); + + assert_eq!("task1", q.pop_front().unwrap()); + assert_eq!("task2", q.pop_front().unwrap()); + assert_eq!("task3", q.pop_front().unwrap()); + assert!(q.pop_front().is_none()); + assert!(q.is_empty()); + + q.push_back(1, "task1".to_string()); + q.push_back(2, "task2".to_string()); + q.push_back(3, "task3".to_string()); + q.push_back(1, "task11".to_string()); + q.push_back(3, "task33".to_string()); + q.push_back(3, "task333".to_string()); + + assert_eq!(3, q.len()); + assert_eq!("task11", q.pop_front().unwrap()); + assert_eq!("task2", q.pop_front().unwrap()); + assert_eq!("task333", q.pop_front().unwrap()); + assert!(q.pop_front().is_none()); + assert!(q.is_empty()); + assert_eq!(0, q.len()); + } +} diff --git a/analytic_engine/src/context.rs b/analytic_engine/src/context.rs new file mode 100644 index 0000000000..60f2ef17c5 --- /dev/null +++ b/analytic_engine/src/context.rs @@ -0,0 +1,38 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Context for instance + +use std::{fmt, sync::Arc}; + +use parquet::{DataCacheRef, MetaCacheRef}; +use table_engine::engine::EngineRuntimes; + +use crate::Config; + +/// Common context for instance +pub struct CommonContext { + pub db_write_buffer_size: usize, + pub space_write_buffer_size: usize, +} + +/// Context for instance open +pub struct OpenContext { + /// Engine config + pub config: Config, + + /// Background job runtime + pub runtimes: Arc, + + /// Sst meta data cache. + pub meta_cache: Option, + /// Sst page cache. + pub data_cache: Option, +} + +impl fmt::Debug for OpenContext { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("OpenContext") + .field("config", &self.config) + .finish() + } +} diff --git a/analytic_engine/src/engine.rs b/analytic_engine/src/engine.rs new file mode 100644 index 0000000000..82e785186b --- /dev/null +++ b/analytic_engine/src/engine.rs @@ -0,0 +1,163 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Implements the TableEngine trait + +use std::sync::Arc; + +use async_trait::async_trait; +use log::info; +use object_store::ObjectStore; +use snafu::ResultExt; +use table_engine::{ + engine::{Close, CreateTableRequest, DropTableRequest, OpenTableRequest, Result, TableEngine}, + table::TableRef, + ANALYTIC_ENGINE_TYPE, +}; +use wal::manager::WalManager; + +use crate::{ + context::CommonContext, instance::InstanceRef, meta::Manifest, space::SpaceName, + sst::factory::Factory, table::TableImpl, +}; + +/// TableEngine implementation +pub struct TableEngineImpl { + /// Instance of the table engine + instance: InstanceRef, +} + +impl Clone for TableEngineImpl { + fn clone(&self) -> Self { + Self { + instance: self.instance.clone(), + } + } +} + +impl< + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa, + > TableEngineImpl +{ + pub fn new(instance: InstanceRef) -> Self { + Self { instance } + } +} + +impl TableEngineImpl { + pub fn instance(&self) -> InstanceRef { + self.instance.clone() + } +} + +impl Drop for TableEngineImpl { + fn drop(&mut self) { + info!("Table engine dropped"); + } +} + +#[async_trait] +impl< + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, + > TableEngine for TableEngineImpl +{ + fn engine_type(&self) -> &str { + ANALYTIC_ENGINE_TYPE + } + + async fn close(&self) -> Result<()> { + info!("Try to close table engine"); + + // Close the instance. + self.instance + .close() + .await + .map_err(|e| Box::new(e) as _) + .context(Close)?; + + info!("Table engine closed"); + + Ok(()) + } + + async fn create_table(&self, request: CreateTableRequest) -> Result { + let space = build_space_name(&request.catalog_name, &request.schema_name); + + info!( + "Table engine impl create table, space:{}, request:{:?}", + space, request + ); + + let ctx = CommonContext { + db_write_buffer_size: self.instance.db_write_buffer_size, + space_write_buffer_size: self.instance.space_write_buffer_size, + }; + let space_table = self.instance.create_table(&ctx, &space, request).await?; + + let table_impl = Arc::new(TableImpl::new( + space_table, + self.instance.clone(), + ANALYTIC_ENGINE_TYPE.to_string(), + )); + + Ok(table_impl) + } + + async fn drop_table(&self, request: DropTableRequest) -> Result { + let space = build_space_name(&request.catalog_name, &request.schema_name); + + info!( + "Table engine impl drop table, space:{}, request:{:?}", + space, request + ); + + let ctx = CommonContext { + db_write_buffer_size: self.instance.db_write_buffer_size, + space_write_buffer_size: self.instance.space_write_buffer_size, + }; + let dropped = self.instance.drop_table(&ctx, &space, request).await?; + Ok(dropped) + } + + async fn open_table(&self, request: OpenTableRequest) -> Result> { + let space = build_space_name(&request.catalog_name, &request.schema_name); + + info!( + "Table engine impl open table, space:{}, request:{:?}", + space, request + ); + let ctx = CommonContext { + db_write_buffer_size: self.instance.db_write_buffer_size, + space_write_buffer_size: self.instance.space_write_buffer_size, + }; + let space_table = match self + .instance + .find_table(&ctx, &space, &request.table_name)? + { + Some(v) => v, + None => return Ok(None), + }; + + let table_impl = Arc::new(TableImpl::new( + space_table, + self.instance.clone(), + ANALYTIC_ENGINE_TYPE.to_string(), + )); + + Ok(Some(table_impl)) + } +} + +/// Build the space name from catalog and schema +// TODO(yingwen): Should we store the => space mapping in the +// system catalog, then put it in the CreateTableRequest, avoid generating space +// name here +fn build_space_name(catalog: &str, schema: &str) -> SpaceName { + // FIXME(yingwen): Find out a better way to create space name + format!("{}/{}", catalog, schema) +} diff --git a/analytic_engine/src/instance/alter.rs b/analytic_engine/src/instance/alter.rs new file mode 100644 index 0000000000..e7ee9f6c42 --- /dev/null +++ b/analytic_engine/src/instance/alter.rs @@ -0,0 +1,289 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Alter schema logic of instance + +use std::{collections::HashMap, sync::Arc}; + +use common_types::schema::Version; +use common_util::define_result; +use log::info; +use object_store::ObjectStore; +use snafu::{ensure, Backtrace, ResultExt, Snafu}; +use table_engine::table::AlterSchemaRequest; +use tokio::sync::oneshot; +use wal::manager::WalManager; + +use crate::{ + instance::{ + flush_compaction::TableFlushOptions, + write_worker, + write_worker::{AlterOptionsCommand, AlterSchemaCommand, WorkerLocal}, + Instance, + }, + meta::{ + meta_update::{AlterOptionsMeta, AlterSchemaMeta, MetaUpdate}, + Manifest, + }, + space::SpaceAndTable, + sst::factory::Factory, + table::data::TableDataRef, + table_options, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to alter schema, source:{}", source,))] + AlterSchema { source: write_worker::Error }, + + #[snafu(display("Failed to alter options, source:{}", source,))] + AlterOptions { source: write_worker::Error }, + + #[snafu(display( + "Try to update schema to elder version, table:{}, current_version:{}, given_version:{}.\nBacktrace:\n{}", + table, + current_version, + given_version, + backtrace, + ))] + InvalidSchemaVersion { + table: String, + current_version: Version, + given_version: Version, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid previous schema version, table:{}, current_version:{}, pre_version:{}.\nBacktrace:\n{}", + table, + current_version, + pre_version, + backtrace, + ))] + InvalidPreVersion { + table: String, + current_version: Version, + pre_version: Version, + backtrace: Backtrace, + }, + + #[snafu(display("Alter schema of a dropped table:{}", table))] + AlterDroppedTable { table: String }, + + #[snafu(display("Failed to flush table, table:{}, err:{}", table, source))] + FlushTable { + table: String, + source: crate::instance::flush_compaction::Error, + }, + + #[snafu(display("Failed to persist alter update, err:{}", source))] + PersistAlter { + source: Box, + }, + + #[snafu(display("Invalid options, table:{}, err:{}", table, source))] + InvalidOptions { + table: String, + source: Box, + }, +} + +define_result!(Error); + +impl< + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, + > Instance +{ + // Alter schema need to be handled by write worker. + pub async fn alter_schema_of_table( + &self, + space_table: &SpaceAndTable, + request: AlterSchemaRequest, + ) -> Result<()> { + info!( + "Instance alter schema, space_table:{:?}, request:{:?}", + space_table, request + ); + + // Create a oneshot channel to send/receive alter schema result. + let (tx, rx) = oneshot::channel(); + let cmd = AlterSchemaCommand { + space_table: space_table.clone(), + request, + tx, + }; + + // Send alter schema request to write worker, actual works done in + // Self::process_alter_schema_command() + write_worker::process_command_in_write_worker( + cmd.into_command(), + space_table.table_data(), + rx, + ) + .await + .context(AlterSchema) + } + + /// Do the actual alter schema job, must called by write worker in write + /// thread sequentially. + pub(crate) async fn process_alter_schema_command( + self: &Arc, + worker_local: &mut WorkerLocal, + space_table: &SpaceAndTable, + request: AlterSchemaRequest, + ) -> Result<()> { + let table_data = space_table.table_data(); + // Validate alter schema request. + self.validate_before_alter(table_data, &request)?; + + let opts = TableFlushOptions { + block_on_write_thread: true, + ..Default::default() + }; + // We are in write thread now and there is no write request being processed, but + // we need to trigger a flush to ensure all wal entries with old schema + // are flushed, so we won't need to handle them during replaying wal. + self.flush_table_in_worker(worker_local, table_data, opts) + .await + .context(FlushTable { + table: &table_data.name, + })?; + + // Now we can persist and update the schema, since this function is called by + // write worker, so there is no other concurrent writer altering the + // schema. + let meta_update = MetaUpdate::AlterSchema(AlterSchemaMeta { + space_id: space_table.space().id, + table_id: table_data.id, + schema: request.schema.clone(), + pre_schema_version: request.pre_schema_version, + }); + self.space_store + .manifest + .store_update(meta_update) + .await + .map_err(|e| Box::new(e) as _) + .context(PersistAlter)?; + + info!( + "Instance update table schema, new_schema:{:?}", + request.schema + ); + + // Update schema in memory. + table_data.set_schema(request.schema); + + Ok(()) + } + + // Most validation should be done by catalog module, so we don't do too much + // duplicate check here, especially the schema compatibility. + fn validate_before_alter( + &self, + table_data: &TableDataRef, + request: &AlterSchemaRequest, + ) -> Result<()> { + ensure!( + !table_data.is_dropped(), + AlterDroppedTable { + table: &table_data.name, + } + ); + + let current_version = table_data.schema_version(); + ensure!( + current_version < request.schema.version(), + InvalidSchemaVersion { + table: &table_data.name, + current_version, + given_version: request.schema.version(), + } + ); + + ensure!( + current_version == request.pre_schema_version, + InvalidPreVersion { + table: &table_data.name, + current_version, + pre_version: request.pre_schema_version, + } + ); + + Ok(()) + } + + pub async fn alter_options_of_table( + &self, + space_table: &SpaceAndTable, + options: HashMap, + ) -> Result<()> { + info!( + "Instance alter options of table, space_table:{:?}, options:{:?}", + space_table, options + ); + + // Create a oneshot channel to send/receive alter options result. + let (tx, rx) = oneshot::channel(); + let cmd = AlterOptionsCommand { + space_table: space_table.clone(), + options, + tx, + }; + + // Send alter options request to write worker, actual works done in + // Self::process_alter_options_command() + write_worker::process_command_in_write_worker( + cmd.into_command(), + space_table.table_data(), + rx, + ) + .await + .context(AlterOptions) + } + + /// Do the actual alter options job, must called by write worker in write + /// thread sequentially. + pub(crate) async fn process_alter_options_command( + self: &Arc, + worker_local: &mut WorkerLocal, + space_table: &SpaceAndTable, + options: HashMap, + ) -> Result<()> { + let table_data = space_table.table_data(); + let current_table_options = table_data.table_options(); + info!( + "Instance alter options, space:{:?}, tables:{:?}, old_table_opts:{:?}, options:{:?}", + space_table.space().name, + space_table.table_data().name, + current_table_options, + options + ); + let mut table_opts = + table_options::merge_table_options_for_alter(&options, &*current_table_options) + .map_err(|e| Box::new(e) as _) + .context(InvalidOptions { + table: &table_data.name, + })?; + table_opts.sanitize(); + + // Now we can persist and update the options, since this function is called by + // write worker, so there is no other concurrent writer altering the + // options. + let meta_update = MetaUpdate::AlterOptions(AlterOptionsMeta { + space_id: space_table.space().id, + table_id: table_data.id, + options: table_opts.clone(), + }); + self.space_store + .manifest + .store_update(meta_update) + .await + .map_err(|e| Box::new(e) as _) + .context(PersistAlter)?; + + table_data.set_table_options(worker_local, table_opts); + Ok(()) + } +} diff --git a/analytic_engine/src/instance/close.rs b/analytic_engine/src/instance/close.rs new file mode 100644 index 0000000000..6ae34f4eb5 --- /dev/null +++ b/analytic_engine/src/instance/close.rs @@ -0,0 +1,93 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Close table logic of instance + +use std::sync::Arc; + +use log::{info, warn}; +use object_store::ObjectStore; +use snafu::ResultExt; +use table_engine::engine::CloseTableRequest; +use tokio::sync::oneshot; +use wal::manager::WalManager; + +use crate::{ + instance::{ + engine::{FlushTable, OperateByWriteWorker, Result}, + flush_compaction::TableFlushOptions, + write_worker::{self, CloseTableCommand, WorkerLocal}, + Instance, + }, + meta::Manifest, + space::SpaceRef, + sst::factory::Factory, +}; + +impl Instance +where + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, +{ + /// Close table need to be handled by write worker. + pub async fn do_close_table(&self, space: SpaceRef, request: CloseTableRequest) -> Result<()> { + info!("Instance close table, request:{:?}", request); + + let table_data = match space.find_table_by_id(request.table_id) { + Some(v) => v, + None => return Ok(()), + }; + + let (tx, rx) = oneshot::channel::>(); + let cmd = CloseTableCommand { space, request, tx }; + write_worker::process_command_in_write_worker(cmd.into_command(), &table_data, rx) + .await + .context(OperateByWriteWorker { + space_id: table_data.space_id, + table: &table_data.name, + table_id: table_data.id, + }) + } + + /// Do the actual close table job, must be called by write worker in write + /// thread sequentially. + pub(crate) async fn process_close_table_command( + self: &Arc, + worker_local: &mut WorkerLocal, + space: SpaceRef, + request: CloseTableRequest, + ) -> Result<()> { + let table_data = match space.find_table_by_id(request.table_id) { + Some(v) => v, + None => { + warn!("try to close a closed table, request:{:?}", request); + return Ok(()); + } + }; + + let opts = TableFlushOptions { + block_on_write_thread: true, + // The table will be dropped, no need to trigger a compaction. + compact_after_flush: false, + ..Default::default() + }; + self.flush_table_in_worker(worker_local, &table_data, opts) + .await + .context(FlushTable { + space_id: space.id, + table: &table_data.name, + table_id: table_data.id, + })?; + + // table has been closed so remove it from the space + let removed_table = space.remove_table(&request.table_name); + assert!(removed_table.is_some()); + + info!( + "table:{}-{} has been removed from the space_id:{}", + table_data.name, table_data.id, space.id + ); + Ok(()) + } +} diff --git a/analytic_engine/src/instance/create.rs b/analytic_engine/src/instance/create.rs new file mode 100644 index 0000000000..1597982f27 --- /dev/null +++ b/analytic_engine/src/instance/create.rs @@ -0,0 +1,131 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Create table logic of instance + +use std::sync::Arc; + +use log::info; +use object_store::ObjectStore; +use snafu::ResultExt; +use table_engine::engine::CreateTableRequest; +use tokio::sync::oneshot; +use wal::manager::WalManager; + +use crate::{ + instance::{ + engine::{CreateTableData, InvalidOptions, OperateByWriteWorker, Result, WriteManifest}, + write_worker::{self, CreateTableCommand, WorkerLocal}, + Instance, + }, + meta::{ + meta_update::{AddTableMeta, MetaUpdate}, + Manifest, + }, + space::SpaceRef, + sst::factory::Factory, + table::data::{TableData, TableDataRef}, + table_options, +}; + +impl Instance +where + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, +{ + /// Create table need to be handled by write worker. + pub async fn do_create_table( + &self, + space: SpaceRef, + request: CreateTableRequest, + ) -> Result { + info!("Instance create table, request:{:?}", request); + + let mut table_opts = + table_options::merge_table_options_for_create(&request.options, &self.table_opts) + .map_err(|e| Box::new(e) as _) + .context(InvalidOptions { + space_id: space.id, + table: &request.table_name, + table_id: request.table_id, + })?; + // Sanitize options before creating table. + table_opts.sanitize(); + + if let Some(table_data) = space.find_table_by_id(request.table_id) { + return Ok(table_data); + } + + // Choose a write worker for this table + let write_handle = space.write_group.choose_worker(request.table_id); + let (table_name, table_id) = (request.table_name.clone(), request.table_id); + + let table_data = Arc::new( + TableData::new( + space.id, + request, + write_handle, + table_opts, + &self.file_purger, + space.mem_usage_collector.clone(), + ) + .context(CreateTableData { + space_id: space.id, + table: &table_name, + table_id, + })?, + ); + + let space_id = space.id; + let (tx, rx) = oneshot::channel(); + let cmd = CreateTableCommand { + space, + table_data: table_data.clone(), + tx, + }; + write_worker::process_command_in_write_worker(cmd.into_command(), &table_data, rx) + .await + .context(OperateByWriteWorker { + space_id, + table: table_name, + table_id: table_data.id, + }) + } + + /// Do the actual create table job, must be called by write worker in write + /// thread sequentially. + pub(crate) async fn process_create_table_command( + self: &Arc, + _worker_local: &mut WorkerLocal, + space: SpaceRef, + table_data: TableDataRef, + ) -> Result { + if let Some(table_data) = space.find_table_by_id(table_data.id) { + // Use the table data from the space instead of the table_data in params. + return Ok(table_data); + }; + + // Store table info into meta + let update = MetaUpdate::AddTable(AddTableMeta { + space_id: space.id, + table_id: table_data.id, + table_name: table_data.name.clone(), + schema: table_data.schema(), + opts: table_data.table_options().as_ref().clone(), + }); + self.space_store + .manifest + .store_update(update) + .await + .map_err(|e| Box::new(e) as _) + .context(WriteManifest { + space_id: space.id, + table: &table_data.name, + table_id: table_data.id, + })?; + + space.insert_table(table_data.clone()); + Ok(table_data) + } +} diff --git a/analytic_engine/src/instance/drop.rs b/analytic_engine/src/instance/drop.rs new file mode 100644 index 0000000000..899d937524 --- /dev/null +++ b/analytic_engine/src/instance/drop.rs @@ -0,0 +1,152 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Drop table logic of instance + +use std::sync::Arc; + +use common_util::define_result; +use log::{info, warn}; +use object_store::ObjectStore; +use snafu::{ResultExt, Snafu}; +use table_engine::engine::DropTableRequest; +use tokio::sync::oneshot; +use wal::manager::WalManager; + +use crate::{ + instance::{ + flush_compaction::TableFlushOptions, + write_worker::{self, DropTableCommand, WorkerLocal}, + Instance, + }, + meta::{ + meta_update::{DropTableMeta, MetaUpdate}, + Manifest, + }, + space::SpaceAndTable, + sst::factory::Factory, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Failed to drop table space:{}, table:{}, err:{}", + space, + table, + source, + ))] + DropTable { + space: String, + table: String, + source: write_worker::Error, + }, + + #[snafu(display("Flush before drop failed, table:{}, err:{}", table, source))] + FlushTable { + table: String, + source: crate::instance::flush_compaction::Error, + }, + + #[snafu(display("Failed to persist drop table update, err:{}", source))] + PersistDrop { + source: Box, + }, +} + +define_result!(Error); + +impl< + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, + > Instance +{ + /// Drop table need to be handled by write worker. + pub async fn do_drop_table( + &self, + space_table: SpaceAndTable, + request: DropTableRequest, + ) -> Result<()> { + info!( + "Instance drop table, space_table:{:?}, request:{:?}", + space_table, request + ); + + // Create a oneshot channel to send/receive alter schema result. + let (tx, rx) = oneshot::channel(); + let cmd = DropTableCommand { + space_table: space_table.clone(), + request, + tx, + }; + + write_worker::process_command_in_write_worker( + cmd.into_command(), + space_table.table_data(), + rx, + ) + .await + .context(DropTable { + space: &space_table.space().name, + table: &space_table.table_data().name, + })?; + + Ok(()) + } + + /// Do the actual drop table job, must be called by write worker in write + /// thread sequentially. + pub(crate) async fn process_drop_table_command( + self: &Arc, + worker_local: &mut WorkerLocal, + space_table: &SpaceAndTable, + _request: DropTableRequest, + ) -> Result<()> { + let table_data = space_table.table_data(); + if table_data.is_dropped() { + warn!( + "Process drop table command tries to drop a dropped table, space_table:{:?}", + space_table + ); + return Ok(()); + } + + // Fixme(xikai): Trigger a force flush so that the data of the table in the wal + // is marked for deletable. However, the overhead of the flushing can + // be avoided. + let opts = TableFlushOptions { + block_on_write_thread: true, + // The table will be dropped, no need to trigger a compaction. + compact_after_flush: false, + ..Default::default() + }; + self.flush_table_in_worker(worker_local, table_data, opts) + .await + .context(FlushTable { + table: &table_data.name, + })?; + + // Store the dropping information into meta + let update = MetaUpdate::DropTable(DropTableMeta { + space_id: space_table.space().id, + table_id: table_data.id, + table_name: table_data.name.clone(), + }); + self.space_store + .manifest + .store_update(update) + .await + .map_err(|e| Box::new(e) as _) + .context(PersistDrop)?; + + // Set the table dropped after finishing flushing and storing drop table meta + // information. + table_data.set_dropped(); + + // Clear the memory status after updating manifest and clearing wal so that + // the drop is retryable if fails to update and clear. + space_table.space().remove_table(&table_data.name); + + Ok(()) + } +} diff --git a/analytic_engine/src/instance/engine.rs b/analytic_engine/src/instance/engine.rs new file mode 100644 index 0000000000..a96895070e --- /dev/null +++ b/analytic_engine/src/instance/engine.rs @@ -0,0 +1,230 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table engine logic of instance + +use std::sync::Arc; + +use common_util::define_result; +use log::info; +use object_store::ObjectStore; +use snafu::{ResultExt, Snafu}; +use table_engine::engine::{CreateTableRequest, DropTableRequest}; +use wal::manager::WalManager; + +use crate::{ + context::CommonContext, + instance::{write_worker::WriteGroup, Instance}, + meta::{ + meta_update::{AddSpaceMeta, MetaUpdate}, + Manifest, + }, + space::{Space, SpaceAndTable, SpaceNameRef, SpaceRef}, + sst::factory::Factory, + table_options, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Space failed to create table, err:{}", source))] + SpaceCreateTable { source: crate::space::Error }, + + #[snafu(display("Failed to drop table, err:{}", source))] + DoDropTable { + source: crate::instance::drop::Error, + }, + + #[snafu(display("Failed to store meta of space, space:{}, err:{}", space, source))] + SpaceWriteMeta { + space: String, + source: Box, + }, + #[snafu(display("Invalid options, table:{}, err:{}", table, source))] + InvalidOptions { + table: String, + source: Box, + }, +} + +define_result!(Error); + +impl From for table_engine::engine::Error { + fn from(err: Error) -> Self { + match err { + Error::SpaceCreateTable { source } => Self::from(source), + + // FIXME(xikai): should map drop table error to a more reasonable table engine error. + Error::DoDropTable { .. } => Self::Unexpected { + source: Box::new(err), + }, + + Error::SpaceWriteMeta { .. } => Self::WriteMeta { + source: Box::new(err), + }, + + Error::InvalidOptions { ref table, .. } => Self::InvalidArguments { + table: table.clone(), + source: Box::new(err), + }, + } + } +} + +impl< + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, + > Instance +{ + /// Find space by name, create if the space is not exists + pub async fn find_or_create_space( + self: &Arc, + _ctx: &CommonContext, + space_name: SpaceNameRef<'_>, + ) -> Result { + // Find space first + if let Some(space) = self.get_space_by_read_lock(space_name) { + return Ok(space); + } + + // Persist space data into meta, done with `meta_state` guarded + let mut meta_state = self.space_store.meta_state.lock().await; + // The space may already been created by other thread + if let Some(space) = self.get_space_by_read_lock(space_name) { + return Ok(space); + } + // Now we are the one responsible to create and persist the space info into meta + + let space_id = meta_state.alloc_space_id(); + // Create write group for the space + // TODO(yingwen): Expose options + let write_group_opts = self.write_group_options(space_id); + let write_group = WriteGroup::new(write_group_opts, self.clone()); + + // Create space + let space = Arc::new(Space::new( + space_id, + space_name.to_string(), + self.space_write_buffer_size, + write_group, + self.mem_usage_collector.clone(), + )); + + // Create a meta update and store it + let update = MetaUpdate::AddSpace(AddSpaceMeta { + space_id, + space_name: space_name.to_string(), + }); + info!("Instance create space, update:{:?}", update); + self.space_store + .manifest + .store_update(update) + .await + .map_err(|e| Box::new(e) as _) + .context(SpaceWriteMeta { space: space_name })?; + + let mut spaces = self.space_store.spaces.write().unwrap(); + spaces.insert(space_name.to_string(), space.clone()); + // Now we can release the meta state lock + + Ok(space) + } + + /// Find space by name + pub fn find_space( + &self, + _ctx: &CommonContext, + space: SpaceNameRef, + ) -> Result> { + let spaces = self.space_store.spaces.read().unwrap(); + Ok(spaces.get_by_name(space).cloned()) + } + + /// Create a table under given space + pub async fn create_table( + self: &Arc, + ctx: &CommonContext, + space: SpaceNameRef<'_>, + request: CreateTableRequest, + ) -> Result { + let mut table_opts = + table_options::merge_table_options_for_create(&request.options, &self.table_opts) + .map_err(|e| Box::new(e) as _) + .context(InvalidOptions { + table: &request.table_name, + })?; + // Sanitize options before creating table. + table_opts.sanitize(); + + info!( + "Instance create table, space:{}, request:{:?}, table_opts:{:?}", + space, request, table_opts + ); + + let space = self.find_or_create_space(ctx, space).await?; + + let table_data = space + .create_table( + request, + &self.space_store.manifest, + &table_opts, + &self.file_purger, + ) + .await + .context(SpaceCreateTable)?; + + Ok(SpaceAndTable::new(space, table_data)) + } + + /// Drop a table under given space + pub async fn drop_table( + self: &Arc, + ctx: &CommonContext, + space: SpaceNameRef<'_>, + request: DropTableRequest, + ) -> Result { + info!( + "Instance drop table, space:{}, request:{:?}", + space, request + ); + + let space = match self.find_space(ctx, space)? { + Some(v) => v, + None => return Ok(false), + }; + + // Checks whether the table is exists + let table = match space.find_table(&request.table_name) { + Some(v) => v, + None => return Ok(false), + }; + + let space_table = SpaceAndTable::new(space.clone(), table); + self.do_drop_table(space_table, request) + .await + .context(DoDropTable)?; + + Ok(true) + } + + /// Find the table under given space by its table name + /// + /// Return None if space or table is not found + pub fn find_table( + &self, + ctx: &CommonContext, + space: SpaceNameRef, + table: &str, + ) -> Result> { + let space = match self.find_space(ctx, space)? { + Some(s) => s, + None => return Ok(None), + }; + + let space_table = space + .find_table(table) + .map(|table_data| SpaceAndTable::new(space, table_data)); + + Ok(space_table) + } +} diff --git a/analytic_engine/src/instance/flush_compaction.rs b/analytic_engine/src/instance/flush_compaction.rs new file mode 100644 index 0000000000..f6fd3debf5 --- /dev/null +++ b/analytic_engine/src/instance/flush_compaction.rs @@ -0,0 +1,1037 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Flush and compaction logic of instance + +use std::{cmp, collections::Bound, sync::Arc}; + +use common_types::{ + projected_schema::ProjectedSchema, + record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder}, + request_id::RequestId, + row::RowViewOnBatch, + time::TimeRange, + SequenceNumber, +}; +use common_util::{config::ReadableDuration, define_result, runtime::Runtime}; +use futures::{ + channel::{mpsc, mpsc::channel}, + future::try_join_all, + stream, SinkExt, TryStreamExt, +}; +use log::{error, info}; +use object_store::{path::ObjectStorePath, ObjectStore}; +use snafu::{Backtrace, OptionExt, ResultExt, Snafu}; +use table_engine::{predicate::Predicate, table::Result as TableResult}; +use tokio::sync::oneshot; +use wal::manager::{RegionId, WalManager}; + +use crate::{ + compaction::{ + CompactionInputFiles, CompactionTask, ExpiredFiles, TableCompactionRequest, WaitError, + }, + instance::{ + write_worker::{self, CompactTableCommand, FlushTableCommand, WorkerLocal}, + Instance, SpaceStore, + }, + memtable::{ColumnarIterPtr, MemTableRef, ScanContext, ScanRequest}, + meta::{ + meta_update::{AlterOptionsMeta, MetaUpdate, VersionEditMeta}, + Manifest, + }, + row_iter::{ + self, + dedup::DedupIterator, + merge::{MergeBuilder, MergeConfig}, + IterOptions, + }, + space::SpaceAndTable, + sst::{ + builder::RecordBatchStream, + factory::{Factory, SstBuilderOptions, SstReaderOptions, SstType}, + file::{self, FileMeta, SstMetaData}, + }, + table::{ + data::{MemTableId, TableData, TableDataRef}, + version::{FlushableMemTables, MemTableState, SamplingMemTable}, + version_edit::{AddFile, DeleteFile, VersionEdit}, + }, +}; + +const DEFAULT_CHANNEL_SIZE: usize = 5; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to store version edit, err:{}", source))] + StoreVersionEdit { + source: Box, + }, + + #[snafu(display("Failed to purge wal, region_id:{}, sequence:{}", region_id, sequence))] + PurgeWal { + region_id: RegionId, + sequence: SequenceNumber, + source: wal::manager::Error, + }, + + #[snafu(display("Failed to build mem table iterator, source:{}", source))] + InvalidMemIter { + source: Box, + }, + + #[snafu(display( + "Sst type is not found, sst_type:{:?}.\nBacktrace:\n{}", + sst_type, + backtrace + ))] + InvalidSstType { + sst_type: SstType, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to build sst, file_path:{}, source:{}", path, source))] + FailBuildSst { + path: String, + source: Box, + }, + + #[snafu(display("Background flush failed, cannot schedule flush task, err:{}", source))] + BackgroundFlushFailed { + source: crate::instance::write_worker::Error, + }, + + #[snafu(display("Failed to send flush command, err:{}", source))] + SendFlushCmd { + source: crate::instance::write_worker::Error, + }, + + #[snafu(display("Failed to send compact command, err:{}", source))] + SendCompactCmd { + source: crate::instance::write_worker::Error, + }, + + #[snafu(display("Failed to build merge iterator, table:{}, err:{}", table, source))] + BuildMergeIterator { + table: String, + source: crate::row_iter::merge::Error, + }, + + #[snafu(display("Failed to do manual compaction, err:{}", source))] + ManualCompactFailed { + source: crate::compaction::WaitError, + }, + + #[snafu(display("Failed to split record batch, source:{}", source))] + SplitRecordBatch { + source: Box, + }, + + #[snafu(display("Failed to send to channel, source:{}", source))] + ChannelSend { source: mpsc::SendError }, + + #[snafu(display("Runtime join error, source:{}", source))] + RuntimeJoin { source: common_util::runtime::Error }, +} + +define_result!(Error); + +/// Options to flush single table. +#[derive(Debug)] +pub struct TableFlushOptions { + /// Flush result sender. + /// + /// Default is None. + pub res_sender: Option>>, + /// Schedule a compaction request after flush. + /// + /// Default is true. + pub compact_after_flush: bool, + /// Whether to block on write thread. + /// + /// Default is false. + pub block_on_write_thread: bool, +} + +impl Default for TableFlushOptions { + fn default() -> Self { + Self { + res_sender: None, + compact_after_flush: true, + block_on_write_thread: false, + } + } +} + +/// Request to flush single table. +pub struct TableFlushRequest { + /// Table to flush. + pub table_data: TableDataRef, + /// Max id of memtable to flush (inclusive). + pub max_memtable_id: MemTableId, +} + +impl< + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, + > Instance +{ + /// Flush this table. + pub async fn flush_table( + &self, + space_table: &SpaceAndTable, + flush_opts: TableFlushOptions, + ) -> Result<()> { + info!( + "Instance flush table, space_table:{:?}, flush_opts:{:?}", + space_table, flush_opts + ); + + // Create a oneshot channel to send/receive flush result. + let (tx, rx) = oneshot::channel(); + let cmd = FlushTableCommand { + space_table: space_table.clone(), + flush_opts, + tx, + }; + + // Actual work is done in flush_table_in_worker(). + write_worker::process_command_in_write_worker( + cmd.into_command(), + space_table.table_data(), + rx, + ) + .await + .context(SendFlushCmd) + } + + /// Compact the table manually. + pub async fn manual_compact_table(&self, space_table: &SpaceAndTable) -> Result<()> { + info!("Instance compact table, space_table:{:?}", space_table); + + // Create a oneshot channel to send/receive result from write worker. + let (tx, rx) = oneshot::channel(); + let (compact_tx, compact_rx) = oneshot::channel(); + // Create a oneshot channel to send/receive compaction result. + let cmd = CompactTableCommand { + space_table: space_table.clone(), + waiter: Some(compact_tx), + tx, + }; + + // The write worker will call schedule_table_compaction(). + write_worker::process_command_in_write_worker( + cmd.into_command(), + space_table.table_data(), + rx, + ) + .await + .context(SendCompactCmd)?; + + // Now wait for compaction done, if the sender has been dropped, we convert it + // into Error::Canceled. + compact_rx + .await + .unwrap_or(Err(WaitError::Canceled)) + .context(ManualCompactFailed) + } + + /// Flush given table in write worker thread. + pub async fn flush_table_in_worker( + self: &Arc, + worker_local: &mut WorkerLocal, + table_data: &TableDataRef, + opts: TableFlushOptions, + ) -> Result<()> { + let flush_req = self.preprocess_flush(worker_local, table_data).await?; + + self.schedule_table_flush(worker_local, flush_req, opts) + .await + } + + async fn preprocess_flush( + &self, + worker_local: &mut WorkerLocal, + table_data: &TableDataRef, + ) -> Result { + let current_version = table_data.current_version(); + let last_sequence = table_data.last_sequence(); + // Switch all mutable memtables + if let Some(suggest_segment_duration) = + current_version.switch_memtables_or_suggest_duration(worker_local) + { + info!("Switch memtable and suggest segment duration, table:{}, table_id:{}, segment_duration:{:?}", table_data.name, table_data.id, suggest_segment_duration); + assert!(suggest_segment_duration.as_millis() > 0); + + let mut new_table_opts = (*table_data.table_options()).clone(); + new_table_opts.segment_duration = Some(ReadableDuration(suggest_segment_duration)); + + // Now persist the new options, the `worker_local` ensure there is no race + // condition. + let meta_update = MetaUpdate::AlterOptions(AlterOptionsMeta { + space_id: table_data.space_id, + table_id: table_data.id, + options: new_table_opts.clone(), + }); + self.space_store + .manifest + .store_update(meta_update) + .await + .map_err(|e| Box::new(e) as _) + .context(StoreVersionEdit)?; + + table_data.set_table_options(worker_local, new_table_opts); + + // Now the segment duration is applied, we can stop sampling and freeze the + // sampling memtable. + current_version.freeze_sampling(worker_local); + } + + info!("Try to trigger memtable flush of table, table:{}, table_id:{}, max_memtable_id:{}, last_sequence:{}", + table_data.name, table_data.id, table_data.last_memtable_id(), last_sequence); + + // Try to flush all memtables of current table + Ok(TableFlushRequest { + table_data: table_data.clone(), + max_memtable_id: table_data.last_memtable_id(), + }) + } + + /// Schedule table flush request to background workers + async fn schedule_table_flush( + self: &Arc, + worker_local: &mut WorkerLocal, + flush_req: TableFlushRequest, + opts: TableFlushOptions, + ) -> Result<()> { + // TODO(yingwen): Store pending flush reqs and retry flush on recoverable error, + // or try to recover from background error + let table_data = flush_req.table_data.clone(); + let table = table_data.name.clone(); + + let instance = self.clone(); + let flush_job = async move { instance.flush_memtables_to_outputs(&flush_req).await }; + + let compact_req = TableCompactionRequest::no_waiter( + table_data.clone(), + worker_local.compaction_notifier(), + ); + let instance = self.clone(); + + if opts.compact_after_flush { + // Schedule compaction if flush completed successfully. + let on_flush_success = async move { + instance.schedule_table_compaction(compact_req).await; + }; + + worker_local + .flush_sequentially( + table, + &table_data.metrics, + flush_job, + on_flush_success, + opts.block_on_write_thread, + opts.res_sender, + ) + .await + .context(BackgroundFlushFailed) + } else { + worker_local + .flush_sequentially( + table, + &table_data.metrics, + flush_job, + async {}, + opts.block_on_write_thread, + opts.res_sender, + ) + .await + .context(BackgroundFlushFailed) + } + } + + /// Caller should guarantee flush of single table is sequential + pub(crate) async fn flush_memtables_to_outputs( + &self, + flush_req: &TableFlushRequest, + ) -> Result<()> { + // TODO(yingwen): Record memtables num to flush as statistics + let TableFlushRequest { + table_data, + max_memtable_id, + } = flush_req; + + let current_version = table_data.current_version(); + let mut mems_to_flush = FlushableMemTables::default(); + + current_version.pick_memtables_to_flush(*max_memtable_id, &mut mems_to_flush); + + if mems_to_flush.is_empty() { + return Ok(()); + } + + let request_id = RequestId::next_id(); + + info!( + "Instance try to flush memtables, table:{}, table_id:{}, request_id:{}, mems_to_flush:{:?}", + table_data.name, table_data.id, request_id, mems_to_flush + ); + + let local_metrics = table_data.metrics.local_flush_metrics(); + // Start flush duration timer. + let _timer = local_metrics.flush_duration_histogram.start_timer(); + let mut files_to_level0 = Vec::with_capacity(mems_to_flush.memtables.len()); + let mut flushed_sequence = 0; + let mut sst_num = 0; + + if let Some(sampling_mem) = &mems_to_flush.sampling_mem { + if let Some(seq) = self + .flush_sampling_memtable( + &*table_data, + request_id, + sampling_mem, + &mut files_to_level0, + ) + .await? + { + flushed_sequence = seq; + sst_num += files_to_level0.len(); + for add_file in &files_to_level0 { + local_metrics.observe_sst_size(add_file.file.meta.size); + } + } + } + + for mem in &mems_to_flush.memtables { + let file = self + .flush_memtable_to_output(&*table_data, request_id, mem) + .await?; + if let Some(file) = file { + let sst_size = file.meta.size; + files_to_level0.push(AddFile { level: 0, file }); + + // Set flushed sequence to max of the last_sequence of memtables. + flushed_sequence = cmp::max(flushed_sequence, mem.last_sequence()); + + sst_num += 1; + // Collect sst size metrics. + local_metrics.observe_sst_size(sst_size); + } + } + + // Collect sst num metrics. + local_metrics.observe_sst_num(sst_num); + + info!( + "Instance flush memtables to output, table:{}, table_id:{}, request_id:{}, mems_to_flush:{:?}, files_to_level0:{:?}, flushed_sequence:{}", + table_data.name, + table_data.id, + request_id, + mems_to_flush, + files_to_level0, + flushed_sequence + ); + + // Persist the flush result to manifest. + let edit_meta = VersionEditMeta { + space_id: table_data.space_id, + table_id: table_data.id, + flushed_sequence, + files_to_add: files_to_level0.clone(), + files_to_delete: Vec::new(), + }; + let meta_update = MetaUpdate::VersionEdit(edit_meta); + self.space_store + .manifest + .store_update(meta_update) + .await + .map_err(|e| Box::new(e) as _) + .context(StoreVersionEdit)?; + + // Apply to the table version. + let mems_to_remove = mems_to_flush.ids(); + let edit = VersionEdit { + flushed_sequence, + mems_to_remove, + files_to_add: files_to_level0, + files_to_delete: Vec::new(), + }; + table_data.current_version().apply_edit(edit); + + // Mark sequence <= flushed_sequence to be deleted. + self.space_store + .wal_manager + .mark_delete_entries_up_to(table_data.wal_region_id(), flushed_sequence) + .await + .context(PurgeWal { + region_id: table_data.wal_region_id(), + sequence: flushed_sequence, + })?; + + info!( + "Instance flush memtables done, table:{}, table_id:{}, request_id:{}", + table_data.name, table_data.id, request_id + ); + + Ok(()) + } + + /// Flush rows in sampling memtable to multiple ssts according to segment + /// duration. + /// + /// Returns flushed sequence. + async fn flush_sampling_memtable( + &self, + table_data: &TableData, + request_id: RequestId, + sampling_mem: &SamplingMemTable, + files_to_level0: &mut Vec, + ) -> Result> { + let (min_key, max_key) = match (sampling_mem.mem.min_key(), sampling_mem.mem.max_key()) { + (Some(min_key), Some(max_key)) => (min_key, max_key), + _ => { + // the memtable is empty and nothing needs flushing. + return Ok(None); + } + }; + + let max_sequence = sampling_mem.mem.last_sequence(); + let time_ranges = sampling_mem.sampler.ranges(); + + info!("Flush sampling memtable, table_id:{:?}, table_name:{:?}, request_id:{}, sampling memtable time_ranges:{:?}", + table_data.id,table_data.name, request_id, time_ranges); + + let mut batch_record_senders = Vec::with_capacity(time_ranges.len()); + let mut sst_handlers = Vec::with_capacity(time_ranges.len()); + let mut file_ids = Vec::with_capacity(time_ranges.len()); + + let sst_builder_options = SstBuilderOptions { + sst_type: table_data.sst_type, + num_rows_per_row_group: table_data.table_options().num_rows_per_row_group, + compression: table_data.table_options().compression, + }; + + for time_range in &time_ranges { + let (batch_record_sender, batch_record_receiver) = + channel::>(DEFAULT_CHANNEL_SIZE); + let file_id = table_data.alloc_file_id(); + let mut sst_file_path = self.space_store.store.new_path(); + table_data.set_sst_file_path(file_id, &mut sst_file_path); + + // TODO: min_key max_key set in sst_builder build + let mut sst_meta = SstMetaData { + min_key: min_key.clone(), + max_key: max_key.clone(), + time_range: *time_range, + max_sequence, + schema: table_data.schema(), + size: 0, + row_num: 0, + }; + + let store = self.space_store.clone(); + let sst_builder_options_clone = sst_builder_options.clone(); + let sst_type = table_data.sst_type; + + // spawn build sst + let handler = self.runtimes.bg_runtime.spawn(async move { + let mut builder = store + .sst_factory + .new_sst_builder( + &sst_builder_options_clone, + &sst_file_path, + store.store_ref(), + ) + .context(InvalidSstType { sst_type })?; + + let sst_info = builder + .build( + request_id, + &sst_meta, + Box::new(batch_record_receiver.map_err(|e| Box::new(e) as _)), + ) + .await + .map_err(|e| { + error!("Failed to build sst file, meta:{:?}, err:{}", sst_meta, e); + Box::new(e) as _ + }) + .with_context(|| FailBuildSst { + path: sst_file_path.display(), + })?; + + // update sst metadata by built info. + sst_meta.row_num = sst_info.row_num as u64; + sst_meta.size = sst_info.file_size as u64; + Ok(sst_meta) + }); + + batch_record_senders.push(batch_record_sender); + sst_handlers.push(handler); + file_ids.push(file_id); + } + + let iter = build_mem_table_iter(sampling_mem.mem.clone(), table_data)?; + + let timestamp_idx = table_data.schema().timestamp_index(); + + for data in iter { + for (idx, record_batch) in split_record_batch_with_time_ranges( + data.map_err(|e| Box::new(e) as _).context(InvalidMemIter)?, + &time_ranges, + timestamp_idx, + )? + .into_iter() + .enumerate() + { + if !record_batch.is_empty() { + batch_record_senders[idx] + .send(Ok(record_batch)) + .await + .context(ChannelSend)?; + } + } + } + batch_record_senders.clear(); + + let ret = try_join_all(sst_handlers).await; + for (idx, sst_meta) in ret.context(RuntimeJoin)?.into_iter().enumerate() { + files_to_level0.push(AddFile { + level: 0, + file: FileMeta { + id: file_ids[idx], + meta: sst_meta?, + }, + }) + } + + Ok(Some(max_sequence)) + } + + async fn flush_memtable_to_output( + &self, + table_data: &TableData, + request_id: RequestId, + memtable_state: &MemTableState, + ) -> Result> { + let (min_key, max_key) = match (memtable_state.mem.min_key(), memtable_state.mem.max_key()) + { + (Some(min_key), Some(max_key)) => (min_key, max_key), + _ => { + // the memtable is empty and nothing needs flushing. + return Ok(None); + } + }; + let max_sequence = memtable_state.last_sequence(); + let mut sst_meta = SstMetaData { + min_key, + max_key, + time_range: memtable_state.time_range, + max_sequence, + schema: table_data.schema(), + size: 0, + row_num: 0, + }; + + // Alloc file id for next sst file + let file_id = table_data.alloc_file_id(); + let mut sst_file_path = self.space_store.store.new_path(); + table_data.set_sst_file_path(file_id, &mut sst_file_path); + + let sst_builder_options = SstBuilderOptions { + sst_type: table_data.sst_type, + num_rows_per_row_group: table_data.table_options().num_rows_per_row_group, + compression: table_data.table_options().compression, + }; + let mut builder = self + .space_store + .sst_factory + .new_sst_builder( + &sst_builder_options, + &sst_file_path, + self.space_store.store_ref(), + ) + .context(InvalidSstType { + sst_type: table_data.sst_type, + })?; + + let iter = build_mem_table_iter(memtable_state.mem.clone(), table_data)?; + + let record_batch_stream: RecordBatchStream = + Box::new(stream::iter(iter).map_err(|e| Box::new(e) as _)); + + let sst_info = builder + .build(request_id, &sst_meta, record_batch_stream) + .await + .map_err(|e| { + // TODO(yingwen): Maybe remove this log. + error!("Failed to build sst file, meta:{:?}, err:{}", sst_meta, e); + Box::new(e) as _ + }) + .with_context(|| FailBuildSst { + path: sst_file_path.display(), + })?; + + // update sst metadata by built info. + sst_meta.row_num = sst_info.row_num as u64; + sst_meta.size = sst_info.file_size as u64; + + Ok(Some(FileMeta { + id: file_id, + meta: sst_meta, + })) + } + + /// Schedule table compaction request to background workers and return + /// immediately. + pub async fn schedule_table_compaction(&self, compact_req: TableCompactionRequest) { + self.compaction_scheduler + .schedule_table_compaction(compact_req) + .await; + } +} + +impl SpaceStore { + pub(crate) async fn compact_table( + &self, + runtime: Arc, + table_data: &TableData, + request_id: RequestId, + task: &CompactionTask, + ) -> Result<()> { + let mut edit_meta = VersionEditMeta { + space_id: table_data.space_id, + table_id: table_data.id, + flushed_sequence: 0, + // Use the number of compaction inputs as the estimated number of files to add. + files_to_add: Vec::with_capacity(task.compaction_inputs.len()), + files_to_delete: Vec::new(), + }; + + if task.expired.is_empty() && task.compaction_inputs.is_empty() { + // Nothing to compact. + return Ok(()); + } + + for files in &task.expired { + self.delete_expired_files(table_data, request_id, files, &mut edit_meta); + } + + for input in &task.compaction_inputs { + self.compact_input_files( + runtime.clone(), + table_data, + request_id, + input, + &mut edit_meta, + ) + .await?; + } + + let meta_update = MetaUpdate::VersionEdit(edit_meta.clone()); + self.manifest + .store_update(meta_update) + .await + .map_err(|e| Box::new(e) as _) + .context(StoreVersionEdit)?; + + // Apply to the table version. + let edit = edit_meta.into_version_edit(); + table_data.current_version().apply_edit(edit); + + Ok(()) + } + + pub(crate) async fn compact_input_files( + &self, + runtime: Arc, + table_data: &TableData, + request_id: RequestId, + input: &CompactionInputFiles, + edit_meta: &mut VersionEditMeta, + ) -> Result<()> { + if input.files.is_empty() { + return Ok(()); + } + + // metrics + let _timer = table_data + .metrics + .compaction_duration_histogram + .start_timer(); + table_data + .metrics + .compaction_observe_sst_num(input.files.len()); + let mut sst_size = 0; + let mut sst_row_num = 0; + for file in &input.files { + sst_size += file.size(); + sst_row_num += file.row_num(); + } + table_data + .metrics + .compaction_observe_input_sst_size(sst_size); + table_data + .metrics + .compaction_observe_input_sst_row_num(sst_row_num); + + info!( + "Instance try to compact table, table:{}, table_id:{}, request_id:{}, input_files:{:?}", + table_data.name, table_data.id, request_id, input.files, + ); + + // The schema may be modified during compaction, so we acquire it first and use + // the acquired schema as the compacted sst meta. + let schema = table_data.schema(); + let table_options = table_data.table_options(); + + let iter_options = IterOptions::default(); + let merge_iter = { + let space_id = table_data.space_id; + let table_id = table_data.id; + let sequence = table_data.last_sequence(); + let projected_schema = ProjectedSchema::no_projection(schema.clone()); + let sst_reader_options = SstReaderOptions { + sst_type: table_data.sst_type, + read_batch_row_num: table_options.num_rows_per_row_group, + reverse: false, + projected_schema: projected_schema.clone(), + predicate: Arc::new(Predicate::new(TimeRange::min_to_max())), + meta_cache: self.meta_cache.clone(), + data_cache: self.data_cache.clone(), + runtime: runtime.clone(), + }; + let mut builder = MergeBuilder::new(MergeConfig { + request_id, + space_id, + table_id, + sequence, + projected_schema, + predicate: Arc::new(Predicate::empty()), + sst_factory: self.sst_factory.clone(), + sst_reader_options, + store: self.store_ref(), + merge_iter_options: iter_options.clone(), + need_dedup: table_options.need_dedup(), + reverse: false, + }); + // Add all ssts in compaction input to builder. + builder + .mut_ssts_of_level(input.level) + .extend_from_slice(&input.files); + let merge_iter = builder.build().await.context(BuildMergeIterator { + table: table_data.name.clone(), + })?; + merge_iter + }; + + let record_batch_stream = if table_options.need_dedup() { + row_iter::record_batch_with_key_iter_to_stream( + DedupIterator::new(request_id, merge_iter, iter_options), + &runtime, + ) + } else { + row_iter::record_batch_with_key_iter_to_stream(merge_iter, &runtime) + }; + + let mut sst_meta = file::merge_sst_meta(&input.files, schema); + + // Alloc file id for the merged sst. + let file_id = table_data.alloc_file_id(); + let mut sst_file_path = self.store.new_path(); + table_data.set_sst_file_path(file_id, &mut sst_file_path); + + let sst_builder_options = SstBuilderOptions { + sst_type: table_data.sst_type, + num_rows_per_row_group: table_options.num_rows_per_row_group, + compression: table_options.compression, + }; + let mut sst_builder = self + .sst_factory + .new_sst_builder(&sst_builder_options, &sst_file_path, self.store_ref()) + .context(InvalidSstType { + sst_type: table_data.sst_type, + })?; + + let sst_info = sst_builder + .build(request_id, &sst_meta, record_batch_stream) + .await + .map_err(|e| Box::new(e) as _) + .with_context(|| FailBuildSst { + path: sst_file_path.display(), + })?; + + // update sst metadata by built info. + sst_meta.row_num = sst_info.row_num as u64; + sst_meta.size = sst_info.file_size as u64; + + table_data + .metrics + .compaction_observe_output_sst_size(sst_meta.size); + table_data + .metrics + .compaction_observe_output_sst_row_num(sst_meta.row_num); + + info!( + "Instance files compacted, table:{}, table_id:{}, request_id:{}, output_path:{}, input_files:{:?}, sst_meta:{:?}", + table_data.name, + table_data.id, + request_id, + sst_file_path.display(), + input.files, + sst_meta + ); + + // Store updates to edit_meta. + edit_meta.files_to_delete.reserve(input.files.len()); + // The compacted file can be deleted later. + for file in &input.files { + edit_meta.files_to_delete.push(DeleteFile { + level: input.level, + file_id: file.id(), + }); + } + // Add the newly created file to meta. + edit_meta.files_to_add.push(AddFile { + level: input.output_level, + file: FileMeta { + id: file_id, + meta: sst_meta, + }, + }); + + Ok(()) + } + + pub(crate) fn delete_expired_files( + &self, + table_data: &TableData, + request_id: RequestId, + expired: &ExpiredFiles, + edit_meta: &mut VersionEditMeta, + ) { + if !expired.files.is_empty() { + info!( + "Instance try to delete expired files, table:{}, table_id:{}, request_id:{}, level:{}, files:{:?}", + table_data.name, table_data.id, request_id, expired.level, expired.files, + ); + } + + let files = &expired.files; + edit_meta.files_to_delete.reserve(files.len()); + for file in files { + edit_meta.files_to_delete.push(DeleteFile { + level: expired.level, + file_id: file.id(), + }); + } + } +} + +fn split_record_batch_with_time_ranges( + record_batch: RecordBatchWithKey, + time_ranges: &[TimeRange], + timestamp_idx: usize, +) -> Result> { + let mut builders: Vec = (0..time_ranges.len()) + .into_iter() + .map(|_| RecordBatchWithKeyBuilder::new(record_batch.schema_with_key().clone())) + .collect(); + + for row_idx in 0..record_batch.num_rows() { + let datum = record_batch.column(timestamp_idx).datum(row_idx); + let timestamp = datum.as_timestamp().unwrap(); + let mut idx = None; + for (i, time_range) in time_ranges.iter().enumerate() { + if time_range.contains(timestamp) { + idx = Some(i); + break; + } + } + + if let Some(idx) = idx { + let view = RowViewOnBatch { + record_batch: &record_batch, + row_idx, + }; + builders[idx] + .append_row_view(&view) + .map_err(|e| Box::new(e) as _) + .context(SplitRecordBatch)?; + } else { + panic!( + "Record timestamp is not in time_ranges, timestamp:{:?}, time_ranges:{:?}", + timestamp, time_ranges + ); + } + } + let mut ret = Vec::with_capacity(builders.len()); + for mut builder in builders { + ret.push( + builder + .build() + .map_err(|e| Box::new(e) as _) + .context(SplitRecordBatch)?, + ); + } + Ok(ret) +} + +fn build_mem_table_iter(memtable: MemTableRef, table_data: &TableData) -> Result { + let scan_ctx = ScanContext::default(); + let scan_req = ScanRequest { + start_user_key: Bound::Unbounded, + end_user_key: Bound::Unbounded, + sequence: common_types::MAX_SEQUENCE_NUMBER, + projected_schema: ProjectedSchema::no_projection(table_data.schema()), + need_dedup: table_data.dedup(), + reverse: false, + }; + memtable + .scan(scan_ctx, scan_req) + .map_err(|e| Box::new(e) as _) + .context(InvalidMemIter) +} + +#[cfg(test)] +mod tests { + use common_types::{ + tests::{ + build_record_batch_with_key_by_rows, build_row, build_row_opt, + check_record_batch_with_key_with_rows, + }, + time::TimeRange, + }; + + use crate::instance::flush_compaction::split_record_batch_with_time_ranges; + + #[test] + fn test_split_record_batch_with_time_ranges() { + let rows0 = vec![build_row(b"binary key", 20, 10.0, "string value")]; + let rows1 = vec![build_row(b"binary key1", 120, 11.0, "string value 1")]; + let rows2 = vec![ + build_row_opt(b"binary key2", 220, None, Some("string value 2")), + build_row_opt(b"binary key3", 250, Some(13.0), None), + ]; + + let rows = vec![rows0.clone(), rows1.clone(), rows2.clone()] + .into_iter() + .flatten() + .collect(); + let record_batch_with_key = build_record_batch_with_key_by_rows(rows); + let column_num = record_batch_with_key.num_columns(); + let time_ranges = vec![ + TimeRange::new_unchecked_for_test(0, 100), + TimeRange::new_unchecked_for_test(100, 200), + TimeRange::new_unchecked_for_test(200, 300), + ]; + + let timestamp_idx = 1; + let rets = + split_record_batch_with_time_ranges(record_batch_with_key, &time_ranges, timestamp_idx) + .unwrap(); + + check_record_batch_with_key_with_rows(&rets[0], rows0.len(), column_num, rows0); + check_record_batch_with_key_with_rows(&rets[1], rows1.len(), column_num, rows1); + check_record_batch_with_key_with_rows(&rets[2], rows2.len(), column_num, rows2); + } +} diff --git a/analytic_engine/src/instance/mem_collector.rs b/analytic_engine/src/instance/mem_collector.rs new file mode 100644 index 0000000000..c686974b34 --- /dev/null +++ b/analytic_engine/src/instance/mem_collector.rs @@ -0,0 +1,118 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::sync::atomic::{AtomicUsize, Ordering}; + +use arena::{Collector, CollectorRef}; + +/// Space memtable memory usage collector +pub struct MemUsageCollector { + /// Memory size allocated in bytes. + bytes_allocated: AtomicUsize, + /// Memory size used in bytes. + bytes_used: AtomicUsize, + parent: Option, +} + +impl Collector for MemUsageCollector { + fn on_alloc(&self, bytes: usize) { + self.bytes_allocated.fetch_add(bytes, Ordering::Relaxed); + + if let Some(c) = &self.parent { + c.on_alloc(bytes); + } + } + + fn on_used(&self, bytes: usize) { + self.bytes_used.fetch_add(bytes, Ordering::Relaxed); + + if let Some(c) = &self.parent { + c.on_used(bytes); + } + } + + fn on_free(&self, used: usize, allocated: usize) { + self.bytes_allocated.fetch_sub(allocated, Ordering::Relaxed); + self.bytes_used.fetch_sub(used, Ordering::Relaxed); + + if let Some(c) = &self.parent { + c.on_free(used, allocated); + } + } +} + +impl Default for MemUsageCollector { + fn default() -> Self { + Self { + bytes_allocated: AtomicUsize::new(0), + bytes_used: AtomicUsize::new(0), + parent: None, + } + } +} + +impl MemUsageCollector { + pub fn with_parent(collector: CollectorRef) -> Self { + Self { + bytes_allocated: AtomicUsize::new(0), + bytes_used: AtomicUsize::new(0), + parent: Some(collector), + } + } + + #[inline] + pub fn total_memory_allocated(&self) -> usize { + self.bytes_allocated.load(Ordering::Relaxed) + } +} + +#[cfg(test)] +mod tests { + use std::sync::{atomic::Ordering, Arc}; + + use super::*; + #[test] + fn test_collector() { + let collector = MemUsageCollector::default(); + + collector.on_alloc(1024); + collector.on_used(128); + assert_eq!(1024, collector.total_memory_allocated()); + assert_eq!(128, collector.bytes_used.load(Ordering::Relaxed)); + + collector.on_free(64, 512); + assert_eq!(512, collector.total_memory_allocated()); + assert_eq!(64, collector.bytes_used.load(Ordering::Relaxed)); + collector.on_free(64, 512); + assert_eq!(0, collector.total_memory_allocated()); + assert_eq!(0, collector.bytes_used.load(Ordering::Relaxed)); + } + + #[test] + fn test_collector_with_parent() { + let p = Arc::new(MemUsageCollector::default()); + let c1 = MemUsageCollector::with_parent(p.clone()); + let c2 = MemUsageCollector::with_parent(p.clone()); + + c1.on_alloc(1024); + c1.on_used(128); + c2.on_alloc(1024); + c2.on_used(128); + assert_eq!(1024, c1.total_memory_allocated()); + assert_eq!(128, c1.bytes_used.load(Ordering::Relaxed)); + assert_eq!(1024, c2.total_memory_allocated()); + assert_eq!(128, c2.bytes_used.load(Ordering::Relaxed)); + assert_eq!(2048, p.total_memory_allocated()); + assert_eq!(256, p.bytes_used.load(Ordering::Relaxed)); + + c1.on_free(64, 512); + assert_eq!(512, c1.total_memory_allocated()); + assert_eq!(64, c1.bytes_used.load(Ordering::Relaxed)); + assert_eq!(1536, p.total_memory_allocated()); + assert_eq!(192, p.bytes_used.load(Ordering::Relaxed)); + c2.on_free(64, 512); + assert_eq!(512, c2.total_memory_allocated()); + assert_eq!(64, c2.bytes_used.load(Ordering::Relaxed)); + assert_eq!(1024, p.total_memory_allocated()); + assert_eq!(128, p.bytes_used.load(Ordering::Relaxed)); + } +} diff --git a/analytic_engine/src/instance/mod.rs b/analytic_engine/src/instance/mod.rs new file mode 100644 index 0000000000..07bdcf350b --- /dev/null +++ b/analytic_engine/src/instance/mod.rs @@ -0,0 +1,271 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! A table engine instance +//! +//! The root mod only contains common functions of instance, other logics are +//! divided into the sub crates + +mod alter; +mod drop; +mod engine; +pub mod flush_compaction; +pub(crate) mod mem_collector; +pub mod open; +mod read; +mod write; +pub mod write_worker; + +use std::{ + collections::HashMap, + sync::{Arc, RwLock}, +}; + +use common_util::{define_result, runtime::Runtime}; +use log::info; +use mem_collector::MemUsageCollector; +use object_store::ObjectStore; +use parquet::{DataCacheRef, MetaCacheRef}; +use snafu::{ResultExt, Snafu}; +use table_engine::engine::EngineRuntimes; +use tokio::sync::Mutex; +use wal::manager::WalManager; + +use crate::{ + compaction::scheduler::CompactionSchedulerRef, + meta::Manifest, + space::{SpaceId, SpaceName, SpaceNameRef, SpaceRef}, + sst::file::FilePurger, + table::data::TableDataRef, + TableOptions, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to stop file purger, err:{}", source))] + StopFilePurger { source: crate::sst::file::Error }, + + #[snafu(display("Failed to stop compaction scheduler, err:{}", source))] + StopScheduler { + source: crate::compaction::scheduler::Error, + }, + + #[snafu(display("Failed to close space, name:{}, err:{}", name, source))] + CloseSpace { + name: String, + source: crate::space::Error, + }, +} + +define_result!(Error); + +/// Meta state +#[derive(Debug)] +struct MetaState { + /// Id of the last space + last_space_id: SpaceId, +} + +impl MetaState { + /// Create a new state + fn new() -> Self { + Self { last_space_id: 1 } + } + + /// Acquire next id for a new space + fn alloc_space_id(&mut self) -> SpaceId { + self.last_space_id += 1; + self.last_space_id + } +} + +impl Default for MetaState { + fn default() -> Self { + Self::new() + } +} + +/// Spaces states +#[derive(Default)] +struct Spaces { + /// Name to space + name_to_space: HashMap, + /// Id to space + id_to_space: HashMap, +} + +impl Spaces { + /// Insert space by name, and also insert id to space mapping + fn insert(&mut self, space_name: SpaceName, space: SpaceRef) { + let space_id = space.id; + self.name_to_space.insert(space_name, space.clone()); + self.id_to_space.insert(space_id, space); + } + + fn get_by_name(&self, name: SpaceNameRef) -> Option<&SpaceRef> { + self.name_to_space.get(name) + } + + /// List all tables of all spaces + fn list_all_tables(&self, tables: &mut Vec) { + let total_tables = self.id_to_space.values().map(|s| s.table_num()).sum(); + tables.reserve(total_tables); + for space in self.id_to_space.values() { + space.list_all_tables(tables); + } + } + + fn list_all_spaces(&self) -> Vec { + self.id_to_space.values().cloned().collect() + } +} + +pub struct SpaceStore { + /// All spaces of the engine. + spaces: RwLock, + /// Manifest (or meta) stores meta data of the engine instance. + manifest: Meta, + /// Wal of all tables + wal_manager: Wal, + /// Sst storage. + store: Arc, + /// Meta lock protects mutation to meta data of the instance. This lock + /// should be held when persisting mutation of the instance level meta data + /// to the manifest. + /// - add a space + /// - delete a space + /// + /// Mutation to space's meta, like add/delete a table, is protected by + /// space's lock instead of this lock. + meta_state: Mutex, + /// Sst factory. + sst_factory: Fa, + + meta_cache: Option, + data_cache: Option, +} + +impl Drop for SpaceStore { + fn drop(&mut self) { + info!("SpaceStore dropped"); + } +} + +impl SpaceStore { + async fn close(&self) -> Result<()> { + let spaces = self.spaces.read().unwrap().list_all_spaces(); + for space in spaces { + // Close all spaces. + space + .close() + .await + .context(CloseSpace { name: &space.name })?; + } + + Ok(()) + } +} + +impl SpaceStore { + fn store_ref(&self) -> &Store { + &*self.store + } + + /// List all tables of all spaces + pub fn list_all_tables(&self, tables: &mut Vec) { + let spaces = self.spaces.read().unwrap(); + spaces.list_all_tables(tables); + } + + /// Find the space which it's all memtables consumes maximum memory. + #[inline] + fn find_maximum_memory_usage_space(&self) -> Option { + let spaces = self.spaces.read().unwrap().list_all_spaces(); + spaces.into_iter().max_by_key(|t| t.memtable_memory_usage()) + } +} + +/// Table engine instance +/// +/// Manages all spaces, also contains needed resources shared across all table +// TODO(yingwen): Track memory usage of all tables (or tables of space) +pub struct Instance { + /// Space storage + space_store: Arc>, + /// Runtime to execute async tasks. + runtimes: Arc, + /// Global table options, overwrite mutable options in each table's + /// TableOptions. + table_opts: TableOptions, + + // Write group options: + write_group_worker_num: usize, + write_group_command_channel_cap: usize, + // End of write group options. + compaction_scheduler: CompactionSchedulerRef, + file_purger: FilePurger, + + meta_cache: Option, + data_cache: Option, + /// Engine memtable memory usage collector + mem_usage_collector: Arc, + /// Engine write buffer size + pub(crate) db_write_buffer_size: usize, + /// Space write buffer size + pub(crate) space_write_buffer_size: usize, +} + +impl Instance { + /// Close the instance gracefully. + pub async fn close(&self) -> Result<()> { + self.file_purger.stop().await.context(StopFilePurger)?; + + self.space_store.close().await?; + + self.compaction_scheduler + .stop_scheduler() + .await + .context(StopScheduler) + } +} + +// TODO(yingwen): Instance builder +impl + Instance +{ + /// Find space using read lock + fn get_space_by_read_lock(&self, space: SpaceNameRef) -> Option { + let spaces = self.space_store.spaces.read().unwrap(); + spaces.get_by_name(space).cloned() + } + + /// Returns options to create a write group for given space + fn write_group_options(&self, space_id: SpaceId) -> write_worker::Options { + write_worker::Options { + space_id, + worker_num: self.write_group_worker_num, + runtime: self.write_runtime().clone(), + command_channel_capacity: self.write_group_command_channel_cap, + } + } + + /// Returns true when engine instance's total memtable memory usage reaches + /// db_write_buffer_size limit. + #[inline] + fn should_flush_instance(&self) -> bool { + self.db_write_buffer_size > 0 + && self.mem_usage_collector.total_memory_allocated() >= self.db_write_buffer_size + } + + #[inline] + fn read_runtime(&self) -> &Arc { + &self.runtimes.read_runtime + } + + #[inline] + fn write_runtime(&self) -> &Arc { + &self.runtimes.write_runtime + } +} + +/// Instance reference +pub type InstanceRef = Arc>; diff --git a/analytic_engine/src/instance/open.rs b/analytic_engine/src/instance/open.rs new file mode 100644 index 0000000000..deb5a047b9 --- /dev/null +++ b/analytic_engine/src/instance/open.rs @@ -0,0 +1,415 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Open logic of instance + +use std::sync::{Arc, RwLock}; + +use common_types::schema::IndexInWriterSchema; +use common_util::define_result; +use log::{debug, error, info, trace}; +use object_store::ObjectStore; +use snafu::{ResultExt, Snafu}; +use tokio::sync::{oneshot, Mutex}; +use wal::{ + log_batch::LogEntry, + manager::{LogIterator, ReadBoundary, ReadContext, ReadRequest, WalManager}, +}; + +use crate::{ + compaction::scheduler::SchedulerImpl, + context::OpenContext, + instance::{ + mem_collector::MemUsageCollector, + write_worker, + write_worker::{RecoverTableCommand, WorkerLocal, WriteGroup}, + Instance, MetaState, SpaceStore, Spaces, + }, + meta::{meta_data::ManifestData, Manifest}, + payload::{ReadPayload, WalDecoder}, + space::{Space, SpaceId}, + sst::{factory::Factory, file::FilePurger}, + table::data::{TableData, TableDataRef}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to read meta update, err:{}", source))] + ReadMetaUpdate { + source: Box, + }, + + #[snafu(display( + "Failed to recover table data, space_id:{}, table:{}, err:{}", + space_id, + table, + source + ))] + RecoverTableData { + space_id: SpaceId, + table: String, + source: crate::table::data::Error, + }, + + #[snafu(display("Failed to read wal, err:{}", source))] + ReadWal { source: wal::manager::Error }, + + #[snafu(display("Failed to apply log entry to memtable, err:{}", source))] + ApplyMemTable { + source: crate::instance::write::Error, + }, + + #[snafu(display("Failed to recover table, source:{}", source,))] + RecoverTable { source: write_worker::Error }, +} + +define_result!(Error); + +impl< + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, + > Instance +{ + /// Open a new instance + pub async fn open( + ctx: OpenContext, + manifest: Meta, + wal_manager: Wal, + store: Store, + sst_factory: Fa, + ) -> Result> { + let store = Arc::new(store); + let space_store = Arc::new(SpaceStore { + spaces: RwLock::new(Spaces::default()), + manifest, + wal_manager, + store: store.clone(), + meta_state: Mutex::new(MetaState::default()), + sst_factory, + meta_cache: ctx.meta_cache.clone(), + data_cache: ctx.data_cache.clone(), + }); + + let scheduler_config = ctx.config.compaction_config.clone(); + let bg_runtime = ctx.runtimes.bg_runtime.clone(); + let compaction_scheduler = Arc::new(SchedulerImpl::new( + space_store.clone(), + bg_runtime.clone(), + scheduler_config, + )); + + let file_purger = FilePurger::start(&*bg_runtime, store); + + let instance = Arc::new(Instance { + space_store, + runtimes: ctx.runtimes.clone(), + table_opts: ctx.config.table_opts.clone(), + write_group_worker_num: ctx.config.write_group_worker_num, + write_group_command_channel_cap: ctx.config.write_group_command_channel_cap, + compaction_scheduler, + file_purger, + meta_cache: ctx.meta_cache.clone(), + data_cache: ctx.data_cache.clone(), + mem_usage_collector: Arc::new(MemUsageCollector::default()), + db_write_buffer_size: ctx.config.db_write_buffer_size, + space_write_buffer_size: ctx.config.space_write_buffer_size, + }); + + instance.recover(ctx).await?; + + Ok(instance) + } + + /// Recover the instance + /// + /// Should only called by open() + async fn recover(self: &Arc, ctx: OpenContext) -> Result<()> { + // Recover meta data, such as all spaces and tables + self.recover_from_meta(&ctx).await?; + + // Recover from wal + self.recover_from_wal(&ctx).await?; + + Ok(()) + } + + /// Recover meta data from manifest + async fn recover_from_meta(self: &Arc, ctx: &OpenContext) -> Result<()> { + info!("Instance recover from meta begin"); + + // Load manifest, also create a new snapshot at startup. + let manifest_data = self + .space_store + .manifest + .load_data(true) + .await + .map_err(|e| Box::new(e) as _) + .context(ReadMetaUpdate)?; + + self.apply_manifest_data(manifest_data, ctx).await?; + + info!("Instance recover from meta end"); + + Ok(()) + } + + /// Apply manifest data to instance + async fn apply_manifest_data( + self: &Arc, + manifest_data: ManifestData, + ctx: &OpenContext, + ) -> Result<()> { + // Apply all spaces. + for (space_id, space_meta_data) in manifest_data.spaces { + // Create write group for space. + let space_meta = space_meta_data.space_meta; + let write_group_opts = self.write_group_options(space_id); + let write_group = WriteGroup::new(write_group_opts, self.clone()); + + // Add this space to instance. + let space = Arc::new(Space::new( + space_id, + space_meta.space_name.clone(), + ctx.config.space_write_buffer_size, + write_group, + self.mem_usage_collector.clone(), + )); + { + let mut spaces = self.space_store.spaces.write().unwrap(); + spaces.insert(space_meta.space_name, space.clone()); + } + + // Add all tables to the space. + for (table_id, table_meta_data) in space_meta_data.tables { + let table_meta = table_meta_data.table_meta; + let table_name = table_meta.table_name.clone(); + // Choose write worker for this table + let write_handle = space.write_group.choose_worker(table_id); + + debug!("Instance apply add table, meta :{:?}", table_meta); + + let table_data = Arc::new( + TableData::recover_from_add( + table_meta, + write_handle, + &self.file_purger, + space.mem_usage_collector.clone(), + ) + .context(RecoverTableData { + space_id, + table: &table_name, + })?, + ); + // Apply version meta to the table. + let version_meta = table_meta_data.version_meta; + let max_file_id = version_meta.max_file_id_to_add(); + table_data.current_version().apply_meta(version_meta); + // In recovery case, we need to maintain last file id of the table manually. + if table_data.last_file_id() < max_file_id { + table_data.set_last_file_id(max_file_id); + } + // Add table to space. + space.insert_table(table_data); + } + } + + // Update meta state. + let mut meta_state = self.space_store.meta_state.lock().await; + meta_state.last_space_id = manifest_data.last_space_id; + + Ok(()) + } + + /// Recover all table data from wal + async fn recover_from_wal(&self, ctx: &OpenContext) -> Result<()> { + // replay_batch_size == 0 causes infinite loop. + assert!(ctx.config.replay_batch_size > 0); + + info!("Instance recover from wal begin, ctx:{:?}", ctx); + + // For each table, recover data of that table + let tables = { + let mut tables = Vec::new(); + self.space_store.list_all_tables(&mut tables); + tables + }; + + let replay_batch_size = ctx.config.max_replay_tables_per_batch; + let mut replaying_rxs = Vec::with_capacity(replay_batch_size); + let mut replaying_tables = Vec::with_capacity(replay_batch_size); + + for table_data in tables { + // Create a oneshot channel to send/recieve recover result + let (tx, rx) = oneshot::channel(); + let cmd = RecoverTableCommand { + table_data: table_data.clone(), + tx, + replay_batch_size: ctx.config.replay_batch_size, + }; + + // Send recover request to write worker, actual works done in + // Self::recover_table_from_wal() + write_worker::send_command_to_write_worker(cmd.into_command(), &table_data).await; + + replaying_rxs.push(rx); + replaying_tables.push(table_data.clone()); + + if replaying_rxs.len() >= replay_batch_size { + // Wait batch done + write_worker::join_all(&replaying_tables, replaying_rxs) + .await + .context(RecoverTable)?; + + replaying_rxs = Vec::with_capacity(replay_batch_size); + replaying_tables.clear(); + } + } + + // Don't forget to wait the last batch done. + if !replaying_rxs.is_empty() { + write_worker::join_all(&replaying_tables, replaying_rxs) + .await + .context(RecoverTable)?; + } + + info!("Instance recover from wal end"); + + Ok(()) + } + + /// Recover table data from wal + /// + /// Called by write worker + pub(crate) async fn recover_table_from_wal( + &self, + worker_local: &WorkerLocal, + table: TableDataRef, + replay_batch_size: usize, + read_ctx: &ReadContext, + log_entry_buf: &mut Vec>, + ) -> Result<()> { + let decoder = WalDecoder::default(); + + let read_req = ReadRequest { + region_id: table.wal_region_id(), + start: ReadBoundary::Min, + end: ReadBoundary::Max, + }; + + // Read all wal of current table + let mut log_iter = self + .space_store + .wal_manager + .read(read_ctx, &read_req) + .context(ReadWal)?; + + loop { + // fetch entries to log_entry_buf + let no_more_data = { + log_entry_buf.clear(); + + for _ in 0..replay_batch_size { + if let Some(log_entry) = log_iter.next_log_entry(&decoder).context(ReadWal)? { + log_entry_buf.push(log_entry); + } else { + break; + } + } + + log_entry_buf.len() < replay_batch_size + }; + + // Replay all log entries of current table + self.replay_table_log_entries(worker_local, &*table, log_entry_buf) + .await?; + + // No more entries. + if no_more_data { + break; + } + } + + Ok(()) + } + + /// Replay all log entries into memtable + async fn replay_table_log_entries( + &self, + worker_local: &WorkerLocal, + table_data: &TableData, + log_entries: &mut [LogEntry], + ) -> Result<()> { + if log_entries.is_empty() { + // No data in wal + return Ok(()); + } + + let last_sequence = log_entries.last().unwrap().sequence; + + info!( + "Instance replay table log entries begin, table:{}, table_id:{:?}, sequence:{}", + table_data.name, table_data.id, last_sequence + ); + + // TODO(yingwen): Maybe we need to trigger flush if memtable is full during + // recovery Replay entries + for log_entry in log_entries { + let (sequence, payload) = (log_entry.sequence, &mut log_entry.payload); + + // Apply to memtable + match payload { + ReadPayload::Write { row_group } => { + trace!( + "Instance replay row_group, table:{}, row_group:{:?}", + table_data.name, + row_group + ); + + let table_schema_version = table_data.schema_version(); + if table_schema_version != row_group.schema().version() { + // Data with old schema should already been flushed, but we avoid panic + // here. + error!( + "Ignore data with mismatch schema version during replaying, \ + table:{}, \ + table_id:{:?}, \ + expect:{}, \ + actual:{}, \ + last_sequence:{}, \ + sequence:{}", + table_data.name, + table_data.id, + table_schema_version, + row_group.schema().version(), + last_sequence, + sequence, + ); + + continue; + } + + let index_in_writer = + IndexInWriterSchema::for_same_schema(row_group.schema().num_columns()); + Self::write_to_memtable( + worker_local, + table_data, + sequence, + row_group, + index_in_writer, + ) + .context(ApplyMemTable)?; + } + } + } + + info!( + "Instance replay table log entries end, table:{}, table_id:{:?}, last_sequence:{}", + table_data.name, table_data.id, last_sequence + ); + + table_data.set_last_sequence(last_sequence); + + Ok(()) + } +} diff --git a/analytic_engine/src/instance/read.rs b/analytic_engine/src/instance/read.rs new file mode 100644 index 0000000000..8d47d7d8d3 --- /dev/null +++ b/analytic_engine/src/instance/read.rs @@ -0,0 +1,388 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Read logic of instance + +use std::{ + collections::BTreeMap, + pin::Pin, + task::{Context, Poll}, +}; + +use common_types::{ + projected_schema::ProjectedSchema, record_batch::RecordBatch, schema::RecordSchema, + time::TimeRange, +}; +use common_util::{define_result, runtime::Runtime}; +use futures::stream::Stream; +use log::{debug, error, trace}; +use object_store::ObjectStore; +use snafu::{ResultExt, Snafu}; +use table_engine::{ + stream::{ + self, ErrWithSource, PartitionedStreams, RecordBatchStream, SendableRecordBatchStream, + }, + table::ReadRequest, +}; +use tokio::sync::mpsc::{self, Receiver}; +use wal::manager::WalManager; + +use crate::{ + instance::Instance, + meta::Manifest, + row_iter::{ + chain, + chain::{ChainConfig, ChainIterator}, + dedup::DedupIterator, + merge::{MergeBuilder, MergeConfig, MergeIterator}, + IterOptions, RecordBatchWithKeyIterator, + }, + space::SpaceAndTable, + sst::factory::{Factory, SstReaderOptions}, + table::{ + data::TableData, + version::{ReadView, TableVersion}, + }, + table_options::TableOptions, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to scan memtable, table:{}, err:{}", table, source))] + ScanMemTable { + table: String, + source: crate::memtable::Error, + }, + + #[snafu(display("Failed to build merge iterator, table:{}, err:{}", table, source))] + BuildMergeIterator { + table: String, + source: crate::row_iter::merge::Error, + }, + + #[snafu(display("Failed to build chain iterator, table:{}, err:{}", table, source))] + BuildChainIterator { + table: String, + source: crate::row_iter::chain::Error, + }, +} + +define_result!(Error); + +const RECORD_BATCH_READ_BUF_SIZE: usize = 1000; + +/// Check whether it needs to apply merge sorting when reading the table with +/// the `table_options` by the `read_request`. +fn need_merge_sort_streams(table_options: &TableOptions, read_request: &ReadRequest) -> bool { + table_options.need_dedup() || read_request.order.is_in_order() +} + +impl + Instance +{ + /// Read data in multiple time range from table, and return + /// `read_parallelism` output streams. + pub async fn partitioned_read_from_table( + &self, + space_table: &SpaceAndTable, + request: ReadRequest, + ) -> Result { + debug!( + "Instance read from table, space:{}, table:{}, table_id:{:?}, request:{:?}", + space_table.space().name, + space_table.table_data().name, + space_table.table_data().id, + request + ); + + let table_data = space_table.table_data(); + + // Collect metrics. + table_data.metrics.on_read_request_begin(); + + let iter_options = IterOptions::default(); + let table_options = table_data.table_options(); + + if need_merge_sort_streams(&table_data.table_options(), &request) { + let merge_iters = self + .build_merge_iters(table_data, &request, iter_options, &*table_options) + .await?; + self.build_partitioned_streams(&request, merge_iters) + } else { + let chain_iters = self + .build_chain_iters(table_data, &request, &*table_options) + .await?; + self.build_partitioned_streams(&request, chain_iters) + } + } + + fn build_partitioned_streams( + &self, + request: &ReadRequest, + mut partitioned_iters: Vec, + ) -> Result { + let read_parallelism = request.opts.read_parallelism; + + if read_parallelism == 1 && request.order.is_in_desc_order() { + // TODO(xikai): it seems this can be avoided. + partitioned_iters.reverse(); + }; + + // Split iterators into `read_parallelism` groups. + let mut splited_iters: Vec<_> = std::iter::repeat_with(Vec::new) + .take(read_parallelism) + .collect(); + + for (i, time_aligned_iter) in partitioned_iters.into_iter().enumerate() { + splited_iters[i % read_parallelism].push(time_aligned_iter); + } + + let mut streams = Vec::with_capacity(read_parallelism); + for iters in splited_iters { + let stream = iters_to_stream(iters, self.read_runtime(), &request.projected_schema); + streams.push(stream); + } + + assert_eq!(read_parallelism, streams.len()); + + Ok(PartitionedStreams { streams }) + } + + async fn build_merge_iters( + &self, + table_data: &TableData, + request: &ReadRequest, + iter_options: IterOptions, + table_options: &TableOptions, + ) -> Result>> { + // Current visible sequence + let sequence = table_data.last_sequence(); + let projected_schema = request.projected_schema.clone(); + let sst_reader_options = SstReaderOptions { + sst_type: table_data.sst_type, + read_batch_row_num: table_options.num_rows_per_row_group, + reverse: request.order.is_in_desc_order(), + projected_schema: projected_schema.clone(), + predicate: request.predicate.clone(), + meta_cache: self.meta_cache.clone(), + data_cache: self.data_cache.clone(), + runtime: self.read_runtime().clone(), + }; + + let time_range = request.predicate.time_range; + let version = table_data.current_version(); + let read_views = self.partition_ssts_and_memtables(time_range, version, &*table_options); + + let mut iters = Vec::with_capacity(read_views.len()); + for read_view in read_views { + let merge_config = MergeConfig { + request_id: request.request_id, + space_id: table_data.space_id, + table_id: table_data.id, + sequence, + projected_schema: projected_schema.clone(), + predicate: request.predicate.clone(), + sst_factory: self.space_store.sst_factory.clone(), + sst_reader_options: sst_reader_options.clone(), + store: self.space_store.store_ref(), + merge_iter_options: iter_options.clone(), + need_dedup: table_options.need_dedup(), + reverse: request.order.is_in_desc_order(), + }; + + let merge_iter = MergeBuilder::new(merge_config) + .sampling_mem(read_view.sampling_mem) + .memtables(read_view.memtables) + .ssts_of_level(read_view.leveled_ssts) + .build() + .await + .context(BuildMergeIterator { + table: &table_data.name, + })?; + let dedup_iter = + DedupIterator::new(request.request_id, merge_iter, iter_options.clone()); + + iters.push(dedup_iter); + } + + Ok(iters) + } + + async fn build_chain_iters( + &self, + table_data: &TableData, + request: &ReadRequest, + table_options: &TableOptions, + ) -> Result> { + let projected_schema = request.projected_schema.clone(); + + assert!(request.order.is_out_of_order()); + + let sst_reader_options = SstReaderOptions { + sst_type: table_data.sst_type, + read_batch_row_num: table_options.num_rows_per_row_group, + // no need to read in order so just read in asc order by default. + reverse: false, + projected_schema: projected_schema.clone(), + predicate: request.predicate.clone(), + meta_cache: self.meta_cache.clone(), + data_cache: self.data_cache.clone(), + runtime: self.read_runtime().clone(), + }; + + let time_range = request.predicate.time_range; + let version = table_data.current_version(); + let read_views = self.partition_ssts_and_memtables(time_range, version, &*table_options); + + let mut iters = Vec::with_capacity(read_views.len()); + for read_view in read_views { + let chain_config = ChainConfig { + request_id: request.request_id, + space_id: table_data.space_id, + table_id: table_data.id, + projected_schema: projected_schema.clone(), + predicate: request.predicate.clone(), + sst_reader_options: sst_reader_options.clone(), + sst_factory: self.space_store.sst_factory.clone(), + store: self.space_store.store_ref(), + }; + let builder = chain::Builder::new(chain_config); + let chain_iter = builder + .sampling_mem(read_view.sampling_mem) + .memtables(read_view.memtables) + .ssts(read_view.leveled_ssts) + .build() + .await + .context(BuildChainIterator { + table: &table_data.name, + })?; + + iters.push(chain_iter); + } + + Ok(iters) + } + + fn partition_ssts_and_memtables( + &self, + time_range: TimeRange, + version: &TableVersion, + table_options: &TableOptions, + ) -> Vec { + let read_view = version.pick_read_view(time_range); + + let segment_duration = match table_options.segment_duration { + Some(v) => v.0, + None => { + // Segment duration is unknown, the table maybe still in sampling phase + // or the segment duration is still not applied to the table options, + // just return one partition. + return vec![read_view]; + } + }; + if read_view.contains_sampling() { + // The table contains sampling memtable, just return one partition. + return vec![read_view]; + } + + // Collect the aligned ssts and memtables into the map. + // {aligned timestamp} => {read view} + let mut read_view_by_time = BTreeMap::new(); + for (level, leveled_ssts) in read_view.leveled_ssts.into_iter().enumerate() { + for file in leveled_ssts { + let aligned_ts = file + .time_range() + .inclusive_start() + .truncate_by(segment_duration); + let entry = read_view_by_time + .entry(aligned_ts) + .or_insert_with(ReadView::default); + entry.leveled_ssts[level].push(file); + } + } + + for memtable in read_view.memtables { + let aligned_ts = memtable + .time_range + .inclusive_start() + .truncate_by(segment_duration); + let entry = read_view_by_time + .entry(aligned_ts) + .or_insert_with(ReadView::default); + entry.memtables.push(memtable); + } + + read_view_by_time.into_values().collect() + } +} + +// TODO(xikai): this is a hack way to implement SendableRecordBatchStream for +// MergeIterator. +fn iters_to_stream( + collection: T, + runtime: &Runtime, + schema: &ProjectedSchema, +) -> SendableRecordBatchStream +where + T: IntoIterator + Send + 'static, + T::Item: RecordBatchWithKeyIterator, + T::IntoIter: Send, +{ + let (tx, rx) = mpsc::channel(RECORD_BATCH_READ_BUF_SIZE); + let projected_schema = schema.clone(); + + runtime.spawn(async move { + for mut iter in collection { + while let Some(record_batch) = iter.next_batch().await.transpose() { + let record_batch = + record_batch + .map_err(|e| Box::new(e) as _) + .context(ErrWithSource { + msg: "Read record batch", + }); + + // Apply the projection to RecordBatchWithKey and gets the final RecordBatch. + let record_batch = record_batch.and_then(|batch_with_key| { + // TODO(yingwen): Try to use projector to do this, which precompute row + // indexes to project. + batch_with_key + .try_project(&projected_schema) + .map_err(|e| Box::new(e) as _) + .context(ErrWithSource { + msg: "Project record batch", + }) + }); + + trace!("send next record batch:{:?}", record_batch); + if tx.send(record_batch).await.is_err() { + error!("Failed to send record batch from the merge iterator"); + break; + } + } + } + }); + + Box::pin(ChannelledRecordBatchStream { + schema: schema.to_record_schema(), + rx, + }) +} + +pub struct ChannelledRecordBatchStream { + schema: RecordSchema, + rx: Receiver>, +} + +impl Stream for ChannelledRecordBatchStream { + type Item = stream::Result; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + Pin::new(&mut this.rx).poll_recv(cx) + } +} + +impl RecordBatchStream for ChannelledRecordBatchStream { + fn schema(&self) -> &RecordSchema { + &self.schema + } +} diff --git a/analytic_engine/src/instance/write.rs b/analytic_engine/src/instance/write.rs new file mode 100644 index 0000000000..711e0c9b0d --- /dev/null +++ b/analytic_engine/src/instance/write.rs @@ -0,0 +1,464 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Write logic of instance + +use std::sync::Arc; + +use common_types::{ + bytes::ByteVec, + row::RowGroup, + schema::{IndexInWriterSchema, Schema}, +}; +use common_util::{codec::row, define_result}; +use log::{debug, error, info, trace, warn}; +use object_store::ObjectStore; +use proto::table_requests; +use smallvec::SmallVec; +use snafu::{ensure, Backtrace, ResultExt, Snafu}; +use table_engine::table::WriteRequest; +use tokio::sync::oneshot; +use wal::{ + log_batch::{LogWriteBatch, LogWriteEntry}, + manager::{SequenceNumber, WalManager, WriteContext}, +}; + +use crate::{ + instance::{ + flush_compaction::TableFlushOptions, + write_worker, + write_worker::{BackgroundStatus, WorkerLocal, WriteTableCommand}, + Instance, + }, + memtable::{key::KeySequence, PutContext}, + meta::Manifest, + payload::WritePayload, + space::SpaceAndTable, + sst::factory::Factory, + table::{ + data::{TableData, TableDataRef}, + version::MemTableForWrite, + }, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to write to wal, table:{}, err:{}", table, source))] + WriteLogBatch { + table: String, + source: wal::manager::Error, + }, + + #[snafu(display("Failed to write to memtable, table:{}, err:{}", table, source))] + WriteMemTable { + table: String, + source: crate::table::version::Error, + }, + + #[snafu(display("Try to write to a dropped table, table:{}", table))] + WriteDroppedTable { table: String }, + + #[snafu(display( + "Too many rows to write (more than {}), table:{}, rows:{}.\nBacktrace:\n{}", + MAX_ROWS_TO_WRITE, + table, + rows, + backtrace, + ))] + TooManyRows { + table: String, + rows: usize, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to find mutable memtable, table:{}, err:{}", table, source))] + FindMutableMemTable { + table: String, + source: crate::table::data::Error, + }, + #[snafu(display("Failed to write table, source:{}", source,))] + Write { source: write_worker::Error }, + + #[snafu(display("Failed to flush table, table:{}, err:{}", table, source))] + FlushTable { + table: String, + source: crate::instance::flush_compaction::Error, + }, + + #[snafu(display( + "Background flush failed, cannot write more data, err:{}.\nBacktrace:\n{}", + msg, + backtrace + ))] + BackgroundFlushFailed { msg: String, backtrace: Backtrace }, + + #[snafu(display("Schema of request is incompatible with table, err:{}", source))] + IncompatSchema { + source: common_types::schema::CompatError, + }, + + #[snafu(display("Failed to encode row group, err:{}", source))] + EncodeRowGroup { + source: common_util::codec::row::Error, + }, + + #[snafu(display("Failed to update sequence of memtable, err:{}", source))] + UpdateMemTableSequence { source: crate::memtable::Error }, +} + +define_result!(Error); + +/// Max rows in a write request, must less than [u32::MAX] +const MAX_ROWS_TO_WRITE: usize = 10_000_000; + +pub struct EncodeContext { + row_group: RowGroup, + index_in_writer: IndexInWriterSchema, + encoded_rows: Vec, +} + +impl EncodeContext { + fn new(row_group: RowGroup) -> Self { + Self { + row_group, + index_in_writer: IndexInWriterSchema::default(), + encoded_rows: Vec::new(), + } + } + + fn encode_rows(&mut self, table_schema: &Schema) -> Result<()> { + // Encode the row group into the buffer, which can be reused to write to + // memtable + row::encode_row_group_for_wal( + &self.row_group, + table_schema, + &self.index_in_writer, + &mut self.encoded_rows, + ) + .context(EncodeRowGroup)?; + + assert_eq!(self.row_group.num_rows(), self.encoded_rows.len()); + + Ok(()) + } +} + +impl< + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, + > Instance +{ + /// Write data to the table under give space. + pub async fn write_to_table( + &self, + space_table: &SpaceAndTable, + request: WriteRequest, + ) -> Result { + // Collect metrics. + space_table.table_data().metrics.on_write_request_begin(); + + self.validate_before_write(space_table, &request)?; + + // Create a oneshot channel to send/receive write result. + let (tx, rx) = oneshot::channel(); + let cmd = WriteTableCommand { + space_table: space_table.clone(), + request, + tx, + }; + + // Send write request to write worker, actual works done in + // Self::process_write_table_command(). + write_worker::process_command_in_write_worker( + cmd.into_command(), + space_table.table_data(), + rx, + ) + .await + .context(Write) + } + + /// Do the actual write, must called by write worker in write thread + /// sequentially. + pub(crate) async fn process_write_table_command( + self: &Arc, + worker_local: &mut WorkerLocal, + space_table: &SpaceAndTable, + request: WriteRequest, + ) -> Result { + let mut encode_ctx = EncodeContext::new(request.row_group); + + self.preprocess_write(worker_local, space_table, &mut encode_ctx) + .await?; + + let table_data = space_table.table_data(); + let schema = table_data.schema(); + encode_ctx.encode_rows(&schema)?; + + let EncodeContext { + row_group, + index_in_writer, + encoded_rows, + } = encode_ctx; + + let sequence = self + .write_to_wal(worker_local, &**table_data, encoded_rows) + .await?; + + Self::write_to_memtable( + worker_local, + &**table_data, + sequence, + &row_group, + index_in_writer, + ) + .map_err(|e| { + error!( + "Failed to write to memtable, space_table:{:?}, err:{}", + space_table, e + ); + e + })?; + + // Failure of writing memtable may cause inconsecutive sequence. + if table_data.last_sequence() + 1 != sequence { + warn!( + "Sequence must be consecutive, space_table:{:?}, last_sequence:{}, wal_sequence:{}", + space_table, + table_data.last_sequence(), + sequence + ); + } + + debug!( + "Instance write finished, update sequence, space_table:{:?}, last_sequence:{}", + space_table, sequence + ); + + table_data.set_last_sequence(sequence); + + let num_rows = row_group.num_rows(); + // Collect metrics. + table_data.metrics.on_write_request_done(num_rows); + + Ok(num_rows) + } + + /// Return Ok if the request is valid, this is done before entering the + /// write thread. + fn validate_before_write( + &self, + space_table: &SpaceAndTable, + request: &WriteRequest, + ) -> Result<()> { + ensure!( + request.row_group.num_rows() < MAX_ROWS_TO_WRITE, + TooManyRows { + table: &space_table.table_data().name, + rows: request.row_group.num_rows(), + } + ); + + Ok(()) + } + + /// Preprocess before write, check: + /// - whether table is dropped + /// - memtable capacity and maybe trigger flush + /// + /// Fills [common_types::schema::IndexInWriterSchema] in [EncodeContext] + async fn preprocess_write( + self: &Arc, + worker_local: &mut WorkerLocal, + space_table: &SpaceAndTable, + encode_ctx: &mut EncodeContext, + ) -> Result<()> { + let space = space_table.space(); + let table_data = space_table.table_data(); + + ensure!( + !table_data.is_dropped(), + WriteDroppedTable { + table: &table_data.name, + } + ); + + // Checks schema compability. + table_data + .schema() + .compatible_for_write( + encode_ctx.row_group.schema(), + &mut encode_ctx.index_in_writer, + ) + .context(IncompatSchema)?; + + // TODO(yingwen): Allow write and retry flush. + // Check background status, if background error occured, not allow to write + // again. + match &*worker_local.background_status() { + // Compaction error is ignored now. + BackgroundStatus::Ok | BackgroundStatus::CompactionFailed(_) => (), + BackgroundStatus::FlushFailed(e) => { + return BackgroundFlushFailed { msg: e.to_string() }.fail(); + } + } + + if self.should_flush_instance() { + if let Some(space) = self.space_store.find_maximum_memory_usage_space() { + if let Some(table) = space.find_maximum_memory_usage_table() { + info!("Trying to flush table {} bytes {} in space {} because engine total memtable memory usage exceeds db_write_buffer_size {}.", + table.name, + table.memtable_memory_usage(), + space.name, + self.db_write_buffer_size, + ); + self.handle_memtable_flush(worker_local, &table).await?; + } + } + } + + if space.should_flush_space() { + if let Some(table) = space.find_maximum_memory_usage_table() { + info!("Trying to flush table {} bytes {} in space {} because space total memtable memory usage exceeds space_write_buffer_size {}.", + table.name, + table.memtable_memory_usage() , + space.name, + space.write_buffer_size, + ); + self.handle_memtable_flush(worker_local, &table).await?; + } + } + + if table_data.should_flush_table(worker_local) { + self.handle_memtable_flush(worker_local, table_data).await?; + } + + Ok(()) + } + + /// Write log_batch into wal, return the sequence number of log_batch. + async fn write_to_wal( + &self, + _worker_local: &WorkerLocal, + table_data: &TableData, + encoded_rows: Vec, + ) -> Result { + // Convert into pb + let mut write_req_pb = table_requests::WriteRequest::new(); + // Use the table schema instead of the schema in request to avoid schema + // mismatch during replaying + write_req_pb.set_schema(table_data.schema().into()); + write_req_pb.set_rows(encoded_rows.into()); + + let mut log_batch = LogWriteBatch::new(table_data.wal_region_id()); + // Now we only have one request, so no need to use with_capacity + log_batch.push(LogWriteEntry { + payload: WritePayload::Write(&write_req_pb), + }); + + // Write to wal manager + let write_ctx = WriteContext::default(); + let sequence = self + .space_store + .wal_manager + .write(&write_ctx, &log_batch) + .await + .context(WriteLogBatch { + table: &table_data.name, + })?; + + Ok(sequence) + } + + // TODO(yingwen): How to trigger flush if we found memtables are full during + // inserting memtable? RocksDB checks memtable size in MemTableInserter + /// Write data into memtable. + /// + /// The data in `encoded_rows` will be moved to memtable. + /// + /// The len of `row_group` and `encoded_rows` must be equal. + pub(crate) fn write_to_memtable( + worker_local: &WorkerLocal, + table_data: &TableData, + sequence: SequenceNumber, + row_group: &RowGroup, + index_in_writer: IndexInWriterSchema, + ) -> Result<()> { + if row_group.is_empty() { + return Ok(()); + } + + let schema = row_group.schema(); + // Store all memtables we wrote and update their last sequence later. + let mut wrote_memtables: SmallVec<[_; 4]> = SmallVec::new(); + let mut last_mutable_mem: Option = None; + + let mut ctx = PutContext::new(index_in_writer); + for (row_idx, row) in row_group.iter().enumerate() { + // TODO(yingwen): Add RowWithSchema and take RowWithSchema as input, then remove + // this unwrap() + let timestamp = row.timestamp(schema).unwrap(); + // skip expired row + if table_data.is_expired(timestamp) { + trace!("Skip expired row when write to memtable, row:{:?}", row); + continue; + } + if last_mutable_mem.is_none() + || !last_mutable_mem + .as_ref() + .unwrap() + .accept_timestamp(timestamp) + { + // The time range is not processed by current memtable, find next one. + let mutable_mem = table_data + .find_or_create_mutable(worker_local, timestamp, schema) + .context(FindMutableMemTable { + table: &table_data.name, + })?; + wrote_memtables.push(mutable_mem.clone()); + last_mutable_mem = Some(mutable_mem); + } + + // We have check the row num is less than `MAX_ROWS_TO_WRITE`, it is safe to + // cast it to u32 here + let key_seq = KeySequence::new(sequence, row_idx as u32); + // TODO(yingwen): Batch sample timestamp in sampling phase. + last_mutable_mem + .as_ref() + .unwrap() + .put(&mut ctx, key_seq, row, schema, timestamp) + .context(WriteMemTable { + table: &table_data.name, + })?; + } + + // Update last sequence of memtable. + for mem_wrote in wrote_memtables { + mem_wrote + .set_last_sequence(sequence) + .context(UpdateMemTableSequence)?; + } + + Ok(()) + } + + /// Flush memtables of table in background. + /// + /// Only flush mutable memtables, assuming all immutable memtables are + /// flushing. + async fn handle_memtable_flush( + self: &Arc, + worker_local: &mut WorkerLocal, + table_data: &TableDataRef, + ) -> Result<()> { + let opts = TableFlushOptions::default(); + + // Set `block_on_write_thread` to false and let flush do in background. + self.flush_table_in_worker(worker_local, table_data, opts) + .await + .context(FlushTable { + table: &table_data.name, + }) + } +} diff --git a/analytic_engine/src/instance/write_worker.rs b/analytic_engine/src/instance/write_worker.rs new file mode 100644 index 0000000000..41089a2605 --- /dev/null +++ b/analytic_engine/src/instance/write_worker.rs @@ -0,0 +1,970 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Write workers + +use std::{ + collections::HashMap, + future::Future, + sync::{ + atomic::{AtomicBool, AtomicI64, Ordering}, + Arc, + }, + time::Instant, +}; + +use common_util::{ + define_result, + runtime::{JoinHandle, Runtime}, + time::InstantExt, +}; +use futures::future; +use log::{error, info}; +use object_store::ObjectStore; +use snafu::{Backtrace, ResultExt, Snafu}; +use table_engine::{ + engine::DropTableRequest, + table::{ + AlterSchemaRequest, Error as TableError, Result as TableResult, TableId, WriteRequest, + }, +}; +use tokio::sync::{mpsc, oneshot, watch, watch::Ref, Mutex, Notify}; +use wal::{ + log_batch::LogEntry, + manager::{ReadContext, WalManager}, +}; + +use crate::{ + compaction::{TableCompactionRequest, WaitResult}, + instance::{ + alter, drop, + flush_compaction::{self, TableFlushOptions}, + open, write, write_worker, InstanceRef, + }, + meta::Manifest, + payload::ReadPayload, + space::{SpaceAndTable, SpaceId}, + sst::factory::Factory, + table::{data::TableDataRef, metrics::Metrics}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to wait flush completed, channel disconnected, err:{}", source))] + WaitFlush { + source: Box, + }, + + #[snafu(display( + "Background flush failed, cannot write more data, err:{}.\nBacktrace:\n{}", + msg, + backtrace + ))] + BackgroundFlushFailed { msg: String, backtrace: Backtrace }, + + #[snafu(display( + "Failed to receive cmd result, channel disconnected, table:{}, worker_id:{}.\nBacktrace:\n{}", + table, + worker_id, + backtrace, + ))] + ReceiveFromWorker { + table: String, + worker_id: usize, + backtrace: Backtrace, + }, + + #[snafu(display("Channel error, err:{}", source))] + Channel { + source: Box, + }, +} + +define_result!(Error); + +#[derive(Debug)] +pub enum BackgroundStatus { + Ok, + FlushFailed(Arc), + CompactionFailed(Arc), +} + +/// Local state of worker +/// +/// The worker is single threaded and holding this is equivalent to holding a +/// write lock +#[derive(Debug)] +pub struct WorkerLocal { + data: Arc, + background_rx: watch::Receiver, +} + +/// Notifier for the write worker when finishing flushing. +struct FlushNotifier(Arc); + +impl FlushNotifier { + fn new(data: Arc) -> Self { + data.num_background_jobs.fetch_add(1, Ordering::SeqCst); + + Self(data) + } + + /// Mark flush is done and notify the waiter status ok (write thread). + /// Concurrency: + /// - Caller should guarantee that there is only one thread (the flush + /// thread) calling this method + pub fn notify_ok(self) { + // Mark the worker is not flushing. + self.0.set_is_flushing(false); + // Send message to notify waiter, ignore send result. + let _ = self.0.background_tx.send(BackgroundStatus::Ok); + } + + /// Mark flush is done and notify the waiter error (write thread). + /// Concurrency: + /// - Caller should guarantee that there is only one thread (the flush + /// thread) calling this method + pub fn notify_err(self, err: Arc) { + // Mark the worker is not flushing. + self.0.set_is_flushing(false); + // Send message to notify waiter, ignore send result. + let _ = self + .0 + .background_tx + .send(BackgroundStatus::FlushFailed(err)); + } +} + +impl Drop for FlushNotifier { + fn drop(&mut self) { + // SeqCst to ensure subtraction num_background_jobs won't be reordered. + self.0.num_background_jobs.fetch_sub(1, Ordering::SeqCst); + self.0.background_notify.notify_one(); + } +} + +/// Notifier to notify compaction result. If no compaction happened, then the +/// notifier may not be signaled. +pub struct CompactionNotifier(Arc); + +impl CompactionNotifier { + fn new(data: Arc) -> Self { + data.num_background_jobs.fetch_add(1, Ordering::SeqCst); + + Self(data) + } + + pub fn notify_ok(self) { + // Send message to notify waiter, ignore send result. + let _ = self.0.background_tx.send(BackgroundStatus::Ok); + } + + pub fn notify_err(self, err: Arc) { + // Send message to notify waiter, ignore send result. + let _ = self + .0 + .background_tx + .send(BackgroundStatus::CompactionFailed(err)); + } +} + +impl Clone for CompactionNotifier { + fn clone(&self) -> Self { + // It will add num_background_jobs in CompactionNotifier::new, + // so we can't derive Clone for CompactionNotifier. + CompactionNotifier::new(self.0.clone()) + } +} + +impl Drop for CompactionNotifier { + fn drop(&mut self) { + // SeqCst to ensure subtraction num_background_jobs won't be reordered. + self.0.num_background_jobs.fetch_sub(1, Ordering::SeqCst); + self.0.background_notify.notify_one(); + } +} + +fn send_flush_result(res_sender: Option>>, res: TableResult<()>) { + if let Some(tx) = res_sender { + if let Err(send_res) = tx.send(res) { + error!("Fail to send flush result, send_res: {:?}", send_res); + } + } +} + +impl WorkerLocal { + #[inline] + pub fn background_status(&self) -> Ref<'_, BackgroundStatus> { + self.background_rx.borrow() + } + + /// Control the flush procedure and ensure multiple flush procedures to be + /// sequential. + /// + /// REQUIRE: should only be called by the write thread. + pub async fn flush_sequentially( + &mut self, + table: String, + metrics: &Metrics, + flush_job: F, + on_flush_success: T, + block_on_write_thread: bool, + res_sender: Option>>, + ) -> Result<()> + where + F: Future> + Send + 'static, + T: Future + Send + 'static, + { + // If flush operation is running, then we need to wait for it to complete first. + // Actually, the loop waiting ensures the multiple flush procedures to be + // sequential, that is to say, at most one flush is being executed at + // the same time. + let mut stall_begin = None; + while self.data.is_flushing() { + if stall_begin.is_none() { + stall_begin = Some(Instant::now()); + } + + self.background_rx + .changed() + .await + .map_err(|e| Box::new(e) as _) + .context(WaitFlush)?; + } + assert!(!self.data.is_flushing()); + + // Report write stall. + if let Some(instant) = stall_begin { + metrics.on_write_stall(instant.saturating_elapsed()); + } + + // Check background status, if background error occurred, current flush is not + // allowed. + match &*self.background_status() { + // Now background compaction error is ignored. + BackgroundStatus::Ok | BackgroundStatus::CompactionFailed(_) => (), + BackgroundStatus::FlushFailed(e) => { + return BackgroundFlushFailed { msg: e.to_string() }.fail(); + } + } + + // TODO(yingwen): Store pending flush requests and retry flush on recoverable + // error, or try to recover from background error. + + // Mark the worker is flushing. + self.data.set_is_flushing(true); + + let worker_data = self.data.clone(); + // Create a notifier, remember to mark flushed and notify wait when we done! + let notifier = FlushNotifier::new(worker_data); + let task = async move { + let flush_res = flush_job.await; + + match flush_res { + Ok(()) => { + notifier.notify_ok(); + on_flush_success.await; + send_flush_result(res_sender, Ok(())); + } + Err(e) => { + let e = Arc::new(e); + notifier.notify_err(e.clone()); + send_flush_result( + res_sender, + Err(TableError::Flush { + source: Box::new(e), + table, + }), + ); + } + } + }; + + if block_on_write_thread { + task.await; + } else { + self.data.runtime.spawn(task); + } + + Ok(()) + } + + pub fn compaction_notifier(&self) -> CompactionNotifier { + let data = self.data.clone(); + CompactionNotifier::new(data) + } +} + +/// Write table command. +pub struct WriteTableCommand { + pub space_table: SpaceAndTable, + pub request: WriteRequest, + /// Sender for the worker to return result of write + pub tx: oneshot::Sender>, +} + +impl WriteTableCommand { + /// Convert into [Command] + pub fn into_command(self) -> Command { + Command::Write(self) + } +} + +/// Recover table command. +pub struct RecoverTableCommand { + /// Table to recover + pub table_data: TableDataRef, + /// Sender for the worker to return result of recover + pub tx: oneshot::Sender>, + + // Options for recover: + /// Batch size to read records from wal to replay + pub replay_batch_size: usize, +} + +impl RecoverTableCommand { + /// Convert into [Command] + pub fn into_command(self) -> Command { + Command::Recover(self) + } +} + +/// Drop table command +pub struct DropTableCommand { + pub space_table: SpaceAndTable, + pub request: DropTableRequest, + pub tx: oneshot::Sender>, +} + +impl DropTableCommand { + /// Convert into [Command] + pub fn into_command(self) -> Command { + Command::Drop(self) + } +} + +/// Alter table command. +pub struct AlterSchemaCommand { + pub space_table: SpaceAndTable, + pub request: AlterSchemaRequest, + /// Sender for the worker to return result of alter schema + pub tx: oneshot::Sender>, +} + +impl AlterSchemaCommand { + /// Convert into [Command] + pub fn into_command(self) -> Command { + Command::AlterSchema(self) + } +} + +/// Alter table options command. +pub struct AlterOptionsCommand { + pub space_table: SpaceAndTable, + pub options: HashMap, + /// Sender for the worker to return result of alter schema + pub tx: oneshot::Sender>, +} + +impl AlterOptionsCommand { + /// Convert into [Command] + pub fn into_command(self) -> Command { + Command::AlterOptions(self) + } +} + +/// Flush table request. +pub struct FlushTableCommand { + pub space_table: SpaceAndTable, + pub flush_opts: TableFlushOptions, + pub tx: oneshot::Sender>, +} + +impl FlushTableCommand { + /// Convert into [Command] + pub fn into_command(self) -> Command { + Command::Flush(self) + } +} + +/// Compact table request. +pub struct CompactTableCommand { + pub space_table: SpaceAndTable, + pub waiter: Option>>, + pub tx: oneshot::Sender>, +} + +impl CompactTableCommand { + /// Convert into [Command] + pub fn into_command(self) -> Command { + Command::Compact(self) + } +} + +/// Command sent to write worker +pub enum Command { + /// Write to table + Write(WriteTableCommand), + + /// Drop table + Drop(DropTableCommand), + + /// Recover table + Recover(RecoverTableCommand), + + /// Alter table schema + AlterSchema(AlterSchemaCommand), + + /// Alter table modify setting + AlterOptions(AlterOptionsCommand), + + /// Flush table + Flush(FlushTableCommand), + + /// Compact table + Compact(CompactTableCommand), + + /// Exit the worker + Exit, +} + +/// Write handle hold by a table +#[derive(Debug, Clone)] +pub struct WriteHandle { + worker_data: Arc, +} + +impl WriteHandle { + /// Send command to write worker. + /// + /// Panic if channel is disconnected + pub async fn send_command(&self, cmd: Command) { + if self.worker_data.tx.send(cmd).await.is_err() { + error!( + "Failed to send command to worker, worker_id:{}", + self.worker_id() + ); + + panic!("write worker {} disconnected", self.worker_id()); + } + } + + /// Returns the id of the worker + pub fn worker_id(&self) -> usize { + self.worker_data.id + } +} + +pub async fn send_command_to_write_worker(cmd: Command, table_data: &TableDataRef) { + table_data.write_handle.send_command(cmd).await; +} + +pub async fn process_command_in_write_worker( + cmd: Command, + table_data: &TableDataRef, + rx: oneshot::Receiver>, +) -> Result { + send_command_to_write_worker(cmd, table_data).await; + + // Receive alter options result. + match rx.await { + Ok(res) => res.map_err(|e| Box::new(e) as _).context(Channel), + Err(_) => ReceiveFromWorker { + table: &table_data.name, + worker_id: table_data.write_handle.worker_id(), + } + .fail(), + } +} + +pub async fn join_all( + table_vec: &[TableDataRef], + rx_vec: Vec>>, +) -> Result<()> { + let results = future::join_all(rx_vec).await; + for (pos, res) in results.into_iter().enumerate() { + let table_data = &table_vec[pos]; + match res { + Ok(res) => { + res.map_err(|e| Box::new(e) as _).context(Channel)?; + } + Err(_) => { + return ReceiveFromWorker { + table: &table_data.name, + worker_id: table_data.write_handle.worker_id(), + } + .fail() + } + } + } + + Ok(()) +} + +/// Write group options +pub struct Options { + pub space_id: SpaceId, + pub worker_num: usize, + pub runtime: Arc, + /// Capacity of the command channel for each worker + pub command_channel_capacity: usize, +} + +// TODO(yingwen): Add method to stop all workers +/// Write group manages all write worker of a space +#[derive(Debug)] +pub struct WriteGroup { + /// Space of the write group. + space_id: SpaceId, + /// Shared datas of workers. + worker_datas: Vec>, + /// Join handles of workers. + handles: Mutex>>, +} + +impl WriteGroup { + pub fn new( + opts: Options, + instance: InstanceRef, + ) -> Self + where + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, + { + let mut worker_datas = Vec::with_capacity(opts.worker_num); + let mut handles = Vec::with_capacity(opts.worker_num); + for id in 0..opts.worker_num { + let (tx, rx) = mpsc::channel(opts.command_channel_capacity); + let (background_tx, background_rx) = watch::channel(BackgroundStatus::Ok); + + let data = Arc::new(WorkerSharedData { + space_id: opts.space_id, + id, + tx, + is_flushing: AtomicBool::new(false), + background_tx, + runtime: opts.runtime.clone(), + num_background_jobs: AtomicI64::new(0), + background_notify: Notify::new(), + }); + + let mut worker = WriteWorker { + rx, + instance: instance.clone(), + local: WorkerLocal { + data: data.clone(), + background_rx, + }, + log_entry_buf: Vec::new(), + }; + + let space_id = opts.space_id; + // Spawn a task to run the worker + let handle = opts.runtime.spawn(async move { + worker.run().await; + + info!( + "Write worker waiting background jobs, space_id:{}, id:{}", + space_id, id + ); + + worker.wait_background_jobs_done().await; + + info!("Write worker exit, space_id:{}, id:{}", space_id, id); + }); + + worker_datas.push(data); + handles.push(handle); + } + + Self { + space_id: opts.space_id, + worker_datas, + handles: Mutex::new(handles), + } + } + + /// Stop the write group. + pub async fn stop(&self) { + for data in &self.worker_datas { + if data.tx.send(Command::Exit).await.is_err() { + error!( + "Failed to send exit command, space_id:{}, worker_id:{}", + self.space_id, data.id + ); + } + } + + let mut handles = self.handles.lock().await; + for (i, handle) in handles.iter_mut().enumerate() { + if let Err(e) = handle.await { + error!( + "Failed to join handle, space_id:{}, index:{}, err:{}", + self.space_id, i, e + ); + } + } + + // Clear all handles to avoid await again. + handles.clear(); + } + + /// Choose worker for table with `table_id`. The worker chose should be + /// consistent, so the caller can cached the handle of the worker + /// + /// Returns the WriteHandle of the worker + pub fn choose_worker(&self, table_id: TableId) -> WriteHandle { + let index = table_id.as_u64() as usize % self.worker_datas.len(); + let worker_data = self.worker_datas[index].clone(); + + WriteHandle { worker_data } + } +} + +/// Data of write worker +#[derive(Debug)] +struct WorkerSharedData { + /// Space this worker belongs to + space_id: SpaceId, + /// Id of the write worker + id: usize, + /// Sender to send command to this worker + tx: mpsc::Sender, + + /// Whether the flush job is already running + /// + /// When `is_flushing` is true, no more flush job should be scheduled + is_flushing: AtomicBool, + /// Channel to notify background status + background_tx: watch::Sender, + + /// Background job runtime. + runtime: Arc, + /// Numbers of background jobs. + num_background_jobs: AtomicI64, + /// Notify when background job finished. + background_notify: Notify, +} + +impl WorkerSharedData { + fn is_flushing(&self) -> bool { + self.is_flushing.load(Ordering::Relaxed) + } + + fn set_is_flushing(&self, is_flushing: bool) { + self.is_flushing.store(is_flushing, Ordering::Relaxed); + } +} + +/// Table write worker +/// +/// Each table is managed by exactly one write worker. Write request to a table +/// will be sent to this thread and done in this worker. +/// +/// The write worker should ensure there is only one flush thread (task) is +/// running. +struct WriteWorker { + /// Command receiver + rx: mpsc::Receiver, + /// Engine instance + instance: InstanceRef, + /// Worker local states + local: WorkerLocal, + /// Log entry buffer for recover + log_entry_buf: Vec>, +} + +impl< + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, + > WriteWorker +{ + /// Runs the write loop until stopped + async fn run(&mut self) { + // TODO(yingwen): Maybe batch write tasks to improve performance (group commit) + loop { + let command = match self.rx.recv().await { + Some(cmd) => cmd, + None => { + info!( + "Write worker recv None, exit, space_id:{}, id:{}", + self.space_id(), + self.id() + ); + return; + } + }; + + match command { + Command::Write(cmd) => { + self.handle_write_table(cmd).await; + } + Command::Drop(cmd) => { + self.handle_drop_table(cmd).await; + } + Command::Recover(cmd) => { + self.handle_recover_table(cmd).await; + } + Command::AlterSchema(cmd) => { + self.handle_alter_schema(cmd).await; + } + Command::AlterOptions(cmd) => { + self.handle_alter_options(cmd).await; + } + Command::Flush(cmd) => { + self.handle_flush_table(cmd).await; + } + Command::Compact(cmd) => { + self.handle_compact_table(cmd).await; + } + Command::Exit => { + info!( + "Write worker recv Command::Exit, exit, space_id:{}, id:{}", + self.space_id(), + self.id() + ); + return; + } + } + } + } + + async fn wait_background_jobs_done(&self) { + while self.num_background_jobs() > 0 { + self.wait_for_notify().await; + } + } + + async fn handle_write_table(&mut self, cmd: WriteTableCommand) { + let WriteTableCommand { + space_table, + request, + tx, + } = cmd; + + let write_res = self + .instance + .process_write_table_command(&mut self.local, &space_table, request) + .await; + if let Err(res) = tx.send(write_res) { + error!( + "handle write table failed to send result, write_res:{:?}", + res + ); + } + } + + async fn handle_recover_table(&mut self, cmd: RecoverTableCommand) { + let RecoverTableCommand { + table_data, + tx, + replay_batch_size, + } = cmd; + + let read_ctx = ReadContext::default(); + self.log_entry_buf.reserve(replay_batch_size); + + let recover_res = self + .instance + .recover_table_from_wal( + &self.local, + table_data, + replay_batch_size, + &read_ctx, + &mut self.log_entry_buf, + ) + .await; + if let Err(res) = tx.send(recover_res) { + error!( + "handle recover table failed to send result, recover_res:{:?}", + res + ); + } + } + + async fn handle_drop_table(&mut self, cmd: DropTableCommand) { + let DropTableCommand { + space_table, + request, + tx, + } = cmd; + + let drop_res = self + .instance + .process_drop_table_command(&mut self.local, &space_table, request) + .await; + if let Err(res) = tx.send(drop_res) { + error!( + "handle drop table failed to send result, drop_res:{:?}", + res + ); + } + } + + async fn handle_alter_schema(&mut self, cmd: AlterSchemaCommand) { + let AlterSchemaCommand { + space_table, + request, + tx, + } = cmd; + + let alter_res = self + .instance + .process_alter_schema_command(&mut self.local, &space_table, request) + .await + .map_err(|e| Box::new(e) as Box) + .context(Channel); + if let Err(res) = tx.send(alter_res) { + error!( + "handle alter schema failed to send result, alter_res:{:?}", + res + ); + } + } + + async fn handle_alter_options(&mut self, cmd: AlterOptionsCommand) { + let AlterOptionsCommand { + space_table, + options, + tx, + } = cmd; + + let alter_res = self + .instance + .process_alter_options_command(&mut self.local, &space_table, options) + .await; + if let Err(res) = tx.send(alter_res) { + error!( + "handle alter schema failed to send result, alter_res:{:?}", + res + ); + } + } + + async fn handle_flush_table(&mut self, cmd: FlushTableCommand) { + let FlushTableCommand { + space_table, + flush_opts, + tx, + } = cmd; + + let flush_res = self + .instance + .flush_table_in_worker(&mut self.local, space_table.table_data(), flush_opts) + .await; + if let Err(res) = tx.send(flush_res) { + error!( + "handle flush table failed to send result, flush_res:{:?}", + res + ); + } + } + + async fn handle_compact_table(&mut self, cmd: CompactTableCommand) { + let CompactTableCommand { + space_table, + waiter, + tx, + } = cmd; + + let request = TableCompactionRequest { + table_data: space_table.table_data().clone(), + compaction_notifier: self.local.compaction_notifier(), + waiter, + }; + + self.instance.schedule_table_compaction(request).await; + if let Err(_res) = tx.send(Ok(())) { + error!("handle compact table failed to send result"); + } + } + + #[inline] + fn space_id(&self) -> SpaceId { + self.local.data.space_id + } + + #[inline] + fn id(&self) -> usize { + self.local.data.id + } + + #[inline] + fn num_background_jobs(&self) -> i64 { + self.local.data.num_background_jobs.load(Ordering::SeqCst) + } + + async fn wait_for_notify(&self) { + self.local.data.background_notify.notified().await; + } +} + +#[cfg(test)] +pub mod tests { + use common_util::runtime; + + use super::*; + + pub struct MockedWriteHandle { + pub write_handle: WriteHandle, + pub rx: mpsc::Receiver, + pub worker_local: WorkerLocal, + } + + pub struct WriteHandleMocker { + space_id: SpaceId, + runtime: Option>, + } + + impl Default for WriteHandleMocker { + fn default() -> Self { + Self { + space_id: 1, + runtime: None, + } + } + } + + impl WriteHandleMocker { + pub fn space_id(mut self, space_id: SpaceId) -> Self { + self.space_id = space_id; + self + } + + pub fn build(self) -> MockedWriteHandle { + let (tx, rx) = mpsc::channel(1); + let (background_tx, background_rx) = watch::channel(BackgroundStatus::Ok); + let runtime = self.runtime.unwrap_or_else(|| { + let rt = runtime::Builder::default().build().unwrap(); + Arc::new(rt) + }); + + let worker_data = Arc::new(WorkerSharedData { + space_id: self.space_id, + id: 0, + tx, + is_flushing: AtomicBool::new(false), + background_tx, + runtime, + num_background_jobs: AtomicI64::new(0), + background_notify: Notify::new(), + }); + + let write_handle = WriteHandle { + worker_data: worker_data.clone(), + }; + + MockedWriteHandle { + write_handle, + rx, + worker_local: WorkerLocal { + data: worker_data, + background_rx, + }, + } + } + } +} diff --git a/analytic_engine/src/lib.rs b/analytic_engine/src/lib.rs new file mode 100644 index 0000000000..a4fc60c14f --- /dev/null +++ b/analytic_engine/src/lib.rs @@ -0,0 +1,98 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Analytic table engine implementations + +mod compaction; +mod context; +mod engine; +mod instance; +pub mod memtable; +mod meta; +mod payload; +pub mod row_iter; +mod sampler; +pub mod setup; +pub mod space; +pub mod sst; +pub mod table; +pub mod table_options; + +#[cfg(any(test, feature = "test"))] +pub mod tests; + +use object_store::disk::File; +use serde_derive::Deserialize; +use wal::rocks_impl::manager::RocksImpl; + +pub use crate::{compaction::scheduler::SchedulerConfig, table_options::TableOptions}; +use crate::{ + engine::TableEngineImpl, + instance::InstanceRef, + meta::details::{ManifestImpl, Options as ManifestOptions}, + sst::factory::FactoryImpl, +}; + +/// Analytic table engine +pub type AnalyticTableEngine = + TableEngineImpl, File, FactoryImpl>; +/// Default instance +pub(crate) type EngineInstance = InstanceRef, File, FactoryImpl>; + +/// Config of analytic engine. +#[derive(Debug, Clone, Deserialize)] +#[serde(default)] +pub struct Config { + /// Data path of the engine. + pub data_path: String, + + /// Batch size to read records from wal to replay. + pub replay_batch_size: usize, + /// Batch size to replay tables. + pub max_replay_tables_per_batch: usize, + // Write group options: + pub write_group_worker_num: usize, + pub write_group_command_channel_cap: usize, + // End of write group options. + /// Default options for table. + pub table_opts: TableOptions, + + pub compaction_config: SchedulerConfig, + + /// sst meta cache capacity. + pub sst_meta_cache_cap: Option, + /// sst data cache capacity. + pub sst_data_cache_cap: Option, + + /// Manifest options. + pub manifest: ManifestOptions, + + // Global write buffer options: + /// The maximum write buffer size used for single space. + pub space_write_buffer_size: usize, + /// The maximum size of all Write Buffers across all spaces. + pub db_write_buffer_size: usize, + // End of global write buffer options. +} + +impl Default for Config { + fn default() -> Self { + Self { + data_path: String::from("/tmp/ceresdbx"), + replay_batch_size: 500, + max_replay_tables_per_batch: 64, + write_group_worker_num: 8, + write_group_command_channel_cap: 128, + table_opts: TableOptions::default(), + compaction_config: SchedulerConfig::default(), + sst_meta_cache_cap: Some(1000), + sst_data_cache_cap: Some(1000), + manifest: ManifestOptions::default(), + /// Zero means disabling this param, give a postive value to enable + /// it. + space_write_buffer_size: 0, + /// Zero means disabling this param, give a postive value to enable + /// it. + db_write_buffer_size: 0, + } + } +} diff --git a/analytic_engine/src/memtable/factory.rs b/analytic_engine/src/memtable/factory.rs new file mode 100644 index 0000000000..0867bba2da --- /dev/null +++ b/analytic_engine/src/memtable/factory.rs @@ -0,0 +1,38 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! MemTable factory + +use std::{fmt, sync::Arc}; + +use arena::CollectorRef; +use common_types::{schema::Schema, SequenceNumber}; +use common_util::define_result; +use snafu::Snafu; + +use crate::memtable::MemTableRef; + +#[derive(Debug, Snafu)] +pub enum Error {} + +define_result!(Error); + +/// MemTable options +pub struct Options { + /// Schema of the skiplist. + pub schema: Schema, + /// Block size of arena in bytes. + pub arena_block_size: u32, + /// Log sequence at the memtable creation. + pub creation_sequence: SequenceNumber, + /// Memory usage colllector + pub collector: CollectorRef, +} + +/// MemTable factory +pub trait Factory: fmt::Debug { + /// Create a new memtable instance + fn create_memtable(&self, opts: Options) -> Result; +} + +/// MemTable Factory reference +pub type FactoryRef = Arc; diff --git a/analytic_engine/src/memtable/key.rs b/analytic_engine/src/memtable/key.rs new file mode 100644 index 0000000000..6c11837028 --- /dev/null +++ b/analytic_engine/src/memtable/key.rs @@ -0,0 +1,249 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Memtable key +//! +//! Some concepts: +//! - User key (row key) is a bytes encoded from the key columns of a row +//! - Internal key contains +//! - user key +//! - memtable key sequence +//! - sequence number +//! - index + +use std::mem; + +use common_types::{ + bytes::{BytesMut, MemBuf, MemBufMut}, + row::Row, + schema::Schema, + SequenceNumber, +}; +use common_util::{ + codec::{memcomparable::MemComparable, Decoder, Encoder}, + define_result, +}; +use snafu::{ensure, Backtrace, ResultExt, Snafu}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to encode key datum, err:{}", source))] + EncodeKeyDatum { + source: common_util::codec::memcomparable::Error, + }, + + #[snafu(display("Failed to encode sequence, err:{}", source))] + EncodeSequence { source: common_types::bytes::Error }, + + #[snafu(display("Failed to encode row index, err:{}", source))] + EncodeIndex { source: common_types::bytes::Error }, + + #[snafu(display("Failed to decode sequence, err:{}", source))] + DecodeSequence { source: common_types::bytes::Error }, + + #[snafu(display("Failed to decode row index, err:{}", source))] + DecodeIndex { source: common_types::bytes::Error }, + + #[snafu(display( + "Insufficent internal key length, len:{}.\nBacktrace:\n{}", + len, + backtrace + ))] + InternalKeyLen { len: usize, backtrace: Backtrace }, +} + +define_result!(Error); + +// u64 + u32 +const KEY_SEQUENCE_BYTES_LEN: usize = 12; + +/// Row index in the batch +pub type RowIndex = u32; + +/// Sequence number of row in memtable +/// +/// Contains: +/// - sequence number in wal (sequence number of the write batch) +/// - unique index of the row in the write batch +/// +/// Ordering: +/// 1. ordered by sequence desc +/// 2. ordered by index desc +/// +/// The desc order is implemented via MAX - seq +/// +/// The index is used to distinguish rows with same key of the same write batch +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct KeySequence(SequenceNumber, RowIndex); + +impl KeySequence { + pub fn new(sequence: SequenceNumber, index: RowIndex) -> Self { + Self(sequence, index) + } + + #[inline] + pub fn sequence(&self) -> SequenceNumber { + self.0 + } + + #[inline] + pub fn row_index(&self) -> RowIndex { + self.1 + } +} + +// TODO(yingwen): We also need opcode (PUT/DELETE), put it in key or row value +/// Comparable internal key encoder +/// +/// Key order: +/// 1. ordered by user key ascend (key parts of a row) +/// 2. ordered by sequence descend +/// +/// Encoding: +/// user_key + sequence +/// +/// REQUIRE: The schema of row to encode matches the Self::schema +pub struct ComparableInternalKey<'a> { + /// Sequence number of the row + sequence: KeySequence, + /// Schema of row + schema: &'a Schema, +} + +impl<'a> ComparableInternalKey<'a> { + pub fn new(sequence: KeySequence, schema: &'a Schema) -> Self { + Self { sequence, schema } + } +} + +impl<'a> Encoder for ComparableInternalKey<'a> { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &Row) -> Result<()> { + let encoder = MemComparable; + for idx in 0..self.schema.num_key_columns() { + // Encode each column in primary key + encoder.encode(buf, &value[idx]).context(EncodeKeyDatum)?; + } + SequenceCodec.encode(buf, &self.sequence)?; + + Ok(()) + } + + fn estimate_encoded_size(&self, value: &Row) -> usize { + let encoder = MemComparable; + let mut total_len = 0; + for idx in 0..self.schema.num_key_columns() { + // Size of each column in primary key + total_len += encoder.estimate_encoded_size(&value[idx]); + } + // The size of sequence + total_len += KEY_SEQUENCE_BYTES_LEN; + + total_len + } +} + +struct SequenceCodec; + +impl Encoder for SequenceCodec { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &KeySequence) -> Result<()> { + // Encode sequence number and index in descend order + encode_sequence_number(buf, value.sequence())?; + let reversed_index = RowIndex::MAX - value.row_index(); + buf.write_u32(reversed_index).context(EncodeIndex)?; + + Ok(()) + } + + fn estimate_encoded_size(&self, _value: &KeySequence) -> usize { + KEY_SEQUENCE_BYTES_LEN + } +} + +impl Decoder for SequenceCodec { + type Error = Error; + + fn decode(&self, buf: &mut B) -> Result { + let sequence = buf.read_u64().context(DecodeSequence)?; + // Reverse sequence + let sequence = SequenceNumber::MAX - sequence; + let row_index = buf.read_u32().context(DecodeIndex)?; + // Reverse row index + let row_index = RowIndex::MAX - row_index; + + Ok(KeySequence::new(sequence, row_index)) + } +} + +#[inline] +fn encode_sequence_number(buf: &mut B, sequence: SequenceNumber) -> Result<()> { + // The sequence need to encode in descend order + let reversed_sequence = SequenceNumber::MAX - sequence; + // Encode sequence + buf.write_u64(reversed_sequence).context(EncodeSequence)?; + Ok(()) +} + +// TODO(yingwen): Maybe make decoded internal key a type? + +/// Encode internal key from user key for seek +/// +/// - user_key: the user key to encode +/// - sequence: the sequence number to encode into internal key +/// - scratch: buffer to store the encoded internal key, the scratch will be +/// clear +/// +/// Returns the slice to the encoded internal key +pub fn internal_key_for_seek<'a>( + user_key: &[u8], + sequence: SequenceNumber, + scratch: &'a mut BytesMut, +) -> Result<&'a [u8]> { + scratch.clear(); + + scratch.reserve(user_key.len() + mem::size_of::()); + scratch.extend_from_slice(user_key); + encode_sequence_number(scratch, sequence)?; + + Ok(&scratch[..]) +} + +/// Decode user key and sequence number from the internal key +pub fn user_key_from_internal_key(internal_key: &[u8]) -> Result<(&[u8], KeySequence)> { + // Empty user key is meaningless + ensure!( + internal_key.len() > KEY_SEQUENCE_BYTES_LEN, + InternalKeyLen { + len: internal_key.len(), + } + ); + + let (left, mut right) = internal_key.split_at(internal_key.len() - KEY_SEQUENCE_BYTES_LEN); + // Decode sequence number from right part + let sequence = SequenceCodec.decode(&mut right)?; + + Ok((left, sequence)) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_sequence_codec() { + let codec = SequenceCodec; + + let sequence = KeySequence::new(123, 456); + assert_eq!(12, codec.estimate_encoded_size(&sequence)); + let mut buf = Vec::new(); + codec.encode(&mut buf, &sequence).unwrap(); + assert_eq!(12, buf.len()); + + let mut b = &buf[..]; + let decoded_sequence = codec.decode(&mut b).unwrap(); + + assert_eq!(sequence, decoded_sequence); + } +} diff --git a/analytic_engine/src/memtable/mod.rs b/analytic_engine/src/memtable/mod.rs new file mode 100644 index 0000000000..5074eff34c --- /dev/null +++ b/analytic_engine/src/memtable/mod.rs @@ -0,0 +1,198 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! MemTable + +pub mod factory; +pub mod key; +pub mod skiplist; + +use std::{ops::Bound, sync::Arc}; + +use common_types::{ + bytes::{ByteVec, Bytes}, + projected_schema::ProjectedSchema, + record_batch::RecordBatchWithKey, + row::Row, + schema::{IndexInWriterSchema, Schema}, + SequenceNumber, +}; +use common_util::define_result; +use snafu::{Backtrace, Snafu}; + +use crate::memtable::key::KeySequence; + +const DEFAULT_SCAN_BATCH_SIZE: usize = 500; + +#[derive(Debug, Snafu)] +#[snafu(visibility(pub(crate)))] +pub enum Error { + #[snafu(display("Failed to encode internal key, err:{}", source))] + EncodeInternalKey { source: crate::memtable::key::Error }, + + #[snafu(display("Failed to decode internal key, err:{}", source))] + DecodeInternalKey { source: crate::memtable::key::Error }, + + #[snafu(display("Failed to decode row, err:{}", source))] + DecodeRow { + source: common_util::codec::row::Error, + }, + + #[snafu(display("Failed to append row to batch builder, err:{}", source))] + AppendRow { + source: common_types::record_batch::Error, + }, + + #[snafu(display("Failed to build record batch, err:{}", source,))] + BuildRecordBatch { + source: common_types::record_batch::Error, + }, + + #[snafu(display("Failed to project memtable schema, err:{}", source))] + ProjectSchema { + source: common_types::projected_schema::Error, + }, + + #[snafu(display( + "Invalid sequence number to put, given:{}, last:{}.\nBacktrace:\n{}", + given, + last, + backtrace + ))] + InvalidPutSequence { + given: SequenceNumber, + last: SequenceNumber, + backtrace: Backtrace, + }, + + #[snafu(display("Invalid row, err:{}", source))] + InvalidRow { + source: Box, + }, + + #[snafu(display("Fail to iter in reverse order, err:{}", source))] + IterReverse { + source: Box, + }, +} + +define_result!(Error); + +/// Options for put and context for tracing +pub struct PutContext { + /// Buffer for encoding key, can reuse during put + pub key_buf: ByteVec, + /// Buffer for encoding value, can reuse during put + pub value_buf: ByteVec, + /// Used to encode row. + pub index_in_writer: IndexInWriterSchema, +} + +impl PutContext { + pub fn new(index_in_writer: IndexInWriterSchema) -> Self { + Self { + key_buf: ByteVec::new(), + value_buf: ByteVec::new(), + index_in_writer, + } + } +} + +/// Options for scan and context for tracing +#[derive(Debug, Clone)] +pub struct ScanContext { + /// Suggested row number per batch + pub batch_size: usize, +} + +impl Default for ScanContext { + fn default() -> Self { + Self { + batch_size: DEFAULT_SCAN_BATCH_SIZE, + } + } +} + +/// Scan request +/// +/// Now we only support forward scan. +#[derive(Debug, Clone)] +pub struct ScanRequest { + /// The start key of the encoded user key (without sequence). + pub start_user_key: Bound, + /// The end key of the encoded user key (without sequence). + pub end_user_key: Bound, + /// Max visible sequence (inclusive), row key with sequence <= this can be + /// visible. + pub sequence: SequenceNumber, + /// Schema and projection to read. + pub projected_schema: ProjectedSchema, + pub need_dedup: bool, + pub reverse: bool, +} + +/// In memory storage for table's data. +/// +/// # Concurrency +/// The memtable is designed for single-writer and mutltiple-reader usage, so +/// not all function supports concurrent writer, the caller should guarantee not +/// writing to the memtable concurrrently. +// All operation is done in memory, no need to use async trait +pub trait MemTable { + /// Schema of this memtable + /// + /// The schema of a memtable is not allowed to change now. Modifying the + /// schema of a table requires a memtable switch and external + /// synchronization + fn schema(&self) -> &Schema; + + /// Peek the min key of this memtable. + fn min_key(&self) -> Option; + + /// Peek the max key of this memtable. + fn max_key(&self) -> Option; + + /// Insert one row into the memtable. + /// + ///.- ctx: The put context + /// - sequence: The sequence of the row + /// - row: The row to insert + /// - schema: The schema of the row + /// + /// REQUIRE: + /// - The schema of RowGroup must equal to the schema of memtable. How to + /// handle duplicate entries is implementation specific. + fn put( + &self, + ctx: &mut PutContext, + sequence: KeySequence, + row: &Row, + schema: &Schema, + ) -> Result<()>; + + /// Scan the memtable. + /// + /// Returns the data in columnar format. The returned rows is guaranteed + /// to be ordered by the primary key. + fn scan(&self, ctx: ScanContext, request: ScanRequest) -> Result; + + /// Returns an estimate of the number of bytes of data in used + fn approximate_memory_usage(&self) -> usize; + + /// Set last sequence of the memtable, returns error if the given `sequence` + /// is less than existing last sequence. + /// + /// REQUIRE: + /// - External synchronization is required. + fn set_last_sequence(&self, sequence: SequenceNumber) -> Result<()>; + + /// Returns the last sequence of the memtable. + /// + /// If the memtable is empty, then the last sequence is 0. + fn last_sequence(&self) -> SequenceNumber; +} + +/// A reference to memtable +pub type MemTableRef = Arc; + +/// A pointer to columnar iterator +pub type ColumnarIterPtr = Box> + Send + Sync>; diff --git a/analytic_engine/src/memtable/skiplist/factory.rs b/analytic_engine/src/memtable/skiplist/factory.rs new file mode 100644 index 0000000000..89dd453587 --- /dev/null +++ b/analytic_engine/src/memtable/skiplist/factory.rs @@ -0,0 +1,32 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Skiplist memtable factory + +use std::sync::{atomic::AtomicU64, Arc}; + +use arena::MonoIncArena; +use skiplist::Skiplist; + +use crate::memtable::{ + factory::{Factory, Options, Result}, + skiplist::{BytewiseComparator, SkiplistMemTable}, + MemTableRef, +}; + +/// Factory to create memtable +#[derive(Debug)] +pub struct SkiplistMemTableFactory; + +impl Factory for SkiplistMemTableFactory { + fn create_memtable(&self, opts: Options) -> Result { + let arena = MonoIncArena::with_collector(opts.arena_block_size as usize, opts.collector); + let skiplist = Skiplist::with_arena(BytewiseComparator, arena); + let memtable = Arc::new(SkiplistMemTable { + schema: opts.schema, + skiplist, + last_sequence: AtomicU64::new(opts.creation_sequence), + }); + + Ok(memtable) + } +} diff --git a/analytic_engine/src/memtable/skiplist/iter.rs b/analytic_engine/src/memtable/skiplist/iter.rs new file mode 100644 index 0000000000..0cf60cc90e --- /dev/null +++ b/analytic_engine/src/memtable/skiplist/iter.rs @@ -0,0 +1,346 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Skiplist memtable iterator + +use std::{cmp::Ordering, iter::Rev, ops::Bound}; + +use arena::{Arena, BasicStats}; +use common_types::{ + bytes::{Bytes, BytesMut}, + projected_schema::{ProjectedSchema, RowProjector}, + record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder}, + row::contiguous::{ContiguousRowReader, ProjectedContiguousRow}, + schema::Schema, + SequenceNumber, +}; +use common_util::codec::row; +use log::trace; +use skiplist::{ArenaSlice, IterRef, Skiplist}; +use snafu::ResultExt; + +use crate::memtable::{ + key::{self, KeySequence}, + skiplist::{BytewiseComparator, SkiplistMemTable}, + AppendRow, BuildRecordBatch, DecodeInternalKey, EncodeInternalKey, IterReverse, ProjectSchema, + Result, ScanContext, ScanRequest, +}; + +/// Iterator state +#[derive(Debug, PartialEq)] +enum State { + /// The iterator struct is created but not initialized + Uninitialized, + /// The iterator is initialized (seek) + Initialized, + /// No more element the iterator can return + Finished, +} + +/// Columnar iterator for [SkiplistMemTable] +pub struct ColumnarIterImpl + Clone + Sync + Send> { + /// The internal skiplist iter + iter: IterRef, BytewiseComparator, A>, + + // Schema related: + /// Schema of this memtable, used to decode row + memtable_schema: Schema, + /// Projection of schema to read + projected_schema: ProjectedSchema, + projector: RowProjector, + + // Options related: + batch_size: usize, + + start_user_key: Bound, + end_user_key: Bound, + /// Max visible sequence + sequence: SequenceNumber, + /// State of iterator + state: State, + /// Last internal key this iterator returned + // TODO(yingwen): Wrap a internal key struct? + last_internal_key: Option>, + + /// Dedup rows with key + need_dedup: bool, +} + +impl + Clone + Sync + Send> ColumnarIterImpl { + /// Create a new [ColumnarIterImpl] + pub fn new( + memtable: &SkiplistMemTable, + ctx: ScanContext, + request: ScanRequest, + ) -> Result { + // Create projection for the memtable schema + let projector = request + .projected_schema + .try_project_with_key(&memtable.schema) + .context(ProjectSchema)?; + + let iter = memtable.skiplist.iter(); + let mut columnar_iter = Self { + iter, + memtable_schema: memtable.schema.clone(), + projected_schema: request.projected_schema, + projector, + batch_size: ctx.batch_size, + start_user_key: request.start_user_key, + end_user_key: request.end_user_key, + sequence: request.sequence, + state: State::Uninitialized, + last_internal_key: None, + need_dedup: request.need_dedup, + }; + + columnar_iter.init()?; + + Ok(columnar_iter) + } + + /// Init the iterator, will seek to the proper position for first next() + /// call, so the first entry next() returned is after the + /// `start_user_key`, but we still need to check `end_user_key` + fn init(&mut self) -> Result<()> { + match &self.start_user_key { + Bound::Included(user_key) => { + // Construct seek key + let mut key_buf = BytesMut::new(); + let seek_key = key::internal_key_for_seek(user_key, self.sequence, &mut key_buf) + .context(EncodeInternalKey)?; + + // Seek the skiplist + self.iter.seek(seek_key); + } + Bound::Excluded(user_key) => { + // Construct seek key, just seek to the key with next prefix, so there is no + // need to skip the key until we meet the first key > + // start_user_key + let next_user_key = row::key_prefix_next(user_key); + let mut key_buf = BytesMut::new(); + let seek_key = + key::internal_key_for_seek(&next_user_key, self.sequence, &mut key_buf) + .context(EncodeInternalKey)?; + + // Seek the skiplist + self.iter.seek(seek_key); + } + Bound::Unbounded => self.iter.seek_to_first(), + } + + self.state = State::Initialized; + + Ok(()) + } + + /// Fetch next record batch + fn fetch_next_record_batch(&mut self) -> Result> { + debug_assert_eq!(State::Initialized, self.state); + assert!(self.batch_size > 0); + + let mut builder = RecordBatchWithKeyBuilder::with_capacity( + self.projected_schema.to_record_schema_with_key(), + self.batch_size, + ); + let mut num_rows = 0; + while self.iter.valid() && num_rows < self.batch_size { + if let Some(row) = self.fetch_next_row()? { + let row_reader = ContiguousRowReader::with_schema(&row, &self.memtable_schema); + let projected_row = ProjectedContiguousRow::new(row_reader, &self.projector); + + trace!("Column iterator fetch next row, row:{:?}", projected_row); + + builder + .append_projected_contiguous_row(&projected_row) + .context(AppendRow)?; + num_rows += 1; + } else { + // There is no more row to fetch + self.finish(); + break; + } + } + + if num_rows > 0 { + let batch = builder.build().context(BuildRecordBatch)?; + trace!("column iterator send one batch:{:?}", batch); + + Ok(Some(batch)) + } else { + // If iter is invalid after seek (nothing matched), then it may not be marked as + // finished yet + self.finish(); + Ok(None) + } + } + + /// Fetch next row matched the given condition, the current entry of iter + /// will be considered + /// + /// REQUIRE: The iter is valid + fn fetch_next_row(&mut self) -> Result>> { + debug_assert_eq!(State::Initialized, self.state); + + // TODO(yingwen): Some operation like delete needs to be considered during + // iterating: we need to ignore this key if found a delete mark + while self.iter.valid() { + // Fetch current entry + let key = self.iter.key(); + let (user_key, sequence) = + key::user_key_from_internal_key(key).context(DecodeInternalKey)?; + + // Check user key is still in range + if self.is_after_end_bound(user_key) { + // Out of bound + self.finish(); + return Ok(None); + } + + if self.need_dedup { + // Whether this user key is already returned + let same_key = match &self.last_internal_key { + Some(last_internal_key) => { + // TODO(yingwen): Actually this call wont fail, only valid internal key will + // be set as last_internal_key so maybe we can just + // unwrap it? + let (last_user_key, _) = key::user_key_from_internal_key(last_internal_key) + .context(DecodeInternalKey)?; + user_key == last_user_key + } + // This is the first user key + None => false, + }; + + if same_key { + // We meet duplicate key, move forward and continue to find next user key + self.iter.next(); + continue; + } + // Now this is a new user key + } + + // Check whether this key is visible + if !self.is_visible(sequence) { + // The sequence of this key is not visible, move forward + self.iter.next(); + continue; + } + + // This is the row we want + let row = self.iter.value_with_arena(); + + // Store the last key + self.last_internal_key = Some(self.iter.key_with_arena()); + // Move iter forward + self.iter.next(); + + return Ok(Some(row)); + } + + // No more row in range, we can stop the iterator + self.finish(); + Ok(None) + } + + /// Return true if the sequence is visible + #[inline] + fn is_visible(&self, sequence: KeySequence) -> bool { + sequence.sequence() <= self.sequence + } + + /// Return true if the key is after the `end_user_key` bound + fn is_after_end_bound(&self, key: &[u8]) -> bool { + match &self.end_user_key { + Bound::Included(end) => match key.cmp(end) { + Ordering::Less | Ordering::Equal => false, + Ordering::Greater => true, + }, + Bound::Excluded(end) => match key.cmp(end) { + Ordering::Less => false, + Ordering::Equal | Ordering::Greater => true, + }, + // All key is valid + Bound::Unbounded => false, + } + } + + /// Mark the iterator state to finished and return None + fn finish(&mut self) { + self.state = State::Finished; + } +} + +impl + Clone + Sync + Send> Iterator for ColumnarIterImpl { + type Item = Result; + + fn next(&mut self) -> Option { + if self.state != State::Initialized { + return None; + } + + self.fetch_next_record_batch().transpose() + } +} + +/// Reversed columnar iterator. +// TODO(xikai): Now the implementation is not perfect: read all the entries +// into a buffer and reverse read it. The memtable should support scan in +// reverse order naturally. +pub struct ReversedColumnarIterator { + iter: I, + reversed_iter: Option>>>, + num_record_batch: usize, +} + +impl ReversedColumnarIterator +where + I: Iterator>, +{ + pub fn new(iter: I, num_rows: usize, batch_size: usize) -> Self { + Self { + iter, + reversed_iter: None, + num_record_batch: num_rows / batch_size, + } + } + + fn init_if_necessary(&mut self) { + if self.reversed_iter.is_some() { + return; + } + + let mut buf = Vec::with_capacity(self.num_record_batch); + for item in &mut self.iter { + buf.push(item); + } + self.reversed_iter = Some(buf.into_iter().rev()); + } +} + +impl Iterator for ReversedColumnarIterator +where + I: Iterator>, +{ + type Item = Result; + + fn next(&mut self) -> Option { + self.init_if_necessary(); + self.reversed_iter + .as_mut() + .unwrap() + .next() + .map(|v| match v { + Ok(mut batch_with_key) => { + batch_with_key + .reverse_data() + .map_err(|e| Box::new(e) as _) + .context(IterReverse)?; + + Ok(batch_with_key) + } + Err(e) => Err(e), + }) + } +} + +// TODO(yingwen): Test diff --git a/analytic_engine/src/memtable/skiplist/mod.rs b/analytic_engine/src/memtable/skiplist/mod.rs new file mode 100644 index 0000000000..2a1459bc80 --- /dev/null +++ b/analytic_engine/src/memtable/skiplist/mod.rs @@ -0,0 +1,363 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! MemTable based on skiplist + +pub mod factory; +pub mod iter; + +use std::{ + cmp::Ordering, + convert::TryInto, + sync::atomic::{self, AtomicU64}, +}; + +use arena::{Arena, BasicStats}; +use common_types::{ + bytes::Bytes, + row::{contiguous::ContiguousRowWriter, Row}, + schema::Schema, + SequenceNumber, +}; +use common_util::codec::Encoder; +use log::{debug, trace}; +use skiplist::{KeyComparator, Skiplist}; +use snafu::{ensure, ResultExt}; + +use crate::memtable::{ + key::{ComparableInternalKey, KeySequence}, + skiplist::iter::{ColumnarIterImpl, ReversedColumnarIterator}, + ColumnarIterPtr, EncodeInternalKey, InvalidPutSequence, InvalidRow, MemTable, PutContext, + Result, ScanContext, ScanRequest, +}; + +/// MemTable implementation based on skiplist +pub struct SkiplistMemTable + Clone + Sync + Send> { + /// Schema of this memtable, is immutable. + schema: Schema, + skiplist: Skiplist, + /// The last sequence of the rows in this memtable. Update to this field + /// require external synchronization. + last_sequence: AtomicU64, +} + +impl + Clone + Sync + Send + 'static> MemTable + for SkiplistMemTable +{ + fn schema(&self) -> &Schema { + &self.schema + } + + fn min_key(&self) -> Option { + let mut iter = self.skiplist.iter(); + iter.seek_to_first(); + if !iter.valid() { + None + } else { + Some(iter.key().to_vec().into()) + } + } + + fn max_key(&self) -> Option { + let mut iter = self.skiplist.iter(); + iter.seek_to_last(); + if !iter.valid() { + None + } else { + Some(iter.key().to_vec().into()) + } + } + + // TODO(yingwen): Encode value if value_buf is not set. + // Now the caller is required to encode the row into the `value_buf` in + // PutContext first. + fn put( + &self, + ctx: &mut PutContext, + sequence: KeySequence, + row: &Row, + schema: &Schema, + ) -> Result<()> { + trace!("skiplist put row, sequence:{:?}, row:{:?}", sequence, row); + + let key_encoder = ComparableInternalKey::new(sequence, schema); + + let internal_key = &mut ctx.key_buf; + // Reset key buffer + internal_key.clear(); + // Reserve capacity for key + internal_key.reserve(key_encoder.estimate_encoded_size(row)); + // Encode key + key_encoder + .encode(internal_key, row) + .context(EncodeInternalKey)?; + + // Encode row value. The ContiguousRowWriter will clear the buf. + let row_value = &mut ctx.value_buf; + let mut row_writer = ContiguousRowWriter::new(row_value, schema, &ctx.index_in_writer); + row_writer + .write_row(row) + .map_err(|e| Box::new(e) as _) + .context(InvalidRow)?; + + self.skiplist.put(internal_key, row_value); + + Ok(()) + } + + fn scan(&self, ctx: ScanContext, request: ScanRequest) -> Result { + debug!( + "Scan skiplist memtable, ctx:{:?}, request:{:?}", + ctx, request + ); + + let num_rows = self.skiplist.len(); + let (reverse, batch_size) = (request.reverse, ctx.batch_size); + let iter = ColumnarIterImpl::new(self, ctx, request)?; + if reverse { + Ok(Box::new(ReversedColumnarIterator::new( + iter, num_rows, batch_size, + ))) + } else { + Ok(Box::new(iter)) + } + } + + fn approximate_memory_usage(&self) -> usize { + // Mem size of skiplist is u32, need to cast to usize + match self.skiplist.mem_size().try_into() { + Ok(v) => v, + // The skiplist already use bytes larger than usize + Err(_) => usize::MAX, + } + } + + fn set_last_sequence(&self, sequence: SequenceNumber) -> Result<()> { + let last = self.last_sequence(); + ensure!( + sequence >= last, + InvalidPutSequence { + given: sequence, + last + } + ); + + self.last_sequence + .store(sequence, atomic::Ordering::Relaxed); + + Ok(()) + } + + fn last_sequence(&self) -> SequenceNumber { + self.last_sequence.load(atomic::Ordering::Relaxed) + } +} + +#[derive(Debug, Clone)] +pub struct BytewiseComparator; + +impl KeyComparator for BytewiseComparator { + #[inline] + fn compare_key(&self, lhs: &[u8], rhs: &[u8]) -> Ordering { + lhs.cmp(rhs) + } + + #[inline] + fn same_key(&self, lhs: &[u8], rhs: &[u8]) -> bool { + lhs == rhs + } +} + +#[cfg(test)] +mod tests { + + use std::{ops::Bound, sync::Arc}; + + use arena::NoopCollector; + use common_types::{ + bytes::ByteVec, + datum::Datum, + projected_schema::ProjectedSchema, + record_batch::RecordBatchWithKey, + schema::IndexInWriterSchema, + tests::{build_row, build_schema}, + time::Timestamp, + }; + use common_util::codec::memcomparable::MemComparable; + + use super::*; + use crate::memtable::{ + factory::{Factory, Options}, + skiplist::factory::SkiplistMemTableFactory, + }; + + fn test_memtable_scan_for_scan_request( + schema: Schema, + memtable: Arc, + ) { + let projection: Vec = (0..schema.num_columns()).collect(); + let projected_schema = ProjectedSchema::new(schema, Some(projection)).unwrap(); + + let testcases = vec![ + ( + // limited by sequence + ScanRequest { + start_user_key: Bound::Unbounded, + end_user_key: Bound::Unbounded, + sequence: 2, + projected_schema: projected_schema.clone(), + need_dedup: true, + reverse: false, + }, + vec![ + build_row(b"a", 1, 10.0, "v1"), + build_row(b"b", 2, 10.0, "v2"), + build_row(b"c", 3, 10.0, "v3"), + build_row(b"d", 4, 10.0, "v4"), + build_row(b"e", 5, 10.0, "v5"), + build_row(b"f", 6, 10.0, "v6"), + ], + ), + ( + // limited by sequence and start/end key + ScanRequest { + start_user_key: Bound::Included(build_scan_key("a", 1)), + end_user_key: Bound::Excluded(build_scan_key("e", 5)), + sequence: 2, + projected_schema: projected_schema.clone(), + need_dedup: true, + reverse: false, + }, + vec![ + build_row(b"a", 1, 10.0, "v1"), + build_row(b"b", 2, 10.0, "v2"), + build_row(b"c", 3, 10.0, "v3"), + build_row(b"d", 4, 10.0, "v4"), + ], + ), + ( + // limited by sequence and start/end key + // but seq is one smaller than last case + ScanRequest { + start_user_key: Bound::Included(build_scan_key("a", 1)), + end_user_key: Bound::Excluded(build_scan_key("e", 5)), + sequence: 1, + projected_schema, + need_dedup: true, + reverse: false, + }, + vec![ + build_row(b"a", 1, 10.0, "v1"), + build_row(b"b", 2, 10.0, "v2"), + build_row(b"c", 3, 10.0, "v3"), + ], + ), + ]; + + for (req, expected) in testcases { + let scan_ctx = ScanContext::default(); + let iter = memtable.scan(scan_ctx, req).unwrap(); + check_iterator(iter, expected); + } + } + + fn test_memtable_scan_for_projection( + schema: Schema, + memtable: Arc, + ) { + let projection: Vec = (0..2).collect(); + let projected_schema = ProjectedSchema::new(schema, Some(projection)).unwrap(); + + let testcases = vec![( + ScanRequest { + start_user_key: Bound::Included(build_scan_key("a", 1)), + end_user_key: Bound::Excluded(build_scan_key("e", 5)), + sequence: 2, + projected_schema, + need_dedup: true, + reverse: false, + }, + vec![ + build_row_for_two_column(b"a", 1), + build_row_for_two_column(b"b", 2), + build_row_for_two_column(b"c", 3), + build_row_for_two_column(b"d", 4), + ], + )]; + + for (req, expected) in testcases { + let scan_ctx = ScanContext::default(); + let iter = memtable.scan(scan_ctx, req).unwrap(); + check_iterator(iter, expected); + } + } + + #[test] + fn test_memtable_scan() { + let schema = build_schema(); + let factory = SkiplistMemTableFactory; + let memtable = factory + .create_memtable(Options { + schema: schema.clone(), + arena_block_size: 512, + creation_sequence: 1, + collector: Arc::new(NoopCollector {}), + }) + .unwrap(); + + let mut ctx = PutContext::new(IndexInWriterSchema::for_same_schema(schema.num_columns())); + let input = vec![ + (KeySequence::new(1, 1), build_row(b"a", 1, 10.0, "v1")), + (KeySequence::new(1, 2), build_row(b"b", 2, 10.0, "v2")), + ( + KeySequence::new(1, 3), + build_row(b"c", 3, 10.0, "primary_key same with next row"), + ), + (KeySequence::new(1, 4), build_row(b"c", 3, 10.0, "v3")), + (KeySequence::new(2, 1), build_row(b"d", 4, 10.0, "v4")), + (KeySequence::new(2, 1), build_row(b"e", 5, 10.0, "v5")), + (KeySequence::new(2, 3), build_row(b"f", 6, 10.0, "v6")), + (KeySequence::new(3, 4), build_row(b"g", 7, 10.0, "v7")), + ]; + + for (seq, row) in input { + memtable.put(&mut ctx, seq, &row, &schema).unwrap(); + } + + test_memtable_scan_for_scan_request(schema.clone(), memtable.clone()); + test_memtable_scan_for_projection(schema, memtable); + } + + fn check_iterator>>( + iter: T, + expected_rows: Vec, + ) { + let mut visited_rows = 0; + for batch in iter { + let batch = batch.unwrap(); + for row_idx in 0..batch.num_rows() { + assert_eq!(batch.clone_row_at(row_idx), expected_rows[visited_rows]); + visited_rows += 1; + } + } + + assert_eq!(visited_rows, expected_rows.len()); + } + + fn build_scan_key(c1: &str, c2: i64) -> Bytes { + let mut buf = ByteVec::new(); + let encoder = MemComparable; + encoder.encode(&mut buf, &Datum::from(c1)).unwrap(); + encoder.encode(&mut buf, &Datum::from(c2)).unwrap(); + + Bytes::from(buf) + } + + pub fn build_row_for_two_column(key1: &[u8], key2: i64) -> Row { + let datums = vec![ + Datum::Varbinary(Bytes::copy_from_slice(key1)), + Datum::Timestamp(Timestamp::new(key2)), + ]; + + Row::from_datums(datums) + } +} diff --git a/analytic_engine/src/meta/details.rs b/analytic_engine/src/meta/details.rs new file mode 100644 index 0000000000..ae9c5a1741 --- /dev/null +++ b/analytic_engine/src/meta/details.rs @@ -0,0 +1,1282 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Implementation of Manifest + +use std::sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, +}; + +use async_trait::async_trait; +use common_types; +use common_util::define_result; +use log::{error, info, warn}; +use serde_derive::Deserialize; +use snafu::{ResultExt, Snafu}; +use tokio::sync::Mutex; +use wal::{ + log_batch::{LogWriteBatch, LogWriteEntry}, + manager::{ + LogIterator, ReadBoundary, ReadContext, ReadRequest, RegionId, SequenceNumber, WalManager, + WriteContext, + }, +}; + +use crate::meta::{ + meta_data::ManifestData, + meta_update::{ + MetaUpdate, MetaUpdateDecoder, MetaUpdatePayload, SnapshotManifestMeta, VersionEditMeta, + }, + Manifest, +}; + +/// The region id manifest used. +const MANIFEST_REGION_ID: RegionId = 1; +/// The region id to store snapshot state. +const SNAPSHOT_STATE_REGION_ID: RegionId = 2; +// The first region id of snapshot region. +const FIRST_SNAPSHOT_REGION_ID: RegionId = 3; +// The second region id of snapshot region. +const SECOND_SNAPSHOT_REGION_ID: RegionId = 4; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to write update to wal, err:{}", source))] + WriteWal { source: wal::manager::Error }, + + #[snafu(display("Failed to read wal, err:{}", source))] + ReadWal { source: wal::manager::Error }, + + #[snafu(display("Failed to read log entry, err:{}", source))] + ReadEntry { source: wal::manager::Error }, + + #[snafu(display("Failed to apply meta update, err:{}", source))] + ApplyUpdate { + source: crate::meta::meta_data::Error, + }, + + #[snafu(display("Failed to clean wal, err:{}", source))] + CleanWal { source: wal::manager::Error }, + + #[snafu(display("Failed to clean snapshot, region_id:{}, err:{}", region_id, source))] + CleanSnapshot { + region_id: RegionId, + source: wal::manager::Error, + }, + + #[snafu(display("Failed to load sequence of manifest, err:{}", source))] + LoadSequence { source: wal::manager::Error }, + + #[snafu(display("Failed to load sequence of snapshot state, err:{}", source))] + LoadSnapshotMetaSequence { source: wal::manager::Error }, + + #[snafu(display("Failed to clean snapshot state, err:{}", source))] + CleanSnapshotState { source: wal::manager::Error }, +} + +define_result!(Error); + +const STORE_UPDATE_BATCH: usize = 500; + +/// Implementation of [MetaUpdateReader] +#[derive(Debug)] +pub struct MetaUpdateReaderImpl { + iter: W::Iterator, +} + +impl MetaUpdateReaderImpl { + async fn next_update(&mut self) -> Result> { + let decoder = MetaUpdateDecoder; + + match self.iter.next_log_entry(&decoder).context(ReadEntry)? { + Some(entry) => Ok(Some(entry.payload)), + None => Ok(None), + } + } +} + +/// State to track manifest snapshot. +#[derive(Debug, Default)] +struct SnapshotState { + /// Meta data of the snapshot of the manifest, `None` if there is no + /// snapshot. + snapshot_meta: Option, +} + +impl SnapshotState { + fn install_snapshot_meta(&mut self, snapshot_meta: SnapshotManifestMeta) { + self.snapshot_meta = Some(snapshot_meta); + } + + fn next_snapshot_region_id(&self) -> RegionId { + match self.snapshot_meta { + Some(snapshot_meta) => { + if snapshot_meta.snapshot_region_id == FIRST_SNAPSHOT_REGION_ID { + SECOND_SNAPSHOT_REGION_ID + } else { + FIRST_SNAPSHOT_REGION_ID + } + } + None => FIRST_SNAPSHOT_REGION_ID, + } + } +} + +#[derive(Debug, Clone, Deserialize)] +pub struct Options { + pub snapshot_every_n_updates: usize, + pub paranoid_checks: bool, +} + +impl Default for Options { + fn default() -> Self { + Self { + snapshot_every_n_updates: 10_000, + paranoid_checks: true, + } + } +} + +// TODO(yingwen): Wrap into an inner struct if there are too many Arc fields. +/// Implementation of [Manifest]. +#[derive(Debug, Clone)] +pub struct ManifestImpl { + /// Region id for this manifest. + manifest_region_id: RegionId, + /// Wal manager, the manifest use its own wal manager instance. + wal_manager: Arc, + opts: Options, + + // Snapshot related: + /// Region id to store snapshot state. + snapshot_state_region_id: RegionId, + snapshot_state: Arc>, + /// Number of updates wrote to wal since last snapshot. + num_updates_since_snapshot: Arc, +} + +impl ManifestImpl { + pub async fn open(wal_manager: W, opts: Options) -> Result { + let mut manifest = Self { + manifest_region_id: MANIFEST_REGION_ID, + wal_manager: Arc::new(wal_manager), + opts, + snapshot_state_region_id: SNAPSHOT_STATE_REGION_ID, + snapshot_state: Arc::new(Mutex::new(SnapshotState::default())), + num_updates_since_snapshot: Arc::new(AtomicUsize::new(0)), + }; + + manifest.load_snapshot_state().await?; + + Ok(manifest) + } + + async fn load_snapshot_state(&mut self) -> Result<()> { + // Load snapshot state. + let mut reader = self.read_updates_from_region( + self.snapshot_state_region_id, + ReadBoundary::Min, + ReadBoundary::Max, + )?; + + let mut last_snapshot_meta = None; + while let Some(update) = reader.next_update().await? { + // If the entry is a snapshot entry. + if let Some(snapshot_meta) = update.snapshot_manifest_meta() { + last_snapshot_meta = Some(snapshot_meta); + } else { + error!( + "Manifest found non snapshot state entry, entry:{:?}", + update + ); + } + } + + let mut snapshot_state = self.snapshot_state.lock().await; + if let Some(snapshot_meta) = last_snapshot_meta { + // Previous snapshot exists. + snapshot_state.install_snapshot_meta(snapshot_meta); + + info!( + "Manifest found snapshot_meta, snapshot_state:{:?}, last_snapshot_meta:{:?}", + snapshot_state, last_snapshot_meta + ); + } + + Ok(()) + } + + fn read_updates_from_region( + &self, + region_id: RegionId, + start: ReadBoundary, + end: ReadBoundary, + ) -> Result> { + let request = ReadRequest { + region_id, + start, + end, + }; + let ctx = ReadContext::default(); + + let iter = self.wal_manager.read(&ctx, &request).context(ReadWal)?; + + Ok(MetaUpdateReaderImpl { iter }) + } + + /// Load meta update from region of given `region_id` and apply into + /// `manifest_data`. + async fn load_data_from_region( + &self, + region_id: RegionId, + manifest_data: &mut ManifestData, + ) -> Result<()> { + self.load_data_from_region_in_range( + region_id, + ReadBoundary::Min, + ReadBoundary::Max, + manifest_data, + ) + .await?; + + Ok(()) + } + + /// Load meta update in given range from region of given `region_id` + /// boundary, and apply into `manifest_data`. Returns number of MetaUpdates + /// loaded. + async fn load_data_from_region_in_range( + &self, + region_id: RegionId, + start: ReadBoundary, + end: ReadBoundary, + manifest_data: &mut ManifestData, + ) -> Result { + let mut reader = self.read_updates_from_region(region_id, start, end)?; + let mut loaded = 0; + + while let Some(update) = reader.next_update().await? { + if let Err(e) = manifest_data.apply_meta_update(update).context(ApplyUpdate) { + if self.opts.paranoid_checks { + return Err(e); + } else { + warn!("Manifest load meta update failed, err:{:?}", e); + continue; + } + } + loaded += 1; + } + Ok(loaded) + } + + /// Load data and create a snapshot. + async fn create_snapshot(&self) -> Result { + info!("Manifest try to create snapshot"); + + // Acquire snapshot lock. + let mut snapshot_state = self.snapshot_state.lock().await; + let last_snapshot_meta = snapshot_state.snapshot_meta; + let next_snapshot_region_id = snapshot_state.next_snapshot_region_id(); + + // Clean next snapshot region. + self.clean_snapshot(next_snapshot_region_id).await?; + + // Load previous snapshot. + let mut manifest_start = ReadBoundary::Min; + let mut manifest_data = ManifestData::default(); + if let Some(snapshot_meta) = last_snapshot_meta { + // Load manifest from last snapshot first. + self.load_data_from_region(snapshot_meta.snapshot_region_id, &mut manifest_data) + .await?; + // The sequence after snapshot. + manifest_start = ReadBoundary::Excluded(snapshot_meta.sequence); + } + + // Get current sequence, data until this sequence can be loaded to create next + // snapshot. + let snapshot_sequence = self + .wal_manager + .sequence_num(self.manifest_region_id) + .context(LoadSequence)?; + + // Load manifest up to `snapshot_sequence`. + let num_loaded_from_manifest = self + .load_data_from_region_in_range( + self.manifest_region_id, + manifest_start, + ReadBoundary::Included(snapshot_sequence), + &mut manifest_data, + ) + .await?; + + // Store snapshot. + self.store_snapshot_to_region(next_snapshot_region_id, &manifest_data) + .await?; + + // Store snapshot state. + let next_snapshot_meta = SnapshotManifestMeta { + snapshot_region_id: next_snapshot_region_id, + sequence: snapshot_sequence, + }; + self.store_snapshot_state(next_snapshot_meta).await?; + + info!( + "Manifest stored snapshot, + next_snapshot_meta:{:?}, + last_snapshot_meta:{:?}, + snapshot_state_before_install:{:?}, + num_updates_since_snapshot:{}", + next_snapshot_meta, + last_snapshot_meta, + snapshot_state, + self.num_updates_since_snapshot() + ); + + // Install new snapshot, also bump next snapshot region id. + snapshot_state.install_snapshot_meta(next_snapshot_meta); + + // Data before sequence of the snapshot can also be removed. + self.wal_manager + .mark_delete_entries_up_to(self.manifest_region_id, snapshot_sequence) + .await + .context(CleanWal)?; + + self.decrease_num_updates(num_loaded_from_manifest); + + info!( + "Manifest create snapshot done, + next_snapshot_meta:{:?}, + last_snapshot_meta:{:?}, + snapshot_state:{:?}, + num_loaded_from_manifest:{}, + num_updates:{}", + next_snapshot_meta, + last_snapshot_meta, + snapshot_state, + num_loaded_from_manifest, + self.num_updates_since_snapshot() + ); + + Ok(manifest_data) + } + + async fn clean_snapshot(&self, snapshot_region_id: RegionId) -> Result<()> { + info!("Clean snapshot, snapshot_region_id:{}", snapshot_region_id); + + self.wal_manager + .mark_delete_entries_up_to(snapshot_region_id, common_types::MAX_SEQUENCE_NUMBER) + .await + .context(CleanSnapshot { + region_id: snapshot_region_id, + }) + .map_err(|e| { + error!( + "Failed to clean snapshot, region_id:{}, err:{}", + snapshot_region_id, e + ); + e + }) + } + + async fn store_snapshot_state(&self, snapshot_meta: SnapshotManifestMeta) -> Result<()> { + // Get current snapshot state sequence. + let snapshot_state_sequence = self + .wal_manager + .sequence_num(self.snapshot_state_region_id) + .context(LoadSnapshotMetaSequence)?; + // Write a snapshot entry into the region. + + self.store_update_to_region( + self.snapshot_state_region_id, + MetaUpdate::SnapshotManifest(snapshot_meta), + ) + .await?; + // Clean old snapshot state. + self.wal_manager + .mark_delete_entries_up_to(self.snapshot_state_region_id, snapshot_state_sequence) + .await + .context(CleanSnapshotState) + } + + async fn store_update_to_region( + &self, + region_id: RegionId, + update: MetaUpdate, + ) -> Result { + info!( + "Manifest impl store update, region_id:{}, update:{:?}", + region_id, update + ); + + let mut log_batch = LogWriteBatch::new(region_id); + log_batch.push(LogWriteEntry { + payload: MetaUpdatePayload::from(update), + }); + + let write_ctx = WriteContext::default(); + + self.wal_manager + .write(&write_ctx, &log_batch) + .await + .context(WriteWal) + } + + async fn store_updates_to_region( + &self, + region_id: RegionId, + updates: &[MetaUpdate], + ) -> Result { + let mut log_batch = LogWriteBatch::new(region_id); + for update in updates { + log_batch.push(LogWriteEntry { + payload: MetaUpdatePayload::from(update), + }); + } + + let write_ctx = WriteContext::default(); + + self.wal_manager + .write(&write_ctx, &log_batch) + .await + .context(WriteWal) + } + + async fn store_snapshot_to_region( + &self, + region_id: RegionId, + snapshot: &ManifestData, + ) -> Result<()> { + info!("Manifest store snapshot to region, region_id:{}", region_id); + + let mut meta_updates = Vec::with_capacity(STORE_UPDATE_BATCH); + + // Store all spaces. + for (space_id, space_meta_data) in &snapshot.spaces { + let space_meta = space_meta_data.space_meta.clone(); + // Add this space. + meta_updates.push(MetaUpdate::AddSpace(space_meta)); + + // Add all tables to the space. + for (table_id, table_meta_data) in &space_meta_data.tables { + let table_meta = table_meta_data.table_meta.clone(); + // Store table meta. + meta_updates.push(MetaUpdate::AddTable(table_meta)); + + // Store version edit. + let version_meta = &table_meta_data.version_meta; + let version_edit_meta = VersionEditMeta { + space_id: *space_id, + table_id: *table_id, + flushed_sequence: version_meta.flushed_sequence, + files_to_add: version_meta.ordered_files(), + files_to_delete: Vec::new(), + }; + meta_updates.push(MetaUpdate::VersionEdit(version_edit_meta)); + + if meta_updates.len() >= STORE_UPDATE_BATCH { + self.store_updates_to_region(region_id, &meta_updates) + .await?; + meta_updates.clear(); + } + } + } + + if !meta_updates.is_empty() { + self.store_updates_to_region(region_id, &meta_updates) + .await?; + meta_updates.clear(); + } + + Ok(()) + } + + #[inline] + fn num_updates_since_snapshot(&self) -> usize { + self.num_updates_since_snapshot.load(Ordering::Relaxed) + } + + // Guarded by snapshot state lock. + #[inline] + fn decrease_num_updates(&self, num: usize) { + if num >= self.num_updates_since_snapshot() { + self.num_updates_since_snapshot.store(0, Ordering::Relaxed); + } else { + self.num_updates_since_snapshot + .fetch_sub(num, Ordering::Relaxed); + } + } +} + +#[async_trait] +impl Manifest for ManifestImpl { + type Error = Error; + + async fn store_update(&self, update: MetaUpdate) -> Result<()> { + self.store_update_to_region(self.manifest_region_id, update) + .await?; + + let num_updates = self + .num_updates_since_snapshot + .fetch_add(1, Ordering::Relaxed); + if num_updates >= self.opts.snapshot_every_n_updates { + info!( + "Enough updates in manifest, trigger snapshot, num_updates:{}", + num_updates + ); + + self.create_snapshot().await?; + } + + Ok(()) + } + + async fn load_data(&self, do_snapshot: bool) -> Result { + if do_snapshot { + let manifest_data = self.create_snapshot().await?; + + Ok(manifest_data) + } else { + let mut manifest_data = ManifestData::default(); + + let last_snapshot_meta = { + let snapshot_state = self.snapshot_state.lock().await; + snapshot_state.snapshot_meta + }; + let mut manifest_start = ReadBoundary::Min; + // Load from snapshot. + if let Some(snapshot_meta) = last_snapshot_meta { + self.load_data_from_region(snapshot_meta.snapshot_region_id, &mut manifest_data) + .await?; + // The sequence after snapshot. + manifest_start = ReadBoundary::Excluded(snapshot_meta.sequence); + } + + // Load remaining data from wal. + self.load_data_from_region_in_range( + self.manifest_region_id, + manifest_start, + ReadBoundary::Max, + &mut manifest_data, + ) + .await?; + + Ok(manifest_data) + } + } +} + +#[cfg(test)] +mod tests { + use std::{path::PathBuf, sync::Arc}; + + use common_types::{column_schema, datum::DatumKind, schema, schema::Schema}; + use common_util::{runtime, runtime::Runtime, tests::init_log_for_test}; + use table_engine::table::TableId; + use wal::rocks_impl::manager::{Builder as WalBuilder, RocksImpl}; + + use super::*; + use crate::{ + meta::{ + details::{ManifestImpl, Options}, + meta_update::{ + AddSpaceMeta, AddTableMeta, AlterOptionsMeta, AlterSchemaMeta, DropTableMeta, + MetaUpdate, VersionEditMeta, + }, + Manifest, + }, + TableOptions, + }; + + fn build_altered_schema(schema: &Schema) -> Schema { + let mut builder = schema::Builder::new().auto_increment_column_id(true); + for column_schema in schema.key_columns() { + builder = builder + .add_key_column(column_schema.clone()) + .expect("should succeed to add key column"); + } + for column_schema in schema.normal_columns() { + builder = builder + .add_normal_column(column_schema.clone()) + .expect("should succeed to add normal column"); + } + builder + .add_normal_column( + column_schema::Builder::new("field3".to_string(), DatumKind::String) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .build() + .unwrap() + } + + fn build_runtime(thread_num: usize) -> Arc { + Arc::new( + runtime::Builder::default() + .worker_threads(thread_num) + .enable_all() + .build() + .unwrap(), + ) + } + + async fn build_manifest( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> ManifestImpl { + let manifest_wal = WalBuilder::with_default_rocksdb_config(dir, runtime.clone()) + .build() + .unwrap(); + + ManifestImpl::open(manifest_wal, opts).await.unwrap() + } + + async fn assert_expected( + dir: impl Into, + runtime: Arc, + opts: Options, + expected: &str, + ) -> Result<()> { + let manifest = build_manifest(dir, runtime, opts).await; + let data = manifest.load_data(false).await?; + assert_eq!(format!("{:#?}", data), expected); + Ok(()) + } + + async fn test_manifest_add_space( + dir: impl Into, + runtime: Arc, + opts: Options, + ) { + let space_id = 10; + let space_name = "test".to_string(); + + let manifest = build_manifest(dir, runtime, opts).await; + let add_space = MetaUpdate::AddSpace(AddSpaceMeta { + space_id, + space_name: space_name.clone(), + }); + manifest.store_update(add_space).await.unwrap(); + let data = manifest.load_data(false).await.unwrap(); + assert_eq!(data.spaces.len(), 1); + assert_eq!(data.spaces.get(&10).unwrap().space_meta.space_id, space_id); + assert_eq!( + data.spaces.get(&10).unwrap().space_meta.space_name, + space_name + ); + } + + async fn check_add_table( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let expected = r#"ManifestData { + spaces: { + 10: SpaceMetaData { + space_meta: AddSpaceMeta { + space_id: 10, + space_name: "test", + }, + tables: { + TableId(100, 0, 100): TableMetaData { + table_meta: AddTableMeta { + space_id: 10, + table_id: TableId(100, 0, 100), + table_name: "test_table", + schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + opts: TableOptions { + segment_duration: None, + update_mode: Overwrite, + enable_ttl: true, + ttl: ReadableDuration( + 604800s, + ), + arena_block_size: 2097152, + write_buffer_size: 33554432, + compaction_strategy: Default, + num_rows_per_row_group: 8192, + compression: Zstd, + }, + }, + version_meta: TableVersionMeta { + flushed_sequence: 0, + files: {}, + max_file_id: 0, + }, + }, + }, + }, + }, + last_space_id: 10, +}"#; + assert_expected(dir, runtime, opts, expected).await + } + + async fn test_manifest_add_table( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let space_id = 10; + let manifest = build_manifest(dir, runtime, opts).await; + + let table_id = TableId::from(100); + let table_name = "test_table".to_string(); + let add_table = MetaUpdate::AddTable(AddTableMeta { + space_id, + table_id, + table_name, + schema: common_types::tests::build_schema(), + opts: TableOptions::default(), + }); + manifest.store_update(add_table).await + } + + async fn check_drop_table( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let expected = r#"ManifestData { + spaces: { + 10: SpaceMetaData { + space_meta: AddSpaceMeta { + space_id: 10, + space_name: "test", + }, + tables: {}, + }, + }, + last_space_id: 10, +}"#; + assert_expected(dir, runtime, opts, expected).await + } + + async fn test_manifest_drop_table( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let space_id = 10; + + let manifest = build_manifest(dir, runtime, opts).await; + + let table_id = TableId::from(100); + let table_name = "test_table".to_string(); + let add_table = MetaUpdate::DropTable(DropTableMeta { + space_id, + table_id, + table_name, + }); + manifest.store_update(add_table).await + } + + async fn check_version_edit_with_table( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let expected = r#"ManifestData { + spaces: { + 10: SpaceMetaData { + space_meta: AddSpaceMeta { + space_id: 10, + space_name: "test", + }, + tables: { + TableId(100, 0, 100): TableMetaData { + table_meta: AddTableMeta { + space_id: 10, + table_id: TableId(100, 0, 100), + table_name: "test_table", + schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + opts: TableOptions { + segment_duration: None, + update_mode: Overwrite, + enable_ttl: true, + ttl: ReadableDuration( + 604800s, + ), + arena_block_size: 2097152, + write_buffer_size: 33554432, + compaction_strategy: Default, + num_rows_per_row_group: 8192, + compression: Zstd, + }, + }, + version_meta: TableVersionMeta { + flushed_sequence: 3, + files: {}, + max_file_id: 0, + }, + }, + }, + }, + }, + last_space_id: 10, +}"#; + assert_expected(dir, runtime, opts, expected).await + } + + async fn check_version_edit_no_table( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let expected = r#"ManifestData { + spaces: { + 10: SpaceMetaData { + space_meta: AddSpaceMeta { + space_id: 10, + space_name: "test", + }, + tables: {}, + }, + }, + last_space_id: 10, +}"#; + assert_expected(dir, runtime, opts, expected).await + } + + async fn test_manifest_version_edit( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let space_id = 10; + + let manifest = build_manifest(dir, runtime, opts).await; + + let table_id = TableId::from(100); + let version_edit = MetaUpdate::VersionEdit(VersionEditMeta { + space_id, + table_id, + flushed_sequence: 3, + files_to_add: Vec::new(), + files_to_delete: Vec::new(), + }); + manifest.store_update(version_edit).await + } + + async fn check_alter_schema( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let expected = r#"ManifestData { + spaces: { + 10: SpaceMetaData { + space_meta: AddSpaceMeta { + space_id: 10, + space_name: "test", + }, + tables: { + TableId(100, 0, 100): TableMetaData { + table_meta: AddTableMeta { + space_id: 10, + table_id: TableId(100, 0, 100), + table_name: "test_table", + schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 5, + name: "field3", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + opts: TableOptions { + segment_duration: None, + update_mode: Overwrite, + enable_ttl: true, + ttl: ReadableDuration( + 604800s, + ), + arena_block_size: 2097152, + write_buffer_size: 33554432, + compaction_strategy: Default, + num_rows_per_row_group: 8192, + compression: Zstd, + }, + }, + version_meta: TableVersionMeta { + flushed_sequence: 3, + files: {}, + max_file_id: 0, + }, + }, + }, + }, + }, + last_space_id: 10, +}"#; + assert_expected(dir, runtime, opts, expected).await + } + + async fn test_manifest_alter_schema( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let space_id = 10; + let manifest = build_manifest(dir, runtime, opts).await; + + let table_id = TableId::from(100); + let alter_schema = MetaUpdate::AlterSchema(AlterSchemaMeta { + space_id, + table_id, + schema: build_altered_schema(&common_types::tests::build_schema()), + pre_schema_version: 1, + }); + manifest.store_update(alter_schema).await + } + + async fn check_alter_options( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let expected = r#"ManifestData { + spaces: { + 10: SpaceMetaData { + space_meta: AddSpaceMeta { + space_id: 10, + space_name: "test", + }, + tables: { + TableId(100, 0, 100): TableMetaData { + table_meta: AddTableMeta { + space_id: 10, + table_id: TableId(100, 0, 100), + table_name: "test_table", + schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 5, + name: "field3", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + opts: TableOptions { + segment_duration: None, + update_mode: Overwrite, + enable_ttl: false, + ttl: ReadableDuration( + 604800s, + ), + arena_block_size: 2097152, + write_buffer_size: 33554432, + compaction_strategy: Default, + num_rows_per_row_group: 8192, + compression: Zstd, + }, + }, + version_meta: TableVersionMeta { + flushed_sequence: 3, + files: {}, + max_file_id: 0, + }, + }, + }, + }, + }, + last_space_id: 10, +}"#; + assert_expected(dir, runtime, opts, expected).await + } + + async fn test_manifest_alter_options( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let space_id = 10; + + let manifest = build_manifest(dir, runtime, opts).await; + + let table_id = TableId::from(100); + let alter_options = MetaUpdate::AlterOptions(AlterOptionsMeta { + space_id, + table_id, + options: TableOptions { + enable_ttl: false, + ..Default::default() + }, + }); + manifest.store_update(alter_options).await + } + + #[test] + fn test_manifest() { + init_log_for_test(); + let dir = tempfile::tempdir().unwrap(); + let runtime = build_runtime(2); + let runtime_clone = runtime.clone(); + runtime.block_on(async move { + let opts = Options { + snapshot_every_n_updates: 2, + paranoid_checks: false, + }; + + test_manifest_add_space(dir.path(), runtime_clone.clone(), opts.clone()).await; + + test_manifest_add_table(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .unwrap(); + assert!( + check_add_table(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .is_ok() + ); + + test_manifest_drop_table(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .unwrap(); + assert!( + check_drop_table(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .is_ok() + ); + { + let opts = Options { + snapshot_every_n_updates: 2, + paranoid_checks: true, + }; + test_manifest_version_edit(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .unwrap(); + assert!(check_version_edit_no_table( + dir.path(), + runtime_clone.clone(), + opts.clone() + ) + .await + .is_ok()); + + test_manifest_alter_schema(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .unwrap(); + assert!( + check_alter_schema(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .is_err() + ); + + test_manifest_alter_options(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .unwrap(); + assert!(check_alter_options(dir.path(), runtime_clone.clone(), opts) + .await + .is_err()); + } + { + test_manifest_add_table(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .unwrap(); + assert!( + check_add_table(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .is_ok() + ); + + test_manifest_version_edit(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .unwrap(); + assert!(check_version_edit_with_table( + dir.path(), + runtime_clone.clone(), + opts.clone() + ) + .await + .is_ok()); + + test_manifest_alter_schema(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .unwrap(); + assert!( + check_alter_schema(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .is_ok() + ); + + test_manifest_alter_options(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .unwrap(); + assert!(check_alter_options(dir.path(), runtime_clone, opts) + .await + .is_ok()); + } + }); + } +} diff --git a/analytic_engine/src/meta/meta_data.rs b/analytic_engine/src/meta/meta_data.rs new file mode 100644 index 0000000000..07467d9b9f --- /dev/null +++ b/analytic_engine/src/meta/meta_data.rs @@ -0,0 +1,193 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Meta data of manifest. + +use std::collections::BTreeMap; + +use common_util::define_result; +use log::{debug, info}; +use snafu::{ensure, Backtrace, OptionExt, Snafu}; +use table_engine::table::TableId; + +use crate::{ + meta::meta_update::{AddSpaceMeta, AddTableMeta, MetaUpdate}, + space::SpaceId, + table::version::TableVersionMeta, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Space id corrupted (last >= given), last:{}, given:{}.\nBacktrace:\n{}", + last, + given, + backtrace + ))] + SpaceIdCorrupted { + last: SpaceId, + given: SpaceId, + backtrace: Backtrace, + }, + + #[snafu(display( + "Space of table is missing, maybe corrupted, space_id:{}, table:{}.\nBacktrace:\n{}", + space_id, + table_name, + backtrace, + ))] + TableSpaceMiss { + space_id: SpaceId, + table_name: String, + backtrace: Backtrace, + }, + + #[snafu(display( + "Space is missing, maybe corrupted, space_id:{}.\nBacktrace:\n{}", + space_id, + backtrace, + ))] + SpaceMiss { + space_id: SpaceId, + backtrace: Backtrace, + }, + + #[snafu(display( + "Table is missing, maybe corrupted, space_id:{}, table_id:{}.\nBacktrace:\n{}", + space_id, + table_id, + backtrace, + ))] + TableMiss { + space_id: SpaceId, + table_id: TableId, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +#[derive(Debug)] +pub struct TableMetaData { + pub table_meta: AddTableMeta, + pub version_meta: TableVersionMeta, +} + +#[derive(Debug)] +pub struct SpaceMetaData { + pub space_meta: AddSpaceMeta, + // Use BTreeMap to order table meta by table id. + pub tables: BTreeMap, +} + +/// Holds the final view of the data in manifest. +#[derive(Debug, Default)] +pub struct ManifestData { + // Use BTreeMap to order space meta by space id, so space with smaller id + // can be processed first. This is necessary especially in creating snapshot. + pub spaces: BTreeMap, + pub last_space_id: SpaceId, +} + +impl ManifestData { + pub fn apply_meta_update(&mut self, update: MetaUpdate) -> Result<()> { + debug!("Apply meta update, update:{:?}", update); + + // TODO(yingwen): Ignore space not found error when we support drop space. + match update { + MetaUpdate::AddSpace(meta) => { + ensure!( + self.last_space_id <= meta.space_id, + SpaceIdCorrupted { + last: self.last_space_id, + given: meta.space_id, + } + ); + + self.last_space_id = meta.space_id; + self.spaces.insert( + meta.space_id, + SpaceMetaData { + space_meta: meta, + tables: BTreeMap::new(), + }, + ); + } + MetaUpdate::AddTable(meta) => { + let space = self + .spaces + .get_mut(&meta.space_id) + .context(TableSpaceMiss { + space_id: meta.space_id, + table_name: &meta.table_name, + })?; + space.tables.insert( + meta.table_id, + TableMetaData { + table_meta: meta, + version_meta: TableVersionMeta::default(), + }, + ); + } + MetaUpdate::VersionEdit(meta) => { + let space = self.spaces.get_mut(&meta.space_id).context(SpaceMiss { + space_id: meta.space_id, + })?; + // If there is a background compaction/flush job, then version edit + // may be stored after a drop table entry being stored. We ignore + // that case and won't return error if table is not found. + let table = match space.tables.get_mut(&meta.table_id) { + Some(v) => v, + None => { + info!("Table of version edit not found, meta:{:?}", meta); + + return Ok(()); + } + }; + let edit = meta.into_version_edit(); + table.version_meta.apply_edit(edit); + } + MetaUpdate::AlterSchema(meta) => { + let space = self.spaces.get_mut(&meta.space_id).context(SpaceMiss { + space_id: meta.space_id, + })?; + let table = space.tables.get_mut(&meta.table_id).context(TableMiss { + space_id: meta.space_id, + table_id: meta.table_id, + })?; + + // Update schema of AddTableMeta. + table.table_meta.schema = meta.schema; + } + MetaUpdate::AlterOptions(meta) => { + let space = self.spaces.get_mut(&meta.space_id).context(SpaceMiss { + space_id: meta.space_id, + })?; + let table = space.tables.get_mut(&meta.table_id).context(TableMiss { + space_id: meta.space_id, + table_id: meta.table_id, + })?; + + // Update options of AddTableMeta. + table.table_meta.opts = meta.options; + } + MetaUpdate::DropTable(meta) => { + let space = self.spaces.get_mut(&meta.space_id).context(SpaceMiss { + space_id: meta.space_id, + })?; + + let removed_table = space.tables.remove(&meta.table_id); + + debug!( + "Apply drop table meta update, removed table:{}, removed:{}", + meta.table_name, + removed_table.is_some() + ); + } + MetaUpdate::SnapshotManifest(_) => { + // A snapshot record, no need to handle this. + } + } + + Ok(()) + } +} diff --git a/analytic_engine/src/meta/meta_update.rs b/analytic_engine/src/meta/meta_update.rs new file mode 100644 index 0000000000..06e8f86099 --- /dev/null +++ b/analytic_engine/src/meta/meta_update.rs @@ -0,0 +1,463 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Update to meta + +use std::convert::{TryFrom, TryInto}; + +use common_types::{ + bytes::{MemBuf, MemBufMut, Writer}, + schema::{Schema, Version}, + SequenceNumber, +}; +use common_util::define_result; +use proto::{analytic_common, common as common_pb, meta_update as meta_pb}; +use protobuf::Message; +use snafu::{Backtrace, ResultExt, Snafu}; +use table_engine::table::TableId; +use wal::{ + log_batch::{Payload, PayloadDecoder}, + manager::RegionId, +}; + +use crate::{ + space::SpaceId, + table::version_edit::{AddFile, DeleteFile, VersionEdit}, + TableOptions, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to encode payload, err:{}.\nBacktrace:\n{}", source, backtrace))] + EncodePayloadPb { + source: protobuf::error::ProtobufError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to convert schema, err:{}", source))] + ConvertSchema { source: common_types::schema::Error }, + + #[snafu(display("Empty meta update.\nBacktrace:\n{}", backtrace))] + EmptyMetaUpdate { backtrace: Backtrace }, + + #[snafu(display("Failed to decode payload, err:{}.\nBacktrace:\n{}", source, backtrace))] + DecodePayloadPb { + source: protobuf::error::ProtobufError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to convert version edit, err:{}", source))] + ConvertVersionEdit { + source: crate::table::version_edit::Error, + }, +} + +define_result!(Error); + +/// Modifications to meta data in meta +#[derive(Debug, Clone)] +pub enum MetaUpdate { + AddSpace(AddSpaceMeta), + AddTable(AddTableMeta), + DropTable(DropTableMeta), + VersionEdit(VersionEditMeta), + AlterSchema(AlterSchemaMeta), + AlterOptions(AlterOptionsMeta), + SnapshotManifest(SnapshotManifestMeta), +} + +impl MetaUpdate { + pub fn into_pb(self) -> meta_pb::MetaUpdate { + let mut meta_update = meta_pb::MetaUpdate::new(); + + match self { + MetaUpdate::AddSpace(v) => { + meta_update.set_add_space(v.into_pb()); + } + MetaUpdate::AddTable(v) => { + meta_update.set_add_table(v.into_pb()); + } + MetaUpdate::VersionEdit(v) => { + meta_update.set_version_edit(v.into_pb()); + } + MetaUpdate::AlterSchema(v) => { + meta_update.set_alter_schema(v.into_pb()); + } + MetaUpdate::AlterOptions(v) => { + meta_update.set_alter_options(v.into_pb()); + } + MetaUpdate::DropTable(v) => { + meta_update.set_drop_table(v.into_pb()); + } + MetaUpdate::SnapshotManifest(v) => { + meta_update.set_snapshot_manifest(v.into_pb()); + } + } + + meta_update + } + + pub fn snapshot_manifest_meta(&self) -> Option { + if let MetaUpdate::SnapshotManifest(v) = self { + Some(*v) + } else { + None + } + } +} + +impl TryFrom for MetaUpdate { + type Error = Error; + + fn try_from(src: meta_pb::MetaUpdate) -> Result { + let meta_update = match src.meta { + Some(meta_pb::MetaUpdate_oneof_meta::add_space(v)) => { + let add_space = AddSpaceMeta::from(v); + MetaUpdate::AddSpace(add_space) + } + Some(meta_pb::MetaUpdate_oneof_meta::add_table(v)) => { + let add_table = AddTableMeta::try_from(v)?; + MetaUpdate::AddTable(add_table) + } + Some(meta_pb::MetaUpdate_oneof_meta::version_edit(v)) => { + let version_edit = VersionEditMeta::try_from(v)?; + MetaUpdate::VersionEdit(version_edit) + } + Some(meta_pb::MetaUpdate_oneof_meta::alter_schema(v)) => { + let alter_schema = AlterSchemaMeta::try_from(v)?; + MetaUpdate::AlterSchema(alter_schema) + } + Some(meta_pb::MetaUpdate_oneof_meta::alter_options(v)) => { + let alter_options = AlterOptionsMeta::from(v); + MetaUpdate::AlterOptions(alter_options) + } + Some(meta_pb::MetaUpdate_oneof_meta::drop_table(v)) => { + let drop_table = DropTableMeta::from(v); + MetaUpdate::DropTable(drop_table) + } + Some(meta_pb::MetaUpdate_oneof_meta::snapshot_manifest(v)) => { + let snapshot_manifest = SnapshotManifestMeta::from(v); + MetaUpdate::SnapshotManifest(snapshot_manifest) + } + None => { + // Meta update should not be empty. + return EmptyMetaUpdate.fail(); + } + }; + + Ok(meta_update) + } +} + +/// Meta data for a new space +#[derive(Debug, Clone)] +pub struct AddSpaceMeta { + pub space_id: SpaceId, + pub space_name: String, +} + +impl AddSpaceMeta { + fn into_pb(self) -> meta_pb::AddSpaceMeta { + let mut target = meta_pb::AddSpaceMeta::new(); + target.set_space_id(self.space_id); + target.set_space_name(self.space_name); + + target + } +} + +impl From for AddSpaceMeta { + fn from(src: meta_pb::AddSpaceMeta) -> Self { + Self { + space_id: src.space_id, + space_name: src.space_name, + } + } +} + +/// Meta data for a new table +#[derive(Debug, Clone)] +pub struct AddTableMeta { + /// Space id of the table + pub space_id: SpaceId, + pub table_id: TableId, + pub table_name: String, + /// Schema of the table + pub schema: Schema, + // Options needed to persist + pub opts: TableOptions, +} + +impl AddTableMeta { + fn into_pb(self) -> meta_pb::AddTableMeta { + let mut target = meta_pb::AddTableMeta::new(); + target.set_space_id(self.space_id); + target.set_table_id(self.table_id.as_u64()); + target.set_table_name(self.table_name); + target.set_schema(common_pb::TableSchema::from(self.schema)); + target.set_options(analytic_common::TableOptions::from(self.opts)); + + target + } +} + +impl TryFrom for AddTableMeta { + type Error = Error; + + fn try_from(mut src: meta_pb::AddTableMeta) -> Result { + let table_schema = src.take_schema(); + let opts = src.take_options(); + + Ok(Self { + space_id: src.space_id, + table_id: TableId::from(src.table_id), + table_name: src.table_name, + schema: Schema::try_from(table_schema).context(ConvertSchema)?, + opts: TableOptions::from(opts), + }) + } +} + +/// Meta data for dropping a table +#[derive(Debug, Clone)] +pub struct DropTableMeta { + /// Space id of the table + pub space_id: SpaceId, + pub table_id: TableId, + pub table_name: String, +} + +impl DropTableMeta { + fn into_pb(self) -> meta_pb::DropTableMeta { + let mut target = meta_pb::DropTableMeta::new(); + target.set_space_id(self.space_id); + target.set_table_id(self.table_id.as_u64()); + target.set_table_name(self.table_name); + + target + } +} + +impl From for DropTableMeta { + fn from(src: meta_pb::DropTableMeta) -> Self { + Self { + space_id: src.space_id, + table_id: TableId::from(src.table_id), + table_name: src.table_name, + } + } +} + +/// Meta data of version edit to table +#[derive(Debug, Clone)] +pub struct VersionEditMeta { + pub space_id: SpaceId, + pub table_id: TableId, + /// Sequence number of the flushed data. Set to 0 if this edit is not + /// created by a flush request. + pub flushed_sequence: SequenceNumber, + pub files_to_add: Vec, + pub files_to_delete: Vec, +} + +impl VersionEditMeta { + fn into_pb(self) -> meta_pb::VersionEditMeta { + let mut target = meta_pb::VersionEditMeta::new(); + target.set_space_id(self.space_id); + target.set_table_id(self.table_id.as_u64()); + target.set_flushed_sequence(self.flushed_sequence); + + let mut files_to_add = Vec::with_capacity(self.files_to_add.len()); + for file in self.files_to_add { + files_to_add.push(file.into_pb()); + } + target.files_to_add = files_to_add.into(); + + let mut files_to_delete = Vec::with_capacity(self.files_to_delete.len()); + for file in self.files_to_delete { + files_to_delete.push(file.into_pb()); + } + target.files_to_delete = files_to_delete.into(); + + target + } + + /// Convert into [crate::table::version_edit::VersionEdit]. The + /// `mems_to_remove` field is left empty. + pub fn into_version_edit(self) -> VersionEdit { + VersionEdit { + mems_to_remove: Vec::new(), + flushed_sequence: self.flushed_sequence, + files_to_add: self.files_to_add, + files_to_delete: self.files_to_delete, + } + } +} + +impl TryFrom for VersionEditMeta { + type Error = Error; + + fn try_from(src: meta_pb::VersionEditMeta) -> Result { + let mut files_to_add = Vec::with_capacity(src.files_to_add.len()); + for file_meta in src.files_to_add { + files_to_add.push(AddFile::try_from(file_meta).context(ConvertVersionEdit)?); + } + + let mut files_to_delete = Vec::with_capacity(src.files_to_delete.len()); + for file_meta in src.files_to_delete { + files_to_delete.push(DeleteFile::try_from(file_meta).context(ConvertVersionEdit)?); + } + + Ok(Self { + space_id: src.space_id, + table_id: TableId::from(src.table_id), + flushed_sequence: src.flushed_sequence, + files_to_add, + files_to_delete, + }) + } +} + +/// Meta data of schema update. +#[derive(Debug, Clone)] +pub struct AlterSchemaMeta { + pub space_id: SpaceId, + pub table_id: TableId, + pub schema: Schema, + pub pre_schema_version: Version, +} + +impl AlterSchemaMeta { + fn into_pb(self) -> meta_pb::AlterSchemaMeta { + let mut target = meta_pb::AlterSchemaMeta::new(); + target.set_space_id(self.space_id); + target.set_table_id(self.table_id.as_u64()); + target.set_schema(common_pb::TableSchema::from(self.schema)); + target.set_pre_schema_version(self.pre_schema_version); + + target + } +} + +impl TryFrom for AlterSchemaMeta { + type Error = Error; + + fn try_from(mut src: meta_pb::AlterSchemaMeta) -> Result { + let table_schema = src.take_schema(); + + Ok(Self { + space_id: src.space_id, + table_id: TableId::from(src.table_id), + schema: Schema::try_from(table_schema).context(ConvertSchema)?, + pre_schema_version: src.pre_schema_version, + }) + } +} + +/// Meta data of options update. +#[derive(Debug, Clone)] +pub struct AlterOptionsMeta { + pub space_id: SpaceId, + pub table_id: TableId, + pub options: TableOptions, +} + +impl AlterOptionsMeta { + fn into_pb(self) -> meta_pb::AlterOptionsMeta { + let mut target = meta_pb::AlterOptionsMeta::new(); + target.set_space_id(self.space_id); + target.set_table_id(self.table_id.as_u64()); + target.set_options(analytic_common::TableOptions::from(self.options)); + + target + } +} + +impl From for AlterOptionsMeta { + fn from(mut src: meta_pb::AlterOptionsMeta) -> Self { + let table_options = src.take_options(); + + Self { + space_id: src.space_id, + table_id: TableId::from(src.table_id), + options: TableOptions::from(table_options), + } + } +} + +#[derive(Debug, Clone, Copy)] +pub struct SnapshotManifestMeta { + pub snapshot_region_id: RegionId, + /// The last sequence (inclusive) of the data in this snapshot. + /// + /// Note that the sequence refers to the manifest region. + pub sequence: SequenceNumber, +} + +impl SnapshotManifestMeta { + fn into_pb(self) -> meta_pb::SnapshotManifestMeta { + let mut target = meta_pb::SnapshotManifestMeta::new(); + target.set_region_id(self.snapshot_region_id); + target.set_sequence(self.sequence); + + target + } +} + +impl From for SnapshotManifestMeta { + fn from(src: meta_pb::SnapshotManifestMeta) -> SnapshotManifestMeta { + Self { + snapshot_region_id: src.region_id, + sequence: src.sequence, + } + } +} + +/// An adapter to implement [wal::log_batch::Payload] for +/// [proto::meta_update::MetaUpdate] +#[derive(Debug)] +pub struct MetaUpdatePayload(meta_pb::MetaUpdate); + +impl From for MetaUpdatePayload { + fn from(src: MetaUpdate) -> Self { + MetaUpdatePayload(src.into_pb()) + } +} + +impl From<&MetaUpdate> for MetaUpdatePayload { + fn from(src: &MetaUpdate) -> Self { + MetaUpdatePayload(src.clone().into_pb()) + } +} + +impl Payload for MetaUpdatePayload { + type Error = Error; + + fn encode_size(&self) -> usize { + self.0.compute_size().try_into().unwrap_or(usize::MAX) + } + + fn encode_to(&self, buf: &mut B) -> Result<()> { + let mut writer = Writer::new(buf); + self.0 + .write_to_writer(&mut writer) + .context(EncodePayloadPb)?; + Ok(()) + } +} + +/// Decoder to decode MetaUpdate from log entry +pub struct MetaUpdateDecoder; + +impl PayloadDecoder for MetaUpdateDecoder { + type Error = Error; + type Target = MetaUpdate; + + fn decode(&self, buf: &mut B) -> Result { + let meta_update = meta_pb::MetaUpdate::parse_from_bytes(buf.remaining_slice()) + .context(DecodePayloadPb)?; + + let meta_update = MetaUpdate::try_from(meta_update)?; + + Ok(meta_update) + } +} diff --git a/analytic_engine/src/meta/mod.rs b/analytic_engine/src/meta/mod.rs new file mode 100644 index 0000000000..3bea46d26e --- /dev/null +++ b/analytic_engine/src/meta/mod.rs @@ -0,0 +1,29 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Manage meta data of the engine + +pub mod details; +pub mod meta_data; +pub mod meta_update; + +use std::fmt; + +use async_trait::async_trait; + +use crate::meta::{meta_data::ManifestData, meta_update::MetaUpdate}; + +/// Manifest holds meta data of all tables +#[async_trait] +pub trait Manifest: fmt::Debug { + type Error: std::error::Error + Send + Sync + 'static; + + /// Store update to manifest + async fn store_update(&self, update: MetaUpdate) -> Result<(), Self::Error>; + + /// Load all data from manifest. + /// + /// If `do_snapshot` is true, the manifest will try to create a snapshot of + /// the manifest data. The caller should ensure `store_update()` wont be + /// called during loading data. + async fn load_data(&self, do_snapshot: bool) -> Result; +} diff --git a/analytic_engine/src/payload.rs b/analytic_engine/src/payload.rs new file mode 100644 index 0000000000..02cf58fe0a --- /dev/null +++ b/analytic_engine/src/payload.rs @@ -0,0 +1,174 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Payloads to write to wal + +use std::convert::TryInto; + +use common_types::{ + bytes::{MemBuf, MemBufMut, Writer}, + row::{RowGroup, RowGroupBuilder}, + schema::Schema, +}; +use common_util::{ + codec::{row::WalRowDecoder, Decoder}, + define_result, +}; +use proto::table_requests; +use protobuf::Message; +use snafu::{Backtrace, ResultExt, Snafu}; +use wal::log_batch::{Payload, PayloadDecoder}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to encode header, err:{}", source))] + EncodeHeader { source: common_types::bytes::Error }, + + #[snafu(display("Failed to encode body, err:{}.\nBacktrace:\n{}", source, backtrace))] + EncodeBody { + source: protobuf::ProtobufError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to decode header, err:{}", source))] + DecodeHeader { source: common_types::bytes::Error }, + + #[snafu(display( + "Invalid wal entry header, value:{}.\nBacktrace:\n{}", + value, + backtrace + ))] + InvalidHeader { value: u8, backtrace: Backtrace }, + + #[snafu(display("Failed to decode body, err:{}.\nBacktrace:\n{}", source, backtrace))] + DecodeBody { + source: protobuf::ProtobufError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to decode schema, err:{}", source))] + DecodeSchema { source: common_types::schema::Error }, + + #[snafu(display("Failed to decode row, err:{}", source))] + DecodeRow { + source: common_util::codec::row::Error, + }, +} + +define_result!(Error); + +/// Wal entry header +#[derive(Clone, Copy)] +enum Header { + Write = 1, +} + +impl Header { + pub fn to_u8(self) -> u8 { + self as u8 + } + + pub fn from_u8(value: u8) -> Option { + match value { + value if value == Self::Write as u8 => Some(Self::Write), + _ => None, + } + } +} + +fn write_header(header: Header, buf: &mut B) -> Result<()> { + buf.write_u8(header.to_u8()).context(EncodeHeader)?; + Ok(()) +} + +/// Header size in bytes +const HEADER_SIZE: usize = 1; + +/// Write request to persist in wal +#[derive(Debug)] +pub enum WritePayload<'a> { + Write(&'a table_requests::WriteRequest), +} + +impl<'a> Payload for WritePayload<'a> { + type Error = Error; + + fn encode_size(&self) -> usize { + let body_size = match self { + WritePayload::Write(req) => req.compute_size(), + }; + + HEADER_SIZE + body_size as usize + } + + fn encode_to(&self, buf: &mut B) -> Result<()> { + match self { + WritePayload::Write(req) => { + write_header(Header::Write, buf)?; + let mut writer = Writer::new(buf); + req.write_to_writer(&mut writer).context(EncodeBody)?; + } + } + + Ok(()) + } +} + +/// Payload decoded from wal +#[derive(Debug)] +pub enum ReadPayload { + Write { row_group: RowGroup }, +} + +/// Wal payload decoder +#[derive(Default)] +pub struct WalDecoder; + +impl PayloadDecoder for WalDecoder { + type Error = Error; + type Target = ReadPayload; + + fn decode(&self, buf: &mut B) -> Result { + let header_value = buf.read_u8().context(DecodeHeader)?; + let header = match Header::from_u8(header_value) { + Some(header) => header, + None => { + return InvalidHeader { + value: header_value, + } + .fail() + } + }; + + let payload = match header { + Header::Write => { + let mut write_req_pb: table_requests::WriteRequest = + Message::parse_from_bytes(buf.remaining_slice()).context(DecodeBody)?; + + // Consume and convert schema in pb + let schema: Schema = write_req_pb + .take_schema() + .try_into() + .context(DecodeSchema)?; + + // Consume and convert rows in pb + let encoded_rows = write_req_pb.take_rows().into_vec(); + let mut builder = + RowGroupBuilder::with_capacity(schema.clone(), encoded_rows.len()); + let row_decoder = WalRowDecoder::new(&schema); + for row_bytes in &encoded_rows { + let row = row_decoder + .decode(&mut row_bytes.as_slice()) + .context(DecodeRow)?; + // We skip schema check here + builder.push_checked_row(row); + } + + let row_group = builder.build(); + + ReadPayload::Write { row_group } + } + }; + + Ok(payload) + } +} diff --git a/analytic_engine/src/row_iter/chain.rs b/analytic_engine/src/row_iter/chain.rs new file mode 100644 index 0000000000..881c96db3b --- /dev/null +++ b/analytic_engine/src/row_iter/chain.rs @@ -0,0 +1,373 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{fmt, time::Instant}; + +use async_trait::async_trait; +use common_types::{ + projected_schema::ProjectedSchema, record_batch::RecordBatchWithKey, request_id::RequestId, + schema::RecordSchemaWithKey, +}; +use common_util::define_result; +use futures::StreamExt; +use log::debug; +use object_store::ObjectStore; +use snafu::{ResultExt, Snafu}; +use table_engine::{predicate::PredicateRef, table::TableId}; + +use crate::{ + row_iter::{ + record_batch_stream, record_batch_stream::SequencedRecordBatchStream, + RecordBatchWithKeyIterator, + }, + space::SpaceId, + sst::{ + factory::{Factory, SstReaderOptions}, + file::FileHandle, + }, + table::version::{MemTableVec, SamplingMemTable}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Fail to build stream from the memtable, err:{}", source))] + BuildStreamFromMemtable { + source: crate::row_iter::record_batch_stream::Error, + }, + + #[snafu(display("Fail to build stream from the sst file, err:{}", source))] + BuildStreamFromSst { + source: crate::row_iter::record_batch_stream::Error, + }, + + #[snafu(display("Fail to poll next record batch, err:{}", source))] + PollNextRecordBatch { + source: Box, + }, +} + +define_result!(Error); + +/// Required parameters to construct the [Builder]. +#[derive(Clone, Debug)] +pub struct ChainConfig<'a, S, Fa> { + pub request_id: RequestId, + pub space_id: SpaceId, + pub table_id: TableId, + /// The projected schema to read. + pub projected_schema: ProjectedSchema, + /// Predicate of the query. + pub predicate: PredicateRef, + + pub sst_reader_options: SstReaderOptions, + pub sst_factory: Fa, + /// Sst storage + pub store: &'a S, +} + +/// Builder for [ChainIterator]. +#[must_use] +pub struct Builder<'a, S, Fa> { + config: ChainConfig<'a, S, Fa>, + /// Sampling memtable to read. + sampling_mem: Option, + memtables: MemTableVec, + ssts: Vec>, +} + +impl<'a, S, Fa> Builder<'a, S, Fa> { + pub fn new(config: ChainConfig<'a, S, Fa>) -> Self { + Self { + config, + sampling_mem: None, + memtables: Vec::new(), + ssts: Vec::new(), + } + } + + pub fn sampling_mem(mut self, sampling_mem: Option) -> Self { + self.sampling_mem = sampling_mem; + self + } + + pub fn memtables(mut self, memtables: MemTableVec) -> Self { + self.memtables = memtables; + self + } + + pub fn ssts(mut self, ssts: Vec>) -> Self { + self.ssts = ssts; + self + } +} + +impl<'a, S: ObjectStore, Fa: Factory> Builder<'a, S, Fa> { + pub async fn build(self) -> Result { + let total_sst_streams: usize = self.ssts.iter().map(|v| v.len()).sum(); + let mut total_streams = self.memtables.len() + total_sst_streams; + if self.sampling_mem.is_some() { + total_streams += 1; + } + let mut streams = Vec::with_capacity(total_streams); + + if let Some(v) = &self.sampling_mem { + let stream = record_batch_stream::filtered_stream_from_memtable( + self.config.projected_schema.clone(), + false, + &v.mem, + false, + self.config.predicate.as_ref(), + ) + .context(BuildStreamFromMemtable)?; + streams.push(stream); + } + + for memtable in &self.memtables { + let stream = record_batch_stream::filtered_stream_from_memtable( + self.config.projected_schema.clone(), + false, + // chain iterator only handle the case reading in no order so just read in asc + // order by default. + &memtable.mem, + false, + self.config.predicate.as_ref(), + ) + .context(BuildStreamFromMemtable)?; + streams.push(stream); + } + + for leveled_ssts in &self.ssts { + for sst in leveled_ssts { + let stream = record_batch_stream::filtered_stream_from_sst_file( + self.config.space_id, + self.config.table_id, + sst, + &self.config.sst_factory, + &self.config.sst_reader_options, + self.config.store, + ) + .await + .context(BuildStreamFromSst)?; + streams.push(stream); + } + } + + debug!( + "Build chain iterator, table_id:{:?}, request_id:{}, memtables:{:?}, ssts:{:?}", + self.config.table_id, self.config.request_id, self.memtables, self.ssts + ); + + Ok(ChainIterator { + space_id: self.config.space_id, + table_id: self.config.table_id, + request_id: self.config.request_id, + schema: self.config.projected_schema.to_record_schema_with_key(), + streams, + next_stream_idx: 0, + inited: false, + metrics: Metrics::new(self.memtables.len(), total_sst_streams), + }) + } +} + +/// Metrics for [ChainIterator]. +struct Metrics { + num_memtables: usize, + num_ssts: usize, + /// Total batch fetched. + total_batch_fetched: usize, + /// Total rows fetched. + total_rows_fetched: usize, + /// Create time of the metrics. + create_at: Instant, + /// Inited time of the iterator. + inited_at: Option, +} + +impl Metrics { + fn new(num_memtables: usize, num_ssts: usize) -> Self { + Self { + num_memtables, + num_ssts, + total_batch_fetched: 0, + total_rows_fetched: 0, + create_at: Instant::now(), + inited_at: None, + } + } + + fn set_inited_time(&mut self) { + self.inited_at = Some(Instant::now()); + } +} + +impl fmt::Debug for Metrics { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Metrics") + .field("num_memtables", &self.num_memtables) + .field("num_ssts", &self.num_ssts) + .field("total_batch_fetched", &self.total_batch_fetched) + .field("total_rows_fetched", &self.total_rows_fetched) + .field("duration_since_create", &self.create_at.elapsed()) + .field("duration_since_init", &self.inited_at.map(|v| v.elapsed())) + .finish() + } +} + +/// ChainIter chains memtables and ssts and reads the [RecordBatch] from them +/// batch by batch. +/// +/// Note: The chain order is `memtable -> sst level 0 -> sst_level 1`. +pub struct ChainIterator { + space_id: SpaceId, + table_id: TableId, + request_id: RequestId, + schema: RecordSchemaWithKey, + streams: Vec, + /// The range of the index is [0, streams.len()] and the iterator is + /// exhausted if it reaches `streams.len()`. + next_stream_idx: usize, + inited: bool, + + // metrics for the iterator. + metrics: Metrics, +} + +impl ChainIterator { + fn init_if_necessary(&mut self) { + if self.inited { + return; + } + self.inited = true; + self.metrics.set_inited_time(); + + debug!("Init ChainIterator, space_id:{}, table_id:{:?}, request_id:{}, total_streams:{}, schema:{:?}", + self.space_id, self.table_id, self.request_id, self.streams.len(), self.schema + ); + } +} + +impl Drop for ChainIterator { + fn drop(&mut self) { + debug!( + "Chain iterator dropped, space_id:{}, table_id:{:?}, request_id:{}, metrics:{:?}", + self.space_id, self.table_id, self.request_id, self.metrics, + ); + } +} + +#[async_trait] +impl RecordBatchWithKeyIterator for ChainIterator { + type Error = Error; + + fn schema(&self) -> &RecordSchemaWithKey { + &self.schema + } + + async fn next_batch(&mut self) -> Result> { + self.init_if_necessary(); + + while self.next_stream_idx < self.streams.len() { + let read_stream = &mut self.streams[self.next_stream_idx]; + let sequenced_record_batch = read_stream + .next() + .await + .transpose() + .context(PollNextRecordBatch)?; + + match sequenced_record_batch { + Some(v) => { + self.metrics.total_rows_fetched += v.num_rows(); + self.metrics.total_batch_fetched += 1; + + if v.num_rows() > 0 { + return Ok(Some(v.record_batch)); + } + } + // Fetch next stream only if the current sequence_record_batch is None. + None => self.next_stream_idx += 1, + } + } + + Ok(None) + } +} + +#[cfg(test)] +mod tests { + use common_types::{ + self, + row::Row, + tests::{build_row, build_schema}, + SequenceNumber, + }; + + use super::*; + use crate::row_iter::tests::check_iterator; + + async fn run_and_check(testcases: Vec<(SequenceNumber, Vec)>) { + let schema = build_schema(); + + let expect_rows: Vec<_> = testcases + .iter() + .flat_map(|(_, rows)| rows.clone()) + .collect(); + + let streams = + record_batch_stream::tests::build_sequenced_record_batch_stream(&schema, testcases); + + let mut chain_iter = ChainIterator { + space_id: 0, + table_id: TableId::MIN, + request_id: RequestId::next_id(), + schema: schema.to_record_schema_with_key(), + streams, + next_stream_idx: 0, + inited: false, + metrics: Metrics::new(0, 0), + }; + + check_iterator(&mut chain_iter, expect_rows).await; + } + + #[tokio::test] + async fn test_chain_multiple_streams() { + let testcases = vec![ + // (sequence, rows) + (10, vec![build_row(b"key4", 1000000, 10.0, "v4")]), + (20, vec![build_row(b"key2", 1000000, 10.0, "v2")]), + (100, vec![build_row(b"key3", 1000000, 10.0, "v3")]), + (1, vec![build_row(b"key1", 1000000, 10.0, "v1")]), + ]; + run_and_check(testcases).await; + } + + #[tokio::test] + async fn test_chain_empty_streams() { + let testcases = vec![ + // (sequence, rows) + (10, vec![]), + (20, vec![]), + (100, vec![]), + (1, vec![]), + ]; + run_and_check(testcases).await; + } + + #[tokio::test] + async fn test_chain_no_streams() { + let testcases = vec![]; + run_and_check(testcases).await; + } + + #[tokio::test] + async fn test_chain_half_empty_streams() { + let testcases = vec![ + // (sequence, rows) + (10, vec![build_row(b"key4", 1000000, 10.0, "v4")]), + (20, vec![]), + (100, vec![]), + (1, vec![build_row(b"key1", 1000000, 10.0, "v1")]), + ]; + run_and_check(testcases).await; + } +} diff --git a/analytic_engine/src/row_iter/dedup.rs b/analytic_engine/src/row_iter/dedup.rs new file mode 100644 index 0000000000..cd58b0157f --- /dev/null +++ b/analytic_engine/src/row_iter/dedup.rs @@ -0,0 +1,243 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::cmp::Ordering; + +use async_trait::async_trait; +use common_types::{ + record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder}, + request_id::RequestId, + row::{Row, RowViewOnBatch, RowWithMeta}, + schema::RecordSchemaWithKey, +}; +use common_util::define_result; +use log::{info, trace}; +use snafu::{ResultExt, Snafu}; + +use crate::row_iter::{IterOptions, RecordBatchWithKeyIterator}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to iterate column, error:{:?}", source))] + IterateColumn { source: common_types::row::Error }, + + #[snafu(display("Failed to build record batch, error:{:?}", source))] + BuildRecordBatch { + source: common_types::record_batch::Error, + }, + + #[snafu(display("Failed to append row, err:{:?}", source))] + AppendRow { + source: common_types::record_batch::Error, + }, + + #[snafu(display("Failed to read data from the sub iterator, err:{:?}", source))] + ReadFromSubIter { + source: Box, + }, +} + +define_result!(Error); + +/// Dedup the elements from the `iter` by choosing the first one in the +/// duplicate rows. +pub struct DedupIterator { + request_id: RequestId, + schema: RecordSchemaWithKey, + record_batch_builder: RecordBatchWithKeyBuilder, + iter: I, + /// Previous row returned. + prev_row: Option, + /// Store which row in record batch is keep, use Vec is a bit faster + /// than a bitmap. + selected_rows: Vec, + + // Metrics: + total_duplications: usize, + total_selected_rows: usize, +} + +impl DedupIterator { + pub fn new(request_id: RequestId, iter: I, iter_options: IterOptions) -> Self { + let schema = iter.schema(); + + let record_batch_builder = + RecordBatchWithKeyBuilder::with_capacity(schema.clone(), iter_options.batch_size); + Self { + request_id, + schema: schema.clone(), + record_batch_builder, + iter, + prev_row: None, + selected_rows: Vec::new(), + total_duplications: 0, + total_selected_rows: 0, + } + } + + fn dedup_batch(&mut self, record_batch: RecordBatchWithKey) -> Result { + self.selected_rows.clear(); + // Ignore all rows by default. + self.selected_rows.resize(record_batch.num_rows(), false); + + if record_batch.is_empty() { + return Ok(record_batch); + } + + // Dedup batch. + for col_idx in 0..self.schema.num_key_columns() { + let column = record_batch.column(col_idx); + + column.dedup(&mut self.selected_rows); + } + + // Dedup first row in record batch with previous row. + if let Some(prev_row) = &self.prev_row { + let prev_row_view = RowWithMeta { + row: prev_row, + schema: &self.schema, + }; + let curr_row_view = RowViewOnBatch { + record_batch: &record_batch, + // First row. + row_idx: 0, + }; + + let is_equal = matches!( + // TODO(yingwen): Compare row needs clone data of row. + self.schema.compare_row(&prev_row_view, &curr_row_view), + Ordering::Equal + ); + + if is_equal { + // Depulicate with previous row. + self.selected_rows[0] = false; + } + } + + let selected_num = self + .selected_rows + .iter() + .map(|v| if *v { 1 } else { 0 }) + .sum(); + + // Eventhough all rows are duplicate, we can still use row pointed by + // prev_row_idx because they have same row key. + self.prev_row = Some(record_batch.clone_row_at(record_batch.num_rows() - 1)); + + self.filter_batch(record_batch, selected_num) + } + + /// Filter batch by `selected_rows`. + fn filter_batch( + &mut self, + record_batch: RecordBatchWithKey, + selected_num: usize, + ) -> Result { + self.total_selected_rows += selected_num; + self.total_duplications += record_batch.num_rows() - selected_num; + + if selected_num == record_batch.num_rows() { + // No duplicate rows in batch. + return Ok(record_batch); + } + + self.record_batch_builder.clear(); + for (row_idx, selected) in self.selected_rows.iter().enumerate() { + if *selected { + self.record_batch_builder + .append_row_view(&RowViewOnBatch { + record_batch: &record_batch, + row_idx, + }) + .context(AppendRow)?; + } + } + + self.record_batch_builder.build().context(BuildRecordBatch) + } +} + +#[async_trait] +impl RecordBatchWithKeyIterator for DedupIterator { + type Error = Error; + + fn schema(&self) -> &RecordSchemaWithKey { + &self.schema + } + + async fn next_batch(&mut self) -> Result> { + match self + .iter + .next_batch() + .await + .map_err(|e| Box::new(e) as _) + .context(ReadFromSubIter)? + { + Some(record_batch) => { + trace!( + "DedupIterator received next record batch, request_id:{}, batch:{:?}", + self.request_id, + record_batch + ); + + self.dedup_batch(record_batch).map(Some) + } + None => { + info!( + "DedupIterator received none record batch, request_id:{}, total_duplications:{}, total_selected_rows:{}", + self.request_id, self.total_duplications, self.total_selected_rows, + ); + + Ok(None) + } + } + } +} + +#[cfg(test)] +mod tests { + use common_types::tests::{build_row, build_schema}; + + use super::*; + use crate::row_iter::tests::{build_record_batch_with_key, check_iterator, VectorIterator}; + + #[tokio::test] + async fn test_dedup_iterator() { + // first two columns are key columns + let schema = build_schema(); + let iter = VectorIterator::new( + schema.to_record_schema_with_key(), + vec![ + build_record_batch_with_key( + schema.clone(), + vec![ + build_row(b"a", 1, 10.0, "v1"), + build_row(b"a", 1, 10.0, "v"), + build_row(b"a", 2, 10.0, "v2"), + ], + ), + build_record_batch_with_key( + schema, + vec![ + build_row(b"a", 2, 10.0, "v"), + build_row(b"a", 3, 10.0, "v3"), + build_row(b"a", 3, 10.0, "v"), + build_row(b"a", 4, 10.0, "v4"), + ], + ), + ], + ); + + let mut iter = DedupIterator::new(RequestId::next_id(), iter, IterOptions::default()); + check_iterator( + &mut iter, + vec![ + build_row(b"a", 1, 10.0, "v1"), + build_row(b"a", 2, 10.0, "v2"), + build_row(b"a", 3, 10.0, "v3"), + build_row(b"a", 4, 10.0, "v4"), + ], + ) + .await; + } +} diff --git a/analytic_engine/src/row_iter/merge.rs b/analytic_engine/src/row_iter/merge.rs new file mode 100644 index 0000000000..49403c90ae --- /dev/null +++ b/analytic_engine/src/row_iter/merge.rs @@ -0,0 +1,957 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + cmp, + cmp::Ordering, + collections::BinaryHeap, + fmt, mem, + ops::{Deref, DerefMut}, + time::{Duration, Instant}, +}; + +use async_trait::async_trait; +use common_types::{ + projected_schema::ProjectedSchema, + record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder}, + request_id::RequestId, + row::RowViewOnBatch, + schema::RecordSchemaWithKey, + SequenceNumber, +}; +use common_util::define_result; +use futures::StreamExt; +use log::{debug, info, trace}; +use object_store::ObjectStore; +use snafu::{ensure, Backtrace, ResultExt, Snafu}; +use table_engine::{predicate::PredicateRef, table::TableId}; + +use crate::{ + row_iter::{ + record_batch_stream, + record_batch_stream::{SequencedRecordBatch, SequencedRecordBatchStream}, + IterOptions, RecordBatchWithKeyIterator, + }, + space::SpaceId, + sst::{ + factory::{Factory, SstReaderOptions}, + file::FileHandle, + manager::{FileId, MAX_LEVEL}, + }, + table::version::{MemTableVec, SamplingMemTable}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Expect the same schema, expect:{:?}, given:{:?}.\nBacktrace:\n{}", + expect, + given, + backtrace + ))] + MismatchedSchema { + expect: RecordSchemaWithKey, + given: RecordSchemaWithKey, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to pull record batch, error:{}", source))] + PullRecordBatch { + source: Box, + }, + + #[snafu(display("Failed to build record batch, error:{}", source))] + BuildRecordBatch { + source: common_types::record_batch::Error, + }, + + #[snafu(display("Failed to append row, err:{:?}", source))] + AppendRow { + source: common_types::record_batch::Error, + }, + + #[snafu(display("Failed to build stream from memtable, err:{}", source))] + BuildStreamFromMemtable { + source: crate::row_iter::record_batch_stream::Error, + }, + + #[snafu(display("Failed to build record batch from sst, err:{}", source))] + BuildStreamFromSst { + source: crate::row_iter::record_batch_stream::Error, + }, +} + +define_result!(Error); + +/// Required parameters to construct the [MergeBuilder] +#[derive(Debug)] +pub struct MergeConfig<'a, S, Fa> { + pub request_id: RequestId, + pub space_id: SpaceId, + pub table_id: TableId, + /// Max visible sequence (inclusive) + pub sequence: SequenceNumber, + /// The projected schema to read. + pub projected_schema: ProjectedSchema, + /// The predicate of the query. + pub predicate: PredicateRef, + + pub sst_reader_options: SstReaderOptions, + pub sst_factory: Fa, + /// Sst storage + pub store: &'a S, + + pub merge_iter_options: IterOptions, + + pub need_dedup: bool, + pub reverse: bool, +} + +/// Builder for building merge stream from memtables and sst files. +#[must_use] +pub struct MergeBuilder<'a, S, Fa> { + config: MergeConfig<'a, S, Fa>, + + /// Sampling memtable to read. + sampling_mem: Option, + /// MemTables to read. + memtables: MemTableVec, + /// Ssts to read of each level. + ssts: Vec>, +} + +impl<'a, S: ObjectStore, Fa: Factory> MergeBuilder<'a, S, Fa> { + pub fn new(config: MergeConfig<'a, S, Fa>) -> Self { + Self { + config, + sampling_mem: None, + memtables: Vec::new(), + ssts: vec![Vec::new(); MAX_LEVEL], + } + } + + pub fn sampling_mem(mut self, sampling_mem: Option) -> Self { + self.sampling_mem = sampling_mem; + self + } + + pub fn memtables(mut self, memtables: MemTableVec) -> Self { + self.memtables = memtables; + self + } + + pub fn ssts_of_level(mut self, ssts: Vec>) -> Self { + self.ssts = ssts; + self + } + + pub fn mut_memtables(&mut self) -> &mut MemTableVec { + &mut self.memtables + } + + /// Returns file handles in `level`, panic if level >= MAX_LEVEL + pub fn mut_ssts_of_level(&mut self, level: u16) -> &mut Vec { + &mut self.ssts[usize::from(level)] + } + + pub async fn build(self) -> Result { + let sst_streams_num: usize = self + .ssts + .iter() + .map(|leveled_ssts| leveled_ssts.len()) + .sum(); + let mut streams_num = sst_streams_num + self.memtables.len(); + if self.sampling_mem.is_some() { + streams_num += 1; + } + let mut streams = Vec::with_capacity(streams_num); + + debug!( + "Build merge iterator, table_id:{:?}, request_id:{}, sampling_mem:{:?}, memtables:{:?}, ssts:{:?}", + self.config.table_id, + self.config.request_id, + self.sampling_mem, + self.memtables, + self.ssts + ); + + if let Some(v) = &self.sampling_mem { + let stream = record_batch_stream::filtered_stream_from_memtable( + self.config.projected_schema.clone(), + self.config.need_dedup, + &v.mem, + self.config.reverse, + self.config.predicate.as_ref(), + ) + .context(BuildStreamFromMemtable)?; + streams.push(stream); + } + + for memtable in &self.memtables { + let stream = record_batch_stream::filtered_stream_from_memtable( + self.config.projected_schema.clone(), + self.config.need_dedup, + &memtable.mem, + self.config.reverse, + self.config.predicate.as_ref(), + ) + .context(BuildStreamFromMemtable)?; + streams.push(stream); + } + + let mut sst_ids = Vec::with_capacity(self.ssts.len()); + for leveled_ssts in &self.ssts { + for f in leveled_ssts { + let stream = record_batch_stream::filtered_stream_from_sst_file( + self.config.space_id, + self.config.table_id, + f, + &self.config.sst_factory, + &self.config.sst_reader_options, + self.config.store, + ) + .await + .context(BuildStreamFromSst)?; + streams.push(stream); + sst_ids.push(f.id()); + } + } + + Ok(MergeIterator::new( + self.config.table_id, + self.config.request_id, + // Use the schema after projection as the schema of the merge iterator. + self.config.projected_schema.to_record_schema_with_key(), + streams, + self.config.merge_iter_options, + self.config.reverse, + Metrics::new(self.memtables.len(), sst_streams_num, sst_ids), + )) + } +} + +struct BufferedStreamState { + /// Buffered record batch. + /// + /// invariant: `buffered_record_batch` is not empty. + buffered_record_batch: SequencedRecordBatch, + /// Cursor for reading buffered record batch. + /// + /// `cursor` increases monotonically from 0 to + /// `buffered_record_batch.num_rows()` and `cursor == + /// buffered_record_batch.num_rows()` means no more buffered rows to read. + cursor: usize, +} + +impl BufferedStreamState { + #[inline] + fn is_valid(&self) -> bool { + self.cursor < self.buffered_record_batch.num_rows() + } + + #[inline] + fn is_empty(&self) -> bool { + self.cursor >= self.buffered_record_batch.num_rows() + } + + #[inline] + fn sequence(&self) -> SequenceNumber { + self.buffered_record_batch.sequence + } + + #[inline] + fn first_row(&self) -> RowViewOnBatch<'_> { + assert!(self.is_valid()); + + RowViewOnBatch { + record_batch: &self.buffered_record_batch.record_batch, + row_idx: self.cursor, + } + } + + #[inline] + fn last_row(&self) -> RowViewOnBatch<'_> { + assert!(self.is_valid()); + + RowViewOnBatch { + record_batch: &self.buffered_record_batch.record_batch, + row_idx: self.buffered_record_batch.num_rows() - 1, + } + } + + /// Returns the next available row in the buffer and advance the cursor by + /// one step. + fn next_row(&mut self) -> Option> { + if self.cursor < self.buffered_record_batch.num_rows() { + let row_view = RowViewOnBatch { + record_batch: &self.buffered_record_batch.record_batch, + row_idx: self.cursor, + }; + self.cursor += 1; + Some(row_view) + } else { + None + } + } + + /// Append `len` rows from cursor to the `builder` and advance the cursor. + /// + /// Returns number of rows added. + fn append_rows_to( + &mut self, + builder: &mut RecordBatchWithKeyBuilder, + len: usize, + ) -> Result { + let added = builder + .append_batch_range(&self.buffered_record_batch.record_batch, self.cursor, len) + .context(AppendRow)?; + self.cursor += added; + Ok(added) + } + + /// Take record batch slice with at most `len` rows from cursor and advance + /// the cursor. + fn take_record_batch_slice(&mut self, len: usize) -> RecordBatchWithKey { + let len_to_fetch = cmp::min( + self.buffered_record_batch.record_batch.num_rows() - self.cursor, + len, + ); + let record_batch = self + .buffered_record_batch + .record_batch + .slice(self.cursor, len_to_fetch); + self.cursor += record_batch.num_rows(); + record_batch + } + + #[inline] + fn reset(&mut self, record_batch: SequencedRecordBatch) { + self.buffered_record_batch = record_batch; + self.cursor = 0; + } +} + +struct BufferedStream { + schema: RecordSchemaWithKey, + stream: SequencedRecordBatchStream, + /// `None` state means the stream is exhausted. + state: Option, +} + +impl BufferedStream { + async fn build( + schema: RecordSchemaWithKey, + mut stream: SequencedRecordBatchStream, + metrics: &mut Metrics, + ) -> Result { + // TODO(xikai): do the metrics collection in the `pull_next_non_empty_batch`. + let pull_start = Instant::now(); + let buffered_record_batch = Self::pull_next_non_empty_batch(&mut stream).await?; + metrics.scan_duration += pull_start.elapsed(); + metrics.scan_count += 1; + + let state = buffered_record_batch.map(|v| BufferedStreamState { + buffered_record_batch: v, + cursor: 0, + }); + + Ok(Self { + schema, + stream, + state, + }) + } + + fn sequence_in_buffer(&self) -> SequenceNumber { + self.state.as_ref().unwrap().sequence() + } + + /// REQUIRE: the buffer is not exhausted. + fn first_row_in_buffer(&self) -> RowViewOnBatch<'_> { + self.state.as_ref().unwrap().first_row() + } + + /// REQUIRE: the buffer is not exhausted. + fn last_row_in_buffer(&self) -> RowViewOnBatch<'_> { + self.state.as_ref().unwrap().last_row() + } + + /// REQUIRE: the buffer is not exhausted. + fn next_row_in_buffer(&mut self) -> Option> { + self.state.as_mut().unwrap().next_row() + } + + /// REQUIRE: the buffer is not exhausted. + fn append_rows_to( + &mut self, + builder: &mut RecordBatchWithKeyBuilder, + len: usize, + ) -> Result { + self.state.as_mut().unwrap().append_rows_to(builder, len) + } + + /// REQUIRE: the buffer is not exhausted. + fn take_record_batch_slice(&mut self, len: usize) -> RecordBatchWithKey { + self.state.as_mut().unwrap().take_record_batch_slice(len) + } + + /// Pull the next non empty record batch. + /// + /// The returned record batch is ensured `num_rows() > 0`. + async fn pull_next_non_empty_batch( + stream: &mut SequencedRecordBatchStream, + ) -> Result> { + loop { + match stream.next().await.transpose().context(PullRecordBatch)? { + Some(record_batch) => { + trace!( + "MergeIterator one record batch is fetched:{:?}", + record_batch + ); + + if record_batch.num_rows() > 0 { + return Ok(Some(record_batch)); + } + } + None => return Ok(None), + } + } + } + + /// Pull the next batch if the stream is not exhausted and the inner state + /// is empty. + async fn pull_next_batch_if_necessary(&mut self, metrics: &mut Metrics) -> Result { + let need_pull_new_batch = !self.is_exhausted() && self.state.as_ref().unwrap().is_empty(); + if !need_pull_new_batch { + return Ok(false); + } + + // TODO(xikai): do the metrics collection in the `pull_next_non_empty_batch`. + let pull_start = Instant::now(); + let pulled = match Self::pull_next_non_empty_batch(&mut self.stream).await? { + None => { + self.state = None; + Ok(false) + } + Some(record_batch) => { + self.state.as_mut().unwrap().reset(record_batch); + Ok(true) + } + }; + + metrics.scan_duration += pull_start.elapsed(); + metrics.scan_count += 1; + + pulled + } + + #[inline] + fn is_exhausted(&self) -> bool { + self.state.is_none() + } + + fn into_heaped(self, reverse: bool) -> HeapBufferedStream { + HeapBufferedStream { + stream: self, + reverse, + } + } + + #[inline] + fn schema(&self) -> &RecordSchemaWithKey { + &self.schema + } +} + +/// The wrapper struct determines the compare result for the min binary heap. +struct HeapBufferedStream { + stream: BufferedStream, + reverse: bool, +} + +impl HeapBufferedStream { + /// Check whether all the buffered rows in the `stream` is after the + /// `boundary_row`. + /// + /// NOTE: + /// - The first row in the stream is actually the max row if in reverse + /// order and should check whether it is smaller than `boundary_row`. + /// - The first row in the stream is actually the min row if in normal + /// order and should check whether it is greater than `boundary_row`. + fn is_after_boundary( + &self, + schema: &RecordSchemaWithKey, + boundary_row: &RowViewOnBatch, + ) -> bool { + if self.reverse { + // Compare the max row(the first row) in of the stream with the boundary row. + // The stream is after the boundary if the max row is smaller than boundary. + // is_after: (boundary_row) > [first_row in buffer] + matches!( + schema.compare_row(boundary_row, &self.first_row_in_buffer()), + Ordering::Greater + ) + } else { + // compare the min row(the first row) in of the stream with the boundary row. + // The stream is after the boundary if the min row is greater than boundary. + // is_after: (boundary_row) < [first_row in buffer] + matches!( + schema.compare_row(&self.first_row_in_buffer(), boundary_row), + Ordering::Greater + ) + } + } +} + +impl Deref for HeapBufferedStream { + type Target = BufferedStream; + + fn deref(&self) -> &BufferedStream { + &self.stream + } +} + +impl DerefMut for HeapBufferedStream { + fn deref_mut(&mut self) -> &mut BufferedStream { + &mut self.stream + } +} + +impl PartialEq for HeapBufferedStream { + fn eq(&self, other: &Self) -> bool { + let ordering = self + .schema + .compare_row(&self.first_row_in_buffer(), &other.first_row_in_buffer()); + if let Ordering::Equal = ordering { + self.sequence_in_buffer() == other.sequence_in_buffer() + } else { + false + } + } +} + +impl Eq for HeapBufferedStream {} + +impl PartialOrd for HeapBufferedStream { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for HeapBufferedStream { + fn cmp(&self, other: &Self) -> Ordering { + let ordering = if self.reverse { + // keep the original ordering so the greater row comes before the smaller one. + self.schema + .compare_row(&self.first_row_in_buffer(), &other.first_row_in_buffer()) + } else { + // reverse the original ordering so the smaller row comes before the greater + // one. + self.schema + .compare_row(&other.first_row_in_buffer(), &self.first_row_in_buffer()) + }; + + if let Ordering::Equal = ordering { + // The larger sequence number should always comes before the smaller one. + self.sequence_in_buffer().cmp(&other.sequence_in_buffer()) + } else { + ordering + } + } +} + +pub struct Metrics { + num_memtables: usize, + num_ssts: usize, + sst_ids: Vec, + /// Times to fetch rows from one stream. + times_fetch_rows_from_one: usize, + /// Total rows collected using fetch_rows_from_one_stream(). + total_rows_fetch_from_one: usize, + /// Times to fetch one row from multiple stream. + times_fetch_row_from_multiple: usize, + /// Create time of the metrics. + create_at: Instant, + /// Init time cost of the metrics. + init_duration: Duration, + /// Scan time cost of the metrics. + scan_duration: Duration, + /// Scan count + scan_count: usize, +} + +impl Metrics { + fn new(num_memtables: usize, num_ssts: usize, sst_ids: Vec) -> Self { + Self { + num_memtables, + num_ssts, + sst_ids, + times_fetch_rows_from_one: 0, + total_rows_fetch_from_one: 0, + times_fetch_row_from_multiple: 0, + create_at: Instant::now(), + init_duration: Duration::default(), + scan_duration: Duration::default(), + scan_count: 0, + } + } +} + +impl fmt::Debug for Metrics { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Metrics") + .field("num_memtables", &self.num_memtables) + .field("num_ssts", &self.num_ssts) + .field("sst_ids", &self.sst_ids) + .field("times_fetch_rows_from_one", &self.times_fetch_rows_from_one) + .field("total_rows_fetch_from_one", &self.total_rows_fetch_from_one) + .field( + "times_fetch_row_from_multiple", + &self.times_fetch_row_from_multiple, + ) + .field("duration_since_create", &self.create_at.elapsed()) + .field("init_duration", &self.init_duration) + .field("scan_duration", &self.scan_duration) + .field("scan_count", &self.scan_count) + .finish() + } +} + +pub struct MergeIterator { + table_id: TableId, + request_id: RequestId, + inited: bool, + schema: RecordSchemaWithKey, + record_batch_builder: RecordBatchWithKeyBuilder, + origin_streams: Vec, + /// Any [BufferedStream] in the hot heap is not empty. + hot: BinaryHeap, + /// Any [BufferedStream] in the cold heap is not empty. + cold: BinaryHeap, + iter_options: IterOptions, + reverse: bool, + metrics: Metrics, +} + +impl MergeIterator { + pub fn new( + table_id: TableId, + request_id: RequestId, + schema: RecordSchemaWithKey, + streams: Vec, + iter_options: IterOptions, + reverse: bool, + metrics: Metrics, + ) -> Self { + let heap_cap = streams.len(); + let record_batch_builder = + RecordBatchWithKeyBuilder::with_capacity(schema.clone(), iter_options.batch_size); + Self { + table_id, + request_id, + inited: false, + schema, + record_batch_builder, + origin_streams: streams, + hot: BinaryHeap::with_capacity(heap_cap), + cold: BinaryHeap::with_capacity(heap_cap), + iter_options, + reverse, + metrics, + } + } + + fn merge_window_end(&self) -> Option { + self.hot.peek().as_ref().map(|v| v.last_row_in_buffer()) + } + + async fn init_if_necessary(&mut self) -> Result<()> { + if self.inited { + return Ok(()); + } + + info!( + "Merge iterator init, table_id:{:?}, request_id:{}, schema:{:?}", + self.table_id, self.request_id, self.schema + ); + let init_start = Instant::now(); + + let current_schema = &self.schema; + for stream in mem::take(&mut self.origin_streams) { + let buffered_stream = + BufferedStream::build(self.schema.clone(), stream, &mut self.metrics).await?; + let stream_schema = buffered_stream.schema(); + ensure!( + current_schema == stream_schema, + MismatchedSchema { + expect: current_schema.clone(), + given: stream_schema.clone(), + } + ); + + if !buffered_stream.is_exhausted() { + self.cold.push(buffered_stream.into_heaped(self.reverse)); + } + } + + self.refill_hot(); + + self.inited = true; + self.metrics.init_duration = init_start.elapsed(); + Ok(()) + } + + fn refill_hot(&mut self) { + while !self.cold.is_empty() { + if !self.hot.is_empty() { + let merge_window_end = self.merge_window_end().unwrap(); + let warmest = self.cold.peek().unwrap(); + if warmest.is_after_boundary(&self.schema, &merge_window_end) { + // if the warmest stream in the cold stream sets is totally after the + // merge_window_end then no need to add more streams into + // the hot stream sets for merge sorting. + break; + } + } + + let warmest = self.cold.pop().unwrap(); + self.hot.push(warmest); + } + } + + /// Pull the next batch Rearrange the heap + async fn reheap(&mut self, mut buffered_stream: HeapBufferedStream) -> Result<()> { + let pulled_new_batch = buffered_stream + .pull_next_batch_if_necessary(&mut self.metrics) + .await?; + + if buffered_stream.is_exhausted() { + self.refill_hot(); + } else if pulled_new_batch { + // TODO(xikai): it seems no need to decide to which heap push the + // `buffered_stream`. Just put the new batch into the cold heap if + // the max bound of the hottest batch is smaller than the min bound + // of new one. + let cold_new_batch = if let Some(hottest) = self.hot.peek() { + buffered_stream.is_after_boundary(&self.schema, &hottest.last_row_in_buffer()) + } else { + false + }; + + if cold_new_batch { + self.cold.push(buffered_stream); + } else { + self.hot.push(buffered_stream); + } + self.refill_hot(); + } else { + // No new batch is pulled and the `buffered_stream` is not exhausted so just put + // it back to the hot heap. + self.hot.push(buffered_stream); + } + + Ok(()) + } + + /// Fetch at most `num_rows_to_fetch` rows from the hottest + /// `BufferedStream`. + /// + /// If the inner builder is empty, returns a slice of the record batch in + /// stream. + async fn fetch_rows_from_one_stream( + &mut self, + num_rows_to_fetch: usize, + ) -> Result> { + assert_eq!(self.hot.len(), 1); + self.metrics.times_fetch_rows_from_one += 1; + + let mut buffered_stream = self.hot.pop().unwrap(); + + let record_batch = if self.record_batch_builder.is_empty() { + let record_batch = buffered_stream.take_record_batch_slice(num_rows_to_fetch); + + self.metrics.total_rows_fetch_from_one += record_batch.num_rows(); + + Some(record_batch) + } else { + let fetched_row_num = buffered_stream + .append_rows_to(&mut self.record_batch_builder, num_rows_to_fetch)?; + + self.metrics.total_rows_fetch_from_one += fetched_row_num; + + None + }; + + self.reheap(buffered_stream).await?; + + Ok(record_batch) + } + + /// Fetch one row from the hottest `BufferedStream`. + /// + /// REQUIRES: `self.hot` is not empty. + async fn fetch_one_row_from_multiple_streams(&mut self) -> Result<()> { + assert!(!self.hot.is_empty()); + self.metrics.times_fetch_row_from_multiple += 1; + + let mut hottest = self.hot.pop().unwrap(); + let row = hottest.next_row_in_buffer().unwrap(); + self.record_batch_builder + .append_row_view(&row) + .context(AppendRow)?; + self.reheap(hottest).await + } + + /// Fetch the next batch from the streams. + /// + /// `init_if_necessary` should be finished before this method. + async fn fetch_next_batch(&mut self) -> Result> { + self.init_if_necessary().await?; + + self.record_batch_builder.clear(); + + while !self.hot.is_empty() && self.record_batch_builder.len() < self.iter_options.batch_size + { + // no need to do merge sort if only one batch in the hot heap. + if self.hot.len() == 1 { + let fetch_row_num = self.iter_options.batch_size - self.record_batch_builder.len(); + + if let Some(record_batch) = self.fetch_rows_from_one_stream(fetch_row_num).await? { + // The builder is empty and we have fetch a record batch from this stream, just + // return that batch. + return Ok(Some(record_batch)); + } + // Else, some rows may have been pushed into the builder. + } else { + self.fetch_one_row_from_multiple_streams().await?; + } + } + + if self.record_batch_builder.is_empty() { + Ok(None) + } else { + let record_batch = self + .record_batch_builder + .build() + .context(BuildRecordBatch)?; + Ok(Some(record_batch)) + } + } +} + +impl Drop for MergeIterator { + fn drop(&mut self) { + info!( + "Merge iterator dropped, table_id:{:?}, request_id:{}, metrics:{:?}, iter_options:{:?},", + self.table_id, self.request_id, self.metrics, self.iter_options, + ); + } +} + +#[async_trait] +impl RecordBatchWithKeyIterator for MergeIterator { + type Error = Error; + + fn schema(&self) -> &RecordSchemaWithKey { + &self.schema + } + + async fn next_batch(&mut self) -> Result> { + let record_batch = self.fetch_next_batch().await?; + + trace!("MergeIterator send next record batch:{:?}", record_batch); + + Ok(record_batch) + } +} + +#[cfg(test)] +mod tests { + use common_types::{ + self, + tests::{build_row, build_schema}, + }; + + use super::*; + use crate::row_iter::tests::check_iterator; + + #[tokio::test] + async fn test_row_merge_iterator() { + // first two columns are key columns + let schema = build_schema(); + + let testcases = vec![ + // (sequence, rows) + (10, vec![build_row(b"y", 1000000, 10.0, "v4")]), + (20, vec![build_row(b"y", 1000000, 10.0, "v3")]), + (100, vec![build_row(b"b", 1000000, 10.0, "v2")]), + (1, vec![build_row(b"a", 1000000, 10.0, "v1")]), + ]; + + let streams = + record_batch_stream::tests::build_sequenced_record_batch_stream(&schema, testcases); + let mut iter = MergeIterator::new( + TableId::MIN, + RequestId::next_id(), + schema.to_record_schema_with_key(), + streams, + IterOptions::default(), + false, + Metrics::new(1, 1, vec![]), + ); + + check_iterator( + &mut iter, + vec![ + build_row(b"a", 1000000, 10.0, "v1"), + build_row(b"b", 1000000, 10.0, "v2"), + build_row(b"y", 1000000, 10.0, "v3"), + build_row(b"y", 1000000, 10.0, "v4"), + ], + ) + .await; + } + + #[tokio::test] + async fn test_row_merge_iterator_reverse() { + // first two columns are key columns + let schema = build_schema(); + + let testcases = vec![ + // (sequence, rows) + ( + 10, + vec![ + build_row(b"y", 1000001, 10.0, "v5"), + build_row(b"y", 1000000, 10.0, "v4"), + ], + ), + (20, vec![build_row(b"y", 1000000, 10.0, "v3")]), + (100, vec![build_row(b"b", 1000000, 10.0, "v2")]), + (1, vec![build_row(b"a", 1000000, 10.0, "v1")]), + ]; + + let streams = + record_batch_stream::tests::build_sequenced_record_batch_stream(&schema, testcases); + let mut iter = MergeIterator::new( + TableId::MIN, + RequestId::next_id(), + schema.to_record_schema_with_key(), + streams, + IterOptions::default(), + true, + Metrics::new(1, 1, vec![]), + ); + + check_iterator( + &mut iter, + vec![ + build_row(b"y", 1000001, 10.0, "v5"), + build_row(b"y", 1000000, 10.0, "v3"), + build_row(b"y", 1000000, 10.0, "v4"), + build_row(b"b", 1000000, 10.0, "v2"), + build_row(b"a", 1000000, 10.0, "v1"), + ], + ) + .await; + } +} diff --git a/analytic_engine/src/row_iter/mod.rs b/analytic_engine/src/row_iter/mod.rs new file mode 100644 index 0000000000..8c30523396 --- /dev/null +++ b/analytic_engine/src/row_iter/mod.rs @@ -0,0 +1,87 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Iterators for row. + +use std::{ + pin::Pin, + task::{Context, Poll}, +}; + +use async_trait::async_trait; +use common_types::{record_batch::RecordBatchWithKey, schema::RecordSchemaWithKey}; +use common_util::runtime::Runtime; +use futures::stream::Stream; +use log::{debug, error}; +use tokio::sync::mpsc::{self, Receiver}; + +use crate::sst::builder::{RecordBatchStream, RecordBatchStreamItem}; + +pub mod chain; +pub mod dedup; +pub mod merge; +pub mod record_batch_stream; +#[cfg(test)] +pub mod tests; + +const RECORD_BATCH_READ_BUF_SIZE: usize = 10; + +#[derive(Debug, Clone)] +pub struct IterOptions { + pub batch_size: usize, +} + +impl Default for IterOptions { + fn default() -> Self { + Self { batch_size: 500 } + } +} + +/// The iterator for reading RecordBatch from a table. +/// +/// The `schema()` should be the same as the RecordBatch from `read()`. +/// The reader is exhausted if the `read()` returns the `Ok(None)`. +#[async_trait] +pub trait RecordBatchWithKeyIterator: Send { + type Error: std::error::Error + Send + Sync + 'static; + + fn schema(&self) -> &RecordSchemaWithKey; + + async fn next_batch(&mut self) -> std::result::Result, Self::Error>; +} + +struct ReceiverStream { + rx: Receiver, +} + +impl Stream for ReceiverStream { + type Item = RecordBatchStreamItem; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + Pin::new(&mut this.rx).poll_recv(cx) + } +} + +// TODO(yingwen): This is a hack way to convert an async trait to stream. +pub fn record_batch_with_key_iter_to_stream( + mut iter: I, + runtime: &Runtime, +) -> RecordBatchStream { + let (tx, rx) = mpsc::channel(RECORD_BATCH_READ_BUF_SIZE); + runtime.spawn(async move { + while let Some(record_batch) = iter.next_batch().await.transpose() { + let record_batch = record_batch.map_err(|e| Box::new(e) as _); + + debug!( + "compact table send next record batch, batch:{:?}", + record_batch + ); + if tx.send(record_batch).await.is_err() { + error!("Failed to send record batch from the merge iterator"); + break; + } + } + }); + + Box::new(ReceiverStream { rx }) +} diff --git a/analytic_engine/src/row_iter/record_batch_stream.rs b/analytic_engine/src/row_iter/record_batch_stream.rs new file mode 100644 index 0000000000..13cf049b13 --- /dev/null +++ b/analytic_engine/src/row_iter/record_batch_stream.rs @@ -0,0 +1,287 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::ops::Bound; + +use common_types::{ + projected_schema::ProjectedSchema, record_batch::RecordBatchWithKey, SequenceNumber, +}; +use common_util::define_result; +use futures::stream::{self, Stream, StreamExt}; +use log::{error, trace}; +use snafu::{Backtrace, OptionExt, ResultExt, Snafu}; +use table_engine::{ + predicate::{filter_record_batch::RecordBatchFilter, Predicate}, + table::TableId, +}; + +use crate::{ + memtable::{MemTableRef, ScanContext, ScanRequest}, + space::SpaceId, + sst, + sst::{factory::SstReaderOptions, file::FileHandle}, + table::sst_util, +}; + +#[derive(Debug, Snafu)] +#[snafu(visibility(pub))] +pub enum Error { + #[snafu(display( + "No sst reader found, sst_reader_options:{:?}.\nBacktrace:\n{}", + options, + backtrace + ))] + SstReaderNotFound { + options: SstReaderOptions, + backtrace: Backtrace, + }, + + #[snafu(display("Fail to read sst meta, err:{}", source))] + ReadSstMeta { source: crate::sst::reader::Error }, + + #[snafu(display("Fail to read sst data, err:{}", source))] + ReadSstData { source: crate::sst::reader::Error }, + + #[snafu(display("Fail to scan memtable, err:{}", source))] + ScanMemtable { source: crate::memtable::Error }, +} + +define_result!(Error); + +const REBUILD_FILTERED_RECORD_BATCH_MAGNIFICATION: usize = 2; + +// TODO(yingwen): Can we move sequence to RecordBatchWithKey and remove this +// struct? But what is the sequence after merge? +#[derive(Debug)] +pub struct SequencedRecordBatch { + pub record_batch: RecordBatchWithKey, + pub sequence: SequenceNumber, +} + +impl SequencedRecordBatch { + #[inline] + pub fn num_rows(&self) -> usize { + self.record_batch.num_rows() + } +} + +pub type SequencedRecordBatchStream = Box< + dyn Stream< + Item = std::result::Result< + SequencedRecordBatch, + Box, + >, + > + Send + + Unpin, +>; + +/// Filter the `sequenced_record_batch` according to the `filter` if necessary. +/// Returns the original batch if only a small proportion of the batch is +/// filtered out. +/// The `selected_rows_buf` is for reuse. +fn maybe_filter_record_batch( + mut sequenced_record_batch: SequencedRecordBatch, + filter: &RecordBatchFilter, + selected_rows_buf: &mut Vec, +) -> Option { + if filter.is_empty() { + return Some(sequenced_record_batch); + } + + // The filter requires the `selected_rows_buf.len() >= + // sequenced_record_batch.num_rows()`. + selected_rows_buf.resize(sequenced_record_batch.num_rows(), true); + let num_selected_rows = filter.filter( + &sequenced_record_batch.record_batch, + selected_rows_buf.as_mut_slice(), + ); + + trace!( + "filter record batch, selected_rows:{}, origin_rows:{}", + num_selected_rows, + sequenced_record_batch.num_rows() + ); + + // No row is selected. + if num_selected_rows == 0 { + return None; + } + + if num_selected_rows + > sequenced_record_batch.num_rows() / REBUILD_FILTERED_RECORD_BATCH_MAGNIFICATION + { + // just use the original record batch because only a small proportion is + // filtered out. + return Some(sequenced_record_batch); + } + + // select the rows according to the filter result. + if let Err(e) = sequenced_record_batch + .record_batch + .select_data(selected_rows_buf.as_slice()) + { + error!( + "Fail to select record batch, data:{:?}, selected_rows:{:?}, err:{}", + sequenced_record_batch, selected_rows_buf, e, + ); + } + + Some(sequenced_record_batch) +} + +/// Filter the sequenced record batch stream by applying the `predicate`. +/// However, the output record batches is not ensured to meet the requirements +/// of the `predicate`. +pub fn filter_stream( + origin_stream: SequencedRecordBatchStream, + predicate: &Predicate, +) -> SequencedRecordBatchStream { + if predicate.exprs.is_empty() { + return origin_stream; + } + + let mut select_row_buf = Vec::new(); + let filter = RecordBatchFilter::from(predicate.exprs.as_slice()); + let stream = origin_stream.filter_map(move |sequence_record_batch| { + let v = match sequence_record_batch { + Ok(v) => maybe_filter_record_batch(v, &filter, &mut select_row_buf).map(Ok), + Err(e) => Some(Err(e)), + }; + + futures::future::ready(v) + }); + + Box::new(stream) +} + +/// Build filtered (by `predicate`) [SequencedRecordBatchStream] from a +/// memtable. +pub fn filtered_stream_from_memtable( + projected_schema: ProjectedSchema, + need_dedup: bool, + memtable: &MemTableRef, + reverse: bool, + predicate: &Predicate, +) -> Result { + stream_from_memtable(projected_schema, need_dedup, memtable, reverse) + .map(|origin_stream| filter_stream(origin_stream, predicate)) +} + +/// Build [SequencedRecordBatchStream] from a memtable. +pub fn stream_from_memtable( + projected_schema: ProjectedSchema, + need_dedup: bool, + memtable: &MemTableRef, + reverse: bool, +) -> Result { + let scan_ctx = ScanContext::default(); + let max_seq = memtable.last_sequence(); + let scan_req = ScanRequest { + start_user_key: Bound::Unbounded, + end_user_key: Bound::Unbounded, + sequence: max_seq, + projected_schema, + need_dedup, + reverse, + }; + + let iter = memtable.scan(scan_ctx, scan_req).context(ScanMemtable)?; + let stream = stream::iter(iter).map(move |v| { + v.map(|record_batch| SequencedRecordBatch { + record_batch, + sequence: max_seq, + }) + .map_err(|e| Box::new(e) as _) + }); + + Ok(Box::new(stream)) +} + +/// Build the filtered by `sst_read_options.predicate` +/// [SequencedRecordBatchStream] from a sst. +pub async fn filtered_stream_from_sst_file( + space_id: SpaceId, + table_id: TableId, + sst_file: &FileHandle, + sst_factory: &Fa, + sst_reader_options: &SstReaderOptions, + store: &S, +) -> Result +where + Fa: sst::factory::Factory, + S: object_store::ObjectStore, +{ + stream_from_sst_file( + space_id, + table_id, + sst_file, + sst_factory, + sst_reader_options, + store, + ) + .await + .map(|origin_stream| filter_stream(origin_stream, sst_reader_options.predicate.as_ref())) +} + +/// Build the [SequencedRecordBatchStream] from a sst. +pub async fn stream_from_sst_file( + space_id: SpaceId, + table_id: TableId, + sst_file: &FileHandle, + sst_factory: &Fa, + sst_reader_options: &SstReaderOptions, + store: &S, +) -> Result +where + Fa: sst::factory::Factory, + S: object_store::ObjectStore, +{ + sst_file.read_meter().mark(); + let mut path = store.new_path(); + sst_util::set_sst_file_path(space_id, table_id, sst_file.id(), &mut path); + let mut sst_reader = sst_factory + .new_sst_reader(sst_reader_options, &path, store) + .with_context(|| SstReaderNotFound { + options: sst_reader_options.clone(), + })?; + let meta = sst_reader.meta_data().await.context(ReadSstMeta)?; + let max_seq = meta.max_sequence; + let sst_stream = sst_reader.read().await.context(ReadSstData)?; + + let stream = Box::new(sst_stream.map(move |v| { + v.map(|record_batch| SequencedRecordBatch { + record_batch, + sequence: max_seq, + }) + .map_err(|e| Box::new(e) as _) + })); + + Ok(stream) +} + +#[cfg(test)] +pub mod tests { + use common_types::{row::Row, schema::Schema}; + + use super::*; + use crate::row_iter; + + /// Build [SequencedRecordBatchStream] from the sequenced rows. + pub fn build_sequenced_record_batch_stream( + schema: &Schema, + batches: Vec<(SequenceNumber, Vec)>, + ) -> Vec { + batches + .into_iter() + .map(|(seq, rows)| { + let batch = SequencedRecordBatch { + record_batch: row_iter::tests::build_record_batch_with_key( + schema.clone(), + rows, + ), + sequence: seq, + }; + Box::new(stream::iter(vec![Ok(batch)])) as SequencedRecordBatchStream + }) + .collect() + } +} diff --git a/analytic_engine/src/row_iter/tests.rs b/analytic_engine/src/row_iter/tests.rs new file mode 100644 index 0000000000..ce929b852a --- /dev/null +++ b/analytic_engine/src/row_iter/tests.rs @@ -0,0 +1,93 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use async_trait::async_trait; +use common_types::{ + projected_schema::ProjectedSchema, + record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder}, + row::{ + contiguous::{ContiguousRowReader, ContiguousRowWriter, ProjectedContiguousRow}, + Row, + }, + schema::{IndexInWriterSchema, RecordSchemaWithKey, Schema}, +}; +use common_util::define_result; +use snafu::Snafu; + +use crate::row_iter::RecordBatchWithKeyIterator; + +#[derive(Debug, Snafu)] +pub enum Error {} + +define_result!(Error); + +pub struct VectorIterator { + schema: RecordSchemaWithKey, + items: Vec>, + idx: usize, +} + +impl VectorIterator { + pub fn new(schema: RecordSchemaWithKey, items: Vec) -> Self { + Self { + schema, + items: items.into_iter().map(Some).collect(), + idx: 0, + } + } +} + +#[async_trait] +impl RecordBatchWithKeyIterator for VectorIterator { + type Error = Error; + + fn schema(&self) -> &RecordSchemaWithKey { + &self.schema + } + + async fn next_batch(&mut self) -> Result> { + if self.idx == self.items.len() { + return Ok(None); + } + + let ret = Ok(self.items[self.idx].take()); + self.idx += 1; + + ret + } +} + +pub fn build_record_batch_with_key(schema: Schema, rows: Vec) -> RecordBatchWithKey { + assert!(schema.num_columns() > 1); + let projection: Vec = (0..schema.num_columns()).collect(); + let projected_schema = ProjectedSchema::new(schema.clone(), Some(projection)).unwrap(); + let row_projected_schema = projected_schema.try_project_with_key(&schema).unwrap(); + let mut builder = + RecordBatchWithKeyBuilder::with_capacity(projected_schema.to_record_schema_with_key(), 2); + let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns()); + + let mut buf = Vec::new(); + for row in rows { + let mut writer = ContiguousRowWriter::new(&mut buf, &schema, &index_in_writer); + + writer.write_row(&row).unwrap(); + + let source_row = ContiguousRowReader::with_schema(&buf, &schema); + let projected_row = ProjectedContiguousRow::new(source_row, &row_projected_schema); + builder + .append_projected_contiguous_row(&projected_row) + .unwrap(); + } + builder.build().unwrap() +} + +pub async fn check_iterator(iter: &mut T, expected_rows: Vec) { + let mut visited_rows = 0; + while let Some(batch) = iter.next_batch().await.unwrap() { + for row_idx in 0..batch.num_rows() { + assert_eq!(batch.clone_row_at(row_idx), expected_rows[visited_rows]); + visited_rows += 1; + } + } + + assert_eq!(visited_rows, expected_rows.len()); +} diff --git a/analytic_engine/src/sampler.rs b/analytic_engine/src/sampler.rs new file mode 100644 index 0000000000..304d052327 --- /dev/null +++ b/analytic_engine/src/sampler.rs @@ -0,0 +1,448 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Segment duration sampler. + +use std::{ + collections::HashSet, + sync::{Arc, Mutex}, + time::Duration, +}; + +use common_types::time::{TimeRange, Timestamp}; +use common_util::define_result; +use snafu::{ensure, Backtrace, Snafu}; + +use crate::table_options; + +/// Initial size of timestamps set. +const INIT_CAPACITY: usize = 1000; +const HOUR_MS: u64 = 3600 * 1000; +const DAY_MS: u64 = 24 * HOUR_MS; +const AVAILABLE_DURATIONS: [u64; 8] = [ + 2 * HOUR_MS, + DAY_MS, + 7 * DAY_MS, + 30 * DAY_MS, + 180 * DAY_MS, + 360 * DAY_MS, + 5 * 360 * DAY_MS, + 10 * 360 * DAY_MS, +]; +const INTERVAL_RATIO: f64 = 0.9; +/// Expected points per timeseries in a segment, used to pick a proper segment +/// duration. +const POINTS_PER_SERIES: u64 = 100; +/// Max timestamp that wont overflow even using max duration. +const MAX_TIMESTAMP_MS_FOR_DURATION: i64 = + i64::MAX - 2 * AVAILABLE_DURATIONS[AVAILABLE_DURATIONS.len() - 1] as i64; +/// Minimun sample timestamps to compute duration. +const MIN_SAMPLES: usize = 2; + +#[derive(Debug, Snafu)] +#[snafu(display( + "Invalid timestamp to collect, timestamp:{:?}.\nBacktrace:\n{}", + timestamp, + backtrace +))] +pub struct Error { + timestamp: Timestamp, + backtrace: Backtrace, +} + +define_result!(Error); + +/// Segment duration sampler. +/// +/// Collects all timestamps and then yield a suggested segment duration to hold +/// all data with similar timestamp interval. +pub trait DurationSampler { + /// Collect a timestamp. + fn collect(&self, timestamp: Timestamp) -> Result<()>; + + /// Returns a suggested duration to partition the timestamps or default + /// duration if no enough timestamp has been sampled. + /// + /// Note that this method may be invoked more than once. + fn suggest_duration(&self) -> Duration; + + /// Returns a vector of time range with suggested duration that can hold all + /// timestamps collected by this sampler. + fn ranges(&self) -> Vec; + + // TODO(yingwen): Memory usage. +} + +pub type SamplerRef = Arc; + +struct State { + /// Deduplicated timestamps. + deduped_timestamps: HashSet, + /// Cached suggested duration. + duration: Option, + /// Sorted timestamps cache, empty if `duration` is None. + sorted_timestamps: Vec, +} + +impl State { + fn clear_cache(&mut self) { + self.duration = None; + self.sorted_timestamps.clear(); + } +} + +pub struct DefaultSampler { + state: Mutex, +} + +impl Default for DefaultSampler { + fn default() -> Self { + Self { + state: Mutex::new(State { + deduped_timestamps: HashSet::with_capacity(INIT_CAPACITY), + duration: None, + sorted_timestamps: Vec::new(), + }), + } + } +} + +impl DurationSampler for DefaultSampler { + fn collect(&self, timestamp: Timestamp) -> Result<()> { + ensure!( + timestamp.as_i64() < MAX_TIMESTAMP_MS_FOR_DURATION, + Context { timestamp } + ); + + let mut state = self.state.lock().unwrap(); + state.deduped_timestamps.insert(timestamp); + state.clear_cache(); + + Ok(()) + } + + fn suggest_duration(&self) -> Duration { + if let Some(v) = self.duration() { + return v; + } + + let timestamps = self.compute_sorted_timestamps(); + let picked = match evaluate_interval(×tamps) { + Some(interval) => pick_duration(interval), + None => table_options::DEFAULT_SEGMENT_DURATION, + }; + + { + // Cache the picked duration. + let mut state = self.state.lock().unwrap(); + state.duration = Some(picked); + state.sorted_timestamps = timestamps; + } + + picked + } + + fn ranges(&self) -> Vec { + let duration = self.suggest_duration(); + let sorted_timestamps = self.cached_sorted_timestamps(); + // This type hint is needed to make `ranges.last()` work. + let mut ranges: Vec = Vec::new(); + + for ts in sorted_timestamps { + if let Some(range) = ranges.last() { + if range.contains(ts) { + continue; + } + } + + // collect() ensures timestamp won't overflow. + let range = TimeRange::bucket_of(ts, duration).unwrap(); + ranges.push(range); + } + + ranges + } +} + +impl DefaultSampler { + fn cached_sorted_timestamps(&self) -> Vec { + self.state.lock().unwrap().sorted_timestamps.clone() + } + + fn compute_sorted_timestamps(&self) -> Vec { + let mut timestamps: Vec<_> = { + let state = self.state.lock().unwrap(); + state.deduped_timestamps.iter().copied().collect() + }; + + timestamps.sort_unstable(); + + timestamps + } + + fn duration(&self) -> Option { + self.state.lock().unwrap().duration + } +} + +fn evaluate_interval(sorted_timestamps: &[Timestamp]) -> Option { + if sorted_timestamps.len() < MIN_SAMPLES { + return None; + } + + let mut intervals = Vec::with_capacity(sorted_timestamps.len()); + for i in 0..sorted_timestamps.len() - 1 { + let current = sorted_timestamps[i]; + let next = sorted_timestamps[i + 1]; + let interval = next.as_i64() - current.as_i64(); + intervals.push(interval); + } + + intervals.sort_unstable(); + + let mut index = (intervals.len() as f64 * INTERVAL_RATIO) as usize; + if index > 1 { + index -= 1; + }; + let selected = intervals[index]; + // Interval should larger than 0. + assert!(selected > 0); + + Some(selected as u64) +} + +fn pick_duration(interval: u64) -> Duration { + let scaled_interval = interval.checked_mul(POINTS_PER_SERIES).unwrap_or(u64::MAX); + for du_ms in AVAILABLE_DURATIONS { + if du_ms > scaled_interval { + return Duration::from_millis(du_ms); + } + } + + // No duration larger than scaled interval, returns the largest duration. + let du_ms = AVAILABLE_DURATIONS[AVAILABLE_DURATIONS.len() - 1]; + + Duration::from_millis(du_ms) +} + +#[cfg(test)] +mod tests { + use super::*; + + const SEC_MS: u64 = 1000; + const MIN_MS: u64 = 60 * SEC_MS; + + #[test] + fn test_pick_duration() { + let cases = [ + (1, 2 * HOUR_MS), + (5 * SEC_MS, 2 * HOUR_MS), + (15 * SEC_MS, 2 * HOUR_MS), + (MIN_MS, 2 * HOUR_MS), + (5 * MIN_MS, DAY_MS), + (10 * MIN_MS, DAY_MS), + (30 * MIN_MS, 7 * DAY_MS), + (HOUR_MS, 7 * DAY_MS), + (4 * HOUR_MS, 30 * DAY_MS), + (8 * HOUR_MS, 180 * DAY_MS), + (DAY_MS, 180 * DAY_MS), + (3 * DAY_MS, 360 * DAY_MS), + (7 * DAY_MS, 5 * 360 * DAY_MS), + (30 * DAY_MS, 10 * 360 * DAY_MS), + (360 * DAY_MS, 10 * 360 * DAY_MS), + (10 * 360 * DAY_MS, 10 * 360 * DAY_MS), + (20 * 360 * DAY_MS, 10 * 360 * DAY_MS), + ]; + + for (i, (interval, expect)) in cases.iter().enumerate() { + assert_eq!( + *expect, + pick_duration(*interval).as_millis() as u64, + "Case {}", + i + ); + } + } + + #[test] + fn test_empty_sampler() { + let sampler = DefaultSampler::default(); + + assert_eq!( + table_options::DEFAULT_SEGMENT_DURATION, + sampler.suggest_duration() + ); + assert!(sampler.ranges().is_empty()); + } + + #[test] + fn test_one_sample() { + let sampler = DefaultSampler::default(); + + sampler.collect(Timestamp::new(0)).unwrap(); + + assert_eq!( + table_options::DEFAULT_SEGMENT_DURATION, + sampler.suggest_duration() + ); + let time_range = + TimeRange::bucket_of(Timestamp::new(0), table_options::DEFAULT_SEGMENT_DURATION) + .unwrap(); + assert_eq!(&[time_range], &sampler.ranges()[..]); + } + + #[test] + fn test_all_sample_same() { + let sampler = DefaultSampler::default(); + + let ts = Timestamp::now(); + for _ in 0..5 { + sampler.collect(ts).unwrap(); + } + + assert_eq!( + table_options::DEFAULT_SEGMENT_DURATION, + sampler.suggest_duration() + ); + let time_range = TimeRange::bucket_of(ts, table_options::DEFAULT_SEGMENT_DURATION).unwrap(); + assert_eq!(&[time_range], &sampler.ranges()[..]); + } + + #[test] + fn test_collect_invalid() { + let sampler = DefaultSampler::default(); + + assert!(sampler + .collect(Timestamp::new(MAX_TIMESTAMP_MS_FOR_DURATION - 1)) + .is_ok()); + assert!(sampler + .collect(Timestamp::new(MAX_TIMESTAMP_MS_FOR_DURATION)) + .is_err()); + } + + #[test] + fn test_sampler_cache() { + let sampler = DefaultSampler::default(); + + let ts1 = Timestamp::now(); + for i in 0..3 { + sampler + .collect(Timestamp::new(ts1.as_i64() + i * SEC_MS as i64)) + .unwrap(); + } + + assert_eq!( + table_options::DEFAULT_SEGMENT_DURATION, + sampler.suggest_duration() + ); + let time_range1 = + TimeRange::bucket_of(ts1, table_options::DEFAULT_SEGMENT_DURATION).unwrap(); + assert_eq!(&[time_range1], &sampler.ranges()[..]); + + // A new timestamp is sampled. + let ts2 = Timestamp::new(ts1.as_i64() + DAY_MS as i64); + sampler.collect(ts2).unwrap(); + + assert!(sampler.state.lock().unwrap().duration.is_none()); + assert!(sampler.state.lock().unwrap().sorted_timestamps.is_empty()); + + assert_eq!( + table_options::DEFAULT_SEGMENT_DURATION, + sampler.suggest_duration() + ); + let time_range2 = + TimeRange::bucket_of(ts2, table_options::DEFAULT_SEGMENT_DURATION).unwrap(); + assert_eq!(&[time_range1, time_range2], &sampler.ranges()[..]); + } + + fn test_suggest_duration_and_ranges_case( + timestamps: &[i64], + duration: u64, + ranges: &[(i64, i64)], + ) { + let sampler = DefaultSampler::default(); + + for ts in timestamps { + sampler.collect(Timestamp::new(*ts)).unwrap(); + } + + assert_eq!(Duration::from_millis(duration), sampler.suggest_duration()); + + let suggested_ranges = sampler.ranges(); + for (range, suggested_range) in ranges.iter().zip(suggested_ranges) { + assert_eq!(range.0, suggested_range.inclusive_start().as_i64()); + assert_eq!(range.1, suggested_range.exclusive_end().as_i64()); + } + } + + #[test] + fn test_suggest_duration_and_ranges() { + test_suggest_duration_and_ranges_case( + // Intervals: 3, 5 + &[100, 103, 108], + 2 * HOUR_MS, + &[(0, 2 * HOUR_MS as i64)], + ); + + let now_ts = Timestamp::now(); + let now = now_ts.as_i64(); + let sec_ms_i64 = SEC_MS as i64; + + let bucket = TimeRange::bucket_of(now_ts, Duration::from_millis(2 * HOUR_MS)).unwrap(); + let expect_range = ( + bucket.inclusive_start().as_i64(), + bucket.exclusive_end().as_i64(), + ); + test_suggest_duration_and_ranges_case( + // Intervals: 5s, 5s, 5s, 5s, 100s, + &[ + now, + now + 5 * sec_ms_i64, + now + 2 * 5 * sec_ms_i64, + now + 3 * 5 * sec_ms_i64, + now + 4 * 5 * sec_ms_i64, + now + 4 * 5 * sec_ms_i64 + 100 * sec_ms_i64, + ], + 2 * HOUR_MS, + &[expect_range], + ); + + // Same with previous case, but shuffle the input timestamps. + test_suggest_duration_and_ranges_case( + &[ + now + 3 * 5 * sec_ms_i64, + now, + now + 5 * sec_ms_i64, + now + 4 * 5 * sec_ms_i64, + now + 2 * 5 * sec_ms_i64, + now + 4 * 5 * sec_ms_i64 + 100 * sec_ms_i64, + ], + 2 * HOUR_MS, + &[expect_range], + ); + + test_suggest_duration_and_ranges_case( + // Intervals: nine 5s and one 8h + &[ + now + 5 * 5 * sec_ms_i64 + 8 * HOUR_MS as i64, + now, + now + 5 * sec_ms_i64, + now + 2 * 5 * sec_ms_i64, + now + 7 * 5 * sec_ms_i64 + 8 * HOUR_MS as i64, + now + 3 * 5 * sec_ms_i64, + now + 4 * 5 * sec_ms_i64, + now + 4 * 5 * sec_ms_i64 + 8 * HOUR_MS as i64, + now + 6 * 5 * sec_ms_i64 + 8 * HOUR_MS as i64, + now + 8 * 5 * sec_ms_i64 + 8 * HOUR_MS as i64, + now + 9 * 5 * sec_ms_i64 + 8 * HOUR_MS as i64, + ], + 2 * HOUR_MS, + &[ + expect_range, + ( + expect_range.0 + 8 * HOUR_MS as i64, + expect_range.1 + 8 * HOUR_MS as i64, + ), + ], + ); + } +} diff --git a/analytic_engine/src/setup.rs b/analytic_engine/src/setup.rs new file mode 100644 index 0000000000..80e673778a --- /dev/null +++ b/analytic_engine/src/setup.rs @@ -0,0 +1,103 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Setup the analytic engine + +use std::{path::Path, sync::Arc}; + +use common_util::define_result; +use object_store::disk::File; +use parquet::{ + cache::{LruDataCache, LruMetaCache}, + DataCacheRef, MetaCacheRef, +}; +use snafu::{ResultExt, Snafu}; +use table_engine::engine::EngineRuntimes; +use wal::{manager, rocks_impl::manager::Builder as WalBuilder}; + +use crate::{ + context::OpenContext, engine::TableEngineImpl, instance::Instance, meta::details::ManifestImpl, + sst::factory::FactoryImpl, AnalyticTableEngine, Config, EngineInstance, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to open engine instance, err:{}", source))] + OpenInstance { + source: crate::instance::open::Error, + }, + + #[snafu(display("Failed to open wal, err:{}", source))] + OpenWal { source: manager::error::Error }, + + #[snafu(display("Failed to open wal for manifest, err:{}", source))] + OpenManifestWal { source: manager::error::Error }, + + #[snafu(display("Failed to open manifest, err:{}", source))] + OpenManifest { source: crate::meta::details::Error }, +} + +define_result!(Error); + +const WAL_DIR_NAME: &str = "wal"; +const MANIFEST_DIR_NAME: &str = "manifest"; +const STORE_DIR_NAME: &str = "store"; + +/// Open an [AnalyticTableEngine] instance +pub async fn open_analytic_table_engine( + config: Config, + engine_runtimes: Arc, +) -> Result { + let instance = open_instance(config.clone(), engine_runtimes).await?; + + Ok(TableEngineImpl::new(instance)) +} + +async fn open_instance( + config: Config, + engine_runtimes: Arc, +) -> Result { + let write_runtime = engine_runtimes.write_runtime.clone(); + let data_path = Path::new(&config.data_path); + let wal_path = data_path.join(WAL_DIR_NAME); + let wal_manager = WalBuilder::with_default_rocksdb_config(wal_path, write_runtime.clone()) + .build() + .context(OpenWal)?; + + let manifest_path = data_path.join(MANIFEST_DIR_NAME); + let manifest_wal = WalBuilder::with_default_rocksdb_config(manifest_path, write_runtime) + .build() + .context(OpenManifestWal)?; + + let manifest = ManifestImpl::open(manifest_wal, config.manifest.clone()) + .await + .context(OpenManifest)?; + + let meta_cache: Option = + if let Some(sst_meta_cache_cap) = &config.sst_meta_cache_cap { + Some(Arc::new(LruMetaCache::new(*sst_meta_cache_cap))) + } else { + None + }; + + let data_cache: Option = + if let Some(sst_data_cache_cap) = &config.sst_data_cache_cap { + Some(Arc::new(LruDataCache::new(*sst_data_cache_cap))) + } else { + None + }; + + let sst_path = data_path.join(STORE_DIR_NAME); + let store = File::new(sst_path); + let open_ctx = OpenContext { + config, + runtimes: engine_runtimes, + meta_cache, + data_cache, + }; + + let instance = Instance::open(open_ctx, manifest, wal_manager, store, FactoryImpl) + .await + .context(OpenInstance)?; + + Ok(instance) +} diff --git a/analytic_engine/src/space.rs b/analytic_engine/src/space.rs new file mode 100644 index 0000000000..d7ab539571 --- /dev/null +++ b/analytic_engine/src/space.rs @@ -0,0 +1,305 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table space +//! +//! A table space acts like a namespace of a bunch of tables, tables under +//! different space can use same table name + +use std::{ + fmt, + sync::{Arc, RwLock}, +}; + +use arena::CollectorRef; +use common_util::define_result; +use log::info; +use snafu::{Backtrace, ResultExt, Snafu}; +use table_engine::{engine::CreateTableRequest, table::TableId}; +use tokio::sync::Mutex; + +use crate::{ + instance::{mem_collector::MemUsageCollector, write_worker::WriteGroup}, + meta::{ + meta_update::{AddTableMeta, MetaUpdate}, + Manifest, + }, + sst::file::FilePurger, + table::data::{TableData, TableDataRef, TableDataSet}, + TableOptions, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Table already exists, table:{}.\nBacktrace:\n{}", table, backtrace))] + TableExists { table: String, backtrace: Backtrace }, + + #[snafu(display("Failed to create table data, table:{}, err:{}", table, source))] + CreateTableData { + table: String, + source: crate::table::data::Error, + }, + + #[snafu(display("Failed to store meta data, err:{}", source))] + WriteMeta { + source: Box, + }, +} + +define_result!(Error); + +impl From for table_engine::engine::Error { + fn from(err: Error) -> Self { + match err { + Error::TableExists { table, backtrace } => Self::TableExists { table, backtrace }, + Error::CreateTableData { ref table, .. } => Self::InvalidArguments { + table: table.clone(), + source: Box::new(err), + }, + Error::WriteMeta { .. } => Self::WriteMeta { + source: Box::new(err), + }, + } + } +} + +/// Holds references to the table data and its space +/// +/// REQUIRE: The table must belongs to the space +#[derive(Clone)] +pub struct SpaceAndTable { + /// The space of the table + space: SpaceRef, + /// Data of the table + table_data: TableDataRef, +} + +impl SpaceAndTable { + /// Create SpaceAndTable + /// + /// REQUIRE: The table must belongs to the space + pub fn new(space: SpaceRef, table_data: TableDataRef) -> Self { + // Checks table is in space + debug_assert!(space + .table_datas + .read() + .unwrap() + .find_table(&table_data.name) + .is_some()); + + Self { space, table_data } + } + + /// Get space info + #[inline] + pub fn space(&self) -> &SpaceRef { + &self.space + } + + /// Get table data + #[inline] + pub fn table_data(&self) -> &TableDataRef { + &self.table_data + } +} + +impl fmt::Debug for SpaceAndTable { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SpaceAndTable") + .field("space_id", &self.space.id) + .field("space_name", &self.space.name) + .field("table_id", &self.table_data.id) + .field("table_name", &self.table_data.name) + .finish() + } +} + +/// Name type of space +// TODO(yingwen): Or use binary string? +pub type SpaceName = String; +/// Reference of space name +pub type SpaceNameRef<'a> = &'a str; +/// Space id +// TODO(yingwen): Or just use something like uuid as space id? +pub type SpaceId = u32; + +/// A space can hold mulitple tables +pub struct Space { + /// Space id + pub id: SpaceId, + /// Space name + pub name: SpaceName, + /// Data of tables in this space + /// + /// Adding table into it should acquire the space lock first, then the write + /// lock + table_datas: RwLock, + /// Space lock + /// + /// Persisting meta update of this space is protected by this lock + mutex: Mutex<()>, + + /// Write workers + pub write_group: WriteGroup, + /// Space memtable memory usage collector + pub mem_usage_collector: Arc, + /// The maximum write buffer size used for single space. + pub write_buffer_size: usize, +} + +impl Space { + pub fn new( + id: SpaceId, + name: SpaceName, + write_buffer_size: usize, + write_group: WriteGroup, + engine_mem_collector: CollectorRef, + ) -> Self { + Self { + id, + name, + table_datas: RwLock::new(TableDataSet::new()), + mutex: Mutex::new(()), + write_group, + mem_usage_collector: Arc::new(MemUsageCollector::with_parent(engine_mem_collector)), + write_buffer_size, + } + } + + /// Returns true when space total memtable memory usage reaches + /// space_write_buffer_size limit. + #[inline] + pub fn should_flush_space(&self) -> bool { + self.write_buffer_size > 0 && self.memtable_memory_usage() >= self.write_buffer_size + } + + /// Find the table in space which it's memtable consumes maximum memory. + #[inline] + pub fn find_maximum_memory_usage_table(&self) -> Option { + self.table_datas + .read() + .unwrap() + .find_maximum_memory_usage_table() + } + + #[inline] + pub fn memtable_memory_usage(&self) -> usize { + self.mem_usage_collector.total_memory_allocated() + } + + pub async fn close(&self) -> Result<()> { + // Stop the write group. + self.write_group.stop().await; + + Ok(()) + } + + /// Create a table under this space + /// + /// Returns error if the table already exists + pub async fn create_table( + &self, + request: CreateTableRequest, + manifest: &Meta, + table_opts: &TableOptions, + purger: &FilePurger, + ) -> Result { + info!( + "Space create table, space_id:{}, space_name:{}, request:{:?}", + self.id, self.name, request + ); + + // Checks whether the table is exists + if self.find_table(&request.table_name).is_some() { + return TableExists { + table: request.table_name, + } + .fail(); + } + + // Choose a write worker for this table + let write_handle = self.write_group.choose_worker(request.table_id); + + let _lock = self.mutex.lock().await; + + // Double check for table existence under space lock + if self.find_table(&request.table_name).is_some() { + return TableExists { + table: request.table_name, + } + .fail(); + } + + // Store table info into meta + let update = MetaUpdate::AddTable(AddTableMeta { + space_id: self.id, + table_id: request.table_id, + table_name: request.table_name.clone(), + schema: request.table_schema.clone(), + opts: table_opts.clone(), + }); + manifest + .store_update(update) + .await + .map_err(|e| Box::new(e) as _) + .context(WriteMeta)?; + + // Update memory state + let table_name = request.table_name.clone(); + let table_data = Arc::new( + TableData::new( + self.id, + request, + write_handle, + table_opts.clone(), + purger, + self.mem_usage_collector.clone(), + ) + .context(CreateTableData { table: &table_name })?, + ); + + self.insert_table(table_data.clone()); + + Ok(table_data) + } + + /// Insert table data into space memory state if the table is + /// absent. For internal use only + /// + /// Panic if the table is already exists + pub(crate) fn insert_table(&self, table_data: TableDataRef) { + let success = self + .table_datas + .write() + .unwrap() + .insert_if_absent(table_data); + assert!(success); + } + + /// Find table under this space by table name + pub fn find_table(&self, table_name: &str) -> Option { + self.table_datas.read().unwrap().find_table(table_name) + } + + /// Find table under this space by its id + pub fn find_table_by_id(&self, table_id: TableId) -> Option { + self.table_datas.read().unwrap().find_table_by_id(table_id) + } + + /// Remove table under this space by table name + pub fn remove_table(&self, table_name: &str) -> Option { + self.table_datas.write().unwrap().remove_table(table_name) + } + + /// Returns the total table num in this space + pub fn table_num(&self) -> usize { + self.table_datas.read().unwrap().table_num() + } + + /// List all tables of this space to `tables` + pub fn list_all_tables(&self, tables: &mut Vec) { + self.table_datas.read().unwrap().list_all_tables(tables) + } +} + +/// A reference to space +pub type SpaceRef = Arc; diff --git a/analytic_engine/src/sst/builder.rs b/analytic_engine/src/sst/builder.rs new file mode 100644 index 0000000000..3eecbcdf2a --- /dev/null +++ b/analytic_engine/src/sst/builder.rs @@ -0,0 +1,76 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Sst builder trait definition + +use async_trait::async_trait; +use common_types::{record_batch::RecordBatchWithKey, request_id::RequestId}; +use futures::Stream; + +use crate::sst::file::SstMetaData; + +pub mod error { + use common_util::define_result; + use snafu::{Backtrace, Snafu}; + + #[derive(Debug, Snafu)] + #[snafu(visibility(pub))] + pub enum Error { + #[snafu(display("Failed to persist sst content, path:{}, err:{}", path, source))] + Persist { + path: String, + source: Box, + }, + + #[snafu(display("Failed to encode meta data, err:{}", source))] + EncodeMetaData { + source: Box, + }, + + #[snafu(display("Failed to get sst file size, path:{}", path))] + GetFileSize { path: String }, + + #[snafu(display( + "Failed to encode record batch into sst, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + EncodeRecordBatch { + source: Box, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to poll record batch, err:{}", source))] + PollRecordBatch { + source: Box, + }, + } + + define_result!(Error); +} + +pub use error::*; + +pub type RecordBatchStreamItem = + std::result::Result>; +// TODO(yingwen): SstReader also has a RecordBatchStream, can we use same type? +pub type RecordBatchStream = Box + Send + Unpin>; + +#[derive(Debug, Copy, Clone)] +pub struct SstInfo { + pub file_size: usize, + pub row_num: usize, +} + +/// The builder for sst. +/// +/// The caller provides a stream of [RecordBatch] and the builder takes +/// responsibilities for persisting the records. +#[async_trait] +pub trait SstBuilder { + async fn build( + &mut self, + request_id: RequestId, + meta: &SstMetaData, + record_stream: RecordBatchStream, + ) -> Result; +} diff --git a/analytic_engine/src/sst/factory.rs b/analytic_engine/src/sst/factory.rs new file mode 100644 index 0000000000..f910468515 --- /dev/null +++ b/analytic_engine/src/sst/factory.rs @@ -0,0 +1,87 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Factory for different kinds sst builder and reader. + +use std::{fmt::Debug, sync::Arc}; + +use common_types::projected_schema::ProjectedSchema; +use common_util::runtime::Runtime; +use object_store::ObjectStore; +use parquet::{DataCacheRef, MetaCacheRef}; +use table_engine::predicate::PredicateRef; + +use crate::{ + sst::{ + builder::SstBuilder, + parquet::{builder::ParquetSstBuilder, reader::ParquetSstReader}, + reader::SstReader, + }, + table_options::Compression, +}; + +pub trait Factory: Clone { + fn new_sst_reader<'a, S: ObjectStore>( + &self, + options: &SstReaderOptions, + path: &'a S::Path, + storage: &'a S, + ) -> Option>; + + fn new_sst_builder<'a, S: ObjectStore>( + &self, + options: &SstBuilderOptions, + path: &'a S::Path, + storage: &'a S, + ) -> Option>; +} + +#[derive(Debug, Copy, Clone)] +pub enum SstType { + Parquet, +} + +#[derive(Debug, Clone)] +pub struct SstReaderOptions { + pub sst_type: SstType, + pub read_batch_row_num: usize, + pub reverse: bool, + pub projected_schema: ProjectedSchema, + pub predicate: PredicateRef, + pub meta_cache: Option, + pub data_cache: Option, + pub runtime: Arc, +} + +#[derive(Debug, Clone)] +pub struct SstBuilderOptions { + pub sst_type: SstType, + pub num_rows_per_row_group: usize, + pub compression: Compression, +} + +#[derive(Debug, Clone)] +pub struct FactoryImpl; + +impl Factory for FactoryImpl { + fn new_sst_reader<'a, S: ObjectStore>( + &self, + options: &SstReaderOptions, + path: &'a S::Path, + storage: &'a S, + ) -> Option> { + match options.sst_type { + SstType::Parquet => Some(Box::new(ParquetSstReader::new(path, storage, options))), + } + } + + fn new_sst_builder<'a, S: ObjectStore>( + &self, + options: &SstBuilderOptions, + path: &'a S::Path, + storage: &'a S, + ) -> Option> { + match options.sst_type { + SstType::Parquet => Some(Box::new(ParquetSstBuilder::new(path, storage, options))), + } + } +} diff --git a/analytic_engine/src/sst/file.rs b/analytic_engine/src/sst/file.rs new file mode 100644 index 0000000000..00bf345e66 --- /dev/null +++ b/analytic_engine/src/sst/file.rs @@ -0,0 +1,699 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Sst file and storage info + +use std::{ + borrow::Borrow, + cmp, + collections::{BTreeMap, HashSet}, + convert::TryFrom, + fmt, + fmt::Debug, + hash::{Hash, Hasher}, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, +}; + +use common_types::{ + bytes::Bytes, + schema::Schema, + time::{TimeRange, Timestamp}, + SequenceNumber, +}; +use common_util::{ + define_result, + metric::Meter, + runtime::{JoinHandle, Runtime}, +}; +use log::{debug, error, info}; +use object_store::{path::ObjectStorePath, ObjectStore}; +use proto::{common::TimeRange as TimeRangePb, sst::SstMetaData as SstMetaDataPb}; +use snafu::{ResultExt, Snafu}; +use table_engine::table::TableId; +use tokio::sync::{ + mpsc::{self, UnboundedReceiver, UnboundedSender}, + Mutex, +}; + +use crate::{space::SpaceId, sst::manager::FileId, table::sst_util}; + +/// Error of sst file. +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to convert time range, err:{}", source))] + ConvertTimeRange { source: common_types::time::Error }, + + #[snafu(display("Failed to convert table schema, err:{}", source))] + ConvertTableSchema { source: common_types::schema::Error }, + + #[snafu(display("Failed to join purger, err:{}", source))] + StopPurger { source: common_util::runtime::Error }, +} + +define_result!(Error); + +pub type Level = u16; + +// TODO(yingwen): Order or split file by time range to speed up filter (even in +// level 0). +/// Manage files of single level +pub struct LevelHandler { + pub level: Level, + /// All files in current level. + files: FileHandleSet, +} + +impl LevelHandler { + pub fn new(level: u16) -> Self { + Self { + level, + files: FileHandleSet::default(), + } + } + + #[inline] + pub fn insert(&mut self, file: FileHandle) { + self.files.insert(file); + } + + pub fn latest_sst(&self) -> Option { + self.files.latest() + } + + pub fn pick_ssts(&self, time_range: TimeRange) -> Vec { + if self.level == 0 { + self.files.files_by_time_range(time_range) + } else { + Vec::new() + } + } + + #[inline] + pub fn remove_ssts(&mut self, file_ids: &[FileId]) { + self.files.remove_by_ids(file_ids); + } + + pub fn iter_ssts(&self) -> Iter { + let iter = self.files.file_map.values(); + Iter(iter) + } + + #[inline] + pub fn collect_expired( + &self, + expire_time: Option, + expired_files: &mut Vec, + ) { + self.files.collect_expired(expire_time, expired_files); + } + + #[inline] + pub fn has_expired_sst(&self, expire_time: Option) -> bool { + self.files.has_expired_sst(expire_time) + } +} + +pub struct Iter<'a>(std::collections::btree_map::Values<'a, FileOrdKey, FileHandle>); + +impl<'a> Iterator for Iter<'a> { + type Item = &'a FileHandle; + + fn next(&mut self) -> Option { + self.0.next() + } +} + +#[derive(Clone)] +pub struct FileHandle { + inner: Arc, +} + +impl PartialEq for FileHandle { + fn eq(&self, other: &Self) -> bool { + self.id() == other.id() + } +} + +impl Eq for FileHandle {} + +impl Hash for FileHandle { + fn hash(&self, state: &mut H) { + self.id().hash(state); + } +} + +impl FileHandle { + pub fn new(meta: FileMeta, purge_queue: FilePurgeQueue) -> Self { + Self { + inner: Arc::new(FileHandleInner { + meta, + purge_queue, + being_compacted: AtomicBool::new(false), + metrics: SstMetrics::default(), + }), + } + } + + #[inline] + pub fn read_meter(&self) -> Arc { + self.inner.metrics.read_meter.clone() + } + + #[inline] + pub fn row_num(&self) -> u64 { + self.inner.meta.meta.row_num + } + + #[inline] + pub fn id(&self) -> FileId { + self.inner.meta.id + } + + #[inline] + pub fn id_ref(&self) -> &FileId { + &self.inner.meta.id + } + + #[inline] + pub fn intersect_with_time_range(&self, time_range: TimeRange) -> bool { + self.inner.meta.intersect_with_time_range(time_range) + } + + #[inline] + pub fn min_key(&self) -> Bytes { + self.inner.meta.meta.min_key.clone() + } + + #[inline] + pub fn max_key(&self) -> Bytes { + self.inner.meta.meta.max_key.clone() + } + + #[inline] + pub fn time_range(&self) -> TimeRange { + self.inner.meta.meta.time_range + } + + #[inline] + pub fn time_range_ref(&self) -> &TimeRange { + &self.inner.meta.meta.time_range + } + + #[inline] + pub fn max_sequence(&self) -> SequenceNumber { + self.inner.meta.meta.max_sequence + } + + #[inline] + pub fn being_compacted(&self) -> bool { + self.inner.being_compacted.load(Ordering::Relaxed) + } + + #[inline] + pub fn size(&self) -> u64 { + self.inner.meta.meta.size + } + + #[inline] + pub fn set_being_compacted(&self, value: bool) { + self.inner.being_compacted.store(value, Ordering::Relaxed); + } +} + +impl fmt::Debug for FileHandle { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("FileHandle") + .field("meta", &self.inner.meta) + .field("being_compacted", &self.being_compacted()) + .field("metrics", &self.inner.metrics) + .finish() + } +} + +struct SstMetrics { + pub read_meter: Arc, + pub key_num: usize, +} + +impl Default for SstMetrics { + fn default() -> Self { + SstMetrics { + read_meter: Arc::new(Meter::new()), + key_num: 0, + } + } +} + +impl fmt::Debug for SstMetrics { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SstMetrics") + .field("read_meter", &self.read_meter.h2_rate()) + .field("key_num", &self.key_num) + .finish() + } +} + +struct FileHandleInner { + meta: FileMeta, + purge_queue: FilePurgeQueue, + /// The file is being compacting. + being_compacted: AtomicBool, + metrics: SstMetrics, +} + +impl Drop for FileHandleInner { + fn drop(&mut self) { + debug!("FileHandle is dropped, meta:{:?}", self.meta); + + // Push file cannot block or be async because we are in drop(). + self.purge_queue.push_file(self.meta.id); + } +} + +/// Used to order [FileHandle] by (end_time, start_time, file_id) +#[derive(PartialEq, Eq, PartialOrd, Ord)] +struct FileOrdKey { + exclusive_end: Timestamp, + inclusive_start: Timestamp, + file_id: FileId, +} + +impl FileOrdKey { + fn for_seek(exclusive_end: Timestamp) -> Self { + Self { + exclusive_end, + inclusive_start: Timestamp::MIN, + file_id: 0, + } + } + + fn key_of(file: &FileHandle) -> Self { + Self { + exclusive_end: file.time_range().exclusive_end(), + inclusive_start: file.time_range().inclusive_start(), + file_id: file.id(), + } + } +} + +/// Used to index [FileHandle] by file_id +struct FileHandleHash(FileHandle); + +impl PartialEq for FileHandleHash { + fn eq(&self, other: &Self) -> bool { + self.0.id() == other.0.id() + } +} + +impl Eq for FileHandleHash {} + +impl Hash for FileHandleHash { + fn hash(&self, state: &mut H) { + self.0.id().hash(state); + } +} + +impl Borrow for FileHandleHash { + #[inline] + fn borrow(&self) -> &FileId { + self.0.id_ref() + } +} + +#[derive(Default)] +struct FileHandleSet { + /// Files ordered by time range and id. + file_map: BTreeMap, + /// Files indexed by file id, used to speed up removal. + id_to_files: HashSet, +} + +impl FileHandleSet { + fn latest(&self) -> Option { + if let Some(file) = self.file_map.values().rev().next() { + return Some(file.clone()); + } + None + } + + fn files_by_time_range(&self, time_range: TimeRange) -> Vec { + // Seek to first sst whose end time >= time_range.inclusive_start(). + let seek_key = FileOrdKey::for_seek(time_range.inclusive_start()); + self.file_map + .range(seek_key..) + .into_iter() + .filter_map(|(_key, file)| { + if file.intersect_with_time_range(time_range) { + Some(file.clone()) + } else { + None + } + }) + .collect() + } + + fn insert(&mut self, file: FileHandle) { + self.file_map + .insert(FileOrdKey::key_of(&file), file.clone()); + self.id_to_files.insert(FileHandleHash(file)); + } + + fn remove_by_ids(&mut self, file_ids: &[FileId]) { + for file_id in file_ids { + if let Some(file) = self.id_to_files.take(file_id) { + let key = FileOrdKey::key_of(&file.0); + self.file_map.remove(&key); + } + } + } + + /// Collect ssts with time range is expired. + fn collect_expired(&self, expire_time: Option, expired_files: &mut Vec) { + for file in self.file_map.values() { + if file.time_range().is_expired(expire_time) { + expired_files.push(file.clone()); + } else { + // Files are sorted by end time first, so there is no more file whose end time + // is less than `expire_time`. + break; + } + } + } + + fn has_expired_sst(&self, expire_time: Option) -> bool { + // Files are sorted by end time first, so check first file is enough. + if let Some(file) = self.file_map.values().next() { + return file.time_range().is_expired(expire_time); + } + + false + } +} + +/// Meta of a sst file, immutable once created +#[derive(Debug, Clone)] +pub struct FileMeta { + /// Id of the sst file + pub id: FileId, + pub meta: SstMetaData, +} + +impl FileMeta { + pub fn intersect_with_time_range(&self, time_range: TimeRange) -> bool { + self.meta.time_range.intersect_with(time_range) + } +} + +/// Meta data of a sst file, immutable once created +#[derive(Debug, Clone, PartialEq)] +pub struct SstMetaData { + pub min_key: Bytes, + pub max_key: Bytes, + /// Time Range of the sst + pub time_range: TimeRange, + /// Max sequence number in the sst + pub max_sequence: SequenceNumber, + pub schema: Schema, + /// file size in bytes + pub size: u64, + // total row number + pub row_num: u64, +} + +impl From for SstMetaDataPb { + fn from(src: SstMetaData) -> Self { + let mut target = SstMetaDataPb::default(); + target.set_min_key(src.min_key.to_vec()); + target.set_max_key(src.max_key.to_vec()); + target.set_max_sequence(src.max_sequence); + let time_range = TimeRangePb::from(src.time_range); + target.set_time_range(time_range); + target.set_schema(src.schema.into()); + target.set_size(src.size); + target.set_row_num(src.row_num); + + target + } +} + +impl TryFrom for SstMetaData { + type Error = Error; + + fn try_from(mut src: SstMetaDataPb) -> Result { + let time_range = TimeRange::try_from(src.take_time_range()).context(ConvertTimeRange)?; + let schema = Schema::try_from(src.take_schema()).context(ConvertTableSchema)?; + Ok(Self { + min_key: src.min_key.into(), + max_key: src.max_key.into(), + time_range, + max_sequence: src.max_sequence, + schema, + size: src.size, + row_num: src.row_num, + }) + } +} + +// Queue to store files to be deleted for a table. +#[derive(Clone)] +pub struct FilePurgeQueue { + // Wrap a inner struct to avoid storing space/table ids for each file. + inner: Arc, +} + +impl FilePurgeQueue { + pub fn new(space_id: SpaceId, table_id: TableId, sender: UnboundedSender) -> Self { + Self { + inner: Arc::new(FilePurgeQueueInner { + space_id, + table_id, + sender, + closed: AtomicBool::new(false), + }), + } + } + + /// Close the purge queue, then all request pushed to this queue will be + /// ignored. This is mainly used to avoid files being deleted after the + /// db is closed. + pub fn close(&self) { + self.inner.closed.store(true, Ordering::SeqCst); + } + + fn push_file(&self, file_id: FileId) { + if self.inner.closed.load(Ordering::SeqCst) { + return; + } + + // Send the file id via a channel to file purger and delete the file from sst + // store in background. + let request = FilePurgeRequest { + space_id: self.inner.space_id, + table_id: self.inner.table_id, + file_id, + }; + + if let Err(send_res) = self.inner.sender.send(Request::Purge(request)) { + error!( + "Failed to send delete file request, request:{:?}", + send_res.0 + ); + } + } +} + +struct FilePurgeQueueInner { + space_id: SpaceId, + table_id: TableId, + closed: AtomicBool, + sender: UnboundedSender, +} + +#[derive(Debug)] +pub struct FilePurgeRequest { + space_id: SpaceId, + table_id: TableId, + file_id: FileId, +} + +#[derive(Debug)] +pub enum Request { + Purge(FilePurgeRequest), + Exit, +} + +/// Background file purger. +pub struct FilePurger { + sender: UnboundedSender, + handle: Mutex>>, +} + +impl FilePurger { + pub fn start( + runtime: &Runtime, + store: Arc, + ) -> Self { + // We must use unbound channel, so the sender wont block when the handle is + // dropped. + let (tx, rx) = mpsc::unbounded_channel(); + + // Spawn a background job to purge files. + let handle = runtime.spawn(async { + Self::purge_file_loop(store, rx).await; + }); + + Self { + sender: tx, + handle: Mutex::new(Some(handle)), + } + } + + pub async fn stop(&self) -> Result<()> { + info!("Try to stop file purger"); + + if self.sender.send(Request::Exit).is_err() { + error!("File purge task already exited"); + } + + let mut handle = self.handle.lock().await; + // Also clear the handle to avoid await a ready future. + if let Some(h) = handle.take() { + h.await.context(StopPurger)?; + } + + Ok(()) + } + + pub fn create_purge_queue(&self, space_id: SpaceId, table_id: TableId) -> FilePurgeQueue { + FilePurgeQueue::new(space_id, table_id, self.sender.clone()) + } + + async fn purge_file_loop( + store: Arc, + mut receiver: UnboundedReceiver, + ) { + info!("File purger start"); + + while let Some(request) = receiver.recv().await { + match request { + Request::Purge(purge_request) => { + let mut sst_file_path = store.new_path(); + sst_util::set_sst_file_path( + purge_request.space_id, + purge_request.table_id, + purge_request.file_id, + &mut sst_file_path, + ); + + info!( + "File purger delete file, purge_request:{:?}, sst_file_path:{}", + purge_request, + sst_file_path.display() + ); + + if let Err(e) = store.delete(&sst_file_path).await { + error!( + "File purger failed to delete file, sst_file_path:{}, err:{}", + sst_file_path.display(), + e + ); + } + } + Request::Exit => break, + } + } + + info!("File purger exit"); + } +} + +/// Merge sst meta of given `files`, panic if `files` is empty. +/// +/// The size and row_num of the merged meta is initialized to 0. +pub fn merge_sst_meta(files: &[FileHandle], schema: Schema) -> SstMetaData { + let mut min_key = files[0].min_key(); + let mut max_key = files[0].max_key(); + let mut time_range_start = files[0].time_range().inclusive_start(); + let mut time_range_end = files[0].time_range().exclusive_end(); + let mut max_sequence = files[0].max_sequence(); + + if files.len() > 1 { + for file in &files[1..] { + min_key = cmp::min(file.min_key(), min_key); + max_key = cmp::max(file.max_key(), max_key); + time_range_start = cmp::min(file.time_range().inclusive_start(), time_range_start); + time_range_end = cmp::max(file.time_range().exclusive_end(), time_range_end); + max_sequence = cmp::max(file.max_sequence(), max_sequence); + } + } + + SstMetaData { + min_key, + max_key, + time_range: TimeRange::new(time_range_start, time_range_end).unwrap(), + max_sequence, + schema, + // we don't know file size and total row number yet + size: 0, + row_num: 0, + } +} + +#[cfg(test)] +pub mod tests { + use super::*; + + pub struct FilePurgerMocker; + + impl FilePurgerMocker { + pub fn mock() -> FilePurger { + let (sender, _receiver) = mpsc::unbounded_channel(); + + FilePurger { + sender, + handle: Mutex::new(None), + } + } + } + + #[must_use] + pub struct SstMetaDataMocker { + schema: Schema, + time_range: TimeRange, + max_sequence: SequenceNumber, + } + + impl SstMetaDataMocker { + pub fn new(schema: Schema) -> Self { + Self { + schema, + time_range: TimeRange::min_to_max(), + max_sequence: 1, + } + } + + pub fn time_range(mut self, range: TimeRange) -> Self { + self.time_range = range; + self + } + + pub fn max_sequence(mut self, max_sequence: SequenceNumber) -> Self { + self.max_sequence = max_sequence; + self + } + + pub fn build(&self) -> SstMetaData { + SstMetaData { + min_key: Bytes::new(), + max_key: Bytes::new(), + time_range: self.time_range, + max_sequence: self.max_sequence, + schema: self.schema.clone(), + size: 0, + row_num: 0, + } + } + } +} diff --git a/analytic_engine/src/sst/manager.rs b/analytic_engine/src/sst/manager.rs new file mode 100644 index 0000000000..2d64a8fafb --- /dev/null +++ b/analytic_engine/src/sst/manager.rs @@ -0,0 +1,159 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Multi-level SST management + +use common_types::time::{TimeRange, Timestamp}; + +use crate::{ + compaction::ExpiredFiles, + sst::file::{FileHandle, FileMeta, FilePurgeQueue, Iter, Level, LevelHandler}, +}; + +/// Id for a sst file +pub type FileId = u64; +/// We use two level merge tree, the max level should less than u16::MAX +pub const MAX_LEVEL: usize = 2; + +/// A table level manager that manages all the sst files of the table +pub struct LevelsController { + levels: Vec, + purge_queue: FilePurgeQueue, +} + +impl Drop for LevelsController { + fn drop(&mut self) { + // Close the purge queue to avoid files being deleted. + self.purge_queue.close(); + } +} + +impl LevelsController { + /// Create an empty LevelsController + pub fn new(purge_queue: FilePurgeQueue) -> Self { + let mut levels = Vec::with_capacity(MAX_LEVEL); + for level in 0..MAX_LEVEL { + levels.push(LevelHandler::new(level as Level)); + } + + Self { + levels, + purge_queue, + } + } + + /// Add sst file to level + /// + /// Panic: If the level is greater than the max level + pub fn add_sst_to_level(&mut self, level: Level, file_meta: FileMeta) { + let level_handler = &mut self.levels[usize::from(level)]; + let file = FileHandle::new(file_meta, self.purge_queue.clone()); + + level_handler.insert(file); + } + + pub fn latest_sst(&self, level: Level) -> Option { + self.levels[usize::from(level)].latest_sst() + } + + /// Pick the ssts and collect it by `append_sst`. + pub fn pick_ssts( + &self, + time_range: TimeRange, + mut append_sst: impl FnMut(Level, &[FileHandle]), + ) { + for level_handler in self.levels.iter() { + let ssts = level_handler.pick_ssts(time_range); + append_sst(level_handler.level, &ssts); + } + } + + /// Remove sst files from level. + /// + /// Panic: If the level is greater than the max level + pub fn remove_ssts_from_level(&mut self, level: Level, file_ids: &[FileId]) { + let level_handler = &mut self.levels[usize::from(level)]; + level_handler.remove_ssts(file_ids); + } + + /// Total number of levels. + pub fn num_levels(&self) -> Level { + self.levels.len() as Level + } + + /// Iter ssts at given `level`. + /// + /// Panic if level is out of bound. + pub fn iter_ssts_at_level(&self, level: Level) -> Iter { + let level_handler = &self.levels[usize::from(level)]; + level_handler.iter_ssts() + } + + pub fn collect_expired_at_level( + &self, + level: Level, + expire_time: Option, + ) -> Vec { + let level_handler = &self.levels[usize::from(level)]; + let mut expired = Vec::new(); + level_handler.collect_expired(expire_time, &mut expired); + + expired + } + + pub fn has_expired_sst(&self, expire_time: Option) -> bool { + self.levels + .iter() + .any(|level_handler| level_handler.has_expired_sst(expire_time)) + } + + pub fn expired_ssts(&self, expire_time: Option) -> Vec { + let mut expired = Vec::new(); + let num_levels = self.num_levels(); + for level in 0..num_levels { + let files = self.collect_expired_at_level(level, expire_time); + expired.push(ExpiredFiles { level, files }); + } + + expired + } +} + +#[cfg(test)] +pub mod tests { + use table_engine::table::TableId; + use tokio::sync::mpsc; + + use crate::sst::{ + file::{FileMeta, FilePurgeQueue, SstMetaData}, + manager::{FileId, LevelsController}, + }; + + #[must_use] + #[derive(Default)] + pub struct LevelsControllerMockBuilder { + sst_meta_vec: Vec, + } + + impl LevelsControllerMockBuilder { + pub fn add_sst(mut self, mut sst_meta: Vec) -> Self { + self.sst_meta_vec.append(&mut sst_meta); + self + } + + pub fn build(self) -> LevelsController { + let (tx, _rx) = mpsc::unbounded_channel(); + let file_purge_queue = FilePurgeQueue::new(100, TableId::from(101), tx); + let mut levels_controller = LevelsController::new(file_purge_queue); + for (id, sst_meta) in self.sst_meta_vec.into_iter().enumerate() { + levels_controller.add_sst_to_level( + 0, + FileMeta { + id: id as FileId, + meta: sst_meta, + }, + ); + } + levels_controller + } + } +} diff --git a/analytic_engine/src/sst/mod.rs b/analytic_engine/src/sst/mod.rs new file mode 100644 index 0000000000..a6fec9162b --- /dev/null +++ b/analytic_engine/src/sst/mod.rs @@ -0,0 +1,10 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! SST (Sorted String Table) file + +pub mod builder; +pub mod factory; +pub mod file; +pub mod manager; +pub mod parquet; +pub mod reader; diff --git a/analytic_engine/src/sst/parquet/builder.rs b/analytic_engine/src/sst/parquet/builder.rs new file mode 100644 index 0000000000..8bba10cc79 --- /dev/null +++ b/analytic_engine/src/sst/parquet/builder.rs @@ -0,0 +1,560 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Sst builder implementation based on parquet. + +use std::{ + io::SeekFrom, + pin::Pin, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, Mutex, + }, + task::{Context, Poll}, +}; + +use arrow_deps::{ + arrow::record_batch::RecordBatch as ArrowRecordBatch, + datafusion::parquet::basic::Compression, + parquet::{ + arrow::ArrowWriter, + file::{properties::WriterProperties, writer::TryClone}, + }, +}; +use async_trait::async_trait; +use common_types::{bytes::BufMut, request_id::RequestId}; +use futures::AsyncRead; +use log::debug; +use object_store::{path::ObjectStorePath, ObjectStore}; +use snafu::{ensure, ResultExt}; + +use crate::sst::{ + builder::{RecordBatchStream, SstBuilder, *}, + factory::SstBuilderOptions, + file::SstMetaData, + parquet::encoding, +}; + +/// The implementation of sst based on parquet and object storage. +#[derive(Debug)] +pub struct ParquetSstBuilder<'a, S: ObjectStore> { + /// The path where the data is persisted. + path: &'a S::Path, + /// The storage where the data is persist. + storage: &'a S, + /// Max row group size. + num_rows_per_row_group: usize, + compression: Compression, +} + +impl<'a, S: ObjectStore> ParquetSstBuilder<'a, S> { + pub fn new(path: &'a S::Path, storage: &'a S, options: &SstBuilderOptions) -> Self { + Self { + path, + storage, + num_rows_per_row_group: options.num_rows_per_row_group, + compression: options.compression.into(), + } + } +} + +/// A memory writer implementing the [ParquetWriter]. +/// +/// The writer accepts the encoded bytes by parquet format and provides the byte +/// stream to the reader. +#[derive(Clone, Debug)] +struct EncodingBuffer { + // In order to reuse the buffer, the buffer must be wrapped in the Arc and the Mutex because + // the writer is consumed when building a ArrowWriter. + inner: Arc>, +} + +impl Default for EncodingBuffer { + fn default() -> Self { + Self { + inner: Arc::new(Mutex::new(EncodingBufferInner { + bytes_written: 0, + read_offset: 0, + buf: Vec::new(), + })), + } + } +} + +impl std::io::Write for EncodingBuffer { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + let mut inner = self.inner.lock().unwrap(); + inner.write(buf) + } + + fn flush(&mut self) -> std::io::Result<()> { + let mut inner = self.inner.lock().unwrap(); + inner.flush() + } +} + +impl std::io::Seek for EncodingBuffer { + fn seek(&mut self, pos: SeekFrom) -> std::io::Result { + let mut inner = self.inner.lock().unwrap(); + inner.seek(pos) + } +} + +impl TryClone for EncodingBuffer { + fn try_clone(&self) -> std::io::Result { + Ok(self.clone()) + } +} + +impl EncodingBuffer { + fn read(&self, read_buf: &mut [u8]) -> usize { + let mut inner = self.inner.lock().unwrap(); + inner.read(read_buf) + } +} + +/// The underlying buffer implementing [ParquetWriter]. +/// +/// Provides the write function for [ArrowWriter] and read function for +/// [AsyncRead]. +#[derive(Clone, Debug)] +struct EncodingBufferInner { + bytes_written: usize, + read_offset: usize, + buf: Vec, +} + +impl std::io::Write for EncodingBufferInner { + /// Write the `buf` to the `self.buf`. + /// + /// The readable bytes should be exhausted before writing new bytes. + /// `self.bytes_written` and `self.read_offset` is updated after writing. + fn write(&mut self, buf: &[u8]) -> std::io::Result { + if self.read_offset != 0 { + assert_eq!(self.buf.len(), self.read_offset); + self.buf.clear(); + self.buf.reserve(buf.len()); + // reset the read offset + self.read_offset = 0; + } + + let bytes_written = self.buf.write(buf)?; + // accumulate the written bytes + self.bytes_written += bytes_written; + + Ok(bytes_written) + } + + /// Actually nothing to flush. + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} + +impl std::io::Seek for EncodingBufferInner { + /// Given the assumption that the seek usage of the [ParquetWriter] in the + /// parquet project is just `seek(SeekFrom::Current(0))`, the + /// implementation panics if seek to a different target. + fn seek(&mut self, pos: SeekFrom) -> std::io::Result { + if let SeekFrom::Current(offset) = pos { + assert_eq!(offset, 0); + return Ok(self.bytes_written as u64); + } + + unreachable!("Only can handle the case where seek to current(0)") + } +} + +impl EncodingBufferInner { + /// Read the content in `self.buf[self.offset..]` into `read_buf`. + /// + /// When finishing reading, advance the `self.offset`. + fn read(&mut self, mut read_buf: &mut [u8]) -> usize { + if self.read_offset >= self.buf.len() { + return 0; + } + let remaining_size = self.buf.len() - self.read_offset; + + let read_len = remaining_size.min(read_buf.len()); + read_buf.put(&self.buf[self.read_offset..self.read_offset + read_len]); + + self.advance(read_len); + read_len + } + + /// Advance the `self.offset` by `len`. + /// + /// Caller should ensures the advanced offset wont exceed `self.buf.len()`. + fn advance(&mut self, len: usize) { + self.read_offset += len; + + assert!(self.read_offset <= self.buf.len()); + } +} + +/// RecordBytesReader provides AsyncRead implementation for the encoded records +/// by parquet. +struct RecordBytesReader { + request_id: RequestId, + record_stream: RecordBatchStream, + encoding_buffer: EncodingBuffer, + arrow_writer: Mutex>>, + num_rows_per_row_group: usize, + compression: Compression, + meta_data: SstMetaData, + total_row_num: Arc, + arrow_record_batch_vec: Vec, + // Whether the underlying `record_stream` is finished + stream_finished: bool, + + fetched_row_num: usize, +} + +/// Build the write properties containing the sst meta data. +fn build_write_properties( + num_rows_per_row_group: usize, + compression: Compression, + meta_data: &SstMetaData, +) -> Result { + let meta_data_kv = encoding::encode_sst_meta_data(meta_data.clone()) + .map_err(|e| Box::new(e) as _) + .context(EncodeMetaData)?; + + Ok(WriterProperties::builder() + .set_key_value_metadata(Some(vec![meta_data_kv])) + .set_max_row_group_size(num_rows_per_row_group) + .set_compression(compression) + .build()) +} + +/// Encode the record batch with [ArrowWriter] and the encoded contents is +/// written to the [EncodingBuffer]. +// TODO(xikai): too many parameters +fn encode_record_batch( + arrow_writer: &mut Option>, + num_rows_per_row_group: usize, + compression: Compression, + meta_data: &SstMetaData, + mem_buf_writer: EncodingBuffer, + arrow_record_batch_vec: Vec, +) -> Result { + if arrow_record_batch_vec.is_empty() { + return Ok(0); + } + + let arrow_schema = arrow_record_batch_vec[0].schema(); + + // create arrow writer if not exist + if arrow_writer.is_none() { + let write_props = build_write_properties(num_rows_per_row_group, compression, meta_data)?; + let writer = ArrowWriter::try_new(mem_buf_writer, arrow_schema.clone(), Some(write_props)) + .map_err(|e| Box::new(e) as _) + .context(EncodeRecordBatch)?; + *arrow_writer = Some(writer); + } + + let record_batch = ArrowRecordBatch::concat(&arrow_schema, &arrow_record_batch_vec) + .map_err(|e| Box::new(e) as _) + .context(EncodeRecordBatch)?; + + arrow_writer + .as_mut() + .unwrap() + .write(&record_batch) + .map_err(|e| Box::new(e) as _) + .context(EncodeRecordBatch)?; + + Ok(record_batch.num_rows()) +} + +fn close_writer(arrow_writer: &mut Option>) -> Result<()> { + if let Some(arrow_writer) = arrow_writer { + arrow_writer + .close() + .map_err(|e| Box::new(e) as _) + .context(EncodeRecordBatch)?; + } + + Ok(()) +} + +impl AsyncRead for RecordBytesReader { + fn poll_read( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut [u8], + ) -> Poll> { + let mut reader = self.get_mut(); + let size = reader.encoding_buffer.read(buf); + if size > 0 { + return Poll::Ready(Ok(size)); + } + + // The stream is also finished + if reader.stream_finished { + return Poll::Ready(Ok(0)); + } + + // FIXME(xikai): no data may cause empty sst file. + // fetch more rows from the stream. + while reader.fetched_row_num < reader.num_rows_per_row_group { + match Pin::new(reader.record_stream.as_mut()).poll_next(cx) { + Poll::Pending => return Poll::Pending, + Poll::Ready(v) => match v { + Some(record_batch) => match record_batch.context(PollRecordBatch) { + Ok(record_batch) => { + assert!( + !record_batch.is_empty(), + "found empty record batch, request id:{}", + reader.request_id + ); + + reader.fetched_row_num += record_batch.num_rows(); + reader + .arrow_record_batch_vec + .push(record_batch.into_record_batch().into_arrow_record_batch()); + } + Err(e) => { + return Poll::Ready(Err(std::io::Error::new( + std::io::ErrorKind::Other, + e, + ))) + } + }, + None => { + reader.stream_finished = true; + debug!( + "Record stream finished, request_id:{}, batch_len:{}, fetched_row_num:{}, num_rows_per_row_group:{}", + reader.request_id, + reader.arrow_record_batch_vec.len(), + reader.fetched_row_num, + reader.num_rows_per_row_group, + ); + break; + } + }, + } + } + + assert!(reader.stream_finished || reader.fetched_row_num >= reader.num_rows_per_row_group); + + // Reset fetched row num. + reader.fetched_row_num = 0; + match encode_record_batch( + reader.arrow_writer.get_mut().unwrap(), + reader.num_rows_per_row_group, + reader.compression, + &reader.meta_data, + reader.encoding_buffer.clone(), + std::mem::take(&mut reader.arrow_record_batch_vec), + ) { + Err(e) => return Poll::Ready(Err(std::io::Error::new(std::io::ErrorKind::Other, e))), + Ok(row_num) => { + reader.total_row_num.fetch_add(row_num, Ordering::Relaxed); + } + } + + if reader.stream_finished { + if let Err(e) = close_writer(reader.arrow_writer.get_mut().unwrap()) { + return Poll::Ready(Err(std::io::Error::new(std::io::ErrorKind::Other, e))); + } + } + + Poll::Ready(Ok(reader.encoding_buffer.read(buf))) + } +} + +#[async_trait] +impl<'a, S: ObjectStore> SstBuilder for ParquetSstBuilder<'a, S> { + async fn build( + &mut self, + request_id: RequestId, + meta: &SstMetaData, + record_stream: RecordBatchStream, + ) -> Result { + debug!( + "Build parquet file, request_id:{}, meta:{:?}, num_rows_per_row_group:{}", + request_id, meta, self.num_rows_per_row_group + ); + + let total_row_num = Arc::new(AtomicUsize::new(0)); + let reader = RecordBytesReader { + request_id, + record_stream, + encoding_buffer: EncodingBuffer::default(), + arrow_writer: Mutex::new(None), + num_rows_per_row_group: self.num_rows_per_row_group, + compression: self.compression, + total_row_num: total_row_num.clone(), + arrow_record_batch_vec: Vec::new(), + // TODO(xikai): should we avoid this clone? + meta_data: meta.to_owned(), + stream_finished: false, + fetched_row_num: 0, + }; + + self.storage + .put(self.path, reader, None) + .await + .map_err(|e| Box::new(e) as _) + .context(Persist { + path: self.path.display(), + })?; + + let result = self + .storage + .list_with_delimiter(self.path) + .await + .map_err(|e| Box::new(e) as _) + .context(Persist { + path: self.path.display(), + })?; + + ensure!( + result.objects.len() == 1, + GetFileSize { + path: self.path.display(), + } + ); + + Ok(SstInfo { + file_size: result.objects[0].size, + row_num: total_row_num.load(Ordering::Relaxed), + }) + } +} + +#[cfg(test)] +mod tests { + + use common_types::{ + bytes::Bytes, + projected_schema::ProjectedSchema, + tests::{build_row, build_schema}, + time::{TimeRange, Timestamp}, + }; + use common_util::runtime::{self, Runtime}; + use futures::stream; + use object_store::disk::File; + use table_engine::predicate::Predicate; + use tempfile::tempdir; + + use super::*; + use crate::{ + row_iter::tests::build_record_batch_with_key, + sst::{ + factory::{Factory, FactoryImpl, SstBuilderOptions, SstReaderOptions, SstType}, + parquet::reader::ParquetSstReader, + reader::{tests::check_stream, SstReader}, + }, + table_options, + }; + + // TODO(xikai): add test for reverse reader + + #[test] + fn test_parquet_build_and_read() { + let runtime = Arc::new(runtime::Builder::default().build().unwrap()); + parquet_write_and_then_read_back(runtime.clone(), 3, vec![3, 3, 3, 3, 3]); + // TODO: num_rows should be [4, 4, 4, 3]? + parquet_write_and_then_read_back(runtime.clone(), 4, vec![4, 2, 4, 2, 3]); + // TODO: num_rows should be [5, 5, 5]? + parquet_write_and_then_read_back(runtime, 5, vec![5, 1, 5, 1, 3]); + } + + fn parquet_write_and_then_read_back( + runtime: Arc, + num_rows_per_row_group: usize, + expected_num_rows: Vec, + ) { + runtime.block_on(async { + let sst_factory = FactoryImpl; + let sst_builder_options = SstBuilderOptions { + sst_type: SstType::Parquet, + num_rows_per_row_group, + compression: table_options::Compression::Uncompressed, + }; + + let dir = tempdir().unwrap(); + let root = dir.path(); + let store = File::new(root); + let mut sst_file_path = store.new_path(); + sst_file_path.set_file_name("data.par"); + + let schema = build_schema(); + let projected_schema = ProjectedSchema::no_projection(schema.clone()); + let sst_meta = SstMetaData { + min_key: Bytes::from_static(b"100"), + max_key: Bytes::from_static(b"200"), + time_range: TimeRange::new_unchecked(Timestamp::new(1), Timestamp::new(2)), + max_sequence: 200, + schema: schema.clone(), + size: 10, + row_num: 2, + }; + + let mut counter = 10; + let record_batch_stream = Box::new(stream::poll_fn(move |ctx| -> Poll> { + counter -= 1; + if counter == 0 { + return Poll::Ready(None); + } else if counter % 2 == 0 { + ctx.waker().wake_by_ref(); + return Poll::Pending; + } + + // reach here when counter is 9 7 5 3 1 + let ts = 100 + counter; + let rows = vec![ + build_row(b"a", ts, 10.0, "v4"), + build_row(b"b", ts, 10.0, "v4"), + build_row(b"c", ts, 10.0, "v4"), + ]; + let batch = build_record_batch_with_key(schema.clone(), rows); + Poll::Ready(Some(Ok(batch))) + })); + + let mut builder = sst_factory + .new_sst_builder(&sst_builder_options, &sst_file_path, &store) + .unwrap(); + let sst_info = builder + .build(RequestId::next_id(), &sst_meta, record_batch_stream) + .await + .unwrap(); + + assert_eq!(15, sst_info.row_num); + + // read sst back to test + let sst_reader_options = SstReaderOptions { + sst_type: SstType::Parquet, + read_batch_row_num: 5, + reverse: false, + projected_schema, + predicate: Arc::new(Predicate::new(TimeRange::min_to_max())), + meta_cache: None, + data_cache: None, + runtime: runtime.clone(), + }; + + let mut reader = ParquetSstReader::new(&sst_file_path, &store, &sst_reader_options); + assert_eq!(reader.meta_data().await.unwrap(), &sst_meta); + assert_eq!( + expected_num_rows, + reader + .row_groups() + .await + .iter() + .map(|g| g.num_rows()) + .collect::>() + ); + + let mut stream = reader.read().await.unwrap(); + let mut expect_rows = vec![]; + for counter in &[9, 7, 5, 3, 1] { + expect_rows.push(build_row(b"a", 100 + counter, 10.0, "v4")); + expect_rows.push(build_row(b"b", 100 + counter, 10.0, "v4")); + expect_rows.push(build_row(b"c", 100 + counter, 10.0, "v4")); + } + check_stream(&mut stream, expect_rows).await; + }); + } +} diff --git a/analytic_engine/src/sst/parquet/encoding.rs b/analytic_engine/src/sst/parquet/encoding.rs new file mode 100644 index 0000000000..ddb916b14d --- /dev/null +++ b/analytic_engine/src/sst/parquet/encoding.rs @@ -0,0 +1,152 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::convert::TryFrom; + +use arrow_deps::parquet::file::metadata::KeyValue; +use common_types::bytes::{BytesMut, MemBufMut, Writer}; +use common_util::define_result; +use proto::sst::SstMetaData as SstMetaDataPb; +use protobuf::Message; +use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu}; + +use crate::sst::file::SstMetaData; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Failed to encode sst meta data, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + EncodeIntoPb { + source: protobuf::ProtobufError, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to decode sst meta data, base64 of meta value:{}, err:{}.\nBacktrace:\n{}", + meta_value, + source, + backtrace, + ))] + DecodeFromPb { + meta_value: String, + source: protobuf::ProtobufError, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid meta key, expect:{}, given:{}.\nBacktrace:\n{}", + expect, + given, + backtrace + ))] + InvalidMetaKey { + expect: String, + given: String, + backtrace: Backtrace, + }, + + #[snafu(display("Base64 meta value not found.\nBacktrace:\n{}", backtrace))] + Base64MetaValueNotFound { backtrace: Backtrace }, + + #[snafu(display( + "Invalid base64 meta value length, base64 of meta value:{}.\nBacktrace:\n{}", + meta_value, + backtrace, + ))] + InvalidBase64MetaValueLen { + meta_value: String, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to decode base64 meta value, base64 of meta value:{}, err:{}", + meta_value, + source + ))] + DecodeBase64MetaValue { + meta_value: String, + source: base64::DecodeError, + }, + + #[snafu(display( + "Invalid meta value length, base64 of meta value:{}.\nBacktrace:\n{}", + meta_value, + backtrace + ))] + InvalidMetaValueLen { + meta_value: String, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid meta value header, base64 of meta value:{}.\nBacktrace:\n{}", + meta_value, + backtrace + ))] + InvalidMetaValueHeader { + meta_value: String, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to convert sst meta data from protobuf, err:{}", source))] + ConvertSstMetaData { source: crate::sst::file::Error }, +} + +define_result!(Error); + +pub const META_KEY: &str = "meta"; +pub const META_VALUE_HEADER: u8 = 0; + +/// Encode the sst meta data into binary key value pair. +pub fn encode_sst_meta_data(meta_data: SstMetaData) -> Result { + let meta_data_pb = SstMetaDataPb::from(meta_data); + + let mut buf = BytesMut::with_capacity(meta_data_pb.compute_size() as usize + 1); + buf.write_u8(META_VALUE_HEADER) + .expect("Should write header into the buffer successfully"); + + // encode the sst meta data into protobuf binary + { + let mut writer = Writer::new(&mut buf); + meta_data_pb + .write_to_writer(&mut writer) + .context(EncodeIntoPb)?; + } + Ok(KeyValue { + key: META_KEY.to_string(), + value: Some(base64::encode(buf.as_ref())), + }) +} + +/// Decode the sst meta data from the binary key value pair. +pub fn decode_sst_meta_data(kv: &KeyValue) -> Result { + ensure!( + kv.key == META_KEY, + InvalidMetaKey { + expect: META_KEY, + given: &kv.key, + } + ); + + let meta_value = kv.value.as_ref().context(Base64MetaValueNotFound)?; + ensure!( + !meta_value.is_empty(), + InvalidBase64MetaValueLen { meta_value } + ); + + let raw_bytes = base64::decode(meta_value).context(DecodeBase64MetaValue { meta_value })?; + + ensure!(!raw_bytes.is_empty(), InvalidMetaValueLen { meta_value }); + + ensure!( + raw_bytes[0] == META_VALUE_HEADER, + InvalidMetaValueHeader { meta_value } + ); + + let meta_data_pb: SstMetaDataPb = + Message::parse_from_bytes(&raw_bytes[1..]).context(DecodeFromPb { meta_value })?; + + SstMetaData::try_from(meta_data_pb).context(ConvertSstMetaData) +} diff --git a/analytic_engine/src/sst/parquet/mod.rs b/analytic_engine/src/sst/parquet/mod.rs new file mode 100644 index 0000000000..aaf82e4671 --- /dev/null +++ b/analytic_engine/src/sst/parquet/mod.rs @@ -0,0 +1,7 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Sst implementation based on parquet. + +pub mod builder; +pub mod encoding; +pub mod reader; diff --git a/analytic_engine/src/sst/parquet/reader.rs b/analytic_engine/src/sst/parquet/reader.rs new file mode 100644 index 0000000000..f515855ff7 --- /dev/null +++ b/analytic_engine/src/sst/parquet/reader.rs @@ -0,0 +1,371 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Sst reader implementation based on parquet. + +use std::{ + fs::File, + pin::Pin, + sync::Arc, + task::{Context, Poll}, + time::Instant, +}; + +use arrow_deps::{ + arrow::{error::Result as ArrowResult, record_batch::RecordBatch}, + parquet::{ + arrow::{ArrowReader, ParquetFileArrowReader}, + file::{metadata::RowGroupMetaData, reader::FileReader}, + }, +}; +use async_trait::async_trait; +use common_types::{ + projected_schema::{ProjectedSchema, RowProjector}, + record_batch::{ArrowRecordBatchProjector, RecordBatchWithKey}, + schema::Schema, +}; +use common_util::runtime::Runtime; +use futures::Stream; +use log::{debug, error, trace}; +use object_store::{path::ObjectStorePath, ObjectStore}; +use parquet::{ + reverse_reader::Builder as ReverseRecordBatchReaderBuilder, CachableSerializedFileReader, + DataCacheRef, MetaCacheRef, +}; +use snafu::{ensure, OptionExt, ResultExt}; +use table_engine::predicate::PredicateRef; +use tokio::sync::mpsc::{self, Receiver, Sender}; + +use crate::sst::{ + factory::SstReaderOptions, + file::SstMetaData, + parquet::encoding, + reader::{error::*, SstReader}, +}; + +const DEFAULT_CHANNEL_CAP: usize = 1000; + +pub async fn read_sst_meta( + storage: &S, + path: &S::Path, + meta_cache: &Option, + data_cache: &Option, +) -> Result<(CachableSerializedFileReader, SstMetaData)> { + let file = storage + .get(path) + .await + .map_err(|e| Box::new(e) as _) + .with_context(|| ReadPersist { + path: path.display(), + })?; + + // generate the file reader + let file_reader = CachableSerializedFileReader::new( + path.display(), + file, + meta_cache.clone(), + data_cache.clone(), + ) + .map_err(|e| Box::new(e) as _) + .with_context(|| ReadPersist { + path: path.display(), + })?; + + // parse sst meta data + let sst_meta = { + let kv_metas = file_reader + .metadata() + .file_metadata() + .key_value_metadata() + .as_ref() + .context(SstMetaNotFound)?; + + ensure!(!kv_metas.is_empty(), EmptySstMeta); + + encoding::decode_sst_meta_data(&kv_metas[0]) + .map_err(|e| Box::new(e) as _) + .context(DecodeSstMeta)? + }; + + Ok((file_reader, sst_meta)) +} + +/// The implementation of sst based on parquet and object storage. +pub struct ParquetSstReader<'a, S: ObjectStore> { + /// The path where the data is persisted. + path: &'a S::Path, + /// The storage where the data is persist. + storage: &'a S, + projected_schema: ProjectedSchema, + predicate: PredicateRef, + meta_data: Option, + file_reader: Option>, + /// The batch of rows in one `record_batch`. + batch_size: usize, + /// Read the rows in reverse order. + reverse: bool, + channel_cap: usize, + + meta_cache: Option, + data_cache: Option, + + runtime: Arc, +} + +impl<'a, S: ObjectStore> ParquetSstReader<'a, S> { + pub fn new(path: &'a S::Path, storage: &'a S, options: &SstReaderOptions) -> Self { + Self { + path, + storage, + projected_schema: options.projected_schema.clone(), + predicate: options.predicate.clone(), + meta_data: None, + file_reader: None, + batch_size: options.read_batch_row_num, + reverse: options.reverse, + channel_cap: DEFAULT_CHANNEL_CAP, + meta_cache: options.meta_cache.clone(), + data_cache: options.data_cache.clone(), + runtime: options.runtime.clone(), + } + } +} + +impl<'a, S: ObjectStore> ParquetSstReader<'a, S> { + async fn init_if_necessary(&mut self) -> Result<()> { + if self.meta_data.is_some() { + return Ok(()); + } + + let (file_reader, sst_meta) = + read_sst_meta(self.storage, self.path, &self.meta_cache, &self.data_cache).await?; + + self.file_reader = Some(file_reader); + self.meta_data = Some(sst_meta); + + Ok(()) + } + + fn read_record_batches(&mut self, tx: Sender>) -> Result<()> { + let path = self.path.display(); + ensure!(self.file_reader.is_some(), ReadAgain { path }); + + let file_reader = self.file_reader.take().unwrap(); + let batch_size = self.batch_size; + let schema = { + let meta_data = self.meta_data.as_ref().unwrap(); + meta_data.schema.clone() + }; + let projected_schema = self.projected_schema.clone(); + let row_projector = projected_schema + .try_project_with_key(&schema) + .map_err(|e| Box::new(e) as _) + .context(Projection)?; + let predicate = self.predicate.clone(); + let reverse = self.reverse; + + let _ = self.runtime.spawn_blocking(move || { + debug!( + "begin reading record batch from the sst:{}, predicate:{:?}, projection:{:?}", + path, predicate, projected_schema, + ); + + let mut send_failed = false; + let send = |v| -> Result<()> { + tx.blocking_send(v) + .map_err(|e| { + send_failed = true; + Box::new(e) as _ + }) + .context(Other)?; + Ok(()) + }; + + let reader = ProjectAndFilterReader { + file_path: path.clone(), + file_reader: Some(file_reader), + schema, + projected_schema, + row_projector, + predicate, + batch_size, + reverse, + }; + + let start_fetch = Instant::now(); + match reader.fetch_and_send_record_batch(send) { + Ok(row_num) => { + debug!( + "finish reading record batch({} rows) from the sst:{}, time cost:{:?}", + row_num, + path, + start_fetch.elapsed(), + ); + } + Err(e) => { + if send_failed { + error!("fail to send the fetched record batch result, err:{}", e); + } else { + error!( + "failed to read record batch from the sst:{}, err:{}", + path, e + ); + let _ = tx.blocking_send(Err(e)); + } + } + } + }); + + Ok(()) + } + + #[cfg(test)] + pub(crate) async fn row_groups(&mut self) -> &[RowGroupMetaData] { + self.init_if_necessary().await.unwrap(); + self.file_reader.as_ref().unwrap().metadata().row_groups() + } +} + +/// A reader for projection and filter on the parquet file. +struct ProjectAndFilterReader { + file_path: String, + file_reader: Option>, + schema: Schema, + projected_schema: ProjectedSchema, + row_projector: RowProjector, + predicate: PredicateRef, + batch_size: usize, + reverse: bool, +} + +impl ProjectAndFilterReader { + fn build_row_group_predicate(&self) -> Box bool + 'static> { + assert!(self.file_reader.is_some()); + + let row_groups = self.file_reader.as_ref().unwrap().metadata().row_groups(); + let filter_results = self.predicate.filter_row_groups(&self.schema, row_groups); + + trace!("Finish build row group predicate, predicate:{:?}, schema:{:?}, filter_results:{:?}, row_groups meta data:{:?}", self.predicate, self.schema, filter_results, row_groups); + + Box::new(move |_, idx: usize| filter_results[idx]) + } + + /// Generate the reader which has processed projection and filter. + /// This `file_reader` is consumed after calling this method. + fn project_and_filter_reader( + &mut self, + ) -> Result>>> { + assert!(self.file_reader.is_some()); + + let row_group_predicate = self.build_row_group_predicate(); + let mut file_reader = self.file_reader.take().unwrap(); + file_reader.filter_row_groups(&row_group_predicate); + + if self.reverse { + let mut builder = + ReverseRecordBatchReaderBuilder::new(Arc::new(file_reader), self.batch_size); + if !self.projected_schema.is_all_projection() { + builder = builder.projection(Some(self.row_projector.existed_source_projection())); + } + + let reverse_reader = builder + .build() + .map_err(|e| Box::new(e) as _) + .context(DecodeRecordBatch)?; + + Ok(Box::new(reverse_reader)) + } else { + let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(file_reader)); + + let reader = if self.projected_schema.is_all_projection() { + arrow_reader.get_record_reader(self.batch_size) + } else { + let projection = self.row_projector.existed_source_projection(); + arrow_reader.get_record_reader_by_columns(projection, self.batch_size) + }; + let reader = reader + .map_err(|e| Box::new(e) as _) + .context(DecodeRecordBatch)?; + + Ok(Box::new(reader)) + } + } + + /// Fetch the record batch from the `reader` and send them. + /// Returns the fetched row number. + fn fetch_and_send_record_batch( + mut self, + mut send: impl FnMut(Result) -> Result<()>, + ) -> Result { + let reader = self.project_and_filter_reader()?; + + let arrow_record_batch_projector = ArrowRecordBatchProjector::from(self.row_projector); + let mut row_num = 0; + for record_batch in reader { + trace!( + "Fetch one record batch from sst:{}, num_rows:{:?}", + self.file_path, + record_batch.as_ref().map(|v| v.num_rows()) + ); + + match record_batch + .map_err(|e| Box::new(e) as _) + .context(DecodeRecordBatch) + { + Ok(record_batch) => { + row_num += record_batch.num_rows(); + + let record_batch_with_key = arrow_record_batch_projector + .project_to_record_batch_with_key(record_batch) + .map_err(|e| Box::new(e) as _) + .context(DecodeRecordBatch); + + send(record_batch_with_key)?; + } + Err(e) => { + send(Err(e))?; + break; + } + }; + } + + Ok(row_num) + } +} + +struct RecordBatchReceiver { + rx: Receiver>, +} + +impl Stream for RecordBatchReceiver { + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + self.as_mut().rx.poll_recv(cx) + } +} + +#[async_trait] +impl<'a, S: ObjectStore> SstReader for ParquetSstReader<'a, S> { + async fn meta_data(&mut self) -> Result<&SstMetaData> { + self.init_if_necessary().await?; + Ok(self.meta_data.as_ref().unwrap()) + } + + // TODO(yingwen): Project the schema in parquet + async fn read( + &mut self, + ) -> Result> + Send + Unpin>> { + debug!( + "read sst:{}, projected_schema:{:?}, predicate:{:?}", + self.path.display(), + self.projected_schema, + self.predicate + ); + + self.init_if_necessary().await?; + let (tx, rx) = mpsc::channel::>(self.channel_cap); + self.read_record_batches(tx)?; + + Ok(Box::new(RecordBatchReceiver { rx })) + } +} diff --git a/analytic_engine/src/sst/reader.rs b/analytic_engine/src/sst/reader.rs new file mode 100644 index 0000000000..ab76c9a044 --- /dev/null +++ b/analytic_engine/src/sst/reader.rs @@ -0,0 +1,90 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Sst reader trait definition. + +use async_trait::async_trait; +use common_types::record_batch::RecordBatchWithKey; +use futures::Stream; + +use crate::sst::file::SstMetaData; + +pub mod error { + use common_util::define_result; + use snafu::{Backtrace, Snafu}; + + #[derive(Debug, Snafu)] + #[snafu(visibility(pub))] + pub enum Error { + #[snafu(display("Try to read again, path:{}.\nBacktrace:\n{}", path, backtrace))] + ReadAgain { backtrace: Backtrace, path: String }, + + #[snafu(display("Fail to read persisted file, path:{}, err:{}", path, source))] + ReadPersist { + path: String, + source: Box, + }, + + #[snafu(display("Failed to decode record batch, err:{}", source))] + DecodeRecordBatch { + source: Box, + }, + + #[snafu(display("Failed to decode sst meta data, err:{}", source))] + DecodeSstMeta { + source: Box, + }, + + #[snafu(display("Sst meta data is not found.\nBacktrace:\n{}", backtrace))] + SstMetaNotFound { backtrace: Backtrace }, + + #[snafu(display("Fail to projection, err:{}", source))] + Projection { + source: Box, + }, + + #[snafu(display("Sst meta data is empty.\nBacktrace:\n{}", backtrace))] + EmptySstMeta { backtrace: Backtrace }, + + #[snafu(display("Other kind of error:{}", source))] + Other { + source: Box, + }, + } + + define_result!(Error); +} + +pub use error::*; + +#[async_trait] +pub trait SstReader { + async fn meta_data(&mut self) -> Result<&SstMetaData>; + + async fn read( + &mut self, + ) -> Result> + Send + Unpin>>; +} + +#[cfg(test)] +pub mod tests { + use common_types::row::Row; + use futures::StreamExt; + + use super::*; + + pub async fn check_stream(stream: &mut S, expected_rows: Vec) + where + S: Stream> + Unpin, + { + let mut visited_rows = 0; + while let Some(batch) = stream.next().await { + let batch = batch.unwrap(); + for row_idx in 0..batch.num_rows() { + assert_eq!(batch.clone_row_at(row_idx), expected_rows[visited_rows]); + visited_rows += 1; + } + } + + assert_eq!(visited_rows, expected_rows.len()); + } +} diff --git a/analytic_engine/src/table/data.rs b/analytic_engine/src/table/data.rs new file mode 100644 index 0000000000..88dde35166 --- /dev/null +++ b/analytic_engine/src/table/data.rs @@ -0,0 +1,713 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table data + +use std::{ + collections::HashMap, + convert::TryInto, + sync::{ + atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering}, + Arc, Mutex, + }, + time::Duration, +}; + +use arc_swap::ArcSwap; +use arena::CollectorRef; +use common_types::{ + schema::{Schema, Version}, + time::{TimeRange, Timestamp}, + SequenceNumber, +}; +use common_util::define_result; +use log::{debug, info}; +use object_store::path::ObjectStorePath; +use snafu::{Backtrace, OptionExt, ResultExt, Snafu}; +use table_engine::{engine::CreateTableRequest, table::TableId}; +use wal::manager::RegionId; + +use crate::{ + instance::write_worker::{WorkerLocal, WriteHandle}, + memtable::{ + factory::{FactoryRef as MemTableFactoryRef, Options as MemTableOptions}, + skiplist::factory::SkiplistMemTableFactory, + }, + meta::meta_update::AddTableMeta, + space::SpaceId, + sst::{factory::SstType, file::FilePurger, manager::FileId}, + table::{ + metrics::Metrics, + sst_util, + version::{MemTableForWrite, MemTableState, SamplingMemTable, TableVersion}, + }, + TableOptions, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to create memtable, err:{}", source))] + CreateMemTable { + source: crate::memtable::factory::Error, + }, + + #[snafu(display( + "Failed to find or create memtable, timestamp overflow, timestamp:{:?}, duration:{:?}.\nBacktrace:\n{}", + timestamp, + duration, + backtrace, + ))] + TimestampOverflow { + timestamp: Timestamp, + duration: Duration, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to find memtable for write, err:{}", source))] + FindMemTable { + source: crate::table::version::Error, + }, +} + +define_result!(Error); + +pub type MemTableId = u64; + +/// Data of a table +pub struct TableData { + /// Id of this table + pub id: TableId, + /// Name of this table + pub name: String, + /// Schema of this table + schema: Mutex, + /// Space id of this table + pub space_id: SpaceId, + /// The sst type of this table + pub sst_type: SstType, + + /// Mutable memtable memory size limitation + mutable_limit: AtomicU32, + /// Options of this table. + /// + /// Most modification to `opts` can be done by replacing the old options + /// with a new one. However, altering the segment duration should be done + /// carefully to avoid the reader seeing inconsistent segment duration + /// and memtables/ssts during query/compaction/flush . + opts: ArcSwap, + /// MemTable factory of this table + memtable_factory: MemTableFactoryRef, + /// Space memtable memory usage collector + mem_usage_collector: CollectorRef, + + /// Current table version + current_version: TableVersion, + /// Last sequence visible to the reads + /// + /// Write to last_sequence should be guarded by a mutex and only done by + /// single writer, but reads are allowed to be done concurrently without + /// mutex protected + last_sequence: AtomicU64, + /// Handle to the write worker + pub write_handle: WriteHandle, + /// Auto incremented id to track memtable, reset on engine open + /// + /// Allocating memtable id should be guarded by write lock + last_memtable_id: AtomicU64, + + /// Last id of the sst file + /// + /// Write to last_file_id require external synchronization + last_file_id: AtomicU64, + + /// Flag denoting whether the table is dropped + /// + /// No write/alter is allowed if the table is dropped. + dropped: AtomicBool, + + /// Metrics of this table. + pub metrics: Metrics, +} + +impl Drop for TableData { + fn drop(&mut self) { + debug!("TableData is dropped, id:{}, name:{}", self.id, self.name); + } +} + +#[inline] +fn get_mutable_limit(opts: &TableOptions) -> u32 { + opts.write_buffer_size * 7 / 8 +} + +impl TableData { + /// Create a new TableData + /// + /// This function should only be called when a new table is creating and + /// there is no existing data of the table + pub fn new( + space_id: SpaceId, + request: CreateTableRequest, + write_handle: WriteHandle, + table_opts: TableOptions, + purger: &FilePurger, + mem_usage_collector: CollectorRef, + ) -> Result { + // FIXME(yingwen): Validate TableOptions, such as bucket_duration >= + // segment_duration and bucket_duration is aligned to segment_duration + + let memtable_factory = Arc::new(SkiplistMemTableFactory); + let purge_queue = purger.create_purge_queue(space_id, request.table_id); + let current_version = TableVersion::new(purge_queue); + let metrics = Metrics::new(&request.table_name); + + Ok(Self { + id: request.table_id, + name: request.table_name, + schema: Mutex::new(request.table_schema), + space_id, + // TODO(xikai): sst type should be decided by the `request`. + sst_type: SstType::Parquet, + mutable_limit: AtomicU32::new(get_mutable_limit(&table_opts)), + opts: ArcSwap::new(Arc::new(table_opts)), + memtable_factory, + mem_usage_collector, + current_version, + last_sequence: AtomicU64::new(0), + write_handle, + last_memtable_id: AtomicU64::new(0), + last_file_id: AtomicU64::new(0), + dropped: AtomicBool::new(false), + metrics, + }) + } + + /// Recover table from add table meta + /// + /// This wont recover sequence number, which will be set after wal replayed + pub fn recover_from_add( + add_meta: AddTableMeta, + write_handle: WriteHandle, + purger: &FilePurger, + mem_usage_collector: CollectorRef, + ) -> Result { + let memtable_factory = Arc::new(SkiplistMemTableFactory); + let purge_queue = purger.create_purge_queue(add_meta.space_id, add_meta.table_id); + let current_version = TableVersion::new(purge_queue); + let metrics = Metrics::new(&add_meta.table_name); + + Ok(Self { + id: add_meta.table_id, + name: add_meta.table_name, + schema: Mutex::new(add_meta.schema), + space_id: add_meta.space_id, + // TODO(xikai): it should be recovered from `add_meta` struct. + sst_type: SstType::Parquet, + mutable_limit: AtomicU32::new(get_mutable_limit(&add_meta.opts)), + opts: ArcSwap::new(Arc::new(add_meta.opts)), + memtable_factory, + mem_usage_collector, + current_version, + last_sequence: AtomicU64::new(0), + write_handle, + last_memtable_id: AtomicU64::new(0), + last_file_id: AtomicU64::new(0), + dropped: AtomicBool::new(false), + metrics, + }) + } + + /// Get current schema of the table. + pub fn schema(&self) -> Schema { + self.schema.lock().unwrap().clone() + } + + /// Set current schema of the table. + pub fn set_schema(&self, schema: Schema) { + *self.schema.lock().unwrap() = schema; + } + + /// Get current version of schema. + pub fn schema_version(&self) -> Version { + self.schema.lock().unwrap().version() + } + + /// Get current table version + #[inline] + pub fn current_version(&self) -> &TableVersion { + &self.current_version + } + + /// Get the wal region id of this table + /// + /// Now we just use table id as region id + #[inline] + pub fn wal_region_id(&self) -> RegionId { + self.id.as_u64() + } + + /// Get last sequence number + #[inline] + pub fn last_sequence(&self) -> SequenceNumber { + self.last_sequence.load(Ordering::Acquire) + } + + /// Set last sequence number + #[inline] + pub fn set_last_sequence(&self, seq: SequenceNumber) { + self.last_sequence.store(seq, Ordering::Release); + } + + #[inline] + pub fn table_options(&self) -> Arc { + self.opts.load().clone() + } + + /// Update table options. + /// + /// REQUIRE: The write lock is held. + #[inline] + pub fn set_table_options(&self, _write_lock: &WorkerLocal, opts: TableOptions) { + self.mutable_limit + .store(get_mutable_limit(&opts), Ordering::Relaxed); + self.opts.store(Arc::new(opts)) + } + + #[inline] + pub fn is_dropped(&self) -> bool { + self.dropped.load(Ordering::SeqCst) + } + + /// Set the table is dropped and forbid any writes/alter on this table. + #[inline] + pub fn set_dropped(&self) { + self.dropped.store(true, Ordering::SeqCst); + } + + /// Returns total memtable memory usage in bytes. + #[inline] + pub fn memtable_memory_usage(&self) -> usize { + self.current_version.total_memory_usage() + } + + /// Find memtable for given timestamp to insert, create if not exists + /// + /// If the memtable schema is outdated, switch all memtables and create the + /// needed mutable memtable by current schema. The returned memtable is + /// guaranteed to have same schema of current table + /// + /// REQUIRE: The write lock is held + pub fn find_or_create_mutable( + &self, + write_lock: &WorkerLocal, + timestamp: Timestamp, + table_schema: &Schema, + ) -> Result { + let schema_version = table_schema.version(); + let last_sequence = self.last_sequence(); + + if let Some(mem) = self + .current_version + .memtable_for_write(write_lock, timestamp, schema_version) + .context(FindMemTable)? + { + return Ok(mem); + } + + // Mutable memtable for this timestamp not found, need to create a new one. + let table_options = self.table_options(); + let memtable_opts = MemTableOptions { + schema: table_schema.clone(), + arena_block_size: table_options.arena_block_size, + creation_sequence: last_sequence, + collector: self.mem_usage_collector.clone(), + }; + let mem = self + .memtable_factory + .create_memtable(memtable_opts) + .context(CreateMemTable)?; + + match table_options.segment_duration() { + Some(segment_duration) => { + let time_range = TimeRange::bucket_of(timestamp, segment_duration).context( + TimestampOverflow { + timestamp, + duration: segment_duration, + }, + )?; + let mem_state = MemTableState { + mem, + time_range, + id: self.alloc_memtable_id(), + }; + + // Insert memtable into mutable memtables of current version. + self.current_version.insert_mutable(mem_state.clone()); + + Ok(MemTableForWrite::Normal(mem_state)) + } + None => { + let sampling_mem = SamplingMemTable::new(mem, self.alloc_memtable_id()); + + // Set sampling memtables of current version. + self.current_version.set_sampling(sampling_mem.clone()); + + Ok(MemTableForWrite::Sampling(sampling_mem)) + } + } + } + + /// Returns true if the memory usage of this table reaches flush threshold + /// + /// REQUIRE: Do in write worker + pub fn should_flush_table(&self, _worker_local: &WorkerLocal) -> bool { + // Fallback to usize::MAX if Failed to convert arena_block_size into + // usize (overflow) + let max_write_buffer_size = self + .table_options() + .write_buffer_size + .try_into() + .unwrap_or(usize::MAX); + let mutable_limit = self + .mutable_limit + .load(Ordering::Relaxed) + .try_into() + .unwrap_or(usize::MAX); + + let mutable_usage = self.current_version.mutable_memory_usage(); + let total_usage = self.current_version.total_memory_usage(); + + // Inspired by https://github.com/facebook/rocksdb/blob/main/include/rocksdb/write_buffer_manager.h#L94 + if mutable_usage > mutable_limit { + info!( + "TableData should flush, table:{}, table_id:{}, mutable_usage:{}, mutable_limit: {}, total_usage:{}, max_write_buffer_size:{}", + self.name, self.id, mutable_usage, mutable_limit, total_usage, max_write_buffer_size + ); + return true; + } + + // If the memory exceeds the buffer size, we trigger more aggressive + // flush. But if already more than half memory is being flushed, + // triggering more flush may not help. We will hold it instead. + let should_flush = + total_usage >= max_write_buffer_size && mutable_usage >= max_write_buffer_size / 2; + + debug!( + "Check should flush, table:{}, table_id:{}, mutable_usage:{}, mutable_limit: {}, total_usage:{}, max_write_buffer_size:{}", + self.name, self.id, mutable_usage, mutable_limit, total_usage, max_write_buffer_size + ); + + if should_flush { + info!( + "TableData should flush, table:{}, table_id:{}, mutable_usage:{}, mutable_limit: {}, total_usage:{}, max_write_buffer_size:{}", + self.name, self.id, mutable_usage, mutable_limit, total_usage, max_write_buffer_size + ); + } + + should_flush + } + + /// Set `last_file_id`, mainly used in recover + /// + /// This operation require external synchronization + pub fn set_last_file_id(&self, last_file_id: FileId) { + self.last_file_id.store(last_file_id, Ordering::Relaxed); + } + + /// Returns the last file id + pub fn last_file_id(&self) -> FileId { + self.last_file_id.load(Ordering::Relaxed) + } + + /// Alloc a file id for a new file + pub fn alloc_file_id(&self) -> FileId { + let last = self.last_file_id.fetch_add(1, Ordering::Relaxed); + last + 1 + } + + /// Set the sst file path into the object storage path. + pub fn set_sst_file_path(&self, file_id: FileId, path: &mut impl ObjectStorePath) { + sst_util::set_sst_file_path(self.space_id, self.id, file_id, path) + } + + /// Allocate next memtable id + fn alloc_memtable_id(&self) -> MemTableId { + let last = self.last_memtable_id.fetch_add(1, Ordering::Relaxed); + last + 1 + } + + /// Returns last memtable id + pub fn last_memtable_id(&self) -> MemTableId { + self.last_memtable_id.load(Ordering::Relaxed) + } + + pub fn dedup(&self) -> bool { + self.table_options().need_dedup() + } + + pub fn is_expired(&self, timestamp: Timestamp) -> bool { + self.table_options().is_expired(timestamp) + } +} + +/// Table data reference +pub type TableDataRef = Arc; + +/// Manages TableDataRef +pub struct TableDataSet { + /// Name to table data + table_datas: HashMap, + /// Id to table data + id_to_tables: HashMap, +} + +impl TableDataSet { + /// Create an empty TableDataSet + pub fn new() -> Self { + Self { + table_datas: HashMap::new(), + id_to_tables: HashMap::new(), + } + } + + /// Insert if absent, if successfully inserted, return true and return + /// false if the data already exists + pub fn insert_if_absent(&mut self, table_data_ref: TableDataRef) -> bool { + let table_name = &table_data_ref.name; + if self.table_datas.contains_key(table_name) { + return false; + } + self.table_datas + .insert(table_name.to_string(), table_data_ref.clone()); + self.id_to_tables.insert(table_data_ref.id, table_data_ref); + true + } + + /// Find table by table name + pub fn find_table(&self, table_name: &str) -> Option { + self.table_datas.get(table_name).cloned() + } + + /// Find table by table id + pub fn find_table_by_id(&self, table_id: TableId) -> Option { + self.id_to_tables.get(&table_id).cloned() + } + + /// Remove table by table name + pub fn remove_table(&mut self, table_name: &str) -> Option { + let table = self.table_datas.remove(table_name)?; + self.id_to_tables.remove(&table.id); + Some(table) + } + + /// Returns the total table num in this set + pub fn table_num(&self) -> usize { + self.table_datas.len() + } + + /// Find the table which consumes maximum memtable memory usag. + pub fn find_maximum_memory_usage_table(&self) -> Option { + self.table_datas + .values() + .max_by_key(|t| t.memtable_memory_usage()) + .cloned() + } + + /// List all tables to `tables` + pub fn list_all_tables(&self, tables: &mut Vec) { + for table_data in self.table_datas.values().cloned() { + tables.push(table_data); + } + } +} + +impl Default for TableDataSet { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +pub mod tests { + use std::sync::Arc; + + use arena::NoopCollector; + use common_types::datum::DatumKind; + use common_util::config::ReadableDuration; + use table_engine::engine::TableState; + + use super::*; + use crate::{ + instance::write_worker::tests::WriteHandleMocker, + memtable::{factory::Factory, MemTableRef}, + sst::file::tests::FilePurgerMocker, + table_options, + tests::table, + }; + + const DEFAULT_SPACE_ID: SpaceId = 1; + + fn default_schema() -> Schema { + table::create_schema_builder( + &[("key", DatumKind::Timestamp)], + &[("value", DatumKind::Double)], + ) + .build() + .unwrap() + } + + #[derive(Default)] + pub struct MemTableMocker; + + impl MemTableMocker { + pub fn build(&self) -> MemTableRef { + let memtable_opts = MemTableOptions { + schema: default_schema(), + arena_block_size: 1024 * 1024, + creation_sequence: 1000, + collector: Arc::new(NoopCollector), + }; + + let factory = SkiplistMemTableFactory; + factory.create_memtable(memtable_opts).unwrap() + } + } + + #[must_use] + pub struct TableDataMocker { + table_id: TableId, + table_name: String, + write_handle: Option, + } + + impl TableDataMocker { + pub fn table_id(mut self, table_id: TableId) -> Self { + self.table_id = table_id; + self + } + + pub fn table_name(mut self, table_name: String) -> Self { + self.table_name = table_name; + self + } + + pub fn write_handle(mut self, write_handle: WriteHandle) -> Self { + self.write_handle = Some(write_handle); + self + } + + pub fn build(self) -> TableData { + let space_id = DEFAULT_SPACE_ID; + let table_schema = default_schema(); + let create_request = CreateTableRequest { + catalog_name: "test_catalog".to_string(), + schema_name: "public".to_string(), + table_id: self.table_id, + table_name: self.table_name, + table_schema, + partition_info: None, + engine: table_engine::ANALYTIC_ENGINE_TYPE.to_string(), + options: HashMap::new(), + state: TableState::Stable, + }; + + let write_handle = self.write_handle.unwrap_or_else(|| { + let mocked_write_handle = WriteHandleMocker::default().space_id(space_id).build(); + mocked_write_handle.write_handle + }); + let table_opts = TableOptions::default(); + let purger = FilePurgerMocker::mock(); + let collector = Arc::new(NoopCollector); + + TableData::new( + space_id, + create_request, + write_handle, + table_opts, + &purger, + collector, + ) + .unwrap() + } + } + + impl Default for TableDataMocker { + fn default() -> Self { + Self { + table_id: table::new_table_id(2, 1), + table_name: "mocked_table".to_string(), + write_handle: None, + } + } + } + + #[test] + fn test_new_table_data() { + let table_id = table::new_table_id(100, 30); + let table_name = "new_table".to_string(); + let table_data = TableDataMocker::default() + .table_id(table_id) + .table_name(table_name.clone()) + .build(); + + assert_eq!(table_id, table_data.id); + assert_eq!(table_name, table_data.name); + assert_eq!(table_data.id.as_u64(), table_data.wal_region_id()); + assert_eq!(0, table_data.last_sequence()); + assert!(!table_data.is_dropped()); + assert_eq!(0, table_data.last_file_id()); + assert_eq!(0, table_data.last_memtable_id()); + assert!(table_data.dedup()); + } + + #[test] + fn test_find_or_create_mutable() { + let mocked_write_handle = WriteHandleMocker::default() + .space_id(DEFAULT_SPACE_ID) + .build(); + let table_data = TableDataMocker::default() + .write_handle(mocked_write_handle.write_handle) + .build(); + let worker_local = mocked_write_handle.worker_local; + let schema = table_data.schema(); + + // Create sampling memtable. + let zero_ts = Timestamp::new(0); + let mutable = table_data + .find_or_create_mutable(&worker_local, zero_ts, &schema) + .unwrap(); + assert!(mutable.accept_timestamp(zero_ts)); + let sampling_mem = mutable.as_sampling(); + let sampling_id = sampling_mem.id; + assert_eq!(1, sampling_id); + + // Test memtable is reused. + let now_ts = Timestamp::now(); + let mutable = table_data + .find_or_create_mutable(&worker_local, now_ts, &schema) + .unwrap(); + assert!(mutable.accept_timestamp(now_ts)); + let sampling_mem = mutable.as_sampling(); + // Use same sampling memtable. + assert_eq!(sampling_id, sampling_mem.id); + + let current_version = table_data.current_version(); + // Set segment duration manually. + let mut table_opts = (*table_data.table_options()).clone(); + table_opts.segment_duration = + Some(ReadableDuration(table_options::DEFAULT_SEGMENT_DURATION)); + table_data.set_table_options(&worker_local, table_opts); + // Freeze sampling memtable. + current_version.freeze_sampling(&worker_local); + + // A new mutable memtable should be created. + let mutable = table_data + .find_or_create_mutable(&worker_local, now_ts, &schema) + .unwrap(); + assert!(mutable.accept_timestamp(now_ts)); + let mem_state = mutable.as_normal(); + assert_eq!(2, mem_state.id); + let time_range = + TimeRange::bucket_of(now_ts, table_options::DEFAULT_SEGMENT_DURATION).unwrap(); + assert_eq!(time_range, mem_state.time_range); + } +} diff --git a/analytic_engine/src/table/metrics.rs b/analytic_engine/src/table/metrics.rs new file mode 100644 index 0000000000..0a5d801796 --- /dev/null +++ b/analytic_engine/src/table/metrics.rs @@ -0,0 +1,229 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Metrics of table. + +use std::time::Duration; + +use lazy_static::lazy_static; +use prometheus::{ + exponential_buckets, local::LocalHistogram, register_histogram_vec, register_int_counter_vec, + Histogram, HistogramVec, IntCounter, IntCounterVec, +}; + +const KB: f64 = 1024.0; + +lazy_static! { + // Counters: + static ref TABLE_WRITE_REQUEST_COUNTER: IntCounterVec = register_int_counter_vec!( + "table_write_request_counter", + "Write request counter of table", + &["table"] + ) + .unwrap(); + static ref TABLE_WRITE_ROWS_COUNTER: IntCounterVec = register_int_counter_vec!( + "table_write_rows_counter", + "Number of rows wrote to table", + &["table"] + ) + .unwrap(); + static ref TABLE_READ_REQUEST_COUNTER: IntCounterVec = register_int_counter_vec!( + "table_read_request_counter", + "Read request counter of table", + &["table"] + ) + .unwrap(); + // End of counters. + + // Histograms: + // Buckets: 0, 0.002, .., 0.002 * 4^9 + static ref TABLE_FLUSH_DURATION_HISTOGRAM: HistogramVec = register_histogram_vec!( + "table_flush_duration", + "Histogram for flush duration of the table in seconds", + &["table"], + exponential_buckets(0.002, 4.0, 10).unwrap() + ).unwrap(); + // Buckets: 0, 1, .., 2^7 + static ref TABLE_FLUSH_SST_NUM_HISTOGRAM: HistogramVec = register_histogram_vec!( + "table_flush_sst_num", + "Histogram for number of ssts flushed by the table", + &["table"], + exponential_buckets(1.0, 2.0, 8).unwrap() + ).unwrap(); + // Buckets: 0, 1, ..., 4^11 (4GB) + static ref TABLE_FLUSH_SST_SIZE_HISTOGRAM: HistogramVec = register_histogram_vec!( + "table_flush_sst_size", + "Histogram for size of ssts flushed by the table in KB", + &["table"], + exponential_buckets(1.0, 4.0, 12).unwrap() + ).unwrap(); + + // Buckets: 0, 0.02, .., 0.02 * 4^9 + static ref TABLE_COMPACT_DURATION_HISTOGRAM: HistogramVec = register_histogram_vec!( + "table_compaction_duration", + "Histogram for compaction duration of the table in seconds", + &["table"], + exponential_buckets(0.02, 4.0, 10).unwrap() + ).unwrap(); + // Buckets: 0, 1, .., 2^7 + static ref TABLE_COMPACTION_SST_NUM_HISTOGRAM: HistogramVec = register_histogram_vec!( + "table_compaction_sst_num", + "Histogram for number of ssts compacted by the table", + &["table"], + exponential_buckets(1.0, 2.0, 8).unwrap() + ).unwrap(); + // Buckets: 0, 1, ..., 4^11 (4GB) + static ref TABLE_COMPACTION_SST_SIZE_HISTOGRAM: HistogramVec = register_histogram_vec!( + "table_compaction_sst_size", + "Histogram for size of ssts compacted by the table in KB", + &["table", "type"], + exponential_buckets(1.0, 4.0, 12).unwrap() + ).unwrap(); + // Buckets: 0, 1, ..., 10^12(1 billion) + static ref TABLE_COMPACTION_SST_ROW_NUM_HISTOGRAM: HistogramVec = register_histogram_vec!( + "table_compaction_sst_row_num", + "Histogram for row num of ssts compacted by the table", + &["table", "type"], + exponential_buckets(1.0, 10.0, 13).unwrap() + ).unwrap(); + + // Buckets: 0, 0.01, .., 0.01 * 2^12 + static ref TABLE_WRITE_STALL_DURATION_HISTOGRAM: HistogramVec = register_histogram_vec!( + "table_write_stall_duration", + "Histogram for write stall duration of the table in seconds", + &["table"], + exponential_buckets(0.01, 2.0, 13).unwrap() + ).unwrap(); + // End of histograms. +} + +/// Table metrics. +/// +/// Now the registered labels won't remove from the metrics vec to avoid panic +/// on concurrent removal. +pub struct Metrics { + // Counters: + pub write_request_counter: IntCounter, + write_rows_counter: IntCounter, + pub read_request_counter: IntCounter, + // End of counters. + + // Histograms: + pub flush_duration_histogram: Histogram, + flush_sst_num_histogram: Histogram, + flush_sst_size_histogram: Histogram, + + pub compaction_duration_histogram: Histogram, + compaction_sst_num_histogram: Histogram, + compaction_input_sst_size_histogram: Histogram, + compaction_output_sst_size_histogram: Histogram, + compaction_input_sst_row_num_histogram: Histogram, + compaction_output_sst_row_num_histogram: Histogram, + + // Write stall metrics. + write_stall_duration_histogram: Histogram, + // End of histograms. +} + +impl Metrics { + pub fn new(table_name: &str) -> Self { + Self { + write_request_counter: TABLE_WRITE_REQUEST_COUNTER.with_label_values(&[table_name]), + write_rows_counter: TABLE_WRITE_ROWS_COUNTER.with_label_values(&[table_name]), + read_request_counter: TABLE_READ_REQUEST_COUNTER.with_label_values(&[table_name]), + + flush_duration_histogram: TABLE_FLUSH_DURATION_HISTOGRAM + .with_label_values(&[table_name]), + flush_sst_num_histogram: TABLE_FLUSH_SST_NUM_HISTOGRAM.with_label_values(&[table_name]), + flush_sst_size_histogram: TABLE_FLUSH_SST_SIZE_HISTOGRAM + .with_label_values(&[table_name]), + + compaction_duration_histogram: TABLE_COMPACT_DURATION_HISTOGRAM + .with_label_values(&[table_name]), + compaction_sst_num_histogram: TABLE_COMPACTION_SST_NUM_HISTOGRAM + .with_label_values(&[table_name]), + compaction_input_sst_size_histogram: TABLE_COMPACTION_SST_SIZE_HISTOGRAM + .with_label_values(&[table_name, "input"]), + compaction_output_sst_size_histogram: TABLE_COMPACTION_SST_SIZE_HISTOGRAM + .with_label_values(&[table_name, "output"]), + compaction_input_sst_row_num_histogram: TABLE_COMPACTION_SST_ROW_NUM_HISTOGRAM + .with_label_values(&[table_name, "input"]), + compaction_output_sst_row_num_histogram: TABLE_COMPACTION_SST_ROW_NUM_HISTOGRAM + .with_label_values(&[table_name, "output"]), + + write_stall_duration_histogram: TABLE_WRITE_STALL_DURATION_HISTOGRAM + .with_label_values(&[table_name]), + } + } + + #[inline] + pub fn on_write_request_begin(&self) { + self.write_request_counter.inc(); + } + + #[inline] + pub fn on_write_request_done(&self, num_rows: usize) { + self.write_rows_counter.inc_by(num_rows as u64); + } + + #[inline] + pub fn on_read_request_begin(&self) { + self.read_request_counter.inc(); + } + + #[inline] + pub fn on_write_stall(&self, duration: Duration) { + self.write_stall_duration_histogram + .observe(duration.as_secs_f64()); + } + + pub fn local_flush_metrics(&self) -> LocalFlushMetrics { + LocalFlushMetrics { + flush_duration_histogram: self.flush_duration_histogram.local(), + flush_sst_num_histogram: self.flush_sst_num_histogram.local(), + flush_sst_size_histogram: self.flush_sst_size_histogram.local(), + } + } + + pub fn compaction_observe_sst_num(&self, sst_num: usize) { + self.compaction_sst_num_histogram.observe(sst_num as f64); + } + + pub fn compaction_observe_input_sst_size(&self, sst_size: u64) { + // Convert bytes to KB. + self.compaction_input_sst_size_histogram + .observe(sst_size as f64 / KB); + } + + pub fn compaction_observe_output_sst_size(&self, sst_size: u64) { + // Convert bytes to KB. + self.compaction_output_sst_size_histogram + .observe(sst_size as f64 / KB); + } + + pub fn compaction_observe_input_sst_row_num(&self, sst_row_num: u64) { + self.compaction_input_sst_row_num_histogram + .observe(sst_row_num as f64); + } + + pub fn compaction_observe_output_sst_row_num(&self, sst_row_num: u64) { + self.compaction_output_sst_row_num_histogram + .observe(sst_row_num as f64); + } +} + +pub struct LocalFlushMetrics { + pub flush_duration_histogram: LocalHistogram, + flush_sst_num_histogram: LocalHistogram, + flush_sst_size_histogram: LocalHistogram, +} + +impl LocalFlushMetrics { + pub fn observe_sst_num(&self, sst_num: usize) { + self.flush_sst_num_histogram.observe(sst_num as f64); + } + + pub fn observe_sst_size(&self, sst_size: u64) { + // Convert bytes to KB. + self.flush_sst_size_histogram.observe(sst_size as f64 / KB); + } +} diff --git a/analytic_engine/src/table/mod.rs b/analytic_engine/src/table/mod.rs new file mode 100644 index 0000000000..0f5598f0c1 --- /dev/null +++ b/analytic_engine/src/table/mod.rs @@ -0,0 +1,270 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table implementation + +use std::{collections::HashMap, fmt, sync::Arc}; + +use arrow_deps::datafusion::logical_plan::{Column, Expr}; +use async_trait::async_trait; +use common_types::{row::Row, schema::Schema, time::TimeRange}; +use futures::TryStreamExt; +use object_store::ObjectStore; +use snafu::{ensure, OptionExt, ResultExt}; +use table_engine::{ + predicate::Predicate, + stream::{PartitionedStreams, SendableRecordBatchStream}, + table::{ + AlterOptions, AlterSchema, AlterSchemaRequest, Compact, Flush, FlushRequest, Get, + GetInvalidPrimaryKey, GetNullPrimaryKey, GetRequest, ReadOptions, ReadOrder, ReadRequest, + Result, Scan, Table, TableId, TableStats, Write, WriteRequest, + }, +}; +use tokio::sync::oneshot; +use wal::manager::WalManager; + +use crate::{ + instance::{flush_compaction::TableFlushOptions, InstanceRef}, + meta::Manifest, + space::SpaceAndTable, + sst::factory::Factory, +}; + +pub mod data; +pub mod metrics; +pub mod sst_util; +pub mod version; +pub mod version_edit; + +// TODO(yingwen): How to handle drop table? + +/// Table trait implementation +pub struct TableImpl { + /// Space and table info + space_table: SpaceAndTable, + /// Instance + instance: InstanceRef, + /// Engine type + engine_type: String, +} + +impl TableImpl { + pub fn new( + space_table: SpaceAndTable, + instance: InstanceRef, + engine_type: String, + ) -> Self { + Self { + space_table, + instance, + engine_type, + } + } +} + +impl fmt::Debug for TableImpl { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("TableImpl") + .field("space_table", &self.space_table) + .finish() + } +} + +#[async_trait] +impl< + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, + > Table for TableImpl +{ + fn name(&self) -> &str { + &self.space_table.table_data().name + } + + fn id(&self) -> TableId { + self.space_table.table_data().id + } + + fn schema(&self) -> Schema { + self.space_table.table_data().schema() + } + + fn options(&self) -> HashMap { + self.space_table.table_data().table_options().to_raw_map() + } + + fn engine_type(&self) -> &str { + &self.engine_type + } + + fn stats(&self) -> TableStats { + let metrics = &self.space_table.table_data().metrics; + + TableStats { + num_write: metrics.write_request_counter.get(), + num_read: metrics.read_request_counter.get(), + num_flush: metrics.flush_duration_histogram.get_sample_count(), + } + } + + async fn write(&self, request: WriteRequest) -> Result { + let num_rows = self + .instance + .write_to_table(&self.space_table, request) + .await + .map_err(|e| Box::new(e) as _) + .context(Write { table: self.name() })?; + Ok(num_rows) + } + + async fn read(&self, mut request: ReadRequest) -> Result { + request.opts.read_parallelism = 1; + let mut streams = self + .instance + .partitioned_read_from_table(&self.space_table, request) + .await + .map_err(|e| Box::new(e) as _) + .context(Scan { table: self.name() })?; + + assert_eq!(streams.streams.len(), 1); + let stream = streams.streams.pop().unwrap(); + + Ok(stream) + } + + async fn get(&self, request: GetRequest) -> Result> { + let schema = request.projected_schema.to_record_schema_with_key(); + let primary_key_columns = schema.key_columns(); + ensure!( + primary_key_columns.len() == request.primary_key.len(), + GetInvalidPrimaryKey { + schema: schema.clone(), + primary_key_columns, + } + ); + + let mut primary_key_exprs: Vec = Vec::with_capacity(request.primary_key.len()); + for (primary_key_value, column_schema) in + request.primary_key.iter().zip(primary_key_columns.iter()) + { + let v = primary_key_value + .as_scalar_value() + .with_context(|| GetNullPrimaryKey { + schema: schema.clone(), + primary_key_columns, + })?; + primary_key_exprs.push( + Expr::Column(Column::from_qualified_name(&column_schema.name)).eq(Expr::Literal(v)), + ); + } + + let read_request = ReadRequest { + request_id: request.request_id, + opts: ReadOptions::default(), + projected_schema: request.projected_schema, + predicate: Arc::new(Predicate { + exprs: primary_key_exprs, + time_range: TimeRange::min_to_max(), + }), + order: ReadOrder::None, + }; + let mut batch_stream = self + .read(read_request) + .await + .map_err(|e| Box::new(e) as _) + .context(Scan { table: self.name() })?; + + let mut result_columns = Vec::with_capacity(schema.num_columns()); + + while let Some(batch) = batch_stream + .try_next() + .await + .map_err(|e| Box::new(e) as _) + .context(Get { table: self.name() })? + { + let row_num = batch.num_rows(); + if row_num == 0 { + return Ok(None); + } + for row_idx in 0..row_num { + for col_idx in 0..batch.num_columns() { + let col = batch.column(col_idx); + result_columns.push(col.datum(row_idx)); + } + + if request.primary_key == result_columns[..schema.num_key_columns()] { + return Ok(Some(Row::from_datums(result_columns))); + } + result_columns.clear(); + } + } + + Ok(None) + } + + async fn partitioned_read(&self, request: ReadRequest) -> Result { + let streams = self + .instance + .partitioned_read_from_table(&self.space_table, request) + .await + .map_err(|e| Box::new(e) as _) + .context(Scan { table: self.name() })?; + + Ok(streams) + } + + async fn alter_schema(&self, request: AlterSchemaRequest) -> Result { + self.instance + .alter_schema_of_table(&self.space_table, request) + .await + .map_err(|e| Box::new(e) as _) + .context(AlterSchema { table: self.name() })?; + Ok(1) + } + + async fn alter_options(&self, options: HashMap) -> Result { + self.instance + .alter_options_of_table(&self.space_table, options) + .await + .map_err(|e| Box::new(e) as _) + .context(AlterOptions { table: self.name() })?; + Ok(1) + } + + async fn flush(&self, request: FlushRequest) -> Result<()> { + let mut rx_opt = None; + let flush_opts = TableFlushOptions { + compact_after_flush: request.compact_after_flush, + // Never block write thread + block_on_write_thread: false, + res_sender: if request.sync { + let (tx, rx) = oneshot::channel(); + rx_opt = Some(rx); + Some(tx) + } else { + None + }, + }; + + self.instance + .flush_table(&self.space_table, flush_opts) + .await + .map_err(|e| Box::new(e) as _) + .context(Flush { table: self.name() })?; + if let Some(rx) = rx_opt { + rx.await + .map_err(|e| Box::new(e) as _) + .context(Flush { table: self.name() })??; + } + Ok(()) + } + + async fn compact(&self) -> Result<()> { + self.instance + .manual_compact_table(&self.space_table) + .await + .map_err(|e| Box::new(e) as _) + .context(Compact { table: self.name() })?; + Ok(()) + } +} diff --git a/analytic_engine/src/table/sst_util.rs b/analytic_engine/src/table/sst_util.rs new file mode 100644 index 0000000000..b5d760a079 --- /dev/null +++ b/analytic_engine/src/table/sst_util.rs @@ -0,0 +1,27 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! utilities for sst. + +use object_store::path::ObjectStorePath; +use table_engine::table::TableId; + +use crate::{space::SpaceId, sst::manager::FileId}; + +const SST_FILE_SUFFIX: &str = "sst"; + +#[inline] +/// Generate the sst file name. +pub fn sst_file_name(id: FileId) -> String { + format!("{}.{}", id, SST_FILE_SUFFIX) +} + +/// Set the sst file path. +pub fn set_sst_file_path( + space_id: SpaceId, + table_id: TableId, + file_id: FileId, + path: &mut P, +) { + path.push_all_dirs([space_id.to_string().as_str(), table_id.to_string().as_str()]); + path.set_file_name(sst_file_name(file_id)); +} diff --git a/analytic_engine/src/table/version.rs b/analytic_engine/src/table/version.rs new file mode 100644 index 0000000000..b0e4e2b977 --- /dev/null +++ b/analytic_engine/src/table/version.rs @@ -0,0 +1,1096 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table version + +use std::{ + cmp, + collections::{BTreeMap, HashMap}, + fmt, + ops::Bound, + sync::{Arc, RwLock}, + time::Duration, +}; + +use common_types::{ + row::Row, + schema::{self, Schema}, + time::{TimeRange, Timestamp}, + SequenceNumber, +}; +use common_util::define_result; +use snafu::{ensure, Backtrace, ResultExt, Snafu}; + +use crate::{ + compaction::{ + picker::{self, CompactionPickerRef, PickerContext}, + CompactionTask, ExpiredFiles, + }, + instance::write_worker::WorkerLocal, + memtable::{self, key::KeySequence, MemTableRef, PutContext}, + sampler::{DefaultSampler, SamplerRef}, + sst::{ + file::{FileHandle, FilePurgeQueue}, + manager::{FileId, LevelsController, MAX_LEVEL}, + }, + table::{ + data::MemTableId, + version_edit::{AddFile, VersionEdit}, + }, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Schema mismatch, memtable_version:{}, given:{}.\nBacktrace:\n{}", + memtable_version, + given, + backtrace + ))] + SchemaMismatch { + memtable_version: schema::Version, + given: schema::Version, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to put memtable, err:{}", source))] + PutMemTable { source: crate::memtable::Error }, + + #[snafu(display("Failed to collect timestamp, err:{}", source))] + CollectTimestamp { source: crate::sampler::Error }, +} + +define_result!(Error); + +/// Memtable for sampling timestamp. +#[derive(Clone)] +pub struct SamplingMemTable { + pub mem: MemTableRef, + pub id: MemTableId, + /// If freezed is true, the sampling is finished and no more data should be + /// inserted into this memtable. Otherwise, the memtable is active and all + /// data should ONLY write to this memtable instead of mutable memtable. + pub freezed: bool, + pub sampler: SamplerRef, +} + +impl SamplingMemTable { + pub fn new(memtable: MemTableRef, id: MemTableId) -> Self { + SamplingMemTable { + mem: memtable, + id, + freezed: false, + sampler: Arc::new(DefaultSampler::default()), + } + } + + fn memory_usage(&self) -> usize { + self.mem.approximate_memory_usage() + } + + /// Suggest segment duration, if there is no sampled timestamp, returns + /// default segment duration. + fn suggest_segment_duration(&self) -> Duration { + self.sampler.suggest_duration() + } +} + +impl fmt::Debug for SamplingMemTable { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SamplingMemTable") + .field("id", &self.id) + .field("freezed", &self.freezed) + .finish() + } +} + +/// Memtable with additional meta data +#[derive(Clone)] +pub struct MemTableState { + /// The mutable memtable + pub mem: MemTableRef, + /// The `time_range` is estimated via the time range of the first row group + /// write to this memtable and is aligned to segment size + pub time_range: TimeRange, + /// Id of the memtable, newer memtable has greater id + pub id: MemTableId, +} + +impl MemTableState { + #[inline] + pub fn last_sequence(&self) -> SequenceNumber { + self.mem.last_sequence() + } +} + +impl fmt::Debug for MemTableState { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("MemTableState") + .field("time_range", &self.time_range) + .field("id", &self.id) + .field("last_sequence", &self.mem.last_sequence()) + .finish() + } +} + +// TODO(yingwen): Replace by Either. +#[derive(Clone)] +pub enum MemTableForWrite { + Sampling(SamplingMemTable), + Normal(MemTableState), +} + +impl MemTableForWrite { + #[inline] + pub fn set_last_sequence(&self, seq: SequenceNumber) -> memtable::Result<()> { + self.memtable().set_last_sequence(seq) + } + + #[inline] + pub fn accept_timestamp(&self, timestamp: Timestamp) -> bool { + match self { + MemTableForWrite::Sampling(_) => true, + MemTableForWrite::Normal(v) => v.time_range.contains(timestamp), + } + } + + #[inline] + pub fn put( + &self, + ctx: &mut PutContext, + sequence: KeySequence, + row: &Row, + schema: &Schema, + timestamp: Timestamp, + ) -> Result<()> { + match self { + MemTableForWrite::Sampling(v) => { + v.mem.put(ctx, sequence, row, schema).context(PutMemTable)?; + + // Collect the timstamp of this row. + v.sampler.collect(timestamp).context(CollectTimestamp)?; + + Ok(()) + } + MemTableForWrite::Normal(v) => { + v.mem.put(ctx, sequence, row, schema).context(PutMemTable) + } + } + } + + #[inline] + fn memtable(&self) -> &MemTableRef { + match self { + MemTableForWrite::Sampling(v) => &v.mem, + MemTableForWrite::Normal(v) => &v.mem, + } + } + + #[cfg(test)] + pub fn as_sampling(&self) -> &SamplingMemTable { + match self { + MemTableForWrite::Sampling(v) => v, + MemTableForWrite::Normal(_) => panic!(), + } + } + + #[cfg(test)] + pub fn as_normal(&self) -> &MemTableState { + match self { + MemTableForWrite::Sampling(_) => panic!(), + MemTableForWrite::Normal(v) => v, + } + } +} + +#[derive(Debug, Default)] +pub struct FlushableMemTables { + pub sampling_mem: Option, + pub memtables: MemTableVec, +} + +impl FlushableMemTables { + #[inline] + pub fn is_empty(&self) -> bool { + self.sampling_mem.is_none() && self.memtables.is_empty() + } + + pub fn ids(&self) -> Vec { + let mut memtable_ids = Vec::with_capacity(self.memtables.len() + 1); + if let Some(v) = &self.sampling_mem { + memtable_ids.push(v.id); + } + for mem in &self.memtables { + memtable_ids.push(mem.id); + } + + memtable_ids + } +} + +/// Vec to store memtables +pub type MemTableVec = Vec; + +/// MemTableView holds all memtables of the table +#[derive(Debug)] +struct MemTableView { + /// The memtable for sampling timestamp to suggest segment duration. + /// + /// This memtable is special and may contains data in differnt segment, so + /// can not be moved into immutable memtable set. + sampling_mem: Option, + /// Mutable memtables arranged by its time range. + mutables: MutableMemTableSet, + /// Immutable memtables set, lookup by memtable id is fast. + immutables: ImmutableMemTableSet, +} + +impl MemTableView { + fn new() -> Self { + Self { + sampling_mem: None, + mutables: MutableMemTableSet::new(), + immutables: ImmutableMemTableSet(BTreeMap::new()), + } + } + + /// Get the memory usage of mutable memtables. + fn mutable_memory_usage(&self) -> usize { + self.mutables.memory_usage() + + self + .sampling_mem + .as_ref() + .map(|v| v.memory_usage()) + .unwrap_or(0) + } + + /// Get the total memory usage of mutable and immutable memtables. + fn total_memory_usage(&self) -> usize { + let mutable_usage = self.mutable_memory_usage(); + let immutable_usage = self.immutables.memory_usage(); + + mutable_usage + immutable_usage + } + + /// Switch all memtables or just sample the segment duration. + /// + /// If the sampling memtable is still active, return the suggested segment + /// duration or move all mutable memtables into immutable memtables if + /// the sampling memtable is freezed and returns None. + /// + /// Instead of replace the old memtable by a new memtable, we just move the + /// old memtable to immutable memtables and left mutable memtables + /// empty. New mutable memtable will be constructed via put request. + fn switch_memtables_or_suggest_duration(&mut self) -> Option { + if let Some(v) = &mut self.sampling_mem { + if !v.freezed { + // Other memtable should be empty during sampling phase. + assert!(self.mutables.is_empty()); + assert!(self.immutables.is_empty()); + + // The sampling memtable is still active, we need to compute the + // segment duration and then freeze the memtable. + let segment_duration = v.suggest_segment_duration(); + + // But we cannot freeze the sampling memtable now, because the + // segment duration may not yet been persisted. + return Some(segment_duration); + } + } + + self.mutables.move_to_inmem(&mut self.immutables); + + None + } + + fn freeze_sampling_memtable(&mut self) { + if let Some(v) = &mut self.sampling_mem { + v.freezed = true; + } + } + + /// Returns the memtables that needs to be flushed. + /// - Id of returned memtables are no greater than `max_memtable_id`. + /// - The last sequences of the returned memtables are continuous and can + /// used as flushed sequence. + /// - All memTables with same last sequence must be picked to the same + /// MemTableVec, so we can update flushed sequence safely (The + /// `max_memtable_id` should also guarantee this). + /// - If freezed memtable exists, that memtable will be return if memtable + /// id is no greater than `max_memtable_id` (The memtable id should always + /// less than `max_memtable_id`). + /// + /// Now the returned memtables are also ordered by memtable id, but this may + /// change in the future. + fn pick_memtables_to_flush(&self, max_memtable_id: MemTableId, mems: &mut FlushableMemTables) { + if let Some(v) = &self.sampling_mem { + if v.id <= max_memtable_id { + mems.sampling_mem = Some(v.clone()); + } + } + + for mem in self.immutables.0.values() { + if mem.id <= max_memtable_id { + mems.memtables.push(mem.clone()); + } + } + } + + /// Remove memtable from immutables or sampling memtable. + #[inline] + fn remove_immutable_or_sampling(&mut self, id: MemTableId) { + if let Some(v) = &self.sampling_mem { + if v.id == id { + self.sampling_mem = None; + return; + } + } + + self.immutables.0.remove(&id); + } + + /// Collect memtables itersect with `time_range` + fn memtables_for_read( + &self, + time_range: TimeRange, + mems: &mut MemTableVec, + sampling_mem: &mut Option, + ) { + self.mutables.memtables_for_read(time_range, mems); + + self.immutables.memtables_for_read(time_range, mems); + + *sampling_mem = self.sampling_mem.clone(); + } +} + +/// Mutable memtables +/// +/// All mutable memtables ordered by their end time (exclusive), their time +/// range may overlaps if `alter segment duration` is supported +/// +/// We choose end time so we can use BTreeMap::range to find the first range +/// that may contains a given timestamp (end >= timestamp) +#[derive(Debug)] +struct MutableMemTableSet(BTreeMap); + +impl MutableMemTableSet { + fn new() -> Self { + Self(BTreeMap::new()) + } + + /// Get memtale by timestamp for write + fn memtable_for_write(&self, timestamp: Timestamp) -> Option<&MemTableState> { + // Find the first memtable whose end time (exclusive) > timestamp + if let Some((_, memtable)) = self + .0 + .range((Bound::Excluded(timestamp), Bound::Unbounded)) + .next() + { + if memtable.time_range.contains(timestamp) { + return Some(memtable); + } + } + + None + } + + /// Insert memtable, the caller should guarantee the key of memtable is not + /// present. + fn insert(&mut self, memtable: MemTableState) -> Option { + // Use end time of time range as key + let end = memtable.time_range.exclusive_end(); + self.0.insert(end, memtable) + } + + fn memory_usage(&self) -> usize { + self.0 + .values() + .map(|m| m.mem.approximate_memory_usage()) + .sum() + } + + /// Move all mutable memtables to immutable memtables. + fn move_to_inmem(&mut self, immem: &mut ImmutableMemTableSet) { + for m in self.0.values() { + let state = m.clone(); + + immem.0.insert(m.id, state); + } + self.0.clear(); + } + + fn memtables_for_read(&self, time_range: TimeRange, mems: &mut MemTableVec) { + // Seek to first memtable whose end time (exclusive) > time_range.start + let inclusive_start = time_range.inclusive_start(); + let iter = self + .0 + .range((Bound::Excluded(inclusive_start), Bound::Unbounded)); + for (_end_ts, mem) in iter { + // We need to iterate all candidate memtables as their start time is unspecific + if mem.time_range.intersect_with(time_range) { + mems.push(mem.clone()); + } + } + } + + fn is_empty(&self) -> bool { + self.0.is_empty() + } +} + +/// Immutable memtables set +/// +/// MemTables are ordered by memtable id, so lookup by memtable id is fast +#[derive(Debug)] +struct ImmutableMemTableSet(BTreeMap); + +impl ImmutableMemTableSet { + /// Memory used by all immutable memtables + fn memory_usage(&self) -> usize { + self.0 + .values() + .map(|m| m.mem.approximate_memory_usage()) + .sum() + } + + fn memtables_for_read(&self, time_range: TimeRange, mems: &mut MemTableVec) { + for mem in self.0.values() { + if mem.time_range.intersect_with(time_range) { + mems.push(mem.clone()); + } + } + } + + fn is_empty(&self) -> bool { + self.0.is_empty() + } +} + +pub type LeveledFiles = Vec>; + +/// Memtable/sst to read for given time range. +pub struct ReadView { + pub sampling_mem: Option, + pub memtables: MemTableVec, + /// Ssts to read in each level. + /// + /// The `ReadView` MUST ensure the length of `leveled_ssts` >= MAX_LEVEL. + pub leveled_ssts: LeveledFiles, +} + +impl Default for ReadView { + fn default() -> Self { + Self { + sampling_mem: None, + memtables: Vec::new(), + leveled_ssts: vec![Vec::new(); MAX_LEVEL], + } + } +} + +impl ReadView { + pub fn contains_sampling(&self) -> bool { + self.sampling_mem.is_some() + } +} + +/// Data of TableVersion +struct TableVersionInner { + /// All memtables + memtable_view: MemTableView, + /// All ssts + levels: LevelsController, + + /// The earliest sequence number of the entries already flushed (inclusive). + /// All log entry with sequence <= `flushed_sequence` can be deleted + flushed_sequence: SequenceNumber, +} + +impl TableVersionInner { + fn memtable_for_write( + &self, + _write_lock: &WorkerLocal, + timestamp: Timestamp, + ) -> Option { + if let Some(mem) = self.memtable_view.sampling_mem.clone() { + if !mem.freezed { + // If sampling memtable is not freezed. + return Some(MemTableForWrite::Sampling(mem)); + } + } + + self.memtable_view + .mutables + .memtable_for_write(timestamp) + .cloned() + .map(MemTableForWrite::Normal) + } +} + +// TODO(yingwen): How to support snapshot? +/// Table version +/// +/// Holds memtables and sst meta data of a table +/// +/// Switching memtable, memtable to level 0 file, addition/deletion to files +/// should be done atomically. +pub struct TableVersion { + inner: RwLock, +} + +impl TableVersion { + /// Create an empty table version + pub fn new(purge_queue: FilePurgeQueue) -> Self { + Self { + inner: RwLock::new(TableVersionInner { + memtable_view: MemTableView::new(), + levels: LevelsController::new(purge_queue), + flushed_sequence: 0, + }), + } + } + + /// See [MemTableView::mutable_memory_usage] + pub fn mutable_memory_usage(&self) -> usize { + self.inner + .read() + .unwrap() + .memtable_view + .mutable_memory_usage() + } + + /// See [MemTableView::total_memory_usage] + pub fn total_memory_usage(&self) -> usize { + self.inner + .read() + .unwrap() + .memtable_view + .total_memory_usage() + } + + /// Switch all mutable memtables or just return the suggested segment + /// duration if sampling memtable is still active. + /// + /// Returns a duration if a sampled segment duration needs to be persisted. + /// + /// REQUIRE: Do in write worker + pub fn switch_memtables_or_suggest_duration( + &self, + _worker_local: &WorkerLocal, + ) -> Option { + self.inner + .write() + .unwrap() + .memtable_view + .switch_memtables_or_suggest_duration() + } + + /// Stop timestamp sampling and freezed the sampling memtable. + /// + /// REQUIRE: Do in write worker + pub fn freeze_sampling(&self, _worker_local: &WorkerLocal) { + self.inner + .write() + .unwrap() + .memtable_view + .freeze_sampling_memtable(); + } + + /// See [MemTableView::pick_memtables_to_flush] + pub fn pick_memtables_to_flush( + &self, + max_memtable_id: MemTableId, + mems: &mut FlushableMemTables, + ) { + self.inner + .read() + .unwrap() + .memtable_view + .pick_memtables_to_flush(max_memtable_id, mems); + } + + /// Get memtable by timestamp for write. + /// + /// The returned schema is guaranteed to have schema with same version as + /// `schema_version`. Return None if the schema of existing memtable has + /// different schema. + pub fn memtable_for_write( + &self, + write_lock: &WorkerLocal, + timestamp: Timestamp, + schema_version: schema::Version, + ) -> Result> { + // Find memtable by timestamp + let mutable = { + let inner = self.inner.read().unwrap(); + match inner.memtable_for_write(write_lock, timestamp) { + Some(v) => v, + None => return Ok(None), + } + }; + + // We consider the schemas are same if they have the same version. + ensure!( + mutable.memtable().schema().version() == schema_version, + SchemaMismatch { + memtable_version: mutable.memtable().schema().version(), + given: schema_version, + } + ); + + Ok(Some(mutable)) + } + + /// Insert memtable into mutable memtable set. + pub fn insert_mutable(&self, mem_state: MemTableState) { + let mut inner = self.inner.write().unwrap(); + let old_memtable = inner.memtable_view.mutables.insert(mem_state.clone()); + assert!( + old_memtable.is_none(), + "Find a duplicate memtable, new_memtable:{:?}, old_memtable:{:?}, memtable_view:{:#?}", + mem_state, + old_memtable, + inner.memtable_view + ); + } + + /// Set sampling memtable. + /// + /// Panic if the sampling memtable of this version is not None. + pub fn set_sampling(&self, sampling_mem: SamplingMemTable) { + let mut inner = self.inner.write().unwrap(); + assert!(inner.memtable_view.sampling_mem.is_none()); + inner.memtable_view.sampling_mem = Some(sampling_mem); + } + + /// Atomically apply the edit to the version. + pub fn apply_edit(&self, edit: VersionEdit) { + let mut inner = self.inner.write().unwrap(); + + // TODO(yingwen): else, log warning + inner.flushed_sequence = cmp::max(inner.flushed_sequence, edit.flushed_sequence); + + // Add sst files to level first. + for add_file in edit.files_to_add { + inner.levels.add_sst_to_level(add_file.level, add_file.file); + } + + // Remove ssts from level. + for delete_file in edit.files_to_delete { + inner + .levels + .remove_ssts_from_level(delete_file.level, &[delete_file.file_id]); + } + + // Remove immutable memtables. + for mem_id in edit.mems_to_remove { + inner.memtable_view.remove_immutable_or_sampling(mem_id); + } + } + + /// Atomically apply the meta to the version, useful in recover. + pub fn apply_meta(&self, meta: TableVersionMeta) { + let mut inner = self.inner.write().unwrap(); + + inner.flushed_sequence = cmp::max(inner.flushed_sequence, meta.flushed_sequence); + + for add_file in meta.files.into_values() { + inner.levels.add_sst_to_level(add_file.level, add_file.file); + } + } + + pub fn pick_read_view(&self, time_range: TimeRange) -> ReadView { + let mut sampling_mem = None; + let mut memtables = MemTableVec::new(); + let mut leveled_ssts = vec![Vec::new(); MAX_LEVEL]; + + { + // Pick memtables for read. + let inner = self.inner.read().unwrap(); + + inner + .memtable_view + .memtables_for_read(time_range, &mut memtables, &mut sampling_mem); + + // Pick ssts for read. + inner.levels.pick_ssts(time_range, |level, ssts| { + leveled_ssts[level as usize].extend_from_slice(ssts) + }); + } + + ReadView { + sampling_mem, + memtables, + leveled_ssts, + } + } + + /// Pick ssts for compaction using given `picker`. + pub fn pick_for_compaction( + &self, + picker_ctx: PickerContext, + picker: &CompactionPickerRef, + ) -> picker::Result { + let inner = self.inner.read().unwrap(); + + picker.pick_compaction(picker_ctx, &inner.levels) + } + + pub fn has_expired_sst(&self, expire_time: Option) -> bool { + let inner = self.inner.read().unwrap(); + + inner.levels.has_expired_sst(expire_time) + } + + pub fn expired_ssts(&self, expire_time: Option) -> Vec { + let inner = self.inner.read().unwrap(); + + inner.levels.expired_ssts(expire_time) + } +} + +/// During recovery, we apply all version edit to [TableVersionMeta] first, then +/// apply the version meta to the table, so we can avoid adding removed ssts to +/// the version. +#[derive(Debug, Default)] +pub struct TableVersionMeta { + pub flushed_sequence: SequenceNumber, + files: HashMap, + max_file_id: FileId, +} + +impl TableVersionMeta { + pub fn apply_edit(&mut self, edit: VersionEdit) { + self.flushed_sequence = cmp::max(self.flushed_sequence, edit.flushed_sequence); + + for add_file in edit.files_to_add { + self.max_file_id = cmp::max(self.max_file_id, add_file.file.id); + + self.files.insert(add_file.file.id, add_file); + } + + for delete_file in edit.files_to_delete { + self.files.remove(&delete_file.file_id); + } + } + + /// Returns the max file id in the files to add. + pub fn max_file_id_to_add(&self) -> FileId { + self.max_file_id + } + + pub fn ordered_files(&self) -> Vec { + let mut files_vec: Vec<_> = self.files.values().cloned().collect(); + files_vec.sort_unstable_by_key(|file| file.file.id); + + files_vec + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + instance::write_worker::tests::WriteHandleMocker, + sst::file::tests::{FilePurgerMocker, SstMetaDataMocker}, + table::{data::tests::MemTableMocker, version_edit::tests::AddFileMocker}, + table_options, + tests::table, + }; + + fn new_table_version() -> TableVersion { + let purger = FilePurgerMocker::mock(); + let queue = purger.create_purge_queue(1, table::new_table_id(2, 2)); + TableVersion::new(queue) + } + + #[test] + fn test_empty_table_version() { + let mocked_write_handle = WriteHandleMocker::default().build(); + let worker_local = mocked_write_handle.worker_local; + let version = new_table_version(); + + let ts = Timestamp::now(); + assert!(!version.has_expired_sst(None)); + assert!(!version.has_expired_sst(Some(ts))); + + assert_eq!(0, version.mutable_memory_usage()); + assert_eq!(0, version.total_memory_usage()); + + { + let inner = version.inner.read().unwrap(); + let memtable_view = &inner.memtable_view; + assert!(memtable_view.sampling_mem.is_none()); + assert!(memtable_view.mutables.is_empty()); + assert!(memtable_view.immutables.is_empty()); + } + + let mut flushable_mems = FlushableMemTables::default(); + let max_memtable_id = 1000; + version.pick_memtables_to_flush(max_memtable_id, &mut flushable_mems); + assert!(flushable_mems.is_empty()); + + let read_view = version.pick_read_view(TimeRange::min_to_max()); + assert!(!read_view.contains_sampling()); + + assert!(read_view.sampling_mem.is_none()); + assert!(read_view.memtables.is_empty()); + for ssts in read_view.leveled_ssts { + assert!(ssts.is_empty()); + } + + let now = Timestamp::now(); + let mutable = version.memtable_for_write(&worker_local, now, 1).unwrap(); + assert!(mutable.is_none()); + + // Nothing to switch. + assert!(version + .switch_memtables_or_suggest_duration(&worker_local) + .is_none()); + } + + fn check_flushable_mem_with_sampling( + flushable_mems: &FlushableMemTables, + memtable_id: MemTableId, + ) { + assert!(!flushable_mems.is_empty()); + assert_eq!( + memtable_id, + flushable_mems.sampling_mem.as_ref().unwrap().id + ); + assert!(flushable_mems.memtables.is_empty()); + } + + #[test] + fn test_table_version_sampling() { + let mocked_write_handle = WriteHandleMocker::default().build(); + let worker_local = mocked_write_handle.worker_local; + let version = new_table_version(); + + let memtable = MemTableMocker::default().build(); + let schema = memtable.schema().clone(); + + let memtable_id = 1; + let sampling_mem = SamplingMemTable::new(memtable, memtable_id); + + version.set_sampling(sampling_mem); + + // Should write to sampling memtable. + let now = Timestamp::now(); + let mutable = version + .memtable_for_write(&worker_local, now, schema.version()) + .unwrap() + .unwrap(); + let actual_memtable = mutable.as_sampling(); + assert_eq!(memtable_id, actual_memtable.id); + + let mutable = version + .memtable_for_write(&worker_local, Timestamp::new(1234), schema.version()) + .unwrap() + .unwrap(); + let actual_memtable = mutable.as_sampling(); + assert_eq!(memtable_id, actual_memtable.id); + + // Sampling memtable should always be read. + let read_view = version.pick_read_view(TimeRange::new(0.into(), 123.into()).unwrap()); + assert!(read_view.contains_sampling()); + assert_eq!(memtable_id, read_view.sampling_mem.unwrap().id); + + let mut flushable_mems = FlushableMemTables::default(); + version.pick_memtables_to_flush(memtable_id, &mut flushable_mems); + check_flushable_mem_with_sampling(&flushable_mems, memtable_id); + } + + #[test] + fn test_table_version_sampling_switch() { + let worker_local = WriteHandleMocker::default().build().worker_local; + let version = new_table_version(); + + let memtable = MemTableMocker::default().build(); + let schema = memtable.schema().clone(); + + let memtable_id = 1; + let sampling_mem = SamplingMemTable::new(memtable, memtable_id); + + version.set_sampling(sampling_mem); + + let duration = version + .switch_memtables_or_suggest_duration(&worker_local) + .unwrap(); + assert_eq!(table_options::DEFAULT_SEGMENT_DURATION, duration); + + // Flushable memtables only contains sampling memtable. + let mut flushable_mems = FlushableMemTables::default(); + version.pick_memtables_to_flush(memtable_id, &mut flushable_mems); + check_flushable_mem_with_sampling(&flushable_mems, memtable_id); + + // Write to memtable after switch and before freezed. + let now = Timestamp::now(); + let mutable = version + .memtable_for_write(&worker_local, now, schema.version()) + .unwrap() + .unwrap(); + // Still write to sampling memtable. + let actual_memtable = mutable.as_sampling(); + assert_eq!(memtable_id, actual_memtable.id); + + // Switch still return duration before freezed. + let duration = version + .switch_memtables_or_suggest_duration(&worker_local) + .unwrap(); + assert_eq!(table_options::DEFAULT_SEGMENT_DURATION, duration); + + // Flushable memtables only contains sampling memtable before sampling + // memtable is freezed. + let mut flushable_mems = FlushableMemTables::default(); + version.pick_memtables_to_flush(memtable_id, &mut flushable_mems); + check_flushable_mem_with_sampling(&flushable_mems, memtable_id); + } + + #[test] + fn test_table_version_sampling_freeze() { + let worker_local = WriteHandleMocker::default().build().worker_local; + let version = new_table_version(); + + let memtable = MemTableMocker::default().build(); + let schema = memtable.schema().clone(); + + let memtable_id1 = 1; + let sampling_mem = SamplingMemTable::new(memtable, memtable_id1); + + version.set_sampling(sampling_mem); + assert_eq!( + table_options::DEFAULT_SEGMENT_DURATION, + version + .switch_memtables_or_suggest_duration(&worker_local) + .unwrap() + ); + + // Freeze the sampling memtable. + version.freeze_sampling(&worker_local); + + // No memtable after switch and freezed. + let now = Timestamp::now(); + assert!(version + .memtable_for_write(&worker_local, now, schema.version()) + .unwrap() + .is_none()); + + // Still flushable after freezed. + let mut flushable_mems = FlushableMemTables::default(); + version.pick_memtables_to_flush(memtable_id1, &mut flushable_mems); + assert!(flushable_mems.sampling_mem.unwrap().freezed); + + let time_range = + TimeRange::bucket_of(now, table_options::DEFAULT_SEGMENT_DURATION).unwrap(); + + // Sampling memtable still readable after freezed. + let read_view = version.pick_read_view(time_range); + assert!(read_view.contains_sampling()); + assert_eq!(memtable_id1, read_view.sampling_mem.as_ref().unwrap().id); + assert!(read_view.sampling_mem.unwrap().freezed); + + let memtable = MemTableMocker::default().build(); + let memtable_id2 = 2; + let mem_state = MemTableState { + mem: memtable, + time_range, + id: memtable_id2, + }; + // Insert a mutable memtable. + version.insert_mutable(mem_state); + + // Write to mutable memtable. + let mutable = version + .memtable_for_write(&worker_local, now, schema.version()) + .unwrap() + .unwrap(); + let mutable = mutable.as_normal(); + assert_eq!(time_range, mutable.time_range); + assert_eq!(memtable_id2, mutable.id); + + // Need to read sampling memtable and mutable memtable. + let read_view = version.pick_read_view(time_range); + assert_eq!(memtable_id1, read_view.sampling_mem.as_ref().unwrap().id); + assert_eq!(1, read_view.memtables.len()); + assert_eq!(memtable_id2, read_view.memtables[0].id); + + // Switch mutable memtable. + assert!(version + .switch_memtables_or_suggest_duration(&worker_local) + .is_none()); + // No memtable after switch. + let now = Timestamp::now(); + assert!(version + .memtable_for_write(&worker_local, now, schema.version()) + .unwrap() + .is_none()); + + // Two memtables to flush. + let mut flushable_mems = FlushableMemTables::default(); + version.pick_memtables_to_flush(memtable_id2, &mut flushable_mems); + assert!(flushable_mems.sampling_mem.unwrap().freezed); + assert_eq!(1, flushable_mems.memtables.len()); + assert_eq!(memtable_id2, flushable_mems.memtables[0].id); + } + + #[test] + fn test_table_version_sampling_apply_edit() { + let worker_local = WriteHandleMocker::default().build().worker_local; + let version = new_table_version(); + + let memtable = MemTableMocker::default().build(); + let schema = memtable.schema().clone(); + + let memtable_id1 = 1; + let sampling_mem = SamplingMemTable::new(memtable, memtable_id1); + + // Prepare sampling memtable. + version.set_sampling(sampling_mem); + version.freeze_sampling(&worker_local); + + let now = Timestamp::now(); + let time_range = + TimeRange::bucket_of(now, table_options::DEFAULT_SEGMENT_DURATION).unwrap(); + + // Prepare mutable memtable. + let memtable = MemTableMocker::default().build(); + let memtable_id2 = 2; + let mem_state = MemTableState { + mem: memtable, + time_range, + id: memtable_id2, + }; + // Insert a mutable memtable. + version.insert_mutable(mem_state); + + // Switch memtable. + assert!(version + .switch_memtables_or_suggest_duration(&worker_local) + .is_none()); + + let max_sequence = 120; + let file_id = 13; + // TO simplify test, we only create one sst. + let sst_meta = SstMetaDataMocker::new(schema) + .time_range(time_range) + .max_sequence(max_sequence) + .build(); + let add_file = AddFileMocker::new(sst_meta).file_id(file_id).build(); + let edit = VersionEdit { + flushed_sequence: max_sequence, + mems_to_remove: vec![memtable_id1, memtable_id2], + files_to_add: vec![add_file], + files_to_delete: vec![], + }; + version.apply_edit(edit); + + // Only pick ssts after flushed. + let read_view = version.pick_read_view(time_range); + assert!(!read_view.contains_sampling()); + assert!(read_view.sampling_mem.is_none()); + assert!(read_view.memtables.is_empty()); + assert_eq!(1, read_view.leveled_ssts[0].len()); + assert_eq!(file_id, read_view.leveled_ssts[0][0].id()); + } +} diff --git a/analytic_engine/src/table/version_edit.rs b/analytic_engine/src/table/version_edit.rs new file mode 100644 index 0000000000..97f09e5454 --- /dev/null +++ b/analytic_engine/src/table/version_edit.rs @@ -0,0 +1,176 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Version edits + +use std::convert::{TryFrom, TryInto}; + +use common_types::{bytes::Bytes, schema::Schema, time::TimeRange, SequenceNumber}; +use common_util::define_result; +use proto::meta_update as meta_pb; +use snafu::{Backtrace, ResultExt, Snafu}; + +use crate::{ + sst::{ + file::{FileMeta, SstMetaData}, + manager::FileId, + }, + table::data::MemTableId, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Invalid level:{}, err:{}.\nBacktrace:\n{}", level, source, backtrace))] + InvalidLevel { + level: u32, + source: std::num::TryFromIntError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to convert time range, err:{}", source))] + ConvertTimeRange { source: common_types::time::Error }, + + #[snafu(display("Fail to convert table schema, err:{}", source))] + ConvertTableSchema { source: common_types::schema::Error }, +} + +define_result!(Error); + +/// Meta data of a new file. +#[derive(Debug, Clone)] +pub struct AddFile { + /// The level of the file intended to add. + pub level: u16, + /// Meta data of the file to add. + pub file: FileMeta, +} + +impl AddFile { + /// Convert into protobuf struct + pub fn into_pb(self) -> meta_pb::AddFileMeta { + let mut target = meta_pb::AddFileMeta::new(); + target.set_level(self.level.into()); + target.set_file_id(self.file.id); + target.set_min_key(self.file.meta.min_key.to_vec()); + target.set_max_key(self.file.meta.max_key.to_vec()); + target.set_time_range(self.file.meta.time_range.into()); + target.set_max_seq(self.file.meta.max_sequence); + target.set_schema(self.file.meta.schema.into()); + target.set_size(self.file.meta.size); + target.set_row_num(self.file.meta.row_num); + + target + } +} + +impl TryFrom for AddFile { + type Error = Error; + + fn try_from(mut src: meta_pb::AddFileMeta) -> Result { + let time_range = TimeRange::try_from(src.take_time_range()).context(ConvertTimeRange)?; + let schema = Schema::try_from(src.take_schema()).context(ConvertTableSchema)?; + Ok(Self { + level: src + .level + .try_into() + .context(InvalidLevel { level: src.level })?, + file: FileMeta { + id: src.file_id, + meta: SstMetaData { + min_key: Bytes::from(src.min_key), + max_key: Bytes::from(src.max_key), + time_range, + max_sequence: src.max_seq, + schema, + size: src.size, + row_num: src.row_num, + }, + }, + }) + } +} + +/// Meta data of the file to delete. +#[derive(Debug, Clone)] +pub struct DeleteFile { + /// The level of the file intended to delete. + pub level: u16, + /// Id of the file to delete. + pub file_id: FileId, +} + +impl DeleteFile { + /// Convert into protobuf struct + pub fn into_pb(self) -> meta_pb::DeleteFileMeta { + let mut target = meta_pb::DeleteFileMeta::new(); + target.set_level(self.level.into()); + target.set_file_id(self.file_id); + + target + } +} + +impl TryFrom for DeleteFile { + type Error = Error; + + fn try_from(src: meta_pb::DeleteFileMeta) -> Result { + let level = src + .level + .try_into() + .context(InvalidLevel { level: src.level })?; + + Ok(Self { + level, + file_id: src.file_id, + }) + } +} + +/// Edit to the [TableVersion], which should be done atomically +#[derive(Debug)] +pub struct VersionEdit { + /// The last sequence already flushed. This field is not guaranteed to be + /// set if the version edit is created by a non-flush operation (such as + /// compaction). + pub flushed_sequence: SequenceNumber, + /// Id of memtables to remove from immutable memtable lists. + pub mems_to_remove: Vec, + /// Sst files to add. + pub files_to_add: Vec, + /// Sst files to delete. + pub files_to_delete: Vec, +} + +#[cfg(test)] +pub mod tests { + use super::*; + + #[must_use] + pub struct AddFileMocker { + file_id: FileId, + sst_meta: SstMetaData, + } + + impl AddFileMocker { + pub fn new(sst_meta: SstMetaData) -> Self { + Self { + file_id: 1, + sst_meta, + } + } + + pub fn file_id(mut self, file_id: FileId) -> Self { + self.file_id = file_id; + self + } + + pub fn build(&self) -> AddFile { + AddFile { + level: 0, + file: FileMeta { + id: self.file_id, + meta: self.sst_meta.clone(), + }, + } + } + } +} diff --git a/analytic_engine/src/table_options.rs b/analytic_engine/src/table_options.rs new file mode 100644 index 0000000000..badac47830 --- /dev/null +++ b/analytic_engine/src/table_options.rs @@ -0,0 +1,553 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Constants for table options. + +use std::{collections::HashMap, string::ToString, time::Duration}; + +use arrow_deps::datafusion::parquet::basic::Compression as ParquetCompression; +use common_types::time::Timestamp; +use common_util::{ + config::{ReadableDuration, ReadableSize}, + define_result, + time::DurationExt, +}; +use proto::analytic_common::{ + CompactionOptions as CompactionOptionsPb, CompactionStrategy as CompactionStrategyPb, + Compression as CompressionPb, TableOptions as TableOptionsPb, UpdateMode as UpdateModePb, +}; +use serde_derive::Deserialize; +use snafu::{Backtrace, GenerateBacktrace, ResultExt, Snafu}; +use table_engine::OPTION_KEY_ENABLE_TTL; + +use crate::compaction::{ + CompactionStrategy, SizeTieredCompactionOptions, TimeWindowCompactionOptions, +}; + +pub const SEGMENT_DURATION: &str = "segment_duration"; +pub const ENABLE_TTL: &str = OPTION_KEY_ENABLE_TTL; +pub const TTL: &str = "ttl"; +pub const ARENA_BLOCK_SIZE: &str = "arena_block_size"; +pub const WRITE_BUFFER_SIZE: &str = "write_buffer_size"; +pub const COMPACTION_STRATEGY: &str = "compaction_strategy"; +pub const NUM_ROWS_PER_ROW_GROUP: &str = "num_rows_per_row_group"; +pub const UPDATE_MODE: &str = "update_mode"; +pub const COMPRESSION: &str = "compression"; + +const UPDATE_MODE_OVERWRITE: &str = "OVERWRITE"; +const UPDATE_MODE_APPEND: &str = "APPEND"; +const COMPRESSION_UNCOMPRESSED: &str = "UNCOMPRESSED"; +const COMPRESSION_LZ4: &str = "LZ4"; +const COMPRESSION_SNAPPY: &str = "SNAPPY"; +const COMPRESSION_ZSTD: &str = "ZSTD"; +const AT_LEAST_OPTIONS_NUM: usize = 9; + +/// Default bucket duration (1d) +const BUCKET_DURATION_1D: Duration = Duration::from_secs(24 * 60 * 60); +/// Default duration of a segment (2h). +pub const DEFAULT_SEGMENT_DURATION: Duration = Duration::from_secs(60 * 60 * 2); +/// Default arena block size (2M). +const DEFAULT_ARENA_BLOCK_SIZE: u32 = 2 * 1024 * 1024; +/// Default write buffer size (32M). +const DEFAULT_WRITE_BUFFER_SIZE: u32 = 32 * 1024 * 1024; +/// Default ttl of table (7d). +const DEFAULT_TTL: Duration = Duration::from_secs(7 * 24 * 60 * 60); +/// Default row number of a row group. +const DEFAULT_NUM_ROW_PER_ROW_GROUP: usize = 8192; + +/// Max arena block size (2G) +const MAX_ARENA_BLOCK_SIZE: u32 = 2 * 1024 * 1024 * 1024; +/// Min arena block size (1K) +const MIN_ARENA_BLOCK_SIZE: u32 = 1024; +const MIN_NUM_ROWS_PER_ROW_GROUP: usize = 100; +const MAX_NUM_ROWS_PER_ROW_GROUP: usize = 10_000_000; + +#[derive(Debug, Snafu)] +#[allow(clippy::enum_variant_names)] +pub enum Error { + #[snafu(display("Failed to parse duration, err:{}.\nBacktrace:\n{}", err, backtrace))] + ParseDuration { err: String, backtrace: Backtrace }, + + #[snafu(display("Failed to parse size, err:{}.\nBacktrace:\n{}", err, backtrace))] + ParseSize { err: String, backtrace: Backtrace }, + + #[snafu(display("Failed to parse compaction strategy: {}, err: {}", value, source))] + ParseStrategy { + value: String, + source: crate::compaction::Error, + }, + #[snafu(display("Failed to parse int, err:{}.\nBacktrace:\n{}", source, backtrace))] + ParseInt { + source: std::num::ParseIntError, + backtrace: Backtrace, + }, + #[snafu(display("Failed to parse bool, err:{}.\nBacktrace:\n{}", source, backtrace))] + ParseBool { + source: std::str::ParseBoolError, + backtrace: Backtrace, + }, + #[snafu(display( + "Failed to parse update mode, raw str:{}.\nBacktrace:\n{}", + s, + backtrace + ))] + ParseUpdateMode { s: String, backtrace: Backtrace }, + #[snafu(display( + "Failed to parse compression, name:{}.\nBacktrace:\n{}", + name, + backtrace + ))] + ParseCompressionName { name: String, backtrace: Backtrace }, +} + +define_result!(Error); + +#[derive(Debug, Clone, Deserialize)] +pub enum UpdateMode { + Overwrite, + Append, +} + +impl UpdateMode { + pub fn parse_from(s: &str) -> Result { + if s.eq_ignore_ascii_case(UPDATE_MODE_OVERWRITE) { + Ok(UpdateMode::Overwrite) + } else if s.eq_ignore_ascii_case(UPDATE_MODE_APPEND) { + Ok(UpdateMode::Append) + } else { + ParseUpdateMode { s }.fail() + } + } +} + +impl ToString for UpdateMode { + fn to_string(&self) -> String { + match self { + UpdateMode::Append => UPDATE_MODE_APPEND.to_string(), + UpdateMode::Overwrite => UPDATE_MODE_OVERWRITE.to_string(), + } + } +} + +#[derive(Debug, Clone, Copy, Deserialize)] +pub enum Compression { + Uncompressed, + Lz4, + Snappy, + Zstd, +} + +impl Compression { + pub fn parse_from(name: &str) -> Result { + if name.eq_ignore_ascii_case(COMPRESSION_UNCOMPRESSED) { + Ok(Compression::Uncompressed) + } else if name.eq_ignore_ascii_case(COMPRESSION_LZ4) { + Ok(Compression::Lz4) + } else if name.eq_ignore_ascii_case(COMPRESSION_SNAPPY) { + Ok(Compression::Snappy) + } else if name.eq_ignore_ascii_case(COMPRESSION_ZSTD) { + Ok(Compression::Zstd) + } else { + ParseCompressionName { name }.fail() + } + } +} + +impl ToString for Compression { + fn to_string(&self) -> String { + match self { + Compression::Uncompressed => COMPRESSION_UNCOMPRESSED.to_string(), + Compression::Lz4 => COMPRESSION_LZ4.to_string(), + Compression::Snappy => COMPRESSION_SNAPPY.to_string(), + Compression::Zstd => COMPRESSION_ZSTD.to_string(), + } + } +} + +impl From for CompressionPb { + fn from(compression: Compression) -> Self { + match compression { + Compression::Uncompressed => CompressionPb::UNCOMPRESSED, + Compression::Lz4 => CompressionPb::LZ4, + Compression::Snappy => CompressionPb::SNAPPY, + Compression::Zstd => CompressionPb::ZSTD, + } + } +} + +impl From for Compression { + fn from(compression: CompressionPb) -> Self { + match compression { + CompressionPb::UNCOMPRESSED => Compression::Uncompressed, + CompressionPb::LZ4 => Compression::Lz4, + CompressionPb::SNAPPY => Compression::Snappy, + CompressionPb::ZSTD => Compression::Zstd, + } + } +} + +impl From for ParquetCompression { + fn from(compression: Compression) -> Self { + match compression { + Compression::Uncompressed => ParquetCompression::UNCOMPRESSED, + Compression::Lz4 => ParquetCompression::LZ4, + Compression::Snappy => ParquetCompression::SNAPPY, + Compression::Zstd => ParquetCompression::ZSTD, + } + } +} + +/// Options for a table. +#[derive(Debug, Clone, Deserialize)] +#[serde(default)] +pub struct TableOptions { + // The following options are immutable once table was created. + /// Segment duration of the table. + /// + /// `None` means the table is doing the segment duration sampling and + /// the actual duration is still unknown. + pub segment_duration: Option, + /// Table update mode, now support Overwrite(Default) and Append + pub update_mode: UpdateMode, + + // The following options can be altered. + /// Enable ttl + pub enable_ttl: bool, + /// Time-to-live of the data. + pub ttl: ReadableDuration, + /// Arena block size of memtable. + pub arena_block_size: u32, + /// Write buffer size of memtable. + pub write_buffer_size: u32, + /// Compaction strategy of the table. + pub compaction_strategy: CompactionStrategy, + /// Row number in a row group. + pub num_rows_per_row_group: usize, + /// Table Compression + pub compression: Compression, +} + +impl TableOptions { + #[inline] + pub fn segment_duration(&self) -> Option { + self.segment_duration.map(|v| v.0) + } + + #[inline] + pub fn ttl(&self) -> Option { + if self.enable_ttl { + Some(self.ttl) + } else { + None + } + } + + // for show create table + pub fn to_raw_map(&self) -> HashMap { + let mut m = HashMap::with_capacity(AT_LEAST_OPTIONS_NUM); + m.insert( + SEGMENT_DURATION.to_string(), + self.segment_duration + .map(|v| v.to_string()) + .unwrap_or_else(String::new), + ); + m.insert(UPDATE_MODE.to_string(), self.update_mode.to_string()); + m.insert(ENABLE_TTL.to_string(), self.enable_ttl.to_string()); + m.insert(TTL.to_string(), format!("{}", self.ttl)); + m.insert( + ARENA_BLOCK_SIZE.to_string(), + format!("{}", self.arena_block_size), + ); + m.insert( + WRITE_BUFFER_SIZE.to_string(), + format!("{}", self.write_buffer_size), + ); + self.compaction_strategy.fill_raw_map(&mut m); + m.insert( + NUM_ROWS_PER_ROW_GROUP.to_string(), + format!("{}", self.num_rows_per_row_group), + ); + m.insert(COMPRESSION.to_string(), self.compression.to_string()); + + assert!(m.len() >= AT_LEAST_OPTIONS_NUM); + + m + } + + /// Sanitize options silently. + pub fn sanitize(&mut self) { + let one_day_secs = BUCKET_DURATION_1D.as_secs(); + + if let Some(segment_duration) = self.segment_duration { + let mut segment_duration_secs = segment_duration.as_secs(); + if segment_duration_secs == 0 { + segment_duration_secs = DEFAULT_SEGMENT_DURATION.as_secs() + }; + self.segment_duration = Some(ReadableDuration::secs(segment_duration_secs)); + } + + let ttl_secs = self.ttl.as_secs(); + // Ttl must align to day. + let ttl_secs = ttl_secs / one_day_secs * one_day_secs; + self.ttl = ReadableDuration::secs(ttl_secs); + + if self.arena_block_size < MIN_ARENA_BLOCK_SIZE { + self.arena_block_size = MIN_ARENA_BLOCK_SIZE; + } + + if self.arena_block_size > MAX_ARENA_BLOCK_SIZE { + self.arena_block_size = MAX_ARENA_BLOCK_SIZE; + } + + if self.num_rows_per_row_group < MIN_NUM_ROWS_PER_ROW_GROUP { + self.num_rows_per_row_group = MIN_NUM_ROWS_PER_ROW_GROUP; + } + + if self.num_rows_per_row_group > MAX_NUM_ROWS_PER_ROW_GROUP { + self.num_rows_per_row_group = MAX_NUM_ROWS_PER_ROW_GROUP; + } + } + + pub fn need_dedup(&self) -> bool { + match self.update_mode { + UpdateMode::Overwrite => true, + UpdateMode::Append => false, + } + } + + pub fn is_expired(&self, timestamp: Timestamp) -> bool { + self.enable_ttl && timestamp.is_expired(Timestamp::expire_time(self.ttl.0)) + } +} + +impl From for CompactionOptionsPb { + fn from(opts: SizeTieredCompactionOptions) -> Self { + let mut target = CompactionOptionsPb::new(); + target.set_bucket_low(opts.bucket_low); + target.set_bucket_high(opts.bucket_high); + target.set_min_sstable_size(opts.min_sstable_size.0 as u32); + target.set_max_threshold(opts.max_threshold as u32); + target.set_min_threshold(opts.min_threshold as u32); + + target + } +} + +impl From for SizeTieredCompactionOptions { + fn from(opts: CompactionOptionsPb) -> Self { + Self { + bucket_low: opts.bucket_low, + bucket_high: opts.bucket_high, + min_sstable_size: ReadableSize(opts.min_sstable_size.into()), + min_threshold: opts.min_threshold as usize, + max_threshold: opts.max_threshold as usize, + } + } +} + +impl From for CompactionOptionsPb { + fn from(opts: TimeWindowCompactionOptions) -> Self { + let mut target = CompactionOptionsPb::new(); + target.set_bucket_low(opts.size_tiered.bucket_low); + target.set_bucket_high(opts.size_tiered.bucket_high); + target.set_min_sstable_size(opts.size_tiered.min_sstable_size.0 as u32); + target.set_min_threshold(opts.size_tiered.min_threshold as u32); + target.set_max_threshold(opts.size_tiered.max_threshold as u32); + target.set_timestamp_resolution(opts.timestamp_resolution.into()); + + target + } +} + +impl From for TimeWindowCompactionOptions { + fn from(opts: CompactionOptionsPb) -> Self { + let size_tiered: SizeTieredCompactionOptions = opts.clone().into(); + + Self { + size_tiered, + timestamp_resolution: opts.timestamp_resolution.into(), + } + } +} + +impl From for TableOptionsPb { + fn from(opts: TableOptions) -> Self { + let mut target = TableOptionsPb::new(); + if let Some(segment_duration) = opts.segment_duration { + target.set_segment_duration(segment_duration.0.as_millis_u64()); + target.set_sampling_segment_duration(false); + } else { + // The segment duration is unknown. + target.set_sampling_segment_duration(true); + } + target.set_enable_ttl(opts.enable_ttl); + target.set_ttl(opts.ttl.0.as_millis_u64()); + target.set_arena_block_size(opts.arena_block_size); + target.set_num_rows_per_row_group(opts.num_rows_per_row_group as u64); + + match opts.compaction_strategy { + CompactionStrategy::Default => { + target.set_compaction_strategy(CompactionStrategyPb::DEFAULT); + } + CompactionStrategy::SizeTiered(opts) => { + target.set_compaction_strategy(CompactionStrategyPb::SIZE_TIERED); + target.set_compaction_options(opts.into()); + } + CompactionStrategy::TimeWindow(opts) => { + target.set_compaction_strategy(CompactionStrategyPb::TIME_WINDOW); + target.set_compaction_options(opts.into()); + } + } + + match opts.update_mode { + UpdateMode::Overwrite => { + target.set_update_mode(UpdateModePb::Overwrite); + } + UpdateMode::Append => { + target.set_update_mode(UpdateModePb::Append); + } + } + + target.set_write_buffer_size(opts.write_buffer_size); + target.set_compression(opts.compression.into()); + + target + } +} + +impl From for TableOptions { + fn from(opts: TableOptionsPb) -> Self { + let compaction_strategy = match opts.compaction_strategy { + CompactionStrategyPb::DEFAULT => CompactionStrategy::default(), + CompactionStrategyPb::SIZE_TIERED => { + let opts = opts + .compaction_options + .map(SizeTieredCompactionOptions::from) + .unwrap_or_default(); + CompactionStrategy::SizeTiered(opts) + } + CompactionStrategyPb::TIME_WINDOW => { + let opts = opts + .compaction_options + .map(TimeWindowCompactionOptions::from) + .unwrap_or_default(); + CompactionStrategy::TimeWindow(opts) + } + }; + + let update_mode = match opts.update_mode { + UpdateModePb::Overwrite => UpdateMode::Overwrite, + UpdateModePb::Append => UpdateMode::Append, + }; + let segment_duration = if opts.sampling_segment_duration { + None + } else if opts.segment_duration == 0 { + // If segment duration is still zero. If the data had been used by an elder + // version release that not yet support sampling, the + // `sampling_segment_duration` flag would be truncated after + // manifest snapshot, but left segment duration zero. + Some(DEFAULT_SEGMENT_DURATION.into()) + } else { + Some(Duration::from_millis(opts.segment_duration).into()) + }; + + Self { + segment_duration, + enable_ttl: opts.enable_ttl, + ttl: Duration::from_millis(opts.ttl).into(), + arena_block_size: opts.arena_block_size, + compaction_strategy, + num_rows_per_row_group: opts.num_rows_per_row_group as usize, + update_mode, + write_buffer_size: opts.write_buffer_size, + compression: opts.compression.into(), + } + } +} + +impl Default for TableOptions { + fn default() -> Self { + Self { + segment_duration: None, + enable_ttl: true, + ttl: DEFAULT_TTL.into(), + arena_block_size: DEFAULT_ARENA_BLOCK_SIZE, + compaction_strategy: CompactionStrategy::default(), + num_rows_per_row_group: DEFAULT_NUM_ROW_PER_ROW_GROUP, + update_mode: UpdateMode::Overwrite, + write_buffer_size: DEFAULT_WRITE_BUFFER_SIZE, + compression: Compression::Zstd, + } + } +} + +pub fn merge_table_options_for_create( + options: &HashMap, + table_opts: &TableOptions, +) -> Result { + merge_table_options(options, table_opts, true) +} + +pub fn merge_table_options_for_alter( + options: &HashMap, + table_opts: &TableOptions, +) -> Result { + merge_table_options(options, table_opts, false) +} + +/// The options will override the old options. +fn merge_table_options( + options: &HashMap, + table_old_opts: &TableOptions, + is_create: bool, +) -> Result { + let mut table_opts = table_old_opts.clone(); + if is_create { + if let Some(v) = options.get(SEGMENT_DURATION) { + table_opts.segment_duration = Some(parse_duration(v)?); + } + if let Some(v) = options.get(UPDATE_MODE) { + table_opts.update_mode = UpdateMode::parse_from(v)?; + } + } + + if let Some(v) = options.get(TTL) { + table_opts.ttl = parse_duration(v)?; + } + if let Some(v) = options.get(OPTION_KEY_ENABLE_TTL) { + table_opts.enable_ttl = v.parse::().context(ParseBool)?; + } + if let Some(v) = options.get(ARENA_BLOCK_SIZE) { + let size = parse_size(v)?; + table_opts.arena_block_size = size.0 as u32; + } + if let Some(v) = options.get(WRITE_BUFFER_SIZE) { + let size = parse_size(v)?; + table_opts.write_buffer_size = size.0 as u32; + } + if let Some(v) = options.get(COMPACTION_STRATEGY) { + table_opts.compaction_strategy = + CompactionStrategy::parse_from(v, options).context(ParseStrategy { value: v })?; + } + if let Some(v) = options.get(NUM_ROWS_PER_ROW_GROUP) { + table_opts.num_rows_per_row_group = v.parse().context(ParseInt)?; + } + if let Some(v) = options.get(COMPRESSION) { + table_opts.compression = Compression::parse_from(v)?; + } + Ok(table_opts) +} + +fn parse_duration(v: &str) -> Result { + v.parse::() + .map_err(|err| Error::ParseDuration { + err, + backtrace: Backtrace::generate(), + }) +} + +fn parse_size(v: &str) -> Result { + v.parse::().map_err(|err| Error::ParseSize { + err, + backtrace: Backtrace::generate(), + }) +} diff --git a/analytic_engine/src/tests/alter_test.rs b/analytic_engine/src/tests/alter_test.rs new file mode 100644 index 0000000000..2bdc74f50b --- /dev/null +++ b/analytic_engine/src/tests/alter_test.rs @@ -0,0 +1,449 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Alter test + +use std::collections::{BTreeMap, HashMap}; + +use common_types::{ + column_schema, + datum::DatumKind, + row::{RowGroup, RowGroupBuilder}, + schema::{self, Schema}, + time::Timestamp, +}; +use log::info; +use table_engine::table::AlterSchemaRequest; + +use crate::{ + table_options::TableOptions, + tests::{ + row_util, + table::{self, FixedSchemaTable}, + util::{Null, TestContext, TestEnv}, + }, +}; + +#[test] +fn test_alter_table_add_column() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await; + + let start_ms = test_ctx.start_ms(); + let rows = [ + ( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key2", + Timestamp::new(start_ms), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ]; + + // Write data to table. + let row_group = fixed_schema_table.rows_to_row_group(&rows); + test_ctx.write_to_table(test_table1, row_group).await; + + alter_schema_same_schema_version_case(&test_ctx, test_table1).await; + + alter_schema_old_pre_version_case(&test_ctx, test_table1).await; + + alter_schema_add_column_case(&mut test_ctx, test_table1, start_ms, false).await; + + // Prepare another table for alter. + let test_table2 = "test_table2"; + test_ctx.create_fixed_schema_table(test_table2).await; + let row_group = fixed_schema_table.rows_to_row_group(&rows); + test_ctx.write_to_table(test_table2, row_group).await; + + alter_schema_add_column_case(&mut test_ctx, test_table2, start_ms, true).await; + }); +} + +// Add two columns: +// - add_string +// - add_double +fn add_columns(schema_builder: schema::Builder) -> schema::Builder { + schema_builder + .add_normal_column( + column_schema::Builder::new("add_string".to_string(), DatumKind::String) + .is_nullable(true) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("add_double".to_string(), DatumKind::Double) + .is_nullable(true) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() +} + +async fn alter_schema_same_schema_version_case(test_ctx: &TestContext, table_name: &str) { + info!("test alter_schema_same_schema_version_case"); + + let mut schema_builder = FixedSchemaTable::default_schema_builder(); + schema_builder = add_columns(schema_builder); + let new_schema = schema_builder.build().unwrap(); + + let table = test_ctx.table(table_name); + let old_schema = table.schema(); + + let request = AlterSchemaRequest { + schema: new_schema, + pre_schema_version: old_schema.version(), + }; + + let res = test_ctx.try_alter_schema(table_name, request).await; + assert!(res.is_err()); +} + +async fn alter_schema_old_pre_version_case(test_ctx: &TestContext, table_name: &str) { + info!("test alter_schema_old_pre_version_case"); + + let mut schema_builder = FixedSchemaTable::default_schema_builder(); + schema_builder = add_columns(schema_builder); + + let table = test_ctx.table(table_name); + let old_schema = table.schema(); + + let new_schema = schema_builder + .version(old_schema.version() + 1) + .build() + .unwrap(); + + let request = AlterSchemaRequest { + schema: new_schema, + pre_schema_version: old_schema.version() - 1, + }; + + let res = test_ctx.try_alter_schema(table_name, request).await; + assert!(res.is_err()); +} + +async fn alter_schema_add_column_case( + test_ctx: &mut TestContext, + table_name: &str, + start_ms: i64, + flush: bool, +) { + info!( + "test alter_schema_add_column_case, table_name:{}", + table_name + ); + + let mut schema_builder = FixedSchemaTable::default_schema_builder(); + schema_builder = add_columns(schema_builder); + + let old_schema = test_ctx.table(table_name).schema(); + + let new_schema = schema_builder + .version(old_schema.version() + 1) + .build() + .unwrap(); + + let request = AlterSchemaRequest { + schema: new_schema.clone(), + pre_schema_version: old_schema.version(), + }; + + let affected = test_ctx + .try_alter_schema(table_name, request) + .await + .unwrap(); + assert_eq!(1, affected); + + let rows = [ + ( + "key1", + Timestamp::new(start_ms + 10), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + "add1-1", + 210.0, + ), + ( + "key2", + Timestamp::new(start_ms + 10), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + "add1-2", + 220.0, + ), + ]; + let rows_vec = row_util::new_rows_8(&rows); + let row_group = RowGroupBuilder::with_rows(new_schema.clone(), rows_vec) + .unwrap() + .build(); + + // Write data with new schema. + test_ctx.write_to_table(table_name, row_group).await; + + if flush { + test_ctx.flush_table(table_name).await; + } + + let new_schema_rows = [ + // We need to check null datum, so tuples have different types and we need to + // convert it into row first. + row_util::new_row_8(( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + Null, + Null, + )), + row_util::new_row_8(( + "key1", + Timestamp::new(start_ms + 10), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + "add1-1", + 210.0, + )), + row_util::new_row_8(( + "key2", + Timestamp::new(start_ms), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + Null, + Null, + )), + row_util::new_row_8(( + "key2", + Timestamp::new(start_ms + 10), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + "add1-2", + 220.0, + )), + ]; + let new_schema_row_group = + RowGroupBuilder::with_rows(new_schema.clone(), new_schema_rows.to_vec()) + .unwrap() + .build(); + + // Read data using new schema. + check_read_row_group( + test_ctx, + "Test read new schema after add columns", + table_name, + &new_schema, + &new_schema_row_group, + ) + .await; + + let old_schema_rows = [ + ( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key1", + Timestamp::new(start_ms + 10), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key2", + Timestamp::new(start_ms), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ( + "key2", + Timestamp::new(start_ms + 10), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ]; + let old_schema_rows_vec = row_util::new_rows_6(&old_schema_rows); + let old_schema_row_group = RowGroupBuilder::with_rows(old_schema.clone(), old_schema_rows_vec) + .unwrap() + .build(); + + // Read data using old schema. + check_read_row_group( + test_ctx, + "Test read old schema after add columns", + table_name, + &old_schema, + &old_schema_row_group, + ) + .await; + + // Reopen db. + test_ctx.reopen_with_tables(&[table_name]).await; + + // Read again after reopen. + check_read_row_group( + test_ctx, + "Test read after reopen", + table_name, + &new_schema, + &new_schema_row_group, + ) + .await; +} + +async fn check_read_row_group( + test_ctx: &TestContext, + msg: &str, + table_name: &str, + schema: &Schema, + row_group: &RowGroup, +) { + for read_opts in table::read_opts_list() { + info!("{}, opts:{:?}", msg, read_opts); + + let record_batches = test_ctx + .read_table( + table_name, + table::new_read_all_request(schema.clone(), read_opts), + ) + .await; + + table::assert_batch_eq_to_row_group(&record_batches, row_group); + } +} + +#[test] +fn test_alter_table_options() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + test_ctx.create_fixed_schema_table(test_table1).await; + + let opts = test_ctx.table(test_table1).options(); + + let default_opts_map = default_options(); + + assert_options_eq(&default_opts_map, &opts); + + alter_immutable_option_case(&test_ctx, test_table1, "segment_duration", "20d").await; + + alter_immutable_option_case(&test_ctx, test_table1, "bucket_duration", "20d").await; + + alter_immutable_option_case(&test_ctx, test_table1, "update_mode", "Append").await; + + alter_mutable_option_case(&mut test_ctx, test_table1, "enable_ttl", "false").await; + alter_mutable_option_case(&mut test_ctx, test_table1, "enable_ttl", "true").await; + + alter_mutable_option_case(&mut test_ctx, test_table1, "arena_block_size", "10240").await; + + alter_mutable_option_case(&mut test_ctx, test_table1, "write_buffer_size", "1024000").await; + + alter_mutable_option_case( + &mut test_ctx, + test_table1, + "num_rows_per_row_group", + "10000", + ) + .await; + }); +} + +async fn alter_immutable_option_case( + test_ctx: &TestContext, + table_name: &str, + opt_key: &str, + opt_value: &str, +) { + let old_opts = test_ctx.table(table_name).options(); + + let mut new_opts = HashMap::new(); + new_opts.insert(opt_key.to_string(), opt_value.to_string()); + + let affected = test_ctx + .try_alter_options(table_name, new_opts) + .await + .unwrap(); + assert_eq!(1, affected); + + let opts_after_alter = test_ctx.table(table_name).options(); + assert_options_eq(&old_opts, &opts_after_alter); +} + +async fn alter_mutable_option_case( + test_ctx: &mut TestContext, + table_name: &str, + opt_key: &str, + opt_value: &str, +) { + let mut expect_opts = test_ctx.table(table_name).options(); + expect_opts.insert(opt_key.to_string(), opt_value.to_string()); + + let mut new_opts = HashMap::new(); + new_opts.insert(opt_key.to_string(), opt_value.to_string()); + + let affected = test_ctx + .try_alter_options(table_name, new_opts) + .await + .unwrap(); + assert_eq!(1, affected); + + let opts_after_alter = test_ctx.table(table_name).options(); + assert_options_eq(&expect_opts, &opts_after_alter); + + // Reopen table. + test_ctx.reopen_with_tables(&[table_name]).await; + + let opts_after_alter = test_ctx.table(table_name).options(); + assert_options_eq(&expect_opts, &opts_after_alter); +} + +fn assert_options_eq(left: &HashMap, right: &HashMap) { + let sorted_left: BTreeMap<_, _> = left.iter().collect(); + let sorted_right: BTreeMap<_, _> = right.iter().collect(); + + assert_eq!(sorted_left, sorted_right); +} + +fn default_options() -> HashMap { + let table_opts = TableOptions::default(); + + table_opts.to_raw_map() +} diff --git a/analytic_engine/src/tests/compaction_test.rs b/analytic_engine/src/tests/compaction_test.rs new file mode 100644 index 0000000000..6a5b300eb3 --- /dev/null +++ b/analytic_engine/src/tests/compaction_test.rs @@ -0,0 +1,90 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Compaction integration tests. + +use common_types::time::Timestamp; +use table_engine::table::FlushRequest; + +use crate::{ + compaction::SizeTieredCompactionOptions, + tests::util::{self, TestEnv}, +}; + +#[test] +fn test_table_compact_current_segment() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await; + let default_opts = SizeTieredCompactionOptions::default(); + + let mut expect_rows = Vec::new(); + + let start_ms = test_ctx.start_ms(); + // Write more than ensure compaction will be triggered. + for offset in 0..default_opts.max_threshold as i64 * 2 { + let rows = [ + ( + "key1", + Timestamp::new(start_ms + offset), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key2", + Timestamp::new(start_ms + offset), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ]; + expect_rows.extend_from_slice(&rows); + let row_group = fixed_schema_table.rows_to_row_group(&rows); + + test_ctx.write_to_table(test_table1, row_group).await; + + // Flush table and generate sst. + test_ctx + .flush_table_with_request( + test_table1, + FlushRequest { + // Don't trigger a compaction. + compact_after_flush: false, + sync: true, + }, + ) + .await; + } + + expect_rows.sort_unstable_by_key(|row_tuple| (row_tuple.0, row_tuple.1)); + + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read after flush", + test_table1, + &expect_rows, + ) + .await; + + // Trigger a compaction. + test_ctx.compact_table(test_table1).await; + + // Check read after compaction. + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read after compaction", + test_table1, + &expect_rows, + ) + .await; + }); +} diff --git a/analytic_engine/src/tests/drop_test.rs b/analytic_engine/src/tests/drop_test.rs new file mode 100644 index 0000000000..7d12baa536 --- /dev/null +++ b/analytic_engine/src/tests/drop_test.rs @@ -0,0 +1,231 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Drop table tests + +use std::collections::HashMap; + +use common_types::{column_schema, datum::DatumKind, time::Timestamp}; +use table_engine::table::AlterSchemaRequest; + +use crate::tests::{ + table::FixedSchemaTable, + util::{self, TestEnv}, +}; + +#[test] +fn test_drop_table_once() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + test_ctx.create_fixed_schema_table(test_table1).await; + + assert!(test_ctx.drop_table(test_table1).await); + + let table_opt = test_ctx.try_open_table(test_table1).await.unwrap(); + assert!(table_opt.is_none()); + + test_ctx.reopen().await; + + let table_opt = test_ctx.try_open_table(test_table1).await.unwrap(); + assert!(table_opt.is_none()); + }); +} + +#[test] +fn test_drop_table_again() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + test_ctx.create_fixed_schema_table(test_table1).await; + + assert!(test_ctx.drop_table(test_table1).await); + + assert!(!test_ctx.drop_table(test_table1).await); + + let table_opt = test_ctx.try_open_table(test_table1).await.unwrap(); + assert!(table_opt.is_none()); + }); +} + +#[test] +fn test_drop_create_table_mixed() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + test_ctx.create_fixed_schema_table(test_table1).await; + + assert!(test_ctx.drop_table(test_table1).await); + + // Create another table after dropped. + let test_table2 = "test_table2"; + test_ctx.create_fixed_schema_table(test_table2).await; + + let table_opt = test_ctx.try_open_table(test_table1).await.unwrap(); + assert!(table_opt.is_none()); + + test_ctx.reopen().await; + + let table_opt = test_ctx.try_open_table(test_table1).await.unwrap(); + assert!(table_opt.is_none()); + // Table 2 is still exists. + assert!(test_ctx + .try_open_table(test_table2) + .await + .unwrap() + .is_some()); + }); +} + +fn test_drop_create_same_table_case(flush: bool) { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await; + + // Write data to table1. + let start_ms = test_ctx.start_ms(); + let rows = [( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + )]; + let row_group = fixed_schema_table.rows_to_row_group(&rows); + test_ctx.write_to_table(test_table1, row_group).await; + + if flush { + test_ctx.flush_table(test_table1).await; + } + + assert!(test_ctx.drop_table(test_table1).await); + + // Create same table again. + let test_table1 = "test_table1"; + test_ctx.create_fixed_schema_table(test_table1).await; + + // No data exists. + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read table", + test_table1, + &[], + ) + .await; + + test_ctx.reopen_with_tables(&[test_table1]).await; + + // No data exists. + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read table after reopen", + test_table1, + &[], + ) + .await; + }); +} + +#[test] +fn test_drop_create_same_table() { + test_drop_create_same_table_case(false); + + test_drop_create_same_table_case(true); +} + +#[test] +fn test_alter_schema_drop_create() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + test_ctx.create_fixed_schema_table(test_table1).await; + + // Alter schema. + let old_schema = test_ctx.table(test_table1).schema(); + let schema_builder = FixedSchemaTable::default_schema_builder() + .add_normal_column( + column_schema::Builder::new("add_double".to_string(), DatumKind::Double) + .is_nullable(true) + .build() + .unwrap(), + ) + .unwrap(); + let new_schema = schema_builder + .version(old_schema.version() + 1) + .build() + .unwrap(); + let request = AlterSchemaRequest { + schema: new_schema.clone(), + pre_schema_version: old_schema.version(), + }; + let affected = test_ctx + .try_alter_schema(test_table1, request) + .await + .unwrap(); + assert_eq!(1, affected); + + // Drop table. + assert!(test_ctx.drop_table(test_table1).await); + + // Create same table again. + let test_table1 = "test_table1"; + test_ctx.create_fixed_schema_table(test_table1).await; + + test_ctx.reopen_with_tables(&[test_table1]).await; + }); +} + +#[test] +fn test_alter_options_drop_create() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + test_ctx.create_fixed_schema_table(test_table1).await; + + // Alter options. + let mut new_opts = HashMap::new(); + new_opts.insert("arena_block_size".to_string(), "10240".to_string()); + + let affected = test_ctx + .try_alter_options(test_table1, new_opts) + .await + .unwrap(); + assert_eq!(1, affected); + + // Drop table. + assert!(test_ctx.drop_table(test_table1).await); + + // Create same table again. + let test_table1 = "test_table1"; + test_ctx.create_fixed_schema_table(test_table1).await; + + test_ctx.reopen_with_tables(&[test_table1]).await; + }); +} diff --git a/analytic_engine/src/tests/mod.rs b/analytic_engine/src/tests/mod.rs new file mode 100644 index 0000000000..3ed5f527e0 --- /dev/null +++ b/analytic_engine/src/tests/mod.rs @@ -0,0 +1,17 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Test suits and intergration tests. + +#[cfg(test)] +mod alter_test; +#[cfg(test)] +mod compaction_test; +#[cfg(test)] +mod drop_test; +#[cfg(test)] +mod open_test; +#[cfg(test)] +mod read_write_test; +pub mod row_util; +pub mod table; +pub mod util; diff --git a/analytic_engine/src/tests/open_test.rs b/analytic_engine/src/tests/open_test.rs new file mode 100644 index 0000000000..6c3afc0578 --- /dev/null +++ b/analytic_engine/src/tests/open_test.rs @@ -0,0 +1,18 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Engine open test. + +use crate::tests::util::TestEnv; + +#[test] +fn test_open_engine() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + // Reopen engine. + test_ctx.reopen().await; + }); +} diff --git a/analytic_engine/src/tests/read_write_test.rs b/analytic_engine/src/tests/read_write_test.rs new file mode 100644 index 0000000000..c190817470 --- /dev/null +++ b/analytic_engine/src/tests/read_write_test.rs @@ -0,0 +1,735 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Read write test. + +use std::{thread, time}; + +use common_types::time::Timestamp; +use log::info; +use table_engine::table::ReadOrder; + +use crate::{ + table_options, + tests::util::{self, TestEnv}, +}; + +#[test] +fn test_multi_table_read_write() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_multi_table_read_write1"; + let test_table2 = "test_multi_table_read_write2"; + let test_table3 = "test_multi_table_read_write3"; + + let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await; + let _ = test_ctx.create_fixed_schema_table(test_table2).await; + let _ = test_ctx.create_fixed_schema_table(test_table3).await; + + let start_ms = test_ctx.start_ms(); + let rows = [ + // One bucket. + ( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key2", + Timestamp::new(start_ms), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ( + "key2", + Timestamp::new(start_ms + 1), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ( + "key3", + Timestamp::new(start_ms + 2), + "tag1-4", + 13.0, + 110.0, + "tag2-4", + ), + ( + "key4", + Timestamp::new(start_ms + 3), + "tag1-5", + 13.0, + 110.0, + "tag2-5", + ), + // Next bucket. + ( + "key5", + Timestamp::new( + start_ms + 1 + 2 * table_options::DEFAULT_SEGMENT_DURATION.as_millis() as i64, + ), + "tag-5-3", + 33.0, + 310.0, + "tag-5-3", + ), + ]; + + // Write data to table. + let row_group1 = fixed_schema_table.rows_to_row_group(&rows); + let row_group2 = fixed_schema_table.rows_to_row_group(&rows); + let row_group3 = fixed_schema_table.rows_to_row_group(&rows); + test_ctx.write_to_table(test_table1, row_group1).await; + test_ctx.write_to_table(test_table2, row_group2).await; + test_ctx.write_to_table(test_table3, row_group3).await; + + // Read with different opts. + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read write table1", + test_table1, + &rows, + ) + .await; + + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read write table2", + test_table2, + &rows, + ) + .await; + + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read write table3", + test_table3, + &rows, + ) + .await; + + // Reopen db. + test_ctx + .reopen_with_tables(&[test_table1, test_table2, test_table3]) + .await; + + // Read with different opts again. + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read write table1 after reopen", + test_table1, + &rows, + ) + .await; + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read write table2 after reopen", + test_table2, + &rows, + ) + .await; + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read write table3 after reopen", + test_table3, + &rows, + ) + .await; + }); +} + +#[test] +fn test_table_write_read() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await; + + let start_ms = test_ctx.start_ms(); + let rows = [ + ( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key2", + Timestamp::new(start_ms), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ( + "key2", + Timestamp::new(start_ms + 1), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ]; + let row_group = fixed_schema_table.rows_to_row_group(&rows); + + // Write data to table. + test_ctx.write_to_table(test_table1, row_group).await; + + // Read with different opts. + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read write table", + test_table1, + &rows, + ) + .await; + + // Reopen db. + test_ctx.reopen_with_tables(&[test_table1]).await; + + // Read with different opts again. + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read write table after reopen", + test_table1, + &rows, + ) + .await; + }); +} + +#[test] +fn test_table_write_get() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await; + + let start_ms = test_ctx.start_ms(); + let rows = [ + ( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key2", + Timestamp::new(start_ms), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ( + "key2", + Timestamp::new(start_ms + 1), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ]; + let row_group = fixed_schema_table.rows_to_row_group(&rows); + + // Write data to table. + test_ctx.write_to_table(test_table1, row_group).await; + + util::check_get( + &test_ctx, + &fixed_schema_table, + "Try to get row", + test_table1, + &rows, + ) + .await; + + // Reopen db. + test_ctx.reopen_with_tables(&[test_table1]).await; + + util::check_get( + &test_ctx, + &fixed_schema_table, + "Try to get row after reopen", + test_table1, + &rows, + ) + .await; + }); +} + +#[test] +fn test_table_write_get_override() { + test_table_write_get_override_case(FlushPoint::NoFlush); + + test_table_write_get_override_case(FlushPoint::AfterFirstWrite); + + test_table_write_get_override_case(FlushPoint::AfterOverwrite); + + test_table_write_get_override_case(FlushPoint::FirstAndOverwrite); +} + +#[derive(Debug)] +enum FlushPoint { + NoFlush, + AfterFirstWrite, + AfterOverwrite, + FirstAndOverwrite, +} + +fn test_table_write_get_override_case(flush_point: FlushPoint) { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + info!( + "test_table_write_get_override_case, flush_point:{:?}", + flush_point + ); + + test_ctx.open().await; + + let test_table1 = "test_table1"; + let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await; + + let start_ms = test_ctx.start_ms(); + { + let rows = [ + ( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key2", + Timestamp::new(start_ms), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ( + "key3", + Timestamp::new(start_ms + 10), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ( + "key2", + Timestamp::new(start_ms + 1), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ]; + let row_group = fixed_schema_table.rows_to_row_group(&rows); + + // Write data to table. + test_ctx.write_to_table(test_table1, row_group).await; + } + + if let FlushPoint::AfterFirstWrite | FlushPoint::FirstAndOverwrite = flush_point { + test_ctx.flush_table(test_table1).await; + } + + // Override some rows + { + let rows = [ + ( + "key2", + Timestamp::new(start_ms), + "tag1-2-copy", + 112.0, + 210.0, + "tag2-2-copy", + ), + ( + "key2", + Timestamp::new(start_ms + 1), + "tag1-3-copy", + 113.0, + 210.0, + "tag2-3-copy", + ), + ]; + let row_group = fixed_schema_table.rows_to_row_group(&rows); + + test_ctx.write_to_table(test_table1, row_group).await; + } + + if let FlushPoint::AfterOverwrite | FlushPoint::FirstAndOverwrite = flush_point { + test_ctx.flush_table(test_table1).await; + } + + let expect_rows = [ + ( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key2", + Timestamp::new(start_ms), + "tag1-2-copy", + 112.0, + 210.0, + "tag2-2-copy", + ), + ( + "key2", + Timestamp::new(start_ms + 1), + "tag1-3-copy", + 113.0, + 210.0, + "tag2-3-copy", + ), + ( + "key3", + Timestamp::new(start_ms + 10), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ]; + + util::check_get( + &test_ctx, + &fixed_schema_table, + "Try to get row", + test_table1, + &expect_rows, + ) + .await; + + // Reopen db. + test_ctx.reopen_with_tables(&[test_table1]).await; + + util::check_get( + &test_ctx, + &fixed_schema_table, + "Try to get row after reopen", + test_table1, + &expect_rows, + ) + .await; + }); +} + +#[test] +fn test_db_write_buffer_size() { + let mut env = TestEnv::builder().build(); + env.config.db_write_buffer_size = 1; + test_write_buffer_size_overflow("db_write_buffer_size_test", env); +} + +#[test] +fn test_space_write_buffer_size() { + let mut env = TestEnv::builder().build(); + env.config.space_write_buffer_size = 1; + test_write_buffer_size_overflow("space_write_buffer_size_test", env); +} + +fn test_write_buffer_size_overflow(test_table_name: &str, env: TestEnv) { + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table_name).await; + + let table = test_ctx.table(test_table_name); + let old_stats = table.stats(); + + let start_ms = test_ctx.start_ms(); + let rows1 = [ + ( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key2", + Timestamp::new(start_ms), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ( + "key2", + Timestamp::new(start_ms + 1), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ]; + let row_group = fixed_schema_table.rows_to_row_group(&rows1); + // Write rows1 to table. + test_ctx.write_to_table(test_table_name, row_group).await; + + let stats = table.stats(); + assert_eq!(old_stats.num_read, stats.num_read); + assert_eq!(old_stats.num_write + 1, stats.num_write); + assert_eq!(old_stats.num_flush, stats.num_flush); + + let rows2 = [ + ( + "key4", + Timestamp::new(start_ms + 2), + "tag1-4", + 11.0, + 110.0, + "tag2-4", + ), + ( + "key5", + Timestamp::new(start_ms + 3), + "tag1-5", + 12.0, + 110.0, + "tag2-5", + ), + ]; + + let row_group = fixed_schema_table.rows_to_row_group(&rows2); + // Write rowss2 to table. + test_ctx.write_to_table(test_table_name, row_group).await; + + let mut rows = Vec::new(); + rows.extend_from_slice(&rows1); + rows.extend_from_slice(&rows2); + + // TODO(boyan) a better way to wait table flushing finishes. + thread::sleep(time::Duration::from_millis(500)); + + // Read with different opts. + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read write table", + test_table_name, + &rows, + ) + .await; + + let stats = table.stats(); + assert_eq!(old_stats.num_read + 5, stats.num_read); + assert_eq!(old_stats.num_write + 2, stats.num_write); + // Flush when reaches (db/space) write_buffer size limitation. + assert_eq!(old_stats.num_flush + 1, stats.num_flush); + + drop(table); + // Reopen db. + test_ctx.reopen_with_tables(&[test_table_name]).await; + + // Read with different opts again. + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read write table after reopen", + test_table_name, + &rows, + ) + .await; + }); +} + +#[test] +fn test_table_write_read_reverse() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table = "test_table"; + let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table).await; + + let start_ms = test_ctx.start_ms(); + let rows = [ + ( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + // update the first row + ( + "key1", + Timestamp::new(start_ms), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ( + "key1", + Timestamp::new(start_ms + 1), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ( + "key2", + Timestamp::new(start_ms), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ( + "key2", + Timestamp::new(start_ms + 1), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ]; + let expect_reversed_rows = vec![rows[4], rows[3], rows[2], rows[1]]; + let row_group = fixed_schema_table.rows_to_row_group(&rows); + + // Write data to table. + test_ctx.write_to_table(test_table, row_group).await; + + // Read reverse + util::check_read_with_order( + &test_ctx, + &fixed_schema_table, + "Test read write table", + test_table, + &expect_reversed_rows, + ReadOrder::Desc, + ) + .await; + }); +} + +#[test] +fn test_table_write_read_reverse_after_flush() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table = "test_table"; + let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table).await; + + let start_ms = test_ctx.start_ms(); + let rows1 = [ + ( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key2", + Timestamp::new(start_ms), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ( + "key2", + Timestamp::new(start_ms + 1), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ]; + + let rows2 = vec![ + // update the first row + ( + "key1", + Timestamp::new(start_ms), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ( + "key1", + Timestamp::new(start_ms + 1), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ]; + + let expect_reversed_rows = vec![rows1[2], rows1[1], rows2[1], rows2[0]]; + let row_group1 = fixed_schema_table.rows_to_row_group(&rows1); + // Write data to table and flush + test_ctx.write_to_table(test_table, row_group1).await; + test_ctx.flush_table(test_table).await; + + let row_group2 = fixed_schema_table.rows_to_row_group(&rows2); + // Write data to table and not flush + test_ctx.write_to_table(test_table, row_group2).await; + + // Read reverse + util::check_read_with_order( + &test_ctx, + &fixed_schema_table, + "Test read write table", + test_table, + &expect_reversed_rows, + ReadOrder::Desc, + ) + .await; + }); +} diff --git a/analytic_engine/src/tests/row_util.rs b/analytic_engine/src/tests/row_util.rs new file mode 100644 index 0000000000..eaf7b592ed --- /dev/null +++ b/analytic_engine/src/tests/row_util.rs @@ -0,0 +1,93 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Row utils + +use common_types::{datum::Datum, row::Row}; + +pub fn new_row_6(data: (C0, C1, C2, C3, C4, C5)) -> Row +where + C0: Into, + C1: Into, + C2: Into, + C3: Into, + C4: Into, + C5: Into, +{ + let cols = vec![ + data.0.into(), + data.1.into(), + data.2.into(), + data.3.into(), + data.4.into(), + data.5.into(), + ]; + + Row::from_datums(cols) +} + +pub fn assert_row_eq_6(data: (C0, C1, C2, C3, C4, C5), row: Row) +where + C0: Into, + C1: Into, + C2: Into, + C3: Into, + C4: Into, + C5: Into, +{ + let expect_row = new_row_6(data); + assert_eq!(expect_row, row); +} + +pub fn new_row_8(data: (C0, C1, C2, C3, C4, C5, C6, C7)) -> Row +where + C0: Into, + C1: Into, + C2: Into, + C3: Into, + C4: Into, + C5: Into, + C6: Into, + C7: Into, +{ + let cols = vec![ + data.0.into(), + data.1.into(), + data.2.into(), + data.3.into(), + data.4.into(), + data.5.into(), + data.6.into(), + data.7.into(), + ]; + + Row::from_datums(cols) +} + +pub fn new_rows_6(data: &[(C0, C1, C2, C3, C4, C5)]) -> Vec +where + C0: Into + Clone, + C1: Into + Clone, + C2: Into + Clone, + C3: Into + Clone, + C4: Into + Clone, + C5: Into + Clone, +{ + data.iter().cloned().map(new_row_6).collect() +} + +#[allow(clippy::type_complexity)] +pub fn new_rows_8( + data: &[(C0, C1, C2, C3, C4, C5, C6, C7)], +) -> Vec +where + C0: Into + Clone, + C1: Into + Clone, + C2: Into + Clone, + C3: Into + Clone, + C4: Into + Clone, + C5: Into + Clone, + C6: Into + Clone, + C7: Into + Clone, +{ + data.iter().cloned().map(new_row_8).collect() +} diff --git a/analytic_engine/src/tests/table.rs b/analytic_engine/src/tests/table.rs new file mode 100644 index 0000000000..8d3d7a83e1 --- /dev/null +++ b/analytic_engine/src/tests/table.rs @@ -0,0 +1,331 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Utils to create table. + +use std::{collections::HashMap, sync::Arc}; + +use common_types::{ + column_schema, + datum::{Datum, DatumKind}, + projected_schema::ProjectedSchema, + record_batch::RecordBatch, + request_id::RequestId, + row::{Row, RowGroup, RowGroupBuilder}, + schema::{self, Schema}, + time::{TimeRange, Timestamp}, +}; +use common_util::config::ReadableDuration; +use table_engine::{ + self, + engine::{CreateTableRequest, TableState}, + predicate::Predicate, + table::{GetRequest, ReadOptions, ReadOrder, ReadRequest, SchemaId, TableId, TableSeq}, +}; + +use crate::{table_options, tests::row_util}; + +pub fn new_table_id(schema_id: u16, table_seq: u32) -> TableId { + TableId::new(SchemaId::from(schema_id), TableSeq::from(table_seq)) +} + +pub type RowTuple<'a> = (&'a str, Timestamp, &'a str, f64, f64, &'a str); +pub type RowTupleOpt<'a> = ( + &'a str, + Timestamp, + Option<&'a str>, + Option, + Option, + Option<&'a str>, +); +pub type KeyTuple<'a> = (&'a str, Timestamp); + +pub struct FixedSchemaTable { + create_request: CreateTableRequest, +} + +impl FixedSchemaTable { + pub fn builder() -> Builder { + Builder::default() + } + + fn default_schema() -> Schema { + Self::default_schema_builder().build().unwrap() + } + + pub fn default_schema_builder() -> schema::Builder { + create_schema_builder( + // Key columns + &[("key", DatumKind::String), ("ts", DatumKind::Timestamp)], + // Normal columns + &[ + ("string_tag", DatumKind::String), + ("double_field1", DatumKind::Double), + ("double_field2", DatumKind::Double), + ("string_field2", DatumKind::String), + ], + ) + } + + #[inline] + pub fn create_request(&self) -> &CreateTableRequest { + &self.create_request + } + + #[inline] + pub fn segment_duration_ms(&self) -> i64 { + table_options::DEFAULT_SEGMENT_DURATION.as_millis() as i64 + } + + // Format of data: (key string, timestamp, string_tag, double_field1, + // double_field2, string_field2) + fn new_row(data: RowTuple) -> Row { + row_util::new_row_6(data) + } + + pub fn rows_to_row_group(&self, data: &[RowTuple]) -> RowGroup { + let rows = data + .iter() + .copied() + .map(FixedSchemaTable::new_row) + .collect(); + + self.new_row_group(rows) + } + + pub fn rows_opt_to_row_group(&self, data: &[RowTupleOpt]) -> RowGroup { + let rows = data + .iter() + .copied() + .map(FixedSchemaTable::new_row_opt) + .collect(); + + self.new_row_group(rows) + } + + fn new_row_group(&self, rows: Vec) -> RowGroup { + RowGroupBuilder::with_rows(self.create_request.table_schema.clone(), rows) + .unwrap() + .build() + } + + fn new_row_opt(data: RowTupleOpt) -> Row { + row_util::new_row_6(data) + } + + pub fn new_read_all_request(&self, opts: ReadOptions, read_order: ReadOrder) -> ReadRequest { + new_read_all_request_with_order(self.create_request.table_schema.clone(), opts, read_order) + } + + pub fn new_get_request(&self, key: KeyTuple) -> GetRequest { + let primary_key = vec![key.0.into(), key.1.into()]; + + GetRequest { + request_id: RequestId::next_id(), + projected_schema: ProjectedSchema::no_projection( + self.create_request.table_schema.clone(), + ), + primary_key, + } + } + + pub fn new_get_request_from_row(&self, data: RowTuple) -> GetRequest { + self.new_get_request((data.0, data.1)) + } + + pub fn assert_batch_eq_to_rows(&self, record_batches: &[RecordBatch], rows: &[RowTuple]) { + let row_group = self.rows_to_row_group(rows); + assert_batch_eq_to_row_group(record_batches, &row_group); + } + + pub fn assert_row_eq(&self, data: RowTuple, row: Row) { + row_util::assert_row_eq_6(data, row); + } +} + +pub fn read_opts_list() -> Vec { + vec![ + ReadOptions::default(), + ReadOptions { + batch_size: 1, + read_parallelism: 1, + }, + ReadOptions { + batch_size: 1, + read_parallelism: 4, + }, + ReadOptions { + batch_size: 100, + read_parallelism: 1, + }, + ReadOptions { + batch_size: 100, + read_parallelism: 4, + }, + ] +} + +pub fn new_read_all_request_with_order( + schema: Schema, + opts: ReadOptions, + order: ReadOrder, +) -> ReadRequest { + ReadRequest { + request_id: RequestId::next_id(), + opts, + projected_schema: ProjectedSchema::no_projection(schema), + predicate: Arc::new(Predicate::new(TimeRange::min_to_max())), + order, + } +} + +pub fn new_read_all_request(schema: Schema, opts: ReadOptions) -> ReadRequest { + new_read_all_request_with_order(schema, opts, ReadOrder::None) +} + +pub fn assert_batch_eq_to_row_group(record_batches: &[RecordBatch], row_group: &RowGroup) { + if record_batches.is_empty() { + assert!(row_group.is_empty()); + } + + for record_batch in record_batches { + assert_eq!( + record_batch.schema().columns(), + row_group.schema().columns() + ); + } + + let mut cursor = RecordBatchesCursor::new(record_batches); + + for row in row_group.iter() { + for (column_idx, datum) in row.iter().enumerate() { + assert_eq!( + &cursor.datum(column_idx), + datum, + "record_batches:{:?}, row_group:{:?}", + record_batches, + row_group + ); + } + cursor.step(); + } +} + +struct RecordBatchesCursor<'a> { + record_batches: &'a [RecordBatch], + batch_idx: usize, + row_idx_in_batch: usize, +} + +impl<'a> RecordBatchesCursor<'a> { + fn new(record_batches: &[RecordBatch]) -> RecordBatchesCursor { + RecordBatchesCursor { + record_batches, + batch_idx: 0, + row_idx_in_batch: 0, + } + } + + fn step(&mut self) { + if self.batch_idx >= self.record_batches.len() { + return; + } + + self.row_idx_in_batch += 1; + if self.row_idx_in_batch >= self.record_batches[self.batch_idx].num_rows() { + self.batch_idx += 1; + self.row_idx_in_batch = 0; + } + } + + fn datum(&self, column_idx: usize) -> Datum { + let record_batch = &self.record_batches[self.batch_idx]; + let column_in_batch = record_batch.column(column_idx); + column_in_batch.datum(self.row_idx_in_batch) + } +} + +#[must_use] +pub struct Builder { + create_request: CreateTableRequest, +} + +impl Builder { + pub fn table_name(mut self, table_name: String) -> Self { + self.create_request.table_name = table_name; + self + } + + pub fn table_id(mut self, table_id: TableId) -> Self { + self.create_request.table_id = table_id; + self + } + + pub fn enable_ttl(mut self, enable_ttl: bool) -> Self { + self.create_request.options.insert( + table_engine::OPTION_KEY_ENABLE_TTL.to_string(), + enable_ttl.to_string(), + ); + self + } + + pub fn ttl(mut self, duration: ReadableDuration) -> Self { + self.create_request + .options + .insert(table_options::TTL.to_string(), duration.to_string()); + self + } + + pub fn build_fixed(self) -> FixedSchemaTable { + FixedSchemaTable { + create_request: self.create_request, + } + } +} + +impl Default for Builder { + fn default() -> Self { + Self { + create_request: CreateTableRequest { + catalog_name: "ceresdb".to_string(), + schema_name: "public".to_string(), + table_id: new_table_id(2, 1), + table_name: "test_table".to_string(), + table_schema: FixedSchemaTable::default_schema(), + partition_info: None, + engine: table_engine::ANALYTIC_ENGINE_TYPE.to_string(), + options: HashMap::new(), + state: TableState::Stable, + }, + } + } +} + +// Format of input slice: &[ ( column name, column type ) ] +pub fn create_schema_builder( + key_tuples: &[(&str, DatumKind)], + normal_tuples: &[(&str, DatumKind)], +) -> schema::Builder { + assert!(!key_tuples.is_empty()); + + let mut schema_builder = schema::Builder::with_capacity(key_tuples.len() + normal_tuples.len()) + .auto_increment_column_id(true); + + for tuple in key_tuples { + // Key column is not nullable. + let column_schema = column_schema::Builder::new(tuple.0.to_string(), tuple.1) + .is_nullable(false) + .build() + .expect("Should succeed to build key column schema"); + schema_builder = schema_builder.add_key_column(column_schema).unwrap(); + } + + for tuple in normal_tuples { + let column_schema = column_schema::Builder::new(tuple.0.to_string(), tuple.1) + .is_nullable(true) + .build() + .expect("Should succeed to build normal column schema"); + schema_builder = schema_builder.add_normal_column(column_schema).unwrap(); + } + + schema_builder +} diff --git a/analytic_engine/src/tests/util.rs b/analytic_engine/src/tests/util.rs new file mode 100644 index 0000000000..31afc1b582 --- /dev/null +++ b/analytic_engine/src/tests/util.rs @@ -0,0 +1,404 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Test utils. + +use std::{collections::HashMap, future::Future, sync::Arc}; + +use common_types::{ + datum::Datum, + record_batch::RecordBatch, + row::{Row, RowGroup}, + time::Timestamp, +}; +use common_util::{config::ReadableDuration, runtime}; +use futures::stream::StreamExt; +use log::info; +use table_engine::{ + engine::{ + CreateTableRequest, DropTableRequest, EngineRuntimes, OpenTableRequest, + Result as EngineResult, TableEngine, + }, + table::{ + AlterSchemaRequest, FlushRequest, GetRequest, ReadOrder, ReadRequest, Result, TableId, + TableRef, WriteRequest, + }, +}; +use tempfile::TempDir; + +use crate::{ + setup, + tests::table::{self, FixedSchemaTable, RowTuple}, + AnalyticTableEngine, Config, EngineInstance, +}; + +const DAY_MS: i64 = 24 * 60 * 60 * 1000; + +/// Helper struct to create a null datum. +pub struct Null; + +impl From for Datum { + fn from(_data: Null) -> Datum { + Datum::Null + } +} + +pub async fn check_read_with_order( + test_ctx: &TestContext, + fixed_schema_table: &FixedSchemaTable, + msg: &str, + table_name: &str, + rows: &[RowTuple<'_>], + read_order: ReadOrder, +) { + for read_opts in table::read_opts_list() { + info!("{}, opts:{:?}", msg, read_opts); + + let record_batches = test_ctx + .read_table( + table_name, + fixed_schema_table.new_read_all_request(read_opts, read_order), + ) + .await; + + fixed_schema_table.assert_batch_eq_to_rows(&record_batches, rows); + } +} + +pub async fn check_read( + test_ctx: &TestContext, + fixed_schema_table: &FixedSchemaTable, + msg: &str, + table_name: &str, + rows: &[RowTuple<'_>], +) { + check_read_with_order( + test_ctx, + fixed_schema_table, + msg, + table_name, + rows, + ReadOrder::None, + ) + .await +} + +pub async fn check_get( + test_ctx: &TestContext, + fixed_schema_table: &FixedSchemaTable, + msg: &str, + table_name: &str, + rows: &[RowTuple<'_>], +) { + for row_data in rows { + let request = fixed_schema_table.new_get_request_from_row(*row_data); + + info!("{}, request:{:?}, row_data:{:?}", msg, request, row_data); + + let row = test_ctx.get_from_table(table_name, request).await.unwrap(); + + fixed_schema_table.assert_row_eq(*row_data, row); + } +} + +pub struct TestContext { + pub config: Config, + runtimes: Arc, + pub engine: Option, + last_table_seq: u32, + + name_to_tables: HashMap, +} + +impl TestContext { + pub async fn open(&mut self) { + let engine = setup::open_analytic_table_engine(self.config.clone(), self.runtimes.clone()) + .await + .unwrap(); + + self.engine = Some(engine); + } + + pub async fn reopen(&mut self) { + { + // Close all tables. + self.name_to_tables.clear(); + + // Close engine. + let engine = self.engine.take().unwrap(); + engine.close().await.unwrap(); + } + + self.open().await; + } + + pub async fn reopen_with_tables(&mut self, tables: &[&str]) { + { + // Close all tables. + self.name_to_tables.clear(); + + // Close engine. + let engine = self.engine.take().unwrap(); + engine.close().await.unwrap(); + } + + self.open().await; + + for name in tables { + self.open_table(name).await; + } + } + + async fn open_table(&mut self, table_name: &str) { + let table = self + .engine() + .open_table(OpenTableRequest { + catalog_name: "ceresdb".to_string(), + schema_name: "public".to_string(), + table_name: table_name.to_string(), + engine: table_engine::ANALYTIC_ENGINE_TYPE.to_string(), + }) + .await + .unwrap() + .unwrap(); + + self.name_to_tables.insert(table_name.to_string(), table); + } + + pub async fn try_open_table(&mut self, table_name: &str) -> EngineResult> { + let table_opt = self + .engine() + .open_table(OpenTableRequest { + catalog_name: "ceresdb".to_string(), + schema_name: "public".to_string(), + table_name: table_name.to_string(), + engine: table_engine::ANALYTIC_ENGINE_TYPE.to_string(), + }) + .await?; + + let table = match table_opt { + Some(v) => v, + None => return Ok(None), + }; + + self.name_to_tables + .insert(table_name.to_string(), table.clone()); + + Ok(Some(table)) + } + + pub async fn drop_table(&mut self, table_name: &str) -> bool { + let request = DropTableRequest { + catalog_name: "ceresdb".to_string(), + schema_name: "public".to_string(), + table_name: table_name.to_string(), + engine: table_engine::ANALYTIC_ENGINE_TYPE.to_string(), + }; + + let ret = self.engine().drop_table(request).await.unwrap(); + + self.name_to_tables.remove(table_name); + + ret + } + + /// 3 days ago. + pub fn start_ms(&self) -> i64 { + Timestamp::now().as_i64() - 3 * DAY_MS + } + + pub async fn create_fixed_schema_table(&mut self, table_name: &str) -> FixedSchemaTable { + let fixed_schema_table = FixedSchemaTable::builder() + .table_name(table_name.to_string()) + .table_id(self.next_table_id()) + .ttl("7d".parse::().unwrap()) + .build_fixed(); + + self.create_table(fixed_schema_table.create_request().clone()) + .await; + + fixed_schema_table + } + + async fn create_table(&mut self, create_request: CreateTableRequest) { + let table_name = create_request.table_name.clone(); + let table = self.engine().create_table(create_request).await.unwrap(); + + self.name_to_tables.insert(table_name.to_string(), table); + } + + pub async fn write_to_table(&self, table_name: &str, row_group: RowGroup) { + let table = self.table(table_name); + + table.write(WriteRequest { row_group }).await.unwrap(); + } + + pub async fn read_table( + &self, + table_name: &str, + read_request: ReadRequest, + ) -> Vec { + let table = self.table(table_name); + + let mut stream = table.read(read_request).await.unwrap(); + let mut record_batches = Vec::new(); + while let Some(batch) = stream.next().await { + let batch = batch.unwrap(); + + record_batches.push(batch); + } + + record_batches + } + + pub async fn partitioned_read_table( + &self, + table_name: &str, + read_request: ReadRequest, + ) -> Vec { + let table = self.table(table_name); + + let streams = table.partitioned_read(read_request).await.unwrap(); + let mut record_batches = Vec::new(); + + for mut stream in streams.streams { + while let Some(batch) = stream.next().await { + let batch = batch.unwrap(); + + record_batches.push(batch); + } + } + + record_batches + } + + pub async fn get_from_table(&self, table_name: &str, request: GetRequest) -> Option { + let table = self.table(table_name); + + table.get(request).await.unwrap() + } + + pub async fn flush_table(&self, table_name: &str) { + let table = self.table(table_name); + + table.flush(FlushRequest::default()).await.unwrap(); + } + + pub async fn flush_table_with_request(&self, table_name: &str, request: FlushRequest) { + let table = self.table(table_name); + + table.flush(request).await.unwrap(); + } + + pub async fn compact_table(&self, table_name: &str) { + let table = self.table(table_name); + + table.compact().await.unwrap(); + } + + pub async fn try_alter_schema( + &self, + table_name: &str, + request: AlterSchemaRequest, + ) -> Result { + let table = self.table(table_name); + + table.alter_schema(request).await + } + + pub async fn try_alter_options( + &self, + table_name: &str, + opts: HashMap, + ) -> Result { + let table = self.table(table_name); + + table.alter_options(opts).await + } + + pub fn table(&self, table_name: &str) -> TableRef { + self.name_to_tables.get(table_name).cloned().unwrap() + } + + #[inline] + pub fn engine(&self) -> AnalyticTableEngine { + self.engine.clone().unwrap() + } + + #[inline] + pub fn instance(&self) -> EngineInstance { + self.engine().instance() + } + + fn next_table_id(&mut self) -> TableId { + self.last_table_seq += 1; + table::new_table_id(2, self.last_table_seq) + } +} + +pub struct TestEnv { + _dir: TempDir, + pub config: Config, + pub runtimes: Arc, +} + +impl TestEnv { + pub fn builder() -> Builder { + Builder::default() + } + + pub fn new_context(&self) -> TestContext { + TestContext { + config: self.config.clone(), + runtimes: self.runtimes.clone(), + engine: None, + last_table_seq: 1, + name_to_tables: HashMap::new(), + } + } + + pub fn block_on(&self, future: F) -> F::Output { + self.runtimes.bg_runtime.block_on(future) + } +} + +pub struct Builder { + num_workers: usize, +} + +impl Builder { + pub fn build(self) -> TestEnv { + // Init log for test. + common_util::tests::init_log_for_test(); + + let dir = tempfile::tempdir().unwrap(); + + let config = Config { + data_path: dir.path().to_str().unwrap().to_string(), + ..Default::default() + }; + + let runtime = Arc::new( + runtime::Builder::default() + .worker_threads(self.num_workers) + .enable_all() + .build() + .unwrap(), + ); + + TestEnv { + _dir: dir, + config, + runtimes: Arc::new(EngineRuntimes { + read_runtime: runtime.clone(), + write_runtime: runtime.clone(), + bg_runtime: runtime, + }), + } + } +} + +impl Default for Builder { + fn default() -> Self { + Self { num_workers: 2 } + } +} diff --git a/arrow_deps/Cargo.toml b/arrow_deps/Cargo.toml new file mode 100644 index 0000000000..e7cac70aa2 --- /dev/null +++ b/arrow_deps/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "arrow_deps" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +arrow = "7.0.0" +parquet = "7.0.0" + +[dependencies.uncover] +git = "https://github.com/matklad/uncover.git" +rev = "1d0770d997e29731b287e9e11e4ffbbea5f456da" + +[dependencies.datafusion] +git = "https://github.com/apache/arrow-datafusion.git" +rev = "444c153863520072ea22d4f8c498dee39437516d" diff --git a/arrow_deps/src/display.rs b/arrow_deps/src/display.rs new file mode 100644 index 0000000000..be037d882e --- /dev/null +++ b/arrow_deps/src/display.rs @@ -0,0 +1,428 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Functions for printing array values, as strings, for debugging +//! purposes. See the `pretty` crate for additional functions for +//! record batch pretty printing. + +// Copy from arrow + +use std::sync::Arc; + +use arrow::{ + array::{self, Array, DictionaryArray}, + datatypes::{ + ArrowNativeType, ArrowPrimitiveType, DataType, Int16Type, Int32Type, Int64Type, Int8Type, + IntervalUnit, TimeUnit, UInt16Type, UInt32Type, UInt64Type, UInt8Type, + }, + error::{ArrowError, Result}, +}; + +macro_rules! make_string { + ($array_type:ty, $column: ident, $row: ident) => {{ + let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + + let s = if array.is_null($row) { + "".to_string() + } else { + array.value($row).to_string() + }; + + Ok(s) + }}; +} + +macro_rules! make_string_interval_year_month { + ($column: ident, $row: ident) => {{ + let array = $column + .as_any() + .downcast_ref::() + .unwrap(); + + let s = if array.is_null($row) { + "NULL".to_string() + } else { + let interval = array.value($row) as f64; + let years = (interval / 12_f64).floor(); + let month = interval - (years * 12_f64); + + format!( + "{} years {} mons 0 days 0 hours 0 mins 0.00 secs", + years, month, + ) + }; + + Ok(s) + }}; +} + +macro_rules! make_string_interval_day_time { + ($column: ident, $row: ident) => {{ + let array = $column + .as_any() + .downcast_ref::() + .unwrap(); + + let s = if array.is_null($row) { + "NULL".to_string() + } else { + let value: u64 = array.value($row) as u64; + + let days_parts: i32 = ((value & 0xFFFFFFFF00000000) >> 32) as i32; + let milliseconds_part: i32 = (value & 0xFFFFFFFF) as i32; + + let secs = milliseconds_part / 1000; + let mins = secs / 60; + let hours = mins / 60; + + let secs = secs - (mins * 60); + let mins = mins - (hours * 60); + + format!( + "0 years 0 mons {} days {} hours {} mins {}.{:02} secs", + days_parts, + hours, + mins, + secs, + (milliseconds_part % 1000), + ) + }; + + Ok(s) + }}; +} + +macro_rules! make_string_interval_month_day_nano { + ($column: ident, $row: ident) => {{ + let array = $column + .as_any() + .downcast_ref::() + .unwrap(); + + let s = if array.is_null($row) { + "NULL".to_string() + } else { + let value: u128 = array.value($row) as u128; + + let months_part: i32 = ((value & 0xFFFFFFFF000000000000000000000000) >> 96) as i32; + let days_part: i32 = ((value & 0xFFFFFFFF0000000000000000) >> 64) as i32; + let nanoseconds_part: i64 = (value & 0xFFFFFFFFFFFFFFFF) as i64; + + let secs = nanoseconds_part / 1000000000; + let mins = secs / 60; + let hours = mins / 60; + + let secs = secs - (mins * 60); + let mins = mins - (hours * 60); + + format!( + "0 years {} mons {} days {} hours {} mins {}.{:02} secs", + months_part, + days_part, + hours, + mins, + secs, + (nanoseconds_part % 1000000000), + ) + }; + + Ok(s) + }}; +} + +macro_rules! make_string_date { + ($array_type:ty, $column: ident, $row: ident) => {{ + let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + + let s = if array.is_null($row) { + "".to_string() + } else { + array + .value_as_date($row) + .map(|d| d.to_string()) + .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()) + }; + + Ok(s) + }}; +} + +macro_rules! make_string_time { + ($array_type:ty, $column: ident, $row: ident) => {{ + let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + + let s = if array.is_null($row) { + "".to_string() + } else { + array + .value_as_time($row) + .map(|d| d.to_string()) + .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()) + }; + + Ok(s) + }}; +} + +macro_rules! make_string_datetime { + ($array_type:ty, $column: ident, $row: ident) => {{ + let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + + let s = if array.is_null($row) { + "".to_string() + } else { + array + .value_as_datetime($row) + .map(|d| d.to_string()) + .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()) + }; + + Ok(s) + }}; +} + +// It's not possible to do array.value($row).to_string() for &[u8], let's format +// it as hex +macro_rules! make_string_hex { + ($array_type:ty, $column: ident, $row: ident) => {{ + let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + + let s = if array.is_null($row) { + "".to_string() + } else { + let mut tmp = "".to_string(); + + for character in array.value($row) { + tmp += &format!("{:02x}", character); + } + + tmp + }; + + Ok(s) + }}; +} + +macro_rules! make_string_from_list { + ($column: ident, $row: ident) => {{ + let list = $column + .as_any() + .downcast_ref::() + .ok_or(ArrowError::InvalidArgumentError(format!( + "Repl error: could not convert list column to list array." + )))? + .value($row); + let string_values = (0..list.len()) + .map(|i| array_value_to_string(&list.clone(), i)) + .collect::>>()?; + Ok(format!("[{}]", string_values.join(", "))) + }}; +} + +macro_rules! make_string_from_fixed_size_list { + ($column: ident, $row: ident) => {{ + let list = $column + .as_any() + .downcast_ref::() + .ok_or(ArrowError::InvalidArgumentError(format!( + "Repl error: could not convert list column to list array." + )))? + .value($row); + let string_values = (0..list.len()) + .map(|i| array_value_to_string(&list.clone(), i)) + .collect::>>()?; + Ok(format!("[{}]", string_values.join(", "))) + }}; +} + +#[inline(always)] +pub fn make_string_from_decimal(column: &Arc, row: usize) -> Result { + let array = column + .as_any() + .downcast_ref::() + .unwrap(); + + let formatted_decimal = array.value_as_string(row); + Ok(formatted_decimal) +} + +fn append_struct_field_string( + target: &mut String, + name: &str, + field_col: &Arc, + row: usize, +) -> Result<()> { + target.push('"'); + target.push_str(name); + target.push_str("\": "); + + if field_col.is_null(row) { + target.push_str("null"); + } else { + match field_col.data_type() { + DataType::Utf8 | DataType::LargeUtf8 => { + target.push('"'); + target.push_str(array_value_to_string(field_col, row)?.as_str()); + target.push('"'); + } + _ => { + target.push_str(array_value_to_string(field_col, row)?.as_str()); + } + } + } + + Ok(()) +} + +/// Get the value at the given row in an array as a String. +/// +/// Note this function is quite inefficient and is unlikely to be +/// suitable for converting large arrays or record batches. +pub fn array_value_to_string(column: &array::ArrayRef, row: usize) -> Result { + if column.is_null(row) { + return Ok("".to_string()); + } + match column.data_type() { + DataType::Utf8 => make_string!(array::StringArray, column, row), + DataType::LargeUtf8 => make_string!(array::LargeStringArray, column, row), + DataType::Binary => make_string_hex!(array::BinaryArray, column, row), + DataType::LargeBinary => make_string_hex!(array::LargeBinaryArray, column, row), + DataType::FixedSizeBinary(_) => { + make_string_hex!(array::FixedSizeBinaryArray, column, row) + } + DataType::Boolean => make_string!(array::BooleanArray, column, row), + DataType::Int8 => make_string!(array::Int8Array, column, row), + DataType::Int16 => make_string!(array::Int16Array, column, row), + DataType::Int32 => make_string!(array::Int32Array, column, row), + DataType::Int64 => make_string!(array::Int64Array, column, row), + DataType::UInt8 => make_string!(array::UInt8Array, column, row), + DataType::UInt16 => make_string!(array::UInt16Array, column, row), + DataType::UInt32 => make_string!(array::UInt32Array, column, row), + DataType::UInt64 => make_string!(array::UInt64Array, column, row), + DataType::Float16 => make_string!(array::Float16Array, column, row), + DataType::Float32 => make_string!(array::Float32Array, column, row), + DataType::Float64 => make_string!(array::Float64Array, column, row), + DataType::Decimal(..) => make_string_from_decimal(column, row), + DataType::Timestamp(unit, _) if *unit == TimeUnit::Second => { + make_string_datetime!(array::TimestampSecondArray, column, row) + } + DataType::Timestamp(unit, _) if *unit == TimeUnit::Millisecond => { + make_string_datetime!(array::TimestampMillisecondArray, column, row) + } + DataType::Timestamp(unit, _) if *unit == TimeUnit::Microsecond => { + make_string_datetime!(array::TimestampMicrosecondArray, column, row) + } + DataType::Timestamp(unit, _) if *unit == TimeUnit::Nanosecond => { + make_string_datetime!(array::TimestampNanosecondArray, column, row) + } + DataType::Date32 => make_string_date!(array::Date32Array, column, row), + DataType::Date64 => make_string_date!(array::Date64Array, column, row), + DataType::Time32(unit) if *unit == TimeUnit::Second => { + make_string_time!(array::Time32SecondArray, column, row) + } + DataType::Time32(unit) if *unit == TimeUnit::Millisecond => { + make_string_time!(array::Time32MillisecondArray, column, row) + } + DataType::Time64(unit) if *unit == TimeUnit::Microsecond => { + make_string_time!(array::Time64MicrosecondArray, column, row) + } + DataType::Time64(unit) if *unit == TimeUnit::Nanosecond => { + make_string_time!(array::Time64NanosecondArray, column, row) + } + DataType::Interval(unit) => match unit { + IntervalUnit::DayTime => { + make_string_interval_day_time!(column, row) + } + IntervalUnit::YearMonth => { + make_string_interval_year_month!(column, row) + } + IntervalUnit::MonthDayNano => { + make_string_interval_month_day_nano!(column, row) + } + }, + DataType::List(_) => make_string_from_list!(column, row), + DataType::Dictionary(index_type, _value_type) => match **index_type { + DataType::Int8 => dict_array_value_to_string::(column, row), + DataType::Int16 => dict_array_value_to_string::(column, row), + DataType::Int32 => dict_array_value_to_string::(column, row), + DataType::Int64 => dict_array_value_to_string::(column, row), + DataType::UInt8 => dict_array_value_to_string::(column, row), + DataType::UInt16 => dict_array_value_to_string::(column, row), + DataType::UInt32 => dict_array_value_to_string::(column, row), + DataType::UInt64 => dict_array_value_to_string::(column, row), + _ => Err(ArrowError::InvalidArgumentError(format!( + "Pretty printing not supported for {:?} due to index type", + column.data_type() + ))), + }, + DataType::FixedSizeList(_, _) => make_string_from_fixed_size_list!(column, row), + DataType::Struct(_) => { + let st = column + .as_any() + .downcast_ref::() + .ok_or_else(|| { + ArrowError::InvalidArgumentError( + "Repl error: could not convert struct column to struct array.".to_string(), + ) + })?; + + let mut s = String::new(); + s.push('{'); + let mut kv_iter = st.columns().into_iter().zip(st.column_names().into_iter()); + if let Some((col, name)) = kv_iter.next() { + append_struct_field_string(&mut s, name, col, row)?; + } + for (col, name) in kv_iter { + s.push_str(", "); + append_struct_field_string(&mut s, name, col, row)?; + } + s.push('}'); + + Ok(s) + } + _ => Err(ArrowError::InvalidArgumentError(format!( + "Pretty printing not implemented for {:?} type", + column.data_type() + ))), + } +} + +/// Converts the value of the dictionary array at `row` to a String +fn dict_array_value_to_string( + colum: &array::ArrayRef, + row: usize, +) -> Result { + let dict_array = colum.as_any().downcast_ref::>().unwrap(); + + let keys_array = dict_array.keys(); + + if keys_array.is_null(row) { + return Ok(String::from("")); + } + + let dict_index = keys_array.value(row).to_usize().ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Can not convert value {:?} at index {:?} to usize for string conversion.", + keys_array.value(row), + row + )) + })?; + + array_value_to_string(dict_array.values(), dict_index) +} diff --git a/arrow_deps/src/lib.rs b/arrow_deps/src/lib.rs new file mode 100644 index 0000000000..b1ead9249e --- /dev/null +++ b/arrow_deps/src/lib.rs @@ -0,0 +1,14 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! This crate exists to add a dependency on (likely as yet +//! unpublished) versions of arrow / datafusion so we can +//! manage the version used by ceresdbx in a single crate. + +pub mod display; +pub mod util; + +// export arrow and datafusion publically so we can have a single +// reference in cargo +pub use arrow; +pub use datafusion; +pub use parquet; diff --git a/arrow_deps/src/util.rs b/arrow_deps/src/util.rs new file mode 100644 index 0000000000..661fa919dd --- /dev/null +++ b/arrow_deps/src/util.rs @@ -0,0 +1,133 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! utilities for manipulating arrow/parquet/datafusion data structures. + +use std::convert::TryFrom; + +use arrow::{ + array::UInt32Array, + compute, + error::{ArrowError, Result}, + record_batch::RecordBatch, +}; + +/// Select the data in the [`RecordBatch`] by read and copy from the source +/// `batch`. +pub fn select_record_batch(batch: &RecordBatch, selected_rows: &[bool]) -> Result { + assert_eq!(batch.num_rows(), selected_rows.len()); + let selected_columns = { + // ensure the the selected_rows.len() is not greater than u32::MAX. + let _ = u32::try_from(selected_rows.len()).map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "too many rows in a batch, convert usize to u32 failed, num_rows:{}, err:{}", + batch.num_rows(), + e + )) + })?; + + let selected_index_iter = selected_rows + .iter() + .enumerate() + .filter_map(|(idx, selected)| if *selected { Some(idx as u32) } else { None }); + // TODO(xikai): avoid this memory allocation. + let indices = UInt32Array::from_iter_values(selected_index_iter); + + let mut cols = Vec::with_capacity(batch.num_columns()); + for orig_col_data in batch.columns() { + let new_col_data = compute::take(orig_col_data.as_ref(), &indices, None)?; + cols.push(new_col_data); + } + + cols + }; + + RecordBatch::try_new(batch.schema(), selected_columns) +} + +/// Reverse the data in the [`RecordBatch`] by read and copy from the source +/// `batch`. +pub fn reverse_record_batch(batch: &RecordBatch) -> Result { + let reversed_columns = { + let num_rows = u32::try_from(batch.num_rows()).map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "too many rows in a batch, convert usize to u32 failed, num_rows:{}, err:{}", + batch.num_rows(), + e + )) + })?; + // TODO(xikai): avoid this memory allocation. + let indices = UInt32Array::from_iter_values((0..num_rows).into_iter().rev()); + + let mut cols = Vec::with_capacity(batch.num_columns()); + for orig_col_data in batch.columns() { + let new_col_data = compute::take(orig_col_data.as_ref(), &indices, None)?; + cols.push(new_col_data); + } + + cols + }; + + RecordBatch::try_new(batch.schema(), reversed_columns) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow::{ + array::Int32Array, + datatypes::{DataType, Field, Schema}, + }; + + use super::*; + + #[test] + fn test_reverse_record_batch() { + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let (ids, reverse_ids) = { + let mut source = vec![1, 2, 3, 4, 5]; + let arr = Int32Array::from(source.clone()); + source.reverse(); + let reversed_arr = Int32Array::from(source); + (arr, reversed_arr) + }; + + let batch = + RecordBatch::try_new(schema.clone(), vec![Arc::new(ids)]).expect("build record batch"); + let expect_reversed_batch = + RecordBatch::try_new(schema, vec![Arc::new(reverse_ids)]).expect("build record batch"); + let reversed_batch = reverse_record_batch(&batch).expect("reverse record batch"); + + assert_eq!(expect_reversed_batch, reversed_batch); + } + + #[test] + fn test_reverse_empty_record_batch() { + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let arr = Int32Array::from(Vec::::new()); + + let batch = RecordBatch::try_new(schema, vec![Arc::new(arr)]).expect("build record batch"); + let reversed_batch = reverse_record_batch(&batch).expect("reverse record batch"); + + assert_eq!(batch, reversed_batch); + } + + #[test] + fn test_select_record_batch() { + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let (ids, expect_selected_ids, selected_rows) = { + let arr = Int32Array::from(vec![1, 2, 3, 4, 5]); + let selected_arr = Int32Array::from(vec![2, 3, 5]); + (arr, selected_arr, vec![false, true, true, false, true]) + }; + let batch = + RecordBatch::try_new(schema.clone(), vec![Arc::new(ids)]).expect("build record batch"); + let selected_batch = + select_record_batch(&batch, &selected_rows).expect("select record batch"); + let expect_selected_batch = + RecordBatch::try_new(schema, vec![Arc::new(expect_selected_ids)]) + .expect("build record batch"); + + assert_eq!(selected_batch, expect_selected_batch); + } +} diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml new file mode 100644 index 0000000000..e453bd8eb6 --- /dev/null +++ b/benchmarks/Cargo.toml @@ -0,0 +1,34 @@ +[package] +name = "benchmarks" +version = "0.1.0" +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +arena = { path = "../components/arena" } +arrow2 = { version = "0.7.0", features = [ "io_parquet" ] } +arrow_deps = { path = "../arrow_deps" } +analytic_engine = { path = "../analytic_engine" } +clap = "2.0" +common_types = { path = "../common_types" } +common_util = { path = "../common_util" } +env_logger = "0.6" +futures = "0.3" +log = "0.4" +object_store = { path = "../components/object_store" } +parquet = { path = "../components/parquet"} +serde = "1.0" +serde_derive = "1.0" +table_engine = { path = "../table_engine" } +tokio = { version = "1.0", features = ["sync"] } + +[dev-dependencies] +criterion = "0.3" + +[[bench]] +name = "bench" +harness = false + +[[bin]] +name = "sst-tools" diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000000..65cc001e80 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,25 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +# Benchmarks + +## Test Data +todo + +## Config +A config template can be found in `config/bench.toml`. + +## Run benchmarks +In root directory of `ceresdbx` (not this directory `ceresdbx/benchmarks`), run the following command: +```bash +ANALYTIC_BENCH_CONFIG_PATH=/path/to/bench.toml cargo bench -p benchmarks +``` + +Print logs: +```bash +RUST_LOG=info ANALYTIC_BENCH_CONFIG_PATH=/path/to/bench.toml cargo bench -p benchmarks +``` + +Run specific bench: +```bash +ANALYTIC_BENCH_CONFIG_PATH=/path/to/bench.toml cargo bench -p benchmarks -- read_parquet +``` diff --git a/benchmarks/bench.toml b/benchmarks/bench.toml new file mode 100644 index 0000000000..e182151bdb --- /dev/null +++ b/benchmarks/bench.toml @@ -0,0 +1,45 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +[sst_bench] +store_path = "/Users/chunshao.rcs/Desktop/work/gitlab/ceresdbx/1/1" +# store_path = "/Users/yingwen.yyw/data/antmonitor_mid_table_4022" +sst_file_name = "37.sst" +runtime_thread_num = 1 +bench_measurement_time = "30s" +max_projections = 5 +read_batch_row_num = 500 +sst_meta_cache_cap = 1000 +sst_data_cache_cap = 10000 + +[sst_bench.predicate] +# start_time_ms = 0 +start_time_ms = 1632985200000 +# end_time_ms = 0 +end_time_ms = 1632985800000 + +[merge_sst_bench] +store_path = "/Users/chunshao.rcs/Desktop/work/gitlab/ceresdbx" +space_id = 1 +table_id = 1 +sst_file_ids = [ 34, 37 ] +runtime_thread_num = 1 +bench_measurement_time = "120s" +max_projections = 5 +read_batch_row_num = 500 + +[merge_sst_bench.predicate] +start_time_ms = 0 +# start_time_ms = 1632985200000 +end_time_ms = 0 +# end_time_ms = 1632985800000 + +[scan_memtable_bench] +store_path = "/Users/chunshao.rcs/Desktop/work/gitlab/ceresdbx/1/1" +sst_file_name = "37.sst" +runtime_thread_num = 1 +max_projections = 5 +arena_block_size = "64M" + +[wal_row_bench] +rows_num = 100_0000 +test_num = 3 \ No newline at end of file diff --git a/benchmarks/benches/bench.rs b/benchmarks/benches/bench.rs new file mode 100644 index 0000000000..26ee634424 --- /dev/null +++ b/benchmarks/benches/bench.rs @@ -0,0 +1,208 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Benchmarks + +use std::sync::Once; + +use benchmarks::{ + arrow2_bench::Arrow2Bench, + config::{self, BenchConfig}, + merge_memtable_bench::MergeMemTableBench, + merge_sst_bench::MergeSstBench, + parquet_bench::ParquetBench, + scan_memtable_bench::ScanMemTableBench, + sst_bench::SstBench, +}; +use criterion::*; + +static INIT_LOG: Once = Once::new(); + +pub fn init_bench() -> BenchConfig { + INIT_LOG.call_once(|| { + env_logger::init(); + }); + + config::bench_config_from_env() +} + +fn bench_read_sst_iter(b: &mut Bencher<'_>, bench: &SstBench) { + b.iter(|| { + bench.run_bench(); + }) +} + +fn bench_read_sst(c: &mut Criterion) { + let config = init_bench(); + + let mut group = c.benchmark_group("read_sst"); + group.measurement_time(config.sst_bench.bench_measurement_time.0); + group.sample_size(config.sst_bench.bench_sample_size); + + let mut bench = SstBench::new(config.sst_bench); + + for i in 0..bench.num_benches() { + bench.init_for_bench(i); + + group.bench_with_input( + BenchmarkId::new("read_sst", format!("{}/{}", bench.sst_file_name, i)), + &bench, + bench_read_sst_iter, + ); + } + + group.finish(); +} + +fn bench_merge_sst_iter(b: &mut Bencher<'_>, bench: &MergeSstBench) { + b.iter(|| bench.run_bench()) +} + +fn bench_merge_sst(c: &mut Criterion) { + let config = init_bench(); + + let mut group = c.benchmark_group("merge_sst"); + + group.measurement_time(config.merge_sst_bench.bench_measurement_time.0); + group.sample_size(config.sst_bench.bench_sample_size); + + let sst_file_ids = format!("{:?}", config.merge_sst_bench.sst_file_ids); + let mut bench = MergeSstBench::new(config.merge_sst_bench); + + for i in 0..bench.num_benches() { + bench.init_for_bench(i, true); + group.bench_with_input( + BenchmarkId::new("merge_sst", format!("{}/{}/dedup", sst_file_ids, i)), + &bench, + bench_merge_sst_iter, + ); + + bench.init_for_bench(i, false); + group.bench_with_input( + BenchmarkId::new("merge_sst", format!("{}/{}/no-dedup", sst_file_ids, i)), + &bench, + bench_merge_sst_iter, + ); + } + + group.finish(); +} + +fn bench_parquet_iter(b: &mut Bencher<'_>, bench: &ParquetBench) { + b.iter(|| bench.run_bench()) +} + +fn bench_parquet(c: &mut Criterion) { + let config = init_bench(); + + let mut group = c.benchmark_group("read_parquet"); + + group.measurement_time(config.sst_bench.bench_measurement_time.0); + group.sample_size(config.sst_bench.bench_sample_size); + + let mut bench = ParquetBench::new(config.sst_bench); + + for i in 0..bench.num_benches() { + bench.init_for_bench(i); + + group.bench_with_input( + BenchmarkId::new("read_parquet", format!("{}/{}", bench.sst_file_name, i)), + &bench, + bench_parquet_iter, + ); + } + + group.finish(); +} + +fn bench_scan_memtable_iter(b: &mut Bencher<'_>, bench: &ScanMemTableBench) { + b.iter(|| bench.run_bench()) +} + +fn bench_scan_memtable(c: &mut Criterion) { + let config = init_bench(); + + let mut group = c.benchmark_group("scan_memtable"); + + let mut bench = ScanMemTableBench::new(config.scan_memtable_bench); + + for i in 0..bench.num_benches() { + bench.init_for_bench(i); + + group.bench_with_input( + BenchmarkId::new("scan_memtable", i), + &bench, + bench_scan_memtable_iter, + ); + } + + group.finish(); +} + +fn bench_merge_memtable_iter(b: &mut Bencher<'_>, bench: &MergeMemTableBench) { + b.iter(|| bench.run_bench()) +} + +fn bench_merge_memtable(c: &mut Criterion) { + let config = init_bench(); + + let mut group = c.benchmark_group("merge_memtable"); + + let sst_file_ids = format!("{:?}", config.merge_memtable_bench.sst_file_ids); + let mut bench = MergeMemTableBench::new(config.merge_memtable_bench); + + for i in 0..bench.num_benches() { + bench.init_for_bench(i, true); + group.bench_with_input( + BenchmarkId::new("merge_memtable", format!("{}/{}/dedup", sst_file_ids, i)), + &bench, + bench_merge_memtable_iter, + ); + + bench.init_for_bench(i, false); + group.bench_with_input( + BenchmarkId::new("merge_memtable", format!("{}/{}/no-dedup", sst_file_ids, i)), + &bench, + bench_merge_memtable_iter, + ); + } + + group.finish(); +} + +fn bench_arrow2_iter(b: &mut Bencher<'_>, bench: &Arrow2Bench) { + b.iter(|| bench.run_bench()) +} + +fn bench_arrow2(c: &mut Criterion) { + let config = init_bench(); + + let mut group = c.benchmark_group("read_arrow2"); + + group.measurement_time(config.sst_bench.bench_measurement_time.0); + group.sample_size(config.sst_bench.bench_sample_size); + + let mut bench = Arrow2Bench::new(config.sst_bench); + + for i in 0..bench.num_benches() { + bench.init_for_bench(i); + + group.bench_with_input( + BenchmarkId::new("read_arrow2", format!("{}/{}", bench.sst_file_name, i)), + &bench, + bench_arrow2_iter, + ); + } + + group.finish(); +} + +criterion_group!( + benches, + bench_read_sst, + bench_merge_sst, + bench_parquet, + bench_scan_memtable, + bench_merge_memtable, + bench_arrow2, +); +criterion_main!(benches); diff --git a/benchmarks/config/bench.toml b/benchmarks/config/bench.toml new file mode 100644 index 0000000000..ba73090b77 --- /dev/null +++ b/benchmarks/config/bench.toml @@ -0,0 +1,50 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +[sst_bench] +store_path = "/path/to/data/1/1" +sst_file_name = "37.sst" +runtime_thread_num = 1 +bench_measurement_time = "30s" +bench_sample_size = 30 +max_projections = 5 +read_batch_row_num = 500 +reverse = false + +[sst_bench.predicate] +# start_time_ms = 0 +start_time_ms = 1632985200000 +# end_time_ms = 0 +end_time_ms = 1632985800000 + +[merge_sst_bench] +store_path = "/path/to/data" +space_id = 1 +table_id = 1 +sst_file_ids = [ 34, 37 ] +runtime_thread_num = 1 +bench_measurement_time = "30s" +bench_sample_size = 30 +max_projections = 5 +read_batch_row_num = 500 + +[merge_sst_bench.predicate] +start_time_ms = 0 +# start_time_ms = 1632985200000 +end_time_ms = 0 +# end_time_ms = 1632985800000 + +[scan_memtable_bench] +store_path = "/path/to/data/1/1" +sst_file_name = "37.sst" +runtime_thread_num = 1 +max_projections = 5 +arena_block_size = "64M" + +[merge_memtable_bench] +store_path = "/path/to/data" +space_id = 1 +table_id = 1 +sst_file_ids = [ 37 ] +runtime_thread_num = 1 +max_projections = 5 +arena_block_size = "64M" diff --git a/benchmarks/config/sst.toml b/benchmarks/config/sst.toml new file mode 100644 index 0000000000..5758df2459 --- /dev/null +++ b/benchmarks/config/sst.toml @@ -0,0 +1,33 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +runtime_thread_num = 4 + + [rebuild_sst] + store_path = "/Users/chunshao.rcs/Desktop/work/gitlab/ceresdb/neo/ceresdb/ceresdbx/benchmarks" + input_file_name = "898.sst" + # read_batch_row_num = 500 + # read_batch_row_num = 4096 + read_batch_row_num = 8192 +# read_batch_row_num = 16384 + output_file_name = "tt_t.sst" + num_rows_per_row_group = 8192 +compression = "SNAPPY" + + [rebuild_sst.predicate] + start_time_ms = 0 + end_time_ms = 0 + +#[merge_sst] +#store_path = "/Users/chunshao.rcs/Desktop/work/gitlab/ceresdb/neo/ceresdb/ceresdbx/benchmarks/2199023255564" +#space_id = 1 +#table_id = 1 +#sst_file_ids = [1, 17, 19, 24, 31, 37, 43, 45, 9, 14, 18, 21, 27, 34, 40, 44, 5] +#dedup = true +#read_batch_row_num = 16384 +#output_store_path = "/Users/yingwen.yyw/data/1/1" +#output_file_name = "16384-all.sst" +#num_rows_per_row_group = 16384 +# +#[merge_sst.predicate] +#start_time_ms = 0 +#end_time_ms = 0 diff --git a/benchmarks/src/arrow2_bench.rs b/benchmarks/src/arrow2_bench.rs new file mode 100644 index 0000000000..e51e96fe4d --- /dev/null +++ b/benchmarks/src/arrow2_bench.rs @@ -0,0 +1,81 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Arrow 2 bench. + +use std::{fs::File, io::BufReader, path::Path, sync::Arc, time::Instant}; + +use arrow2::io::parquet::read; +use common_util::runtime::Runtime; +use log::info; + +use crate::{config::SstBenchConfig, util}; + +pub struct Arrow2Bench { + store_path: String, + pub sst_file_name: String, + max_projections: usize, + projection: Vec, + runtime: Arc, +} + +impl Arrow2Bench { + pub fn new(config: SstBenchConfig) -> Self { + let runtime = util::new_runtime(config.runtime_thread_num); + + Arrow2Bench { + store_path: config.store_path, + sst_file_name: config.sst_file_name, + max_projections: config.max_projections, + projection: Vec::new(), + runtime: Arc::new(runtime), + } + } + + pub fn num_benches(&self) -> usize { + // One test reads all columns and `max_projections` tests read with projection. + 1 + self.max_projections + } + + pub fn init_for_bench(&mut self, i: usize) { + let projection = if i < self.max_projections { + (0..i + 1).into_iter().collect() + } else { + Vec::new() + }; + + self.projection = projection; + } + + pub fn run_bench(&self) { + let sst_path = Path::new(&self.store_path).join(&self.sst_file_name); + + self.runtime.block_on(async { + let open_instant = Instant::now(); + let file = BufReader::new(File::open(sst_path).unwrap()); + + let record_reader = if self.projection.is_empty() { + read::RecordReader::try_new(file, None, None, None, None).unwrap() + } else { + read::RecordReader::try_new(file, Some(self.projection.clone()), None, None, None).unwrap() + }; + let open_cost = open_instant.elapsed(); + + let iter_begin_instant = Instant::now(); + let mut total_rows = 0; + let mut batch_num = 0; + for record_batch in record_reader { + let num_rows = record_batch.unwrap().num_rows(); + total_rows += num_rows; + batch_num += 1; + } + + info!( + "\nParquetBench total rows of sst: {}, total batch num: {}, open cost: {:?}, iter cost: {:?}", + total_rows, + batch_num, + open_cost, + iter_begin_instant.elapsed(), + ); + }); + } +} diff --git a/benchmarks/src/bin/sst-tools.rs b/benchmarks/src/bin/sst-tools.rs new file mode 100644 index 0000000000..ab1a6e91be --- /dev/null +++ b/benchmarks/src/bin/sst-tools.rs @@ -0,0 +1,70 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use benchmarks::{ + sst_tools::{self, MergeSstConfig, RebuildSstConfig}, + util, +}; +use clap::{App, Arg}; +use common_util::toml; +use log::info; +use serde_derive::Deserialize; + +#[derive(Debug, Deserialize)] +#[serde(default)] +struct Config { + runtime_thread_num: usize, + rebuild_sst: Option, + merge_sst: Option, +} + +impl Default for Config { + fn default() -> Config { + Self { + runtime_thread_num: 1, + rebuild_sst: None, + merge_sst: None, + } + } +} + +fn config_from_path(path: &str) -> Config { + let mut toml_buf = String::new(); + toml::parse_toml_from_path(path, &mut toml_buf).expect("Failed to parse config.") +} + +fn main() { + env_logger::init(); + + let matches = App::new("SST Tools") + .arg( + Arg::with_name("config") + .short("c") + .long("config") + .required(true) + .takes_value(true) + .help("Set configuration file, eg: \"/path/server.toml\""), + ) + .get_matches(); + + let config_path = matches + .value_of("config") + .expect("Config file is required."); + let config = config_from_path(config_path); + + info!("sst tools start, config:{:?}", config); + + let runtime = Arc::new(util::new_runtime(config.runtime_thread_num)); + + let rt = runtime.clone(); + runtime.block_on(async { + if let Some(rebuild_sst) = config.rebuild_sst { + sst_tools::rebuild_sst(rebuild_sst, rt.clone()).await; + } + + if let Some(merge_sst) = config.merge_sst { + sst_tools::merge_sst(merge_sst, rt).await; + } + }); +} diff --git a/benchmarks/src/config.rs b/benchmarks/src/config.rs new file mode 100644 index 0000000000..a66cfa1163 --- /dev/null +++ b/benchmarks/src/config.rs @@ -0,0 +1,123 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Benchmark configs. + +use std::env; + +use analytic_engine::{space::SpaceId, sst::manager::FileId}; +use common_types::time::{TimeRange, Timestamp}; +use common_util::{ + config::{ReadableDuration, ReadableSize}, + toml, +}; +use serde_derive::Deserialize; +use table_engine::{predicate::Predicate, table::TableId}; + +const BENCH_CONFIG_PATH_KEY: &str = "ANALYTIC_BENCH_CONFIG_PATH"; + +#[derive(Deserialize)] +pub struct BenchConfig { + pub sst_bench: SstBenchConfig, + pub merge_sst_bench: MergeSstBenchConfig, + pub scan_memtable_bench: ScanMemTableBenchConfig, + pub merge_memtable_bench: MergeMemTableBenchConfig, +} + +// TODO(yingwen): Maybe we can use layze static to load config first. +pub fn bench_config_from_env() -> BenchConfig { + let path = match env::var(BENCH_CONFIG_PATH_KEY) { + Ok(v) => v, + Err(e) => panic!( + "Env {} is required to run benches, err:{}.", + BENCH_CONFIG_PATH_KEY, e + ), + }; + + let mut toml_buf = String::new(); + toml::parse_toml_from_path(&path, &mut toml_buf).expect("Failed to parse config.") +} + +#[derive(Deserialize)] +pub struct SstBenchConfig { + pub store_path: String, + pub sst_file_name: String, + pub runtime_thread_num: usize, + + pub bench_measurement_time: ReadableDuration, + pub bench_sample_size: usize, + + /// Max number of projection columns. + pub max_projections: usize, + pub read_batch_row_num: usize, + pub predicate: BenchPredicate, + pub sst_meta_cache_cap: Option, + pub sst_data_cache_cap: Option, + pub reverse: bool, +} + +#[derive(Deserialize)] +pub struct MergeSstBenchConfig { + pub store_path: String, + pub space_id: SpaceId, + pub table_id: TableId, + pub sst_file_ids: Vec, + pub runtime_thread_num: usize, + + pub bench_measurement_time: ReadableDuration, + pub bench_sample_size: usize, + + /// Max number of projection columns. + pub max_projections: usize, + pub read_batch_row_num: usize, + pub predicate: BenchPredicate, +} + +#[derive(Deserialize)] +pub struct ScanMemTableBenchConfig { + pub store_path: String, + pub sst_file_name: String, + pub runtime_thread_num: usize, + + /// Max number of projection columns. + pub max_projections: usize, + + pub arena_block_size: ReadableSize, +} + +#[derive(Debug, Deserialize)] +pub struct BenchPredicate { + /// Inclusive start time in millis. + start_time_ms: i64, + /// Exclusive end time in millis. + /// + /// Set to current time millis if start_time_ms == end_time_ms. + end_time_ms: i64, +} + +impl BenchPredicate { + pub fn into_predicate(self) -> Predicate { + let start = Timestamp::new(self.start_time_ms); + let end = if self.start_time_ms == self.end_time_ms { + Timestamp::now() + } else { + Timestamp::new(self.end_time_ms) + }; + let time_range = TimeRange::new(start, end).unwrap(); + + Predicate::new(time_range) + } +} + +#[derive(Deserialize)] +pub struct MergeMemTableBenchConfig { + pub store_path: String, + pub space_id: SpaceId, + pub table_id: TableId, + pub sst_file_ids: Vec, + pub runtime_thread_num: usize, + + /// Max number of projection columns. + pub max_projections: usize, + + pub arena_block_size: ReadableSize, +} diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs new file mode 100644 index 0000000000..526d028021 --- /dev/null +++ b/benchmarks/src/lib.rs @@ -0,0 +1,17 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Utilities for benchmarks. + +use common_types::SequenceNumber; + +pub mod arrow2_bench; +pub mod config; +pub mod merge_memtable_bench; +pub mod merge_sst_bench; +pub mod parquet_bench; +pub mod scan_memtable_bench; +pub mod sst_bench; +pub mod sst_tools; +pub mod util; + +pub(crate) const INIT_SEQUENCE: SequenceNumber = 1; diff --git a/benchmarks/src/merge_memtable_bench.rs b/benchmarks/src/merge_memtable_bench.rs new file mode 100644 index 0000000000..7596576aa6 --- /dev/null +++ b/benchmarks/src/merge_memtable_bench.rs @@ -0,0 +1,209 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Merge memtable bench. + +use std::{cmp, sync::Arc, time::Instant}; + +use analytic_engine::{ + memtable::{ + factory::{Factory as MemTableFactory, Options}, + skiplist::factory::SkiplistMemTableFactory, + }, + row_iter::{ + dedup::DedupIterator, + merge::{MergeBuilder, MergeConfig}, + IterOptions, RecordBatchWithKeyIterator, + }, + space::SpaceId, + sst::factory::{FactoryImpl, SstReaderOptions, SstType}, + table::{ + sst_util, + version::{MemTableState, MemTableVec}, + }, +}; +use arena::NoopCollector; +use common_types::{ + projected_schema::ProjectedSchema, request_id::RequestId, schema::Schema, time::TimeRange, +}; +use common_util::runtime::Runtime; +use log::info; +use object_store::{disk::File, ObjectStore}; +use parquet::{DataCacheRef, MetaCacheRef}; +use table_engine::{predicate::Predicate, table::TableId}; + +use crate::{config::MergeMemTableBenchConfig, util}; + +pub struct MergeMemTableBench { + store: File, + memtables: MemTableVec, + max_projections: usize, + schema: Schema, + projected_schema: ProjectedSchema, + runtime: Arc, + space_id: SpaceId, + table_id: TableId, + dedup: bool, + sst_reader_options: SstReaderOptions, +} + +impl MergeMemTableBench { + pub fn new(config: MergeMemTableBenchConfig) -> Self { + assert!(!config.sst_file_ids.is_empty()); + + let store = File::new(config.store_path); + let runtime = Arc::new(util::new_runtime(config.runtime_thread_num)); + let space_id = config.space_id; + let table_id = config.table_id; + + let meta_cache: Option = None; + let data_cache: Option = None; + + // Use first sst's schema. + let mut sst_path = store.new_path(); + sst_util::set_sst_file_path(space_id, table_id, config.sst_file_ids[0], &mut sst_path); + let schema = runtime.block_on(util::schema_from_sst( + &store, + &sst_path, + &meta_cache, + &data_cache, + )); + + let projected_schema = ProjectedSchema::no_projection(schema.clone()); + let max_projections = cmp::min(config.max_projections, schema.num_columns()); + + let mut memtables = Vec::with_capacity(config.sst_file_ids.len()); + for id in &config.sst_file_ids { + let mut sst_path = store.new_path(); + sst_util::set_sst_file_path(space_id, table_id, *id, &mut sst_path); + + let memtable_factory = SkiplistMemTableFactory; + let memtable_opts = Options { + collector: Arc::new(NoopCollector {}), + schema: schema.clone(), + arena_block_size: config.arena_block_size.0 as u32, + creation_sequence: crate::INIT_SEQUENCE, + }; + let memtable = memtable_factory.create_memtable(memtable_opts).unwrap(); + + runtime.block_on(util::load_sst_to_memtable( + &store, + &sst_path, + &schema, + &memtable, + runtime.clone(), + )); + + info!( + "\nMergeMemTableBench memtable loaded, memory used: {}", + memtable.approximate_memory_usage() + ); + + memtables.push(MemTableState { + mem: memtable, + time_range: TimeRange::min_to_max(), + id: *id, + }); + } + let sst_reader_options = mock_sst_reader_options(projected_schema.clone(), runtime.clone()); + + MergeMemTableBench { + store, + memtables, + max_projections, + schema, + projected_schema, + runtime, + space_id, + table_id, + dedup: true, + sst_reader_options, + } + } + + pub fn num_benches(&self) -> usize { + // One test reads all columns and `max_projections` tests read with projection. + 1 + self.max_projections + } + + pub fn init_for_bench(&mut self, i: usize, dedup: bool) { + let projected_schema = + util::projected_schema_by_number(&self.schema, i, self.max_projections); + + self.projected_schema = projected_schema; + self.dedup = dedup; + } + + // TODO(xikai): add benchmark for merge in reverse order. + pub fn run_bench(&self) { + let space_id = self.space_id; + let table_id = self.table_id; + let sequence = u64::MAX; + let iter_options = IterOptions::default(); + let projected_schema = self.projected_schema.clone(); + let sst_factory = FactoryImpl; + + let request_id = RequestId::next_id(); + let mut builder = MergeBuilder::new(MergeConfig { + request_id, + space_id, + table_id, + sequence, + projected_schema, + predicate: Arc::new(Predicate::empty()), + sst_factory, + sst_reader_options: self.sst_reader_options.clone(), + store: &self.store, + merge_iter_options: iter_options.clone(), + need_dedup: true, + reverse: false, + }); + + builder.mut_memtables().extend_from_slice(&self.memtables); + + self.runtime.block_on(async { + let begin_instant = Instant::now(); + + let mut merge_iter = builder.build().await.unwrap(); + let mut total_rows = 0; + let mut batch_num = 0; + + if self.dedup { + let mut dedup_iter = DedupIterator::new(request_id, merge_iter, iter_options); + while let Some(batch) = dedup_iter.next_batch().await.unwrap() { + let num_rows = batch.num_rows(); + total_rows += num_rows; + batch_num += 1; + } + } else { + while let Some(batch) = merge_iter.next_batch().await.unwrap() { + let num_rows = batch.num_rows(); + total_rows += num_rows; + batch_num += 1; + } + } + + info!( + "\nMergeMemTableBench total rows of sst: {}, total batch num: {}, cost: {:?}", + total_rows, + batch_num, + begin_instant.elapsed(), + ); + }); + } +} + +fn mock_sst_reader_options( + projected_schema: ProjectedSchema, + runtime: Arc, +) -> SstReaderOptions { + SstReaderOptions { + sst_type: SstType::Parquet, + read_batch_row_num: 500, + reverse: false, + projected_schema, + predicate: Arc::new(Predicate::new(TimeRange::min_to_max())), + meta_cache: None, + data_cache: None, + runtime, + } +} diff --git a/benchmarks/src/merge_sst_bench.rs b/benchmarks/src/merge_sst_bench.rs new file mode 100644 index 0000000000..a0ccab50d5 --- /dev/null +++ b/benchmarks/src/merge_sst_bench.rs @@ -0,0 +1,225 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Merge SST bench. + +use std::{cmp, sync::Arc, time::Instant}; + +use analytic_engine::{ + row_iter::{ + chain, + chain::ChainConfig, + dedup::DedupIterator, + merge::{MergeBuilder, MergeConfig}, + IterOptions, RecordBatchWithKeyIterator, + }, + space::SpaceId, + sst::{ + factory::{FactoryImpl, SstReaderOptions, SstType}, + file::{FileHandle, FilePurgeQueue, Request}, + }, + table::sst_util, +}; +use common_types::{projected_schema::ProjectedSchema, request_id::RequestId, schema::Schema}; +use common_util::runtime::Runtime; +use log::info; +use object_store::{disk::File, ObjectStore}; +use parquet::{DataCacheRef, MetaCacheRef}; +use table_engine::{predicate::Predicate, table::TableId}; +use tokio::sync::mpsc::{self, UnboundedReceiver}; + +use crate::{config::MergeSstBenchConfig, util}; + +pub struct MergeSstBench { + store: File, + max_projections: usize, + schema: Schema, + sst_reader_options: SstReaderOptions, + runtime: Arc, + space_id: SpaceId, + table_id: TableId, + file_handles: Vec, + _receiver: UnboundedReceiver, + dedup: bool, +} + +impl MergeSstBench { + pub fn new(config: MergeSstBenchConfig) -> Self { + assert!(!config.sst_file_ids.is_empty()); + + let store = File::new(config.store_path); + let runtime = Arc::new(util::new_runtime(config.runtime_thread_num)); + let space_id = config.space_id; + let table_id = config.table_id; + + let mut sst_path = store.new_path(); + sst_util::set_sst_file_path(space_id, table_id, config.sst_file_ids[0], &mut sst_path); + let meta_cache: Option = None; + let data_cache: Option = None; + + let schema = runtime.block_on(util::schema_from_sst( + &store, + &sst_path, + &meta_cache, + &data_cache, + )); + + let predicate = config.predicate.into_predicate(); + let projected_schema = ProjectedSchema::no_projection(schema.clone()); + let sst_reader_options = SstReaderOptions { + sst_type: SstType::Parquet, + read_batch_row_num: config.read_batch_row_num, + reverse: false, + projected_schema, + predicate: Arc::new(predicate), + meta_cache: meta_cache.clone(), + data_cache: data_cache.clone(), + runtime: runtime.clone(), + }; + let max_projections = cmp::min(config.max_projections, schema.num_columns()); + + let (tx, rx) = mpsc::unbounded_channel(); + let purge_queue = FilePurgeQueue::new(space_id, table_id, tx); + + let file_handles = runtime.block_on(util::file_handles_from_ssts( + &store, + space_id, + table_id, + &config.sst_file_ids, + purge_queue, + &meta_cache, + &data_cache, + )); + + MergeSstBench { + store, + max_projections, + schema, + sst_reader_options, + runtime, + space_id, + table_id, + file_handles, + _receiver: rx, + dedup: true, + } + } + + pub fn num_benches(&self) -> usize { + // One test reads all columns and `max_projections` tests read with projection. + 1 + self.max_projections + } + + pub fn init_for_bench(&mut self, i: usize, dedup: bool) { + let projected_schema = + util::projected_schema_by_number(&self.schema, i, self.max_projections); + + self.sst_reader_options.projected_schema = projected_schema; + self.dedup = dedup; + } + + fn run_dedup_bench(&self) { + let space_id = self.space_id; + let table_id = self.table_id; + let sequence = u64::MAX; + let iter_options = IterOptions::default(); + let projected_schema = self.sst_reader_options.projected_schema.clone(); + let sst_factory = FactoryImpl; + + let request_id = RequestId::next_id(); + let mut builder = MergeBuilder::new(MergeConfig { + request_id, + space_id, + table_id, + sequence, + projected_schema, + predicate: Arc::new(Predicate::empty()), + sst_factory, + sst_reader_options: self.sst_reader_options.clone(), + store: &self.store, + merge_iter_options: iter_options.clone(), + need_dedup: true, + reverse: false, + }); + + builder + .mut_ssts_of_level(0) + .extend_from_slice(&self.file_handles); + + self.runtime.block_on(async { + let begin_instant = Instant::now(); + + let merge_iter = builder.build().await.unwrap(); + let mut dedup_iter = DedupIterator::new(request_id, merge_iter, iter_options); + let mut total_rows = 0; + let mut batch_num = 0; + + while let Some(batch) = dedup_iter.next_batch().await.unwrap() { + let num_rows = batch.num_rows(); + total_rows += num_rows; + batch_num += 1; + } + + info!( + "\nMergeSstBench total rows of sst: {}, total batch num: {}, cost: {:?}", + total_rows, + batch_num, + begin_instant.elapsed(), + ); + }); + } + + fn run_no_dedup_bench(&self) { + let space_id = self.space_id; + let table_id = self.table_id; + let projected_schema = self.sst_reader_options.projected_schema.clone(); + let sst_factory = FactoryImpl; + + let request_id = RequestId::next_id(); + let builder = chain::Builder::new(ChainConfig { + request_id, + space_id, + table_id, + projected_schema, + predicate: Arc::new(Predicate::empty()), + sst_factory, + sst_reader_options: self.sst_reader_options.clone(), + store: &self.store, + }) + .ssts(vec![self.file_handles.clone()]); + + self.runtime.block_on(async { + let begin_instant = Instant::now(); + + let mut chain_iter = builder.build().await.unwrap(); + let mut total_rows = 0; + let mut batch_num = 0; + + while let Some(batch) = chain_iter.next_batch().await.unwrap() { + let num_rows = batch.num_rows(); + total_rows += num_rows; + batch_num += 1; + } + + info!( + "\nMergeSstBench total rows of sst: {}, total batch num: {}, cost: {:?}", + total_rows, + batch_num, + begin_instant.elapsed(), + ); + }); + } + + pub fn run_bench(&self) { + if self.dedup { + self.run_dedup_bench(); + } else { + self.run_no_dedup_bench(); + } + } +} + +impl Drop for MergeSstBench { + fn drop(&mut self) { + self.file_handles.clear(); + } +} diff --git a/benchmarks/src/parquet_bench.rs b/benchmarks/src/parquet_bench.rs new file mode 100644 index 0000000000..b52c84f7e1 --- /dev/null +++ b/benchmarks/src/parquet_bench.rs @@ -0,0 +1,137 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Parquet bench. + +use std::{sync::Arc, time::Instant}; + +use arrow_deps::parquet::{ + arrow::{ArrowReader, ParquetFileArrowReader}, + file::{ + metadata::RowGroupMetaData, reader::FileReader, serialized_reader::SerializedFileReader, + }, +}; +use common_types::schema::Schema; +use common_util::runtime::Runtime; +use log::info; +use object_store::{disk::File, path::ObjectStorePath, ObjectStore}; +use parquet::{DataCacheRef, MetaCacheRef}; +use table_engine::predicate::PredicateRef; + +use crate::{config::SstBenchConfig, util}; + +type RowGroupPredicate = Box bool + 'static>; + +pub struct ParquetBench { + store: File, + pub sst_file_name: String, + max_projections: usize, + projection: Vec, + schema: Schema, + predicate: PredicateRef, + batch_size: usize, + runtime: Arc, +} + +impl ParquetBench { + pub fn new(config: SstBenchConfig) -> Self { + let store = File::new(config.store_path); + + let runtime = util::new_runtime(config.runtime_thread_num); + + let mut sst_path = store.new_path(); + sst_path.set_file_name(&config.sst_file_name); + let meta_cache: Option = None; + let data_cache: Option = None; + + let schema = runtime.block_on(util::schema_from_sst( + &store, + &sst_path, + &meta_cache, + &data_cache, + )); + + let predicate = Arc::new(config.predicate.into_predicate()); + + ParquetBench { + store, + sst_file_name: config.sst_file_name, + max_projections: config.max_projections, + projection: Vec::new(), + schema, + predicate, + batch_size: config.read_batch_row_num, + runtime: Arc::new(runtime), + } + } + + pub fn num_benches(&self) -> usize { + // One test reads all columns and `max_projections` tests read with projection. + 1 + self.max_projections + } + + pub fn init_for_bench(&mut self, i: usize) { + let projection = if i < self.max_projections { + (0..i + 1).into_iter().collect() + } else { + Vec::new() + }; + + self.projection = projection; + } + + pub fn run_bench(&self) { + let mut sst_path = self.store.new_path(); + sst_path.set_file_name(&self.sst_file_name); + + self.runtime.block_on(async { + let open_instant = Instant::now(); + let file = self.store.get(&sst_path).await.unwrap(); + let mut file_reader = SerializedFileReader::new(file).unwrap(); + let open_cost = open_instant.elapsed(); + + let filter_begin_instant = Instant::now(); + let row_group_predicate = self.build_row_group_predicate(&file_reader); + let mut arrow_reader = { + file_reader.filter_row_groups(&row_group_predicate); + ParquetFileArrowReader::new(Arc::new(file_reader)) + }; + let filter_cost = filter_begin_instant.elapsed(); + + let record_reader = if self.projection.is_empty() { + arrow_reader.get_record_reader(self.batch_size).unwrap() + } else { + arrow_reader + .get_record_reader_by_columns(self.projection.clone(), self.batch_size) + .unwrap() + }; + + let iter_begin_instant = Instant::now(); + let mut total_rows = 0; + let mut batch_num = 0; + for record_batch in record_reader { + let num_rows = record_batch.unwrap().num_rows(); + total_rows += num_rows; + batch_num += 1; + } + + info!( + "\nParquetBench total rows of sst: {}, total batch num: {}, open cost: {:?}, filter cost: {:?}, iter cost: {:?}", + total_rows, + batch_num, + open_cost, + filter_cost, + iter_begin_instant.elapsed(), + ); + }); + } + + fn build_row_group_predicate( + &self, + file_reader: &SerializedFileReader, + ) -> RowGroupPredicate { + let row_groups = file_reader.metadata().row_groups(); + let filter_results = self.predicate.filter_row_groups(&self.schema, row_groups); + + Box::new(move |_, idx: usize| filter_results[idx]) + } +} diff --git a/benchmarks/src/scan_memtable_bench.rs b/benchmarks/src/scan_memtable_bench.rs new file mode 100644 index 0000000000..424e1886e8 --- /dev/null +++ b/benchmarks/src/scan_memtable_bench.rs @@ -0,0 +1,111 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Scan memtable bench. + +use std::{collections::Bound, sync::Arc}; + +use analytic_engine::memtable::{ + factory::{Factory as MemTableFactory, Options}, + skiplist::factory::SkiplistMemTableFactory, + MemTableRef, ScanContext, ScanRequest, +}; +use arena::NoopCollector; +use common_types::projected_schema::ProjectedSchema; +use log::info; +use object_store::{disk::File, path::ObjectStorePath, ObjectStore}; +use parquet::{DataCacheRef, MetaCacheRef}; + +use crate::{config::ScanMemTableBenchConfig, util}; + +pub struct ScanMemTableBench { + memtable: MemTableRef, + projected_schema: ProjectedSchema, + max_projections: usize, +} + +impl ScanMemTableBench { + pub fn new(config: ScanMemTableBenchConfig) -> Self { + let store = File::new(config.store_path); + + let runtime = Arc::new(util::new_runtime(config.runtime_thread_num)); + let meta_cache: Option = None; + let data_cache: Option = None; + let mut sst_path = store.new_path(); + sst_path.set_file_name(&config.sst_file_name); + let schema = runtime.block_on(util::schema_from_sst( + &store, + &sst_path, + &meta_cache, + &data_cache, + )); + + let projected_schema = ProjectedSchema::no_projection(schema.clone()); + + let memtable_factory = SkiplistMemTableFactory; + let memtable_opts = Options { + collector: Arc::new(NoopCollector {}), + schema: schema.clone(), + arena_block_size: config.arena_block_size.0 as u32, + creation_sequence: crate::INIT_SEQUENCE, + }; + let memtable = memtable_factory.create_memtable(memtable_opts).unwrap(); + + runtime.block_on(util::load_sst_to_memtable( + &store, + &sst_path, + &schema, + &memtable, + runtime.clone(), + )); + + info!( + "\nScanMemTableBench memtable loaded, memory used: {}", + memtable.approximate_memory_usage() + ); + + Self { + memtable, + projected_schema, + max_projections: config.max_projections, + } + } + + pub fn num_benches(&self) -> usize { + // One test reads all columns and `max_projections` tests read with projection. + 1 + self.max_projections + } + + pub fn init_for_bench(&mut self, i: usize) { + let projected_schema = + util::projected_schema_by_number(self.memtable.schema(), i, self.max_projections); + + self.projected_schema = projected_schema; + } + + pub fn run_bench(&self) { + let scan_ctx = ScanContext::default(); + let scan_req = ScanRequest { + start_user_key: Bound::Unbounded, + end_user_key: Bound::Unbounded, + sequence: common_types::MAX_SEQUENCE_NUMBER, + projected_schema: self.projected_schema.clone(), + need_dedup: true, + reverse: false, + }; + + let iter = self.memtable.scan(scan_ctx, scan_req).unwrap(); + + let mut total_rows = 0; + let mut batch_num = 0; + for batch in iter { + let num_rows = batch.unwrap().num_rows(); + total_rows += num_rows; + batch_num += 1; + } + + info!( + "\nScanMemTableBench total rows of memtable: {}, total batch num: {}", + total_rows, batch_num, + ); + } +} diff --git a/benchmarks/src/sst_bench.rs b/benchmarks/src/sst_bench.rs new file mode 100644 index 0000000000..882e40b1fa --- /dev/null +++ b/benchmarks/src/sst_bench.rs @@ -0,0 +1,123 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! SST bench. + +use std::{cmp, sync::Arc, time::Instant}; + +use analytic_engine::sst::factory::{Factory, FactoryImpl, SstReaderOptions, SstType}; +use common_types::{projected_schema::ProjectedSchema, schema::Schema}; +use common_util::runtime::Runtime; +use futures::stream::StreamExt; +use log::info; +use object_store::{disk::File, path::ObjectStorePath, ObjectStore}; +use parquet::{ + cache::{LruDataCache, LruMetaCache}, + DataCacheRef, MetaCacheRef, +}; + +use crate::{config::SstBenchConfig, util}; + +pub struct SstBench { + store: File, + pub sst_file_name: String, + max_projections: usize, + schema: Schema, + sst_reader_options: SstReaderOptions, + runtime: Arc, +} + +impl SstBench { + pub fn new(config: SstBenchConfig) -> Self { + let store = File::new(config.store_path); + + let runtime = Arc::new(util::new_runtime(config.runtime_thread_num)); + + let mut sst_path = store.new_path(); + sst_path.set_file_name(&config.sst_file_name); + let meta_cache: Option = + if let Some(sst_meta_cache_cap) = config.sst_meta_cache_cap { + Some(Arc::new(LruMetaCache::new(sst_meta_cache_cap))) + } else { + None + }; + + let data_cache: Option = + if let Some(sst_data_cache_cap) = config.sst_data_cache_cap { + Some(Arc::new(LruDataCache::new(sst_data_cache_cap))) + } else { + None + }; + + let schema = runtime.block_on(util::schema_from_sst( + &store, + &sst_path, + &meta_cache, + &data_cache, + )); + + let predicate = config.predicate.into_predicate(); + let projected_schema = ProjectedSchema::no_projection(schema.clone()); + let sst_reader_options = SstReaderOptions { + sst_type: SstType::Parquet, + read_batch_row_num: config.read_batch_row_num, + reverse: config.reverse, + projected_schema, + predicate: Arc::new(predicate), + meta_cache, + data_cache, + runtime: runtime.clone(), + }; + let max_projections = cmp::min(config.max_projections, schema.num_columns()); + + SstBench { + store, + sst_file_name: config.sst_file_name, + max_projections, + schema, + sst_reader_options, + runtime, + } + } + + pub fn num_benches(&self) -> usize { + // One test reads all columns and `max_projections` tests read with projection. + 1 + self.max_projections + } + + pub fn init_for_bench(&mut self, i: usize) { + let projected_schema = + util::projected_schema_by_number(&self.schema, i, self.max_projections); + + self.sst_reader_options.projected_schema = projected_schema; + } + + pub fn run_bench(&self) { + let mut sst_path = self.store.new_path(); + sst_path.set_file_name(&self.sst_file_name); + + let sst_factory = FactoryImpl; + let mut sst_reader = sst_factory + .new_sst_reader(&self.sst_reader_options, &sst_path, &self.store) + .unwrap(); + + self.runtime.block_on(async { + let begin_instant = Instant::now(); + let mut sst_stream = sst_reader.read().await.unwrap(); + + let mut total_rows = 0; + let mut batch_num = 0; + while let Some(batch) = sst_stream.next().await { + let num_rows = batch.unwrap().num_rows(); + total_rows += num_rows; + batch_num += 1; + } + + info!( + "\nSstBench total rows of sst: {}, total batch num: {}, cost: {:?}", + total_rows, + batch_num, + begin_instant.elapsed(), + ); + }); + } +} diff --git a/benchmarks/src/sst_tools.rs b/benchmarks/src/sst_tools.rs new file mode 100644 index 0000000000..666722d91b --- /dev/null +++ b/benchmarks/src/sst_tools.rs @@ -0,0 +1,257 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Tools to generate SST. + +use std::sync::Arc; + +use analytic_engine::{ + row_iter::{ + self, + dedup::DedupIterator, + merge::{MergeBuilder, MergeConfig}, + IterOptions, + }, + space::SpaceId, + sst::{ + builder::RecordBatchStream, + factory::{Factory, FactoryImpl, SstBuilderOptions, SstReaderOptions, SstType}, + file::{self, FilePurgeQueue, SstMetaData}, + manager::FileId, + }, + table::sst_util, + table_options::Compression, +}; +use common_types::{projected_schema::ProjectedSchema, request_id::RequestId}; +use common_util::runtime::Runtime; +use futures::TryStreamExt; +use log::info; +use object_store::{ + disk::File, + path::{file::FilePath, ObjectStorePath}, + ObjectStore, +}; +use serde_derive::Deserialize; +use table_engine::{predicate::Predicate, table::TableId}; +use tokio::sync::mpsc; + +use crate::{config::BenchPredicate, util}; + +#[derive(Debug)] +struct SstConfig { + sst_meta: SstMetaData, + store_path: String, + sst_file_name: String, + num_rows_per_row_group: usize, + compression: Compression, +} + +async fn create_sst_from_stream(config: SstConfig, record_batch_stream: RecordBatchStream) { + let sst_factory = FactoryImpl; + let sst_builder_options = SstBuilderOptions { + sst_type: SstType::Parquet, + num_rows_per_row_group: config.num_rows_per_row_group, + compression: config.compression, + }; + + info!( + "create sst from stream, config:{:?}, sst_builder_options:{:?}", + config, sst_builder_options + ); + + let store = File::new(config.store_path); + let mut sst_file_path = store.new_path(); + sst_file_path.set_file_name(&config.sst_file_name); + + let mut builder = sst_factory + .new_sst_builder(&sst_builder_options, &sst_file_path, &store) + .unwrap(); + builder + .build(RequestId::next_id(), &config.sst_meta, record_batch_stream) + .await + .unwrap(); +} + +#[derive(Debug, Deserialize)] +pub struct RebuildSstConfig { + store_path: String, + input_file_name: String, + read_batch_row_num: usize, + predicate: BenchPredicate, + + // Output sst config: + output_file_name: String, + num_rows_per_row_group: usize, + compression: Compression, +} + +pub async fn rebuild_sst(config: RebuildSstConfig, runtime: Arc) { + info!("Start rebuild sst, config:{:?}", config); + + let store = File::new(config.store_path.clone()); + + let mut input_path = store.new_path(); + input_path.set_file_name(&config.input_file_name); + + let sst_meta = util::meta_from_sst(&store, &input_path, &None, &None).await; + + let projected_schema = ProjectedSchema::no_projection(sst_meta.schema.clone()); + let sst_reader_options = SstReaderOptions { + sst_type: SstType::Parquet, + read_batch_row_num: config.read_batch_row_num, + reverse: false, + projected_schema, + predicate: Arc::new(config.predicate.into_predicate()), + meta_cache: None, + data_cache: None, + runtime, + }; + + let record_batch_stream = + sst_to_record_batch_stream(&sst_reader_options, &input_path, &store).await; + + let output_sst_config = SstConfig { + sst_meta, + store_path: config.store_path, + sst_file_name: config.output_file_name, + num_rows_per_row_group: config.num_rows_per_row_group, + compression: config.compression, + }; + + create_sst_from_stream(output_sst_config, record_batch_stream).await; + + info!("Start rebuild sst done"); +} + +async fn sst_to_record_batch_stream( + sst_reader_options: &SstReaderOptions, + input_path: &FilePath, + store: &File, +) -> RecordBatchStream { + let sst_factory = FactoryImpl; + let mut sst_reader = sst_factory + .new_sst_reader(sst_reader_options, input_path, store) + .unwrap(); + + let sst_stream = sst_reader.read().await.unwrap(); + + Box::new(sst_stream.map_err(|e| Box::new(e) as _)) +} + +#[derive(Debug, Deserialize)] +pub struct MergeSstConfig { + store_path: String, + space_id: SpaceId, + table_id: TableId, + sst_file_ids: Vec, + dedup: bool, + read_batch_row_num: usize, + predicate: BenchPredicate, + + // Output sst config: + output_store_path: String, + output_file_name: String, + num_rows_per_row_group: usize, + compression: Compression, +} + +pub async fn merge_sst(config: MergeSstConfig, runtime: Arc) { + if config.sst_file_ids.is_empty() { + info!("No input files to merge"); + return; + } + + info!("Merge sst begin, config:{:?}", config); + + let space_id = config.space_id; + let table_id = config.table_id; + let store = File::new(config.store_path.clone()); + let (tx, _rx) = mpsc::unbounded_channel(); + let purge_queue = FilePurgeQueue::new(space_id, table_id, tx); + + let file_handles = util::file_handles_from_ssts( + &store, + space_id, + table_id, + &config.sst_file_ids, + purge_queue, + &None, + &None, + ) + .await; + let max_sequence = file_handles + .iter() + .map(|file| file.max_sequence()) + .max() + .unwrap(); + + let mut first_sst_path = store.new_path(); + sst_util::set_sst_file_path( + space_id, + table_id, + config.sst_file_ids[0], + &mut first_sst_path, + ); + let schema = util::schema_from_sst(&store, &first_sst_path, &None, &None).await; + let iter_options = IterOptions { + batch_size: config.read_batch_row_num, + }; + + let request_id = RequestId::next_id(); + let iter = { + let space_id = config.space_id; + let table_id = config.table_id; + let sequence = max_sequence + 1; + let projected_schema = ProjectedSchema::no_projection(schema.clone()); + let sst_reader_options = SstReaderOptions { + sst_type: SstType::Parquet, + read_batch_row_num: config.read_batch_row_num, + reverse: false, + projected_schema: projected_schema.clone(), + predicate: Arc::new(config.predicate.into_predicate()), + meta_cache: None, + data_cache: None, + runtime: runtime.clone(), + }; + + let sst_factory = FactoryImpl; + let mut builder = MergeBuilder::new(MergeConfig { + request_id, + space_id, + table_id, + sequence, + projected_schema, + predicate: Arc::new(Predicate::empty()), + sst_factory, + sst_reader_options, + store: &store, + merge_iter_options: iter_options.clone(), + need_dedup: true, + reverse: false, + }); + builder + .mut_ssts_of_level(0) + .extend_from_slice(&file_handles); + + builder.build().await.unwrap() + }; + + let record_batch_stream = if config.dedup { + let iter = DedupIterator::new(request_id, iter, iter_options); + row_iter::record_batch_with_key_iter_to_stream(iter, &runtime) + } else { + row_iter::record_batch_with_key_iter_to_stream(iter, &runtime) + }; + + let sst_meta = file::merge_sst_meta(&file_handles, schema); + let output_sst_config = SstConfig { + sst_meta, + store_path: config.output_store_path, + sst_file_name: config.output_file_name, + num_rows_per_row_group: config.num_rows_per_row_group, + compression: config.compression, + }; + + create_sst_from_stream(output_sst_config, record_batch_stream).await; + + info!("Merge sst done"); +} diff --git a/benchmarks/src/util.rs b/benchmarks/src/util.rs new file mode 100644 index 0000000000..639c3da19b --- /dev/null +++ b/benchmarks/src/util.rs @@ -0,0 +1,146 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Utilities. + +use std::sync::Arc; + +use analytic_engine::{ + memtable::{key::KeySequence, MemTableRef, PutContext}, + space::SpaceId, + sst::{ + factory::{Factory, FactoryImpl, SstReaderOptions, SstType}, + file::{FileHandle, FileMeta, FilePurgeQueue, SstMetaData}, + manager::FileId, + parquet::reader, + }, + table::sst_util, +}; +use common_types::{ + projected_schema::ProjectedSchema, + schema::{IndexInWriterSchema, Schema}, + time::TimeRange, +}; +use common_util::runtime::{self, Runtime}; +use futures::stream::StreamExt; +use object_store::{disk::File, path::file::FilePath, ObjectStore}; +use parquet::{DataCacheRef, MetaCacheRef}; +use table_engine::{predicate::Predicate, table::TableId}; + +pub fn new_runtime(thread_num: usize) -> Runtime { + runtime::Builder::default() + .thread_name("engine_bench") + .worker_threads(thread_num) + .enable_all() + .build() + .unwrap() +} + +pub async fn meta_from_sst( + store: &File, + sst_path: &FilePath, + meta_cache: &Option, + data_cache: &Option, +) -> SstMetaData { + let (_, sst_meta) = reader::read_sst_meta(store, sst_path, meta_cache, data_cache) + .await + .unwrap(); + + sst_meta +} + +pub async fn schema_from_sst( + store: &File, + sst_path: &FilePath, + meta_cache: &Option, + data_cache: &Option, +) -> Schema { + let sst_meta = meta_from_sst(store, sst_path, meta_cache, data_cache).await; + + sst_meta.schema +} + +pub fn projected_schema_by_number( + schema: &Schema, + num_columns: usize, + max_projections: usize, +) -> ProjectedSchema { + if num_columns < max_projections { + let projection = (0..num_columns + 1).into_iter().collect(); + + ProjectedSchema::new(schema.clone(), Some(projection)).unwrap() + } else { + ProjectedSchema::no_projection(schema.clone()) + } +} + +pub async fn load_sst_to_memtable( + store: &File, + sst_path: &FilePath, + schema: &Schema, + memtable: &MemTableRef, + runtime: Arc, +) { + let sst_reader_options = SstReaderOptions { + sst_type: SstType::Parquet, + read_batch_row_num: 500, + reverse: false, + projected_schema: ProjectedSchema::no_projection(schema.clone()), + predicate: Arc::new(Predicate::new(TimeRange::min_to_max())), + meta_cache: None, + data_cache: None, + runtime, + }; + let sst_factory = FactoryImpl; + let mut sst_reader = sst_factory + .new_sst_reader(&sst_reader_options, sst_path, store) + .unwrap(); + + let mut sst_stream = sst_reader.read().await.unwrap(); + let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns()); + let mut ctx = PutContext::new(index_in_writer); + + let mut sequence = crate::INIT_SEQUENCE; + + while let Some(batch) = sst_stream.next().await { + let batch = batch.unwrap(); + + for i in 0..batch.num_rows() { + let row = batch.clone_row_at(i); + + let key_seq = KeySequence::new(sequence, i as u32); + + memtable.put(&mut ctx, key_seq, &row, schema).unwrap(); + + sequence += 1; + } + } +} + +pub async fn file_handles_from_ssts( + store: &File, + space_id: SpaceId, + table_id: TableId, + sst_file_ids: &[FileId], + purge_queue: FilePurgeQueue, + meta_cache: &Option, + data_cache: &Option, +) -> Vec { + let mut file_handles = Vec::with_capacity(sst_file_ids.len()); + + for file_id in sst_file_ids.iter() { + let mut path = store.new_path(); + sst_util::set_sst_file_path(space_id, table_id, *file_id, &mut path); + + let sst_meta = meta_from_sst(store, &path, meta_cache, data_cache).await; + let file_meta = FileMeta { + id: *file_id, + meta: sst_meta, + }; + + let handle = FileHandle::new(file_meta, purge_queue.clone()); + + file_handles.push(handle); + } + + file_handles +} diff --git a/build.rs b/build.rs new file mode 100644 index 0000000000..ce2a0fb668 --- /dev/null +++ b/build.rs @@ -0,0 +1,26 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Build script + +use std::env; + +use vergen::{vergen, Config, ShaKind}; + +fn main() { + // Generate the default 'cargo:' instruction output + let mut config = Config::default(); + // Change the SHA output to the short variant + *config.git_mut().sha_kind_mut() = ShaKind::Short; + // Override git branch by env if provided. + if let Some(branch) = env::var_os("GITBRANCH") { + let branch = branch + .into_string() + .expect("Convert git branch env to string"); + if !branch.is_empty() { + *config.git_mut().branch_mut() = false; + println!("cargo:rustc-env=VERGEN_GIT_BRANCH={}", branch); + } + } + + vergen(config).expect("Vergen failed to generate config"); +} diff --git a/catalog/Cargo.toml b/catalog/Cargo.toml new file mode 100644 index 0000000000..14e3eb5c67 --- /dev/null +++ b/catalog/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "catalog" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +# Workspace dependencies, in alphabetical order +async-trait = "0.1.41" +snafu = { version ="0.6.10", features = ["backtraces"]} +common_types = { path = "../common_types" } +common_util = { path = "../common_util" } +table_engine = { path = "../table_engine" } diff --git a/catalog/src/consts.rs b/catalog/src/consts.rs new file mode 100644 index 0000000000..ebac82873c --- /dev/null +++ b/catalog/src/consts.rs @@ -0,0 +1,12 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Catalog constants + +/// Default catalog name +pub const DEFAULT_CATALOG: &str = "ceresdb"; +/// Default schema name +pub const DEFAULT_SCHEMA: &str = "public"; +/// Catalog name of the sys catalog +pub const SYSTEM_CATALOG: &str = "system"; +/// Schema name of the sys catalog +pub const SYSTEM_CATALOG_SCHEMA: &str = "public"; diff --git a/catalog/src/lib.rs b/catalog/src/lib.rs new file mode 100644 index 0000000000..90799b9205 --- /dev/null +++ b/catalog/src/lib.rs @@ -0,0 +1,59 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Common traits and types about catalog (schema) + +#[macro_use] +extern crate common_util; + +pub mod consts; +pub mod manager; +pub mod schema; + +use std::sync::Arc; + +use async_trait::async_trait; +use snafu::{Backtrace, Snafu}; + +use crate::schema::{NameRef, SchemaRef}; + +#[derive(Debug, Snafu)] +#[snafu(visibility = "pub")] +pub enum Error { + #[snafu(display( + "Failed to create schema, catalog:{}, schema:{}, err:{}", + catalog, + schema, + source + ))] + CreateSchema { + catalog: String, + schema: String, + source: Box, + }, + + #[snafu(display("Unsupported method, msg:{}.\nBacktrace:\n{}", msg, backtrace))] + UnSupported { msg: String, backtrace: Backtrace }, +} + +define_result!(Error); + +/// Catalog manage schemas +// TODO(yingwen): Maybe use async trait? +// TODO(yingwen): Provide a context +// TODO(yingwen): Catalog id? +#[async_trait] +pub trait Catalog { + /// Get the catalog name + fn name(&self) -> NameRef; + + /// Find schema by name + fn schema_by_name(&self, name: NameRef) -> Result>; + + async fn create_schema<'a>(&'a self, name: NameRef<'a>) -> Result<()>; + + /// All schemas + fn all_schemas(&self) -> Result>; +} + +/// A reference counted catalog pointer +pub type CatalogRef = Arc; diff --git a/catalog/src/manager.rs b/catalog/src/manager.rs new file mode 100644 index 0000000000..fb10637750 --- /dev/null +++ b/catalog/src/manager.rs @@ -0,0 +1,32 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Catalog manager + +use snafu::Snafu; + +use crate::{schema::NameRef, CatalogRef}; + +#[derive(Debug, Snafu)] +pub struct Error; + +define_result!(Error); + +/// Catalog manager abstraction +/// +/// Tracks meta data of databases/tables +// TODO(yingwen): Maybe use async trait? +// TODO(yingwen): Provide a context + +pub trait Manager: Clone + Send + Sync { + /// Get the default catalog name + fn default_catalog_name(&self) -> NameRef; + + /// Get the default schema name + fn default_schema_name(&self) -> NameRef; + + /// Find the catalog by name + fn catalog_by_name(&self, name: NameRef) -> Result>; + + /// All catalogs + fn all_catalogs(&self) -> Result>; +} diff --git a/catalog/src/schema.rs b/catalog/src/schema.rs new file mode 100644 index 0000000000..49c2f6c462 --- /dev/null +++ b/catalog/src/schema.rs @@ -0,0 +1,169 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Schema contains one or more tables + +use std::sync::Arc; + +use async_trait::async_trait; +use common_types::column_schema::ColumnSchema; +use snafu::{Backtrace, Snafu}; +use table_engine::{ + engine::{CreateTableRequest, DropTableRequest, TableEngineRef}, + table::{TableId, TableRef}, +}; + +#[derive(Debug, Snafu)] +#[snafu(visibility(pub))] +pub enum Error { + #[snafu(display("Unsupported method, msg:{}.\nBacktrace:\n{}", msg, backtrace))] + UnSupported { msg: String, backtrace: Backtrace }, + + #[snafu(display("Failed to create table, err:{}", source))] + CreateTable { source: table_engine::engine::Error }, + + #[snafu(display( + "Failed to create table, table already exists, table:{}.\nBacktrace:\n{}", + table, + backtrace + ))] + CreateExistTable { table: String, backtrace: Backtrace }, + + #[snafu(display( + "Failed to create table, cannot persist meta, table:{}, err:{}", + table, + source + ))] + WriteTableMeta { + table: String, + source: Box, + }, + + #[snafu(display( + "Catalog mismatch, expect:{}, given:{}.\nBacktrace:\n{}", + expect, + given, + backtrace + ))] + CatalogMismatch { + expect: String, + given: String, + backtrace: Backtrace, + }, + + #[snafu(display( + "Schema mismatch, expect:{}, given:{}.\nBacktrace:\n{}", + expect, + given, + backtrace + ))] + SchemaMismatch { + expect: String, + given: String, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid table id, msg:{}, table_id:{}.\nBacktrace:\n{}", + msg, + table_id, + backtrace + ))] + InvalidTableId { + msg: &'static str, + table_id: TableId, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to find table, table:{}.\nBacktrace:\n{}", table, backtrace))] + TableNotFound { table: String, backtrace: Backtrace }, + + #[snafu(display("Failed to alter table, err:{}", source))] + AlterTable { + source: Box, + }, + + #[snafu(display("Failed to drop table, err:{}", source))] + DropTable { source: table_engine::engine::Error }, + + #[snafu(display( + "Too many table, cannot create table, schema:{}, table:{}.\nBacktrace:\n{}", + schema, + table, + backtrace + ))] + TooManyTable { + schema: String, + table: String, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +/// Create table options. +#[derive(Clone)] +pub struct CreateOptions { + /// Table engine + // FIXME(yingwen): We have engine type in create request, remove this + pub table_engine: TableEngineRef, + /// Create if not exists, if table already exists, wont return error + // TODO(yingwen): Maybe remove this? + pub create_if_not_exists: bool, +} + +/// Drop table options. +#[derive(Clone)] +pub struct DropOptions { + /// Table engine + pub table_engine: TableEngineRef, +} + +/// Alter table operations. +#[derive(Debug)] +pub enum AlterTableOperation { + /// Add column operation, the column id in [ColumnSchema] will be ignored. + /// Primary key column is not allowed to be added, so all columns will + /// be added as normal columns. + AddColumn(ColumnSchema), +} + +/// Alter table request. +#[derive(Debug)] +pub struct AlterTableRequest { + pub table_name: String, + pub operations: Vec, +} + +/// Schema manage tables. +#[async_trait] +pub trait Schema { + /// Get schema name. + fn name(&self) -> NameRef; + + /// Find table by name. + fn table_by_name(&self, name: NameRef) -> Result>; + + /// Allocate a table id for given table. + fn alloc_table_id(&self, name: NameRef) -> Result; + + /// Create table according to `request`. + async fn create_table( + &self, + request: CreateTableRequest, + opts: CreateOptions, + ) -> Result; + + /// Drop table according to `request`. + /// + /// Returns true if the table is really dropped. + async fn drop_table(&self, request: DropTableRequest, opts: DropOptions) -> Result; + + /// All tables + fn all_tables(&self) -> Result>; +} + +/// A name reference +pub type NameRef<'a> = &'a str; +/// A reference counted schema pointer +// TODO(yingwen): This name is conflict with [table_engine::schema::SchemaRef]. +pub type SchemaRef = Arc; diff --git a/catalog_impls/Cargo.toml b/catalog_impls/Cargo.toml new file mode 100644 index 0000000000..ddcbdcdeec --- /dev/null +++ b/catalog_impls/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "catalog_impls" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +# Workspace dependencies, in alphabetical order +async-trait = "0.1.41" +catalog = { path = "../catalog" } +common_types = { path = "../common_types" } +common_util = { path = "../common_util" } +log = "0.4" +snafu = { version ="0.6.10", features = ["backtraces"]} +system_catalog = { path = "../system_catalog" } +table_engine = { path = "../table_engine" } +tokio = { version = "1.0", features = ["sync"] } + +[dev-dependencies] +analytic_engine = { path = "../analytic_engine", features = ["test"] } +server = { path = "../server" } diff --git a/catalog_impls/src/lib.rs b/catalog_impls/src/lib.rs new file mode 100644 index 0000000000..6f4ca69947 --- /dev/null +++ b/catalog_impls/src/lib.rs @@ -0,0 +1,52 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use catalog::{consts::SYSTEM_CATALOG, manager::Manager, schema::NameRef, CatalogRef}; +use system_catalog::{tables::Tables, SystemTableAdapter}; + +use crate::system_tables::{SystemTables, SystemTablesBuilder}; + +pub mod memory; +mod system_tables; +pub mod table_based; + +/// CatalogManagerImpl is a wrapper for system and user tables +#[derive(Clone)] +pub struct CatalogManagerImpl { + system_tables: SystemTables, + user_catalog_manager: M, +} + +impl CatalogManagerImpl { + pub fn new(manager: M) -> Self { + let mut system_tables_builder = SystemTablesBuilder::new(); + system_tables_builder = system_tables_builder + .insert_table(SystemTableAdapter::new(Tables::new(manager.clone()))); + Self { + system_tables: system_tables_builder.build(), + user_catalog_manager: manager, + } + } +} + +impl Manager for CatalogManagerImpl { + fn default_catalog_name(&self) -> NameRef { + self.user_catalog_manager.default_catalog_name() + } + + fn default_schema_name(&self) -> NameRef { + self.user_catalog_manager.default_schema_name() + } + + fn catalog_by_name(&self, name: NameRef) -> catalog::manager::Result> { + match name { + SYSTEM_CATALOG => Ok(Some(Arc::new(self.system_tables.clone()))), + _ => self.user_catalog_manager.catalog_by_name(name), + } + } + + fn all_catalogs(&self) -> catalog::manager::Result> { + self.user_catalog_manager.all_catalogs() + } +} diff --git a/catalog_impls/src/memory.rs b/catalog_impls/src/memory.rs new file mode 100644 index 0000000000..e8ab37bb26 --- /dev/null +++ b/catalog_impls/src/memory.rs @@ -0,0 +1,260 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! A memory catalog implementation +//! +//! Mainly for test + +use std::{ + collections::HashMap, + sync::{Arc, RwLock}, +}; + +use async_trait::async_trait; +use catalog::{ + self, consts, + manager::{self, Manager}, + schema::{ + self, CatalogMismatch, CreateOptions, CreateTable, DropOptions, NameRef, Schema, + SchemaMismatch, SchemaRef, TooManyTable, UnSupported, + }, + Catalog, CatalogRef, +}; +use log::info; +use snafu::{ensure, OptionExt, ResultExt}; +use table_engine::{ + engine::{CreateTableRequest, DropTableRequest}, + table::{SchemaId, SchemaIdGenerator, TableId, TableRef, TableSeqGenerator}, +}; + +struct ManagerImplInner { + catalogs: HashMap, +} + +/// In-memory catalog manager +#[derive(Clone)] +pub struct ManagerImpl { + inner: Arc, +} + +impl Default for ManagerImpl { + fn default() -> Self { + let schema_id_generator = SchemaIdGenerator::default(); + let schema_id = schema_id_generator.alloc_schema_id().unwrap(); + + // Register default schema + let default_schema: SchemaRef = Arc::new(SchemaImpl::new( + consts::DEFAULT_CATALOG.to_string(), + consts::DEFAULT_SCHEMA.to_string(), + schema_id, + )); + let mut schemas = HashMap::new(); + schemas.insert(consts::DEFAULT_SCHEMA.to_string(), default_schema); + + // Use above schemas to create a default catalog + let default_catalog: CatalogRef = Arc::new(CatalogImpl { + name: consts::DEFAULT_CATALOG.to_string(), + schemas: RwLock::new(schemas), + schema_id_generator: Arc::new(schema_id_generator), + }); + // Register default catalog + let mut catalogs = HashMap::new(); + catalogs.insert(consts::DEFAULT_CATALOG.to_string(), default_catalog); + + Self { + inner: Arc::new(ManagerImplInner { catalogs }), + } + } +} + +impl Manager for ManagerImpl { + fn default_catalog_name(&self) -> NameRef { + consts::DEFAULT_CATALOG + } + + fn default_schema_name(&self) -> NameRef { + consts::DEFAULT_SCHEMA + } + + fn catalog_by_name(&self, name: NameRef) -> manager::Result> { + let catalog = self.inner.catalogs.get(name).cloned(); + Ok(catalog) + } + + fn all_catalogs(&self) -> manager::Result> { + Ok(self.inner.catalogs.iter().map(|(_, v)| v.clone()).collect()) + } +} + +/// In-memory catalog +struct CatalogImpl { + /// Catalog name + name: String, + /// Schemas of catalog + schemas: RwLock>, + /// Global schema id generator, Each schema has a unique schema id. + schema_id_generator: Arc, +} + +#[async_trait] +impl Catalog for CatalogImpl { + fn name(&self) -> NameRef { + &self.name + } + + fn schema_by_name(&self, name: NameRef) -> catalog::Result> { + let schema = self.schemas.read().unwrap().get(name).cloned(); + Ok(schema) + } + + async fn create_schema<'a>(&'a self, name: NameRef<'a>) -> catalog::Result<()> { + let mut schemas = self.schemas.write().unwrap(); + + if schemas.get(name).is_some() { + return Ok(()); + } + + let schema_id = self.schema_id_generator.alloc_schema_id().unwrap(); + + let schema: SchemaRef = Arc::new(SchemaImpl::new( + self.name.to_string(), + name.to_string(), + schema_id, + )); + + schemas.insert(name.to_string(), schema); + info!( + "create schema success, catalog:{}, schema:{}", + &self.name, name + ); + Ok(()) + } + + fn all_schemas(&self) -> catalog::Result> { + Ok(self + .schemas + .read() + .unwrap() + .iter() + .map(|(_, v)| v.clone()) + .collect()) + } +} + +/// In-memory schema +struct SchemaImpl { + /// Catalog name + catalog_name: String, + /// Schema name + schema_name: String, + /// Tables of schema + tables: RwLock>, + schema_id: SchemaId, + table_seq_generator: TableSeqGenerator, +} + +impl SchemaImpl { + fn new(catalog_name: String, schema_name: String, schema_id: SchemaId) -> Self { + Self { + catalog_name, + schema_name, + tables: RwLock::new(HashMap::new()), + schema_id, + table_seq_generator: TableSeqGenerator::default(), + } + } +} + +#[async_trait] +impl Schema for SchemaImpl { + fn name(&self) -> NameRef { + &self.schema_name + } + + fn table_by_name(&self, name: NameRef) -> schema::Result> { + let table = self.tables.read().unwrap().get(name).cloned(); + Ok(table) + } + + fn alloc_table_id(&self, name: NameRef) -> schema::Result { + let table_seq = self + .table_seq_generator + .alloc_table_seq() + .context(TooManyTable { + schema: &self.schema_name, + table: name, + })?; + + Ok(TableId::new(self.schema_id, table_seq)) + } + + // In memory schema does not support persisting table info + async fn create_table( + &self, + request: CreateTableRequest, + opts: CreateOptions, + ) -> schema::Result { + ensure!( + self.catalog_name == request.catalog_name, + CatalogMismatch { + expect: &self.catalog_name, + given: request.catalog_name, + } + ); + ensure!( + self.schema_name == request.schema_name, + SchemaMismatch { + expect: &self.schema_name, + given: request.schema_name, + } + ); + + { + // Check table existence + let tables = self.tables.read().unwrap(); + if let Some(table) = tables.get(&request.table_name) { + return Ok(table.clone()); + } + } + + // Table engine handles duplicate table creation + let table_name = request.table_name.clone(); + let table = opts + .table_engine + .create_table(request) + .await + .context(CreateTable)?; + + { + // Now the table engine have create the table, but we may not be the + // creator thread + let mut tables = self.tables.write().unwrap(); + tables.entry(table_name).or_insert_with(|| table.clone()); + } + + Ok(table) + } + + async fn drop_table( + &self, + request: DropTableRequest, + _opts: DropOptions, + ) -> schema::Result { + UnSupported { + msg: format!( + "Dropping table is not supported by memory catalog, request:{:?}", + request + ), + } + .fail() + } + + fn all_tables(&self) -> schema::Result> { + Ok(self + .tables + .read() + .unwrap() + .iter() + .map(|(_, v)| v.clone()) + .collect()) + } +} diff --git a/catalog_impls/src/system_tables.rs b/catalog_impls/src/system_tables.rs new file mode 100644 index 0000000000..672f3fa8f6 --- /dev/null +++ b/catalog_impls/src/system_tables.rs @@ -0,0 +1,131 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Contains System tables, such as system.public.tables + +use std::{collections::HashMap, sync::Arc}; + +use async_trait::async_trait; +use catalog::{ + consts::{SYSTEM_CATALOG, SYSTEM_CATALOG_SCHEMA}, + schema::{CreateOptions, DropOptions, NameRef, Schema, SchemaRef}, + Catalog, +}; +use system_catalog::SystemTableAdapter; +use table_engine::{ + engine::{CreateTableRequest, DropTableRequest}, + table::{Table, TableId, TableRef}, +}; + +const UNSUPPORTED_MSG: &str = "system tables not supported"; + +pub struct SystemTablesBuilder { + tables: HashMap>, +} + +impl SystemTablesBuilder { + pub fn new() -> Self { + Self { + tables: HashMap::new(), + } + } + + pub fn insert_table(mut self, table: SystemTableAdapter) -> Self { + self.tables + .insert(table.name().to_string(), Arc::new(table)); + self + } + + pub fn build(self) -> SystemTables { + SystemTables::new(self.tables) + } +} + +#[derive(Clone)] +pub struct SystemTables { + tables: Arc>>, +} + +impl SystemTables { + pub fn new(tables: HashMap>) -> Self { + Self { + tables: Arc::new(tables), + } + } +} + +#[async_trait] +impl Schema for SystemTables { + fn name(&self) -> NameRef { + SYSTEM_CATALOG_SCHEMA + } + + fn table_by_name(&self, name: NameRef) -> catalog::schema::Result> { + Ok(self.tables.get(name).map(|v| v.clone() as TableRef)) + } + + fn alloc_table_id(&self, _name: NameRef) -> catalog::schema::Result { + catalog::schema::UnSupported { + msg: UNSUPPORTED_MSG, + } + .fail() + } + + async fn create_table( + &self, + _request: CreateTableRequest, + _opts: CreateOptions, + ) -> catalog::schema::Result { + catalog::schema::UnSupported { + msg: UNSUPPORTED_MSG, + } + .fail() + } + + async fn drop_table( + &self, + _request: DropTableRequest, + _opts: DropOptions, + ) -> catalog::schema::Result { + catalog::schema::UnSupported { + msg: UNSUPPORTED_MSG, + } + .fail() + } + + fn all_tables(&self) -> catalog::schema::Result> { + Ok(self + .tables + .iter() + .map(|(_, v)| v.clone() as TableRef) + .collect()) + } +} + +#[async_trait] +impl Catalog for SystemTables { + fn name(&self) -> NameRef { + SYSTEM_CATALOG + } + + fn schema_by_name(&self, name: NameRef) -> catalog::Result> { + if name == SYSTEM_CATALOG_SCHEMA { + Ok(Some(Arc::new(self.clone()))) + } else { + Ok(None) + } + } + + async fn create_schema<'a>(&'a self, _name: NameRef<'a>) -> catalog::Result<()> { + catalog::UnSupported { + msg: UNSUPPORTED_MSG, + } + .fail() + } + + fn all_schemas(&self) -> catalog::Result> { + catalog::UnSupported { + msg: UNSUPPORTED_MSG, + } + .fail() + } +} diff --git a/catalog_impls/src/table_based.rs b/catalog_impls/src/table_based.rs new file mode 100644 index 0000000000..60c578a530 --- /dev/null +++ b/catalog_impls/src/table_based.rs @@ -0,0 +1,1126 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table based catalog implementation + +use std::{ + collections::HashMap, + sync::{Arc, RwLock}, +}; + +use async_trait::async_trait; +use catalog::{ + self, consts, + manager::{self, Manager}, + schema::{ + self, CatalogMismatch, CreateExistTable, CreateOptions, CreateTable, DropOptions, + DropTable, InvalidTableId, NameRef, Schema, SchemaMismatch, SchemaRef, TooManyTable, + WriteTableMeta, + }, + Catalog, CatalogRef, +}; +use common_util::define_result; +use log::{debug, error, info}; +use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu}; +use system_catalog::sys_catalog_table::{ + self, CreateCatalogRequest, CreateSchemaRequest, SysCatalogTable, Visitor, + VisitorCatalogNotFound, VisitorOpenTable, VisitorSchemaNotFound, +}; +use table_engine::{ + engine::{ + CreateTableRequest, DropTableRequest, OpenTableRequest, TableEngine, TableEngineRef, + TableState, + }, + table::{ + ReadOptions, SchemaId, SchemaIdGenerator, TableId, TableInfo, TableRef, TableSeqGenerator, + }, +}; +use tokio::sync::Mutex; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to build sys catalog table, err:{}", source))] + BuildSysCatalog { + source: system_catalog::sys_catalog_table::Error, + }, + + #[snafu(display("Failed to visit sys catalog table, err:{}", source))] + VisitSysCatalog { + source: system_catalog::sys_catalog_table::Error, + }, + + #[snafu(display( + "Failed to find table to update, name:{}.\nBacktrace:\n{}", + name, + backtrace + ))] + UpdateTableNotFound { name: String, backtrace: Backtrace }, + + #[snafu(display("Failed to create catalog, catalog:{}, err:{}", catalog, source))] + CreateCatalog { + catalog: String, + source: system_catalog::sys_catalog_table::Error, + }, + + #[snafu(display( + "Failed to create schema, catalog:{}, schema:{}, err:{}", + catalog, + schema, + source + ))] + CreateSchema { + catalog: String, + schema: String, + source: system_catalog::sys_catalog_table::Error, + }, +} + +define_result!(Error); + +/// Table based catalog manager +#[derive(Clone)] +pub struct TableBasedManager { + inner: Arc, +} + +impl Manager for TableBasedManager { + fn default_catalog_name(&self) -> NameRef { + consts::DEFAULT_CATALOG + } + + fn default_schema_name(&self) -> NameRef { + consts::DEFAULT_SCHEMA + } + + fn catalog_by_name(&self, name: NameRef) -> manager::Result> { + let catalog = self.inner.catalogs.get(name).cloned().map(|v| v as _); + Ok(catalog) + } + + fn all_catalogs(&self) -> manager::Result> { + Ok(self + .inner + .catalogs + .iter() + .map(|(_, v)| v.clone() as _) + .collect()) + } +} + +impl TableBasedManager { + /// Create and init the TableBasedManager. + // TODO(yingwen): Define all constants in catalog crate. + pub async fn new(backend: &T, engine_proxy: TableEngineRef) -> Result { + // Create or open sys_catalog table, will also create a space (catalog + schema) + // for system catalog. + let catalog_table = SysCatalogTable::new(backend) + .await + .context(BuildSysCatalog)?; + + let mut inner = Inner { + catalog_table: Arc::new(catalog_table), + catalogs: HashMap::new(), + engine_proxy, + schema_id_generator: Arc::new(SchemaIdGenerator::default()), + }; + + inner.init().await?; + + Ok(Self { + inner: Arc::new(inner), + }) + } + + #[cfg(test)] + pub fn get_engine_proxy(&self) -> TableEngineRef { + self.inner.engine_proxy.clone() + } +} + +type CatalogMap = HashMap>; + +/// Inner state of TableBasedManager +struct Inner { + /// Sys catalog table + catalog_table: Arc, + catalogs: CatalogMap, + /// Table engine proxy + engine_proxy: TableEngineRef, + /// Global schema id generator, Each schema has a unique schema id. + schema_id_generator: Arc, +} + +impl Inner { + /// Load all data from sys catalog table. + async fn init(&mut self) -> Result<()> { + // The system catalog and schema in it is not persisted, so we add it manually. + self.load_system_catalog(); + + let mut visitor = VisitorImpl { + catalog_table: self.catalog_table.clone(), + catalogs: &mut self.catalogs, + engine_proxy: self.engine_proxy.clone(), + schema_id_generator: self.schema_id_generator.clone(), + }; + + // Load all existent catalog/schema/tables from catalog_table. + let opts = ReadOptions::default(); + self.catalog_table + .visit(opts, &mut visitor) + .await + .context(VisitSysCatalog)?; + + // Create default catalog if it is not exists. + self.maybe_create_default_catalog().await?; + + Ok(()) + } + + fn load_system_catalog(&mut self) { + // Get the `sys_catalog` table and add it to tables. + let table = self.catalog_table.inner_table(); + let mut tables = SchemaTables::default(); + tables.insert(self.catalog_table.table_id(), table); + + // Use schema id of schema `system/public` as last schema id. + let schema_id = sys_catalog_table::SCHEMA_ID; + self.schema_id_generator.set_last_schema_id(schema_id); + + // Create the default schema in system catalog. + let schema = Arc::new(SchemaImpl { + catalog_name: consts::SYSTEM_CATALOG.to_string(), + schema_name: consts::SYSTEM_CATALOG_SCHEMA.to_string(), + schema_id, + tables: RwLock::new(tables), + mutex: Mutex::new(()), + catalog_table: self.catalog_table.clone(), + table_seq_generator: TableSeqGenerator::default(), + }); + // Use table seq of `sys_catalog` table as last table seq. + schema + .table_seq_generator + .set_last_table_seq(sys_catalog_table::TABLE_SEQ); + + let mut schemas = HashMap::new(); + schemas.insert(schema.name().to_string(), schema); + + let schema_id_generator = self.schema_id_generator.clone(); + let catalog_table = self.catalog_table.clone(); + // Create the system catalog. + let catalog = Arc::new(CatalogImpl { + name: consts::SYSTEM_CATALOG.to_string(), + schemas: RwLock::new(schemas), + schema_id_generator, + catalog_table, + mutex: Mutex::new(()), + }); + + self.catalogs.insert(catalog.name().to_string(), catalog); + } + + async fn maybe_create_default_catalog(&mut self) -> Result<()> { + // Try to get default catalog, create it if not exists. + let catalog = match self.catalogs.get(consts::DEFAULT_CATALOG) { + Some(v) => v.clone(), + None => { + // Only system catalog should exists. + assert_eq!(1, self.catalogs.len()); + + // Default catalog is not exists, create and store it. + let default_catalog = self + .create_catalog(CreateCatalogRequest { + catalog_name: consts::DEFAULT_CATALOG.to_string(), + }) + .await?; + + default_catalog + } + }; + + // Create default schema if not exists. + if catalog.find_schema(consts::DEFAULT_SCHEMA).is_none() { + // Allocate schema id. + let schema_id = self + .schema_id_generator + .alloc_schema_id() + .expect("Schema id of default catalog should be valid"); + + self.add_schema_to_catalog( + CreateSchemaRequest { + catalog_name: consts::DEFAULT_CATALOG.to_string(), + schema_name: consts::DEFAULT_SCHEMA.to_string(), + schema_id, + }, + &*catalog, + ) + .await?; + } + + Ok(()) + } + + async fn create_catalog(&mut self, request: CreateCatalogRequest) -> Result> { + let catalog_name = request.catalog_name.clone(); + + self.catalog_table + .create_catalog(request) + .await + .context(CreateCatalog { + catalog: &catalog_name, + })?; + + let schema_id_generator = self.schema_id_generator.clone(); + let catalog_table = self.catalog_table.clone(); + let catalog = Arc::new(CatalogImpl { + name: catalog_name.clone(), + schemas: RwLock::new(HashMap::new()), + schema_id_generator, + catalog_table, + mutex: Mutex::new(()), + }); + + self.catalogs.insert(catalog_name, catalog.clone()); + + Ok(catalog) + } + + async fn add_schema_to_catalog( + &mut self, + request: CreateSchemaRequest, + catalog: &CatalogImpl, + ) -> Result> { + let schema_name = request.schema_name.clone(); + let schema_id = request.schema_id; + + self.catalog_table + .create_schema(request) + .await + .context(CreateSchema { + catalog: &catalog.name, + schema: &schema_name, + })?; + + let schema = Arc::new(SchemaImpl::new( + &catalog.name, + &schema_name, + schema_id, + self.catalog_table.clone(), + )); + + catalog.insert_schema_into_memory(schema.clone()); + + Ok(schema) + } +} + +/// Sys catalog visitor implementation, used to load catalog info +struct VisitorImpl<'a> { + catalog_table: Arc, + catalogs: &'a mut CatalogMap, + engine_proxy: TableEngineRef, + schema_id_generator: Arc, +} + +#[async_trait] +impl<'a> Visitor for VisitorImpl<'a> { + fn visit_catalog(&mut self, request: CreateCatalogRequest) -> sys_catalog_table::Result<()> { + debug!("Visitor visit catalog, request:{:?}", request); + let schema_id_generator = self.schema_id_generator.clone(); + let catalog_table = self.catalog_table.clone(); + + let catalog = CatalogImpl { + name: request.catalog_name.to_string(), + schemas: RwLock::new(HashMap::new()), + schema_id_generator, + catalog_table, + mutex: Mutex::new(()), + }; + + // Register catalog. + self.catalogs + .insert(request.catalog_name, Arc::new(catalog)); + + Ok(()) + } + + fn visit_schema(&mut self, request: CreateSchemaRequest) -> sys_catalog_table::Result<()> { + debug!("Visitor visit schema, request:{:?}", request); + + let catalog = + self.catalogs + .get_mut(&request.catalog_name) + .context(VisitorCatalogNotFound { + catalog: &request.catalog_name, + })?; + + let schema_id = request.schema_id; + let schema = Arc::new(SchemaImpl::new( + &request.catalog_name, + &request.schema_name, + schema_id, + self.catalog_table.clone(), + )); + + // If schema exists, we overwrite it. + catalog.insert_schema_into_memory(schema); + + // Update last schema id. + if self.schema_id_generator.last_schema_id_u32() < schema_id.as_u32() { + self.schema_id_generator.set_last_schema_id(schema_id); + } + + Ok(()) + } + + async fn visit_tables(&mut self, table_info: TableInfo) -> sys_catalog_table::Result<()> { + debug!("Visitor visit tables, table_info:{:?}", table_info); + + let catalog = + self.catalogs + .get_mut(&table_info.catalog_name) + .context(VisitorCatalogNotFound { + catalog: &table_info.catalog_name, + })?; + let schema = + catalog + .find_schema(&table_info.schema_name) + .context(VisitorSchemaNotFound { + catalog: &table_info.catalog_name, + schema: &table_info.schema_name, + })?; + + // Update max table sequence of the schema. + let table_id = table_info.table_id; + let table_seq = table_id.table_seq(); + if table_seq.as_u64() >= schema.table_seq_generator.last_table_seq_u64() { + schema.table_seq_generator.set_last_table_seq(table_seq); + } + + // Only the stable/altering table can be opened. + if !matches!(table_info.state, TableState::Stable) { + debug!( + "Visitor visit a unstable table, table_info:{:?}", + table_info + ); + return Ok(()); + } + + let open_request = OpenTableRequest::from(table_info); + let table_name = open_request.table_name.clone(); + let table_opt = self + .engine_proxy + .open_table(open_request) + .await + .context(VisitorOpenTable)?; + + match table_opt { + Some(table) => { + schema.insert_table_into_memory(table_id, table); + } + None => { + // Now we ignore the error that table not in engine but in catalog. + error!( + "Visitor found table not in engine, table_name:{:?}", + table_name + ); + } + } + + Ok(()) + } +} + +type SchemaMap = HashMap>; + +/// Table based catalog +struct CatalogImpl { + /// Catalog name + name: String, + /// Schemas of catalog + // Now the Schema trait does not support create schema, so we use impl type here + schemas: RwLock, + /// Global schema id generator, Each schema has a unique schema id. + schema_id_generator: Arc, + /// Sys catalog table + catalog_table: Arc, + /// Mutex + /// + /// Protects: + /// - create schema + /// - persist to default catalog + mutex: Mutex<()>, +} + +impl CatalogImpl { + /// Insert schema + fn insert_schema_into_memory(&self, schema: Arc) { + let mut schemas = self.schemas.write().unwrap(); + schemas.insert(schema.name().to_string(), schema); + } + + fn find_schema(&self, schema_name: &str) -> Option> { + let schemas = self.schemas.read().unwrap(); + schemas.get(schema_name).cloned() + } +} + +// TODO(yingwen): Support add schema (with options to control schema +// persistence) +#[async_trait] +impl Catalog for CatalogImpl { + fn name(&self) -> NameRef { + &self.name + } + + fn schema_by_name(&self, name: NameRef) -> catalog::Result> { + let schemas = self.schemas.read().unwrap(); + let schema = schemas.get(name).cloned().map(|v| v as _); + Ok(schema) + } + + async fn create_schema<'a>(&'a self, name: NameRef<'a>) -> catalog::Result<()> { + // Check schema existence + if self.schema_by_name(name)?.is_some() { + return Ok(()); + } + + // Lock schema and persist schema to default catalog + let _lock = self.mutex.lock().await; + // Check again + if self.schema_by_name(name)?.is_some() { + return Ok(()); + } + + // Allocate schema id. + let schema_id = self + .schema_id_generator + .alloc_schema_id() + .expect("Schema id of default catalog should be valid"); + + let request = CreateSchemaRequest { + catalog_name: self.name.to_string(), + schema_name: name.to_string(), + schema_id, + }; + + let schema_id = request.schema_id; + + self.catalog_table + .create_schema(request) + .await + .map_err(|e| Box::new(e) as _) + .context(catalog::CreateSchema { + catalog: &self.name, + schema: &name.to_string(), + })?; + + let schema = Arc::new(SchemaImpl::new( + &self.name, + name, + schema_id, + self.catalog_table.clone(), + )); + + self.insert_schema_into_memory(schema); + info!( + "create schema success, catalog:{}, schema:{}", + &self.name, name + ); + Ok(()) + } + + fn all_schemas(&self) -> catalog::Result> { + Ok(self + .schemas + .read() + .unwrap() + .iter() + .map(|(_, v)| v.clone() as _) + .collect()) + } +} + +/// Table based schema +struct SchemaImpl { + /// Catalog name + catalog_name: String, + /// Schema name + schema_name: String, + /// Schema id + schema_id: SchemaId, + /// Tables of schema + tables: RwLock, + /// Mutex + /// + /// Protects: + /// - add/drop/alter table + /// - persist to sys catalog table + mutex: Mutex<()>, + /// Sys catalog table + catalog_table: Arc, + table_seq_generator: TableSeqGenerator, +} + +impl SchemaImpl { + fn new( + catalog_name: &str, + schema_name: &str, + schema_id: SchemaId, + catalog_table: Arc, + ) -> Self { + Self { + catalog_name: catalog_name.to_string(), + schema_name: schema_name.to_string(), + schema_id, + tables: RwLock::new(SchemaTables::default()), + mutex: Mutex::new(()), + catalog_table, + table_seq_generator: TableSeqGenerator::default(), + } + } + + /// Insert table into memory, wont check existence + fn insert_table_into_memory(&self, table_id: TableId, table: TableRef) { + let mut tables = self.tables.write().unwrap(); + tables.insert(table_id, table); + } + + /// Check table existence in read lock + /// + /// If table exists: + /// - if create_if_not_exists is true, return Ok + /// - if create_if_not_exists is false, return Error + fn check_create_table_read( + &self, + request: &CreateTableRequest, + create_if_not_exists: bool, + ) -> schema::Result> { + let table_id = request.table_id; + ensure!( + self.schema_id == table_id.schema_id(), + InvalidTableId { + msg: "schema id unmatch", + table_id, + } + ); + + let tables = self.tables.read().unwrap(); + if let Some(table) = tables.tables_by_name.get(&request.table_name) { + // Already exists + if create_if_not_exists { + // Create if not exists is set + return Ok(Some(table.clone())); + } + // Create if not exists is not set, need to return error + return CreateExistTable { + table: &request.table_name, + } + .fail(); + } + + // Table is not exists, check whether table id is unique under this schema. + let table_by_id = tables.tables_by_id.get(&request.table_id); + ensure!( + table_by_id.is_none(), + InvalidTableId { + msg: "table with given id already exists", + table_id, + } + ); + + Ok(None) + } + + fn find_table_by_name(&self, name: NameRef) -> Option { + self.tables + .read() + .unwrap() + .tables_by_name + .get(name) + .cloned() + } +} + +#[derive(Default)] +struct SchemaTables { + tables_by_name: HashMap, + tables_by_id: HashMap, +} + +impl SchemaTables { + fn insert(&mut self, table_id: TableId, table: TableRef) { + self.tables_by_name + .insert(table.name().to_string(), table.clone()); + self.tables_by_id.insert(table_id, table); + } + + fn remove(&mut self, name: NameRef) { + if let Some(table) = self.tables_by_name.remove(name) { + self.tables_by_id.remove(&table.id()); + } + } +} + +#[async_trait] +impl Schema for SchemaImpl { + fn name(&self) -> NameRef { + &self.schema_name + } + + fn table_by_name(&self, name: NameRef) -> schema::Result> { + let table = self + .tables + .read() + .unwrap() + .tables_by_name + .get(name) + .cloned(); + Ok(table) + } + + fn alloc_table_id(&self, name: NameRef) -> schema::Result { + let table_seq = self + .table_seq_generator + .alloc_table_seq() + .context(TooManyTable { + schema: &self.schema_name, + table: name, + })?; + + Ok(TableId::new(self.schema_id, table_seq)) + } + + // TODO(yingwen): Do not persist if engine is memory engine. + async fn create_table( + &self, + request: CreateTableRequest, + opts: CreateOptions, + ) -> schema::Result { + info!( + "Table based catalog manager create table, request:{:?}", + request + ); + + ensure!( + self.catalog_name == request.catalog_name, + CatalogMismatch { + expect: &self.catalog_name, + given: request.catalog_name, + } + ); + ensure!( + self.schema_name == request.schema_name, + SchemaMismatch { + expect: &self.schema_name, + given: request.schema_name, + } + ); + // TODO(yingwen): Validate table id is unique. + + // Check table existence + if let Some(table) = self.check_create_table_read(&request, opts.create_if_not_exists)? { + return Ok(table); + } + + // Lock schema and persist table to sys catalog table + let _lock = self.mutex.lock().await; + // Check again + if let Some(table) = self.check_create_table_read(&request, opts.create_if_not_exists)? { + return Ok(table); + } + + // Create table + let table_name = request.table_name.clone(); + let table = opts + .table_engine + .create_table(request.clone()) + .await + .context(CreateTable)?; + assert_eq!(table_name, table.name()); + + self.catalog_table + .create_table(request.clone().into()) + .await + .map_err(|e| Box::new(e) as _) + .context(WriteTableMeta { + table: &request.table_name, + })?; + + { + // Insert into memory + let mut tables = self.tables.write().unwrap(); + tables.insert(request.table_id, table.clone()); + } + + Ok(table) + } + + async fn drop_table( + &self, + mut request: DropTableRequest, + opts: DropOptions, + ) -> schema::Result { + info!( + "Table based catalog manager drop table, request:{:?}", + request + ); + + if self.find_table_by_name(&request.table_name).is_none() { + return Ok(false); + }; + + let _lock = self.mutex.lock().await; + // double check whether the table to drop exists. + let table = match self.find_table_by_name(&request.table_name) { + Some(v) => v, + None => return Ok(false), + }; + + // Determine the real engine type of the table to drop. + // FIXME(xikai): the engine should not be part of the DropRequest. + request.engine = table.engine_type().to_string(); + + // Prepare to drop table info in the sys_catalog. + self.catalog_table + .prepare_drop_table(request.clone()) + .await + .map_err(|e| Box::new(e) as _) + .context(WriteTableMeta { + table: &request.table_name, + })?; + + let dropped = opts + .table_engine + .drop_table(request.clone()) + .await + .context(DropTable)?; + + info!( + "Table engine drop table successfully, request:{:?}, dropped:{}", + request, dropped + ); + + // Update the drop table record into the sys_catalog_table. + self.catalog_table + .drop_table(request.clone()) + .await + .map_err(|e| Box::new(e) as _) + .context(WriteTableMeta { + table: &request.table_name, + })?; + + { + let mut tables = self.tables.write().unwrap(); + tables.remove(&request.table_name); + }; + + info!( + "Table based catalog manager drop table successfully, request:{:?}", + request + ); + + return Ok(true); + } + + fn all_tables(&self) -> schema::Result> { + Ok(self + .tables + .read() + .unwrap() + .tables_by_name + .iter() + .map(|(_, v)| v.clone()) + .collect()) + } +} + +#[cfg(any(test, feature = "test"))] +mod tests { + use std::{collections::HashMap, sync::Arc}; + + use analytic_engine::{tests::util::TestEnv, AnalyticTableEngine}; + use catalog::{ + consts::{DEFAULT_CATALOG, DEFAULT_SCHEMA}, + manager::Manager, + schema::{CreateOptions, DropOptions, SchemaRef}, + }; + use server::table_engine::{MemoryTableEngine, TableEngineProxy}; + use table_engine::{ + engine::{CreateTableRequest, DropTableRequest, TableState}, + ANALYTIC_ENGINE_TYPE, + }; + + use crate::table_based::TableBasedManager; + + async fn build_catalog_manager(analytic: AnalyticTableEngine) -> TableBasedManager { + // Create table engine proxy + let memory = MemoryTableEngine; + + let engine_proxy = Arc::new(TableEngineProxy { + memory, + analytic: analytic.clone(), + }); + + // Create catalog manager, use analytic table as backend + TableBasedManager::new(&analytic, engine_proxy.clone()) + .await + .unwrap_or_else(|e| { + panic!("Failed to create catalog manager, err:{}", e); + }) + } + + async fn build_default_schema_with_catalog(catalog_manager: &TableBasedManager) -> SchemaRef { + let catalog_name = catalog_manager.default_catalog_name(); + let schema_name = catalog_manager.default_schema_name(); + let catalog = catalog_manager.catalog_by_name(catalog_name); + assert!(catalog.is_ok()); + assert!(catalog.as_ref().unwrap().is_some()); + catalog + .as_ref() + .unwrap() + .as_ref() + .unwrap() + .schema_by_name(schema_name) + .unwrap() + .unwrap() + } + + async fn build_default_schema(analytic: AnalyticTableEngine) -> SchemaRef { + let catalog_manager = build_catalog_manager(analytic).await; + let catalog_name = catalog_manager.default_catalog_name(); + let schema_name = catalog_manager.default_schema_name(); + let catalog = catalog_manager.catalog_by_name(catalog_name); + assert!(catalog.is_ok()); + assert!(catalog.as_ref().unwrap().is_some()); + catalog + .as_ref() + .unwrap() + .as_ref() + .unwrap() + .schema_by_name(schema_name) + .unwrap() + .unwrap() + } + + fn build_create_table_req(table_name: &str, schema: SchemaRef) -> CreateTableRequest { + CreateTableRequest { + catalog_name: DEFAULT_CATALOG.to_string(), + schema_name: DEFAULT_SCHEMA.to_string(), + table_id: schema.alloc_table_id(table_name).unwrap(), + table_name: table_name.to_string(), + table_schema: common_types::tests::build_schema(), + partition_info: None, + engine: ANALYTIC_ENGINE_TYPE.to_string(), + options: HashMap::new(), + state: TableState::Stable, + } + } + + #[tokio::test] + async fn test_catalog_by_name_schema_by_name() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + test_ctx.open().await; + + let catalog_manager = build_catalog_manager(test_ctx.engine()).await; + let catalog_name = catalog_manager.default_catalog_name(); + let schema_name = catalog_manager.default_schema_name(); + let catalog = catalog_manager.catalog_by_name(catalog_name); + assert!(catalog.is_ok()); + assert!(catalog.as_ref().unwrap().is_some()); + let schema = catalog + .as_ref() + .unwrap() + .as_ref() + .unwrap() + .schema_by_name(schema_name); + assert!(schema.is_ok()); + assert!(schema.as_ref().unwrap().is_some()); + + let schema_name2 = "test"; + let schema = catalog + .as_ref() + .unwrap() + .as_ref() + .unwrap() + .schema_by_name(schema_name2); + assert!(schema.is_ok()); + assert!(schema.as_ref().unwrap().is_none()); + + let catalog_name2 = "test"; + let catalog = catalog_manager.catalog_by_name(catalog_name2); + assert!(catalog.is_ok()); + assert!(catalog.as_ref().unwrap().is_none()); + } + + #[tokio::test] + async fn test_maybe_create_schema_by_name() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + test_ctx.open().await; + + let catalog_manager = build_catalog_manager(test_ctx.engine()).await; + let catalog_name = catalog_manager.default_catalog_name(); + let catalog = catalog_manager.catalog_by_name(catalog_name); + assert!(catalog.is_ok()); + assert!(catalog.as_ref().unwrap().is_some()); + + let schema_name = "test"; + let catalog_ref = catalog.as_ref().unwrap().as_ref().unwrap(); + let mut schema = catalog_ref.schema_by_name(schema_name); + assert!(schema.is_ok()); + assert!(schema.as_ref().unwrap().is_none()); + + catalog_ref.create_schema(schema_name).await.unwrap(); + schema = catalog_ref.schema_by_name(schema_name); + assert!(schema.is_ok()); + assert!(schema.as_ref().unwrap().is_some()); + } + + #[tokio::test] + async fn test_alloc_table_id() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + test_ctx.open().await; + + let schema = build_default_schema(test_ctx.engine()).await; + let table_id = schema.alloc_table_id("test").unwrap(); + let expected_id = 2u64 << 40 | 1u64; + assert_eq!(table_id.as_u64(), expected_id); + } + + #[tokio::test] + async fn test_create_table() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + test_ctx.open().await; + + let catalog_manager = build_catalog_manager(test_ctx.engine()).await; + let schema = build_default_schema_with_catalog(&catalog_manager).await; + + let table_name = "test"; + let request = build_create_table_req(table_name, schema.clone()); + + let opts = CreateOptions { + table_engine: catalog_manager.get_engine_proxy(), + create_if_not_exists: true, + }; + + schema + .create_table(request.clone(), opts.clone()) + .await + .unwrap(); + assert!(schema.table_by_name(table_name).unwrap().is_some()); + + // create again + schema.create_table(request.clone(), opts).await.unwrap(); + assert!(schema.table_by_name(table_name).unwrap().is_some()); + + let opts2 = CreateOptions { + table_engine: catalog_manager.get_engine_proxy(), + create_if_not_exists: false, + }; + assert!(schema.create_table(request.clone(), opts2).await.is_err()); + } + + #[tokio::test] + async fn test_drop_table() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + test_ctx.open().await; + + let catalog_manager = build_catalog_manager(test_ctx.engine()).await; + let schema = build_default_schema_with_catalog(&catalog_manager).await; + + let table_name = "test"; + let engine_name = "test_engine"; + let drop_table_request = DropTableRequest { + catalog_name: DEFAULT_CATALOG.to_string(), + schema_name: DEFAULT_SCHEMA.to_string(), + table_name: table_name.to_string(), + engine: engine_name.to_string(), + }; + let drop_table_opts = DropOptions { + table_engine: catalog_manager.get_engine_proxy(), + }; + + assert!(!schema + .drop_table(drop_table_request.clone(), drop_table_opts.clone()) + .await + .unwrap()); + + let create_table_request = build_create_table_req(table_name, schema.clone()); + let create_table_opts = CreateOptions { + table_engine: catalog_manager.get_engine_proxy(), + create_if_not_exists: true, + }; + + // create table + { + schema + .create_table(create_table_request.clone(), create_table_opts.clone()) + .await + .unwrap(); + // check table exists + assert!(schema.table_by_name(table_name).unwrap().is_some()); + } + + // drop table + { + assert!(schema + .drop_table(drop_table_request.clone(), drop_table_opts.clone()) + .await + .unwrap()); + // check table not exists + assert!(schema.table_by_name(table_name).unwrap().is_none()); + } + + // create table again + { + schema + .create_table(create_table_request.clone(), create_table_opts.clone()) + .await + .unwrap(); + // check table exists + assert!(schema.table_by_name(table_name).unwrap().is_some()); + } + + // drop table again + { + assert!(schema + .drop_table(drop_table_request.clone(), drop_table_opts.clone()) + .await + .unwrap()); + // check table not exists + assert!(schema.table_by_name(table_name).unwrap().is_none()); + } + + // create two tables + { + let table_name2 = "test2"; + let create_table_request2 = build_create_table_req(table_name2, schema.clone()); + schema + .create_table(create_table_request2.clone(), create_table_opts.clone()) + .await + .unwrap(); + // check table exists + assert!(schema.table_by_name(table_name2).unwrap().is_some()); + + schema + .create_table(create_table_request, create_table_opts) + .await + .unwrap(); + // check table exists + assert!(schema.table_by_name(table_name).unwrap().is_some()); + } + + // drop table again + { + assert!(schema + .drop_table(drop_table_request, drop_table_opts) + .await + .unwrap()); + // check table not exists + assert!(schema.table_by_name(table_name).unwrap().is_none()); + } + } +} diff --git a/cluster/Cargo.toml b/cluster/Cargo.toml new file mode 100644 index 0000000000..d75d30a86d --- /dev/null +++ b/cluster/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "cluster" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +analytic_engine = { path = "../analytic_engine" } +async-trait = "0.1.41" +catalog = { path = "../catalog" } +common_types = { path = "../common_types" } +common_util = { path = "../common_util" } +log = "0.4" +meta_client_v2 = { path = "../meta_client_v2" } +rust-fsm = "0.6.0" +serde = "1.0" +serde_derive = "1.0" +serde_json = "1.0.60" +snafu = { version ="0.6.10", features = ["backtraces"]} +tokio = { version = "1.0", features = ["full"] } \ No newline at end of file diff --git a/cluster/src/config.rs b/cluster/src/config.rs new file mode 100644 index 0000000000..2afb0bee57 --- /dev/null +++ b/cluster/src/config.rs @@ -0,0 +1,18 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use meta_client_v2::MetaClientConfig; +use serde_derive::Deserialize; + +#[derive(Default, Clone, Deserialize, Debug)] +pub struct ClusterConfig { + /// Local ip address of this node, used as endpoint ip in meta. + pub node: String, + /// Grpc port of this node, also used as endpoint port in meta. + pub port: u16, + pub zone: String, + pub idc: String, + pub binary_version: String, + pub cmd_channel_buffer_size: usize, + + pub meta_client_config: MetaClientConfig, +} diff --git a/cluster/src/lib.rs b/cluster/src/lib.rs new file mode 100644 index 0000000000..9fe5916dc9 --- /dev/null +++ b/cluster/src/lib.rs @@ -0,0 +1,263 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{sync::Arc, time::Duration}; + +use async_trait::async_trait; +use catalog::manager::Manager; +use common_util::{define_result, runtime::Runtime}; +use log::{error, info}; +use meta_client_v2::{ + build_meta_client, ActionCmd, AllocSchemaIdRequest, AllocTableIdRequest, DropTableRequest, + GetTablesRequest, MetaClient, NodeMetaInfo, SchemaId, ShardId, ShardInfo, TableId, +}; +use snafu::{Backtrace, ResultExt, Snafu}; +use tokio::{ + sync::{mpsc::Receiver, RwLock}, + time, +}; + +use crate::{config::ClusterConfig, table_manager::TableManager}; + +pub mod config; +mod table_manager; + +#[derive(Debug, Snafu)] +#[snafu(visibility = "pub")] +pub enum Error { + #[snafu(display("Build meta client failed, err:{}.", source))] + BuildMetaClient { + source: Box, + }, + + #[snafu(display("Meta client start failed, err:{}.", source))] + StartMetaClient { + source: Box, + }, + + #[snafu(display("Meta client start failed, err:{}.", source))] + MetaClientFailure { + source: Box, + }, + + #[snafu(display( + "Shard not found in current node, shard_id:{}.\nBacktrace:\n{}", + shard_id, + backtrace + ))] + ShardNotFound { + shard_id: ShardId, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +#[async_trait] +pub trait Cluster { + async fn alloc_schema_id(&self, _schema_name: String) -> Result; + + async fn alloc_table_id(&self, _schema_name: String, _table_name: String) -> Result; + + async fn drop_table(&self, _schema_name: String, _table_name: String) -> Result<()>; +} + +pub struct ClusterImpl { + inner: Arc>, + runtime: Arc, +} + +impl ClusterImpl { + pub fn new(config: ClusterConfig, catalog_manager: M, runtime: Arc) -> Result { + Ok(Self { + inner: Arc::new(ClusterImplInner::new( + config, + catalog_manager, + runtime.clone(), + )?), + runtime, + }) + } + + pub async fn start(&self) -> Result<()> { + let inner = self.inner.clone(); + inner + .meta_client + .start() + .await + .map_err(|e| Box::new(e) as _) + .context(StartMetaClient)?; + self.runtime.spawn(async move { + inner.start_heartbeat().await; + }); + + Ok(()) + } +} + +#[async_trait] +impl Cluster for ClusterImpl { + async fn alloc_schema_id(&self, schema_name: String) -> Result { + self.inner.alloc_schema_id(schema_name).await + } + + async fn alloc_table_id(&self, schema_name: String, table_name: String) -> Result { + self.inner.alloc_table_id(schema_name, table_name).await + } + + async fn drop_table(&self, schema_name: String, table_name: String) -> Result<()> { + self.inner.drop_table(schema_name, table_name).await + } +} + +struct ClusterImplInner { + meta_client: Arc, + catalog_manager: M, + table_manager: TableManager, + action_cmd_receiver: RwLock>, + + config: ClusterConfig, +} + +impl ClusterImplInner { + pub fn new(config: ClusterConfig, catalog_manager: M, runtime: Arc) -> Result { + let (sender, receiver) = tokio::sync::mpsc::channel(config.cmd_channel_buffer_size); + let node_meta_info = NodeMetaInfo { + node: config.node.clone(), + zone: config.zone.clone(), + idc: config.idc.clone(), + binary_version: config.binary_version.clone(), + }; + Ok(Self { + meta_client: build_meta_client( + config.meta_client_config.clone(), + node_meta_info, + runtime, + Some(sender), + ) + .map_err(|e| Box::new(e) as _) + .context(BuildMetaClient)?, + catalog_manager, + table_manager: TableManager::new(), + action_cmd_receiver: RwLock::new(receiver), + config: config, + }) + } + + // heartbeat + async fn start_heartbeat(&self) { + let mut interval = time::interval(self.heartbeat_interval()); + + loop { + let shards_info = self.get_shards_info(); + info!("Node heartbeat to meta, shards info:{:?}", shards_info); + let resp = self.meta_client.send_heartbeat(shards_info).await; + match resp { + Ok(()) => { + interval.tick().await; + } + Err(e) => { + error!("Node heartbeat to meta failed, error:{}", e); + time::sleep(self.error_wait_lease()).await; + } + } + } + } + + async fn start_node_action_cmd(&self) { + let action_cmd_receiver = &mut *self.action_cmd_receiver.write().await; + // todo: handle error + while let Some(action_cmd) = action_cmd_receiver.recv().await { + info!( + "Node action cmd from meta received, action_cmd:{:?}", + action_cmd + ); + match action_cmd { + ActionCmd::OpenCmd(open_cmd) => { + let ret = self + .meta_client + .get_tables(GetTablesRequest { + shard_ids: open_cmd.shard_ids, + }) + .await; + match ret { + Err(ref e) => error!("Get shard tables failed, ret:{:?}, err:{}", ret, e), + Ok(v) => { + self.table_manager.update_table_info(v.tables_map); + // todo: self.catalog_manager.open tables + } + } + } + // todo: other action cmd + _ => todo!(), + } + } + info!("Node action cmd receiver exit"); + } + + fn get_shards_info(&self) -> Vec { + self.table_manager.get_shards_info() + } + + // Register node every 2/3 lease + fn heartbeat_interval(&self) -> Duration { + Duration::from_secs(self.config.meta_client_config.lease.as_secs() * 2 / 3) + } + + fn error_wait_lease(&self) -> Duration { + Duration::from_secs(self.config.meta_client_config.lease.as_secs() / 2) + } + + async fn alloc_schema_id(&self, schema_name: String) -> Result { + if let Some(v) = self.table_manager.get_schema_id(&schema_name) { + Ok(v) + } else { + Ok(self + .meta_client + .alloc_schema_id(AllocSchemaIdRequest { + name: schema_name.clone(), + }) + .await + .map_err(|e| Box::new(e) as _) + .context(MetaClientFailure)? + .id) + } + } + + async fn alloc_table_id(&self, schema_name: String, table_name: String) -> Result { + if let Some(v) = self.table_manager.get_table_id(&schema_name, &table_name) { + Ok(v) + } else { + let resp = self + .meta_client + .alloc_table_id(AllocTableIdRequest { + schema_name, + name: table_name, + }) + .await + .map_err(|e| Box::new(e) as _) + .context(MetaClientFailure)?; + self.table_manager.add_table( + resp.shard_id, + resp.schema_name, + resp.name, + resp.schema_id, + resp.id, + )?; + Ok(resp.id) + } + } + + async fn drop_table(&self, schema_name: String, table_name: String) -> Result<()> { + let _resp = self + .meta_client + .drop_table(DropTableRequest { + schema_name: schema_name.clone(), + name: table_name.clone(), + }) + .await + .map_err(|e| Box::new(e) as _) + .context(MetaClientFailure)?; + self.table_manager.drop_table(schema_name, table_name); + Ok(()) + } +} diff --git a/cluster/src/table_manager.rs b/cluster/src/table_manager.rs new file mode 100644 index 0000000000..738df85db6 --- /dev/null +++ b/cluster/src/table_manager.rs @@ -0,0 +1,163 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + collections::{BTreeMap, HashMap}, + sync::RwLock, +}; + +use meta_client_v2::{SchemaId, ShardId, ShardInfo, ShardTables, TableId, TableInfo}; + +use super::Result; +use crate::ShardNotFound; + +struct SchemaInfo { + name: String, + id: SchemaId, +} + +pub struct TableManager { + inner: RwLock, +} + +impl TableManager { + pub fn new() -> Self { + Self { + inner: RwLock::new(TableManagerInner { + shards_info: Vec::new(), + schemas_info: HashMap::new(), + tables: BTreeMap::new(), + }), + } + } + + pub fn get_shards_info(&self) -> Vec { + self.inner.read().unwrap().get_shards_info() + } + + pub fn add_table( + &self, + shard_id: ShardId, + schema_name: String, + table_name: String, + schema_id: SchemaId, + table_id: TableId, + ) -> Result<()> { + self.inner.write().unwrap().add_table( + shard_id, + schema_name, + table_name, + schema_id, + table_id, + ) + } + + pub fn drop_table(&self, schema_name: String, table_name: String) { + self.inner + .write() + .unwrap() + .drop_table(schema_name, table_name) + } + + pub fn update_table_info(&self, shard_table: HashMap) { + self.inner.write().unwrap().update_table_info(shard_table) + } + + pub fn get_schema_id(&self, schema_name: &str) -> Option { + self.inner.read().unwrap().get_schema_id(schema_name) + } + + pub fn get_table_id(&self, schema_name: &str, table_name: &str) -> Option { + self.inner + .read() + .unwrap() + .get_table_id(schema_name, table_name) + } +} + +struct TableManagerInner { + shards_info: Vec, + schemas_info: HashMap, + // schema_name -> table_name -> (shard_info, table_info) + tables: BTreeMap>, +} + +impl TableManagerInner { + fn get_shards_info(&self) -> Vec { + self.shards_info.clone() + } + + fn update_table_info(&mut self, shard_table: HashMap) { + for (shard_id, shard_tables) in shard_table { + let shard_info = ShardInfo { + shard_id, + role: shard_tables.role, + }; + for table in shard_tables.tables { + self.schemas_info + .entry(table.schema_name.clone()) + .or_insert(SchemaInfo { + name: table.schema_name.clone(), + id: table.schema_id, + }); + self.tables + .entry(table.schema_name.clone()) + .or_insert_with(BTreeMap::new) + .insert(table.name.clone(), (shard_info.clone(), table)); + } + } + } + + fn add_table( + &mut self, + shard_id: ShardId, + schema_name: String, + table_name: String, + schema_id: SchemaId, + table_id: TableId, + ) -> Result<()> { + let mut shard_info = None; + for shard in &self.shards_info { + if shard.shard_id == shard_id { + shard_info = Some(shard.clone()); + break; + } + } + match shard_info { + None => ShardNotFound { shard_id }.fail(), + Some(v) => { + self.tables + .entry(schema_name.clone()) + .or_insert_with(BTreeMap::new) + .insert( + table_name.clone(), + ( + v, + TableInfo { + id: table_id, + name: table_name, + schema_id, + schema_name, + }, + ), + ); + Ok(()) + } + } + } + + fn drop_table(&mut self, schema_name: String, table_name: String) { + self.tables + .get_mut(&schema_name) + .map(|v| v.remove(&table_name)); + } + + fn get_schema_id(&self, schema_name: &str) -> Option { + self.schemas_info.get(schema_name).map(|v| v.id) + } + + fn get_table_id(&self, schema_name: &str, table_name: &str) -> Option { + self.tables + .get(schema_name) + .and_then(|schema| schema.get(table_name).map(|v| v.1.id)) + } +} diff --git a/cluster/src/util.rs b/cluster/src/util.rs new file mode 100644 index 0000000000..e69de29bb2 diff --git a/common_types/Cargo.toml b/common_types/Cargo.toml new file mode 100644 index 0000000000..1bb477e3f3 --- /dev/null +++ b/common_types/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "common_types" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[features] +test = [] + +[dependencies] +# In alphabetical order +arrow_deps = { path = "../arrow_deps" } +byteorder = "1.2" +bytes = { path = "../components/bytes" } +chrono = "0.4" +murmur3 = "0.4.1" +paste = "1.0" +proto = { path = "../proto" } +snafu = { version ="0.6.10", features = ["backtraces"]} +# TODO(yingwen): Make sqlparser support a feature +sqlparser = "0.13.0" +serde = "1.0.81" +serde_derive = "1.0.81" diff --git a/common_types/src/bytes.rs b/common_types/src/bytes.rs new file mode 100644 index 0000000000..5a545d7b14 --- /dev/null +++ b/common_types/src/bytes.rs @@ -0,0 +1,5 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Bytes type. + +pub use bytes::*; diff --git a/common_types/src/column.rs b/common_types/src/column.rs new file mode 100644 index 0000000000..44908687bd --- /dev/null +++ b/common_types/src/column.rs @@ -0,0 +1,868 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Column +use std::sync::Arc; + +use arrow_deps::arrow::array::{ + Array, ArrayBuilder, ArrayRef, BinaryArray, BinaryBuilder, BooleanArray, BooleanBuilder, + Float32Array as FloatArray, Float32Builder as FloatBuilder, Float64Array as DoubleArray, + Float64Builder as DoubleBuilder, Int16Array, Int16Builder, Int32Array, Int32Builder, + Int64Array, Int64Builder, Int8Array, Int8Builder, NullArray, StringArray, StringBuilder, + TimestampMillisecondArray, TimestampMillisecondBuilder, UInt16Array, UInt16Builder, + UInt32Array, UInt32Builder, UInt64Array, UInt64Builder, UInt8Array, UInt8Builder, +}; +use paste::paste; +use snafu::{Backtrace, OptionExt, ResultExt, Snafu}; + +use crate::{ + bytes::Bytes, + datum::{Datum, DatumKind, DatumView}, + string::StringBytes, + time::Timestamp, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Invalid array type, datum_kind:{:?}, data_type:{:?}.\nBacktrace:\n{}", + datum_kind, + data_type, + backtrace + ))] + InvalidArrayType { + datum_kind: DatumKind, + data_type: arrow_deps::arrow::datatypes::DataType, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to append value, err:{}.\nBacktrace:\n{}", source, backtrace))] + Append { + source: arrow_deps::arrow::error::ArrowError, + backtrace: Backtrace, + }, + + #[snafu(display( + "Data type conflict, expect:{:?}, given:{:?}.\nBacktrace:\n{}", + expect, + given, + backtrace + ))] + ConflictType { + expect: DatumKind, + given: DatumKind, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to convert arrow data type, data_type:{}.\nBacktrace:\n{}", + data_type, + backtrace + ))] + UnsupportedArray { + data_type: arrow_deps::arrow::datatypes::DataType, + backtrace: Backtrace, + }, +} + +pub type Result = std::result::Result; + +#[derive(Debug)] +pub struct NullColumn(NullArray); + +impl NullColumn { + fn new_null(rows: usize) -> Self { + Self(NullArray::new(rows)) + } + + /// Only the first datum of NullColumn is considered not duplicated. + #[inline] + pub fn dedup(&self, selected: &mut [bool]) { + if !self.0.is_empty() { + selected[0] = true; + } + } +} + +// TODO(yingwen): Builder for columns. + +macro_rules! define_numeric_column { + ($($Kind: ident), *) => { + $(paste! { + #[derive(Debug)] + pub struct [<$Kind Column>]([<$Kind Array>]); + + #[inline] + fn [](array: &[<$Kind Array>], index: usize) -> Datum { + let value = array.value(index); + Datum::$Kind(value) + } + + #[inline] + fn [](array: &[<$Kind Array>], index: usize) -> DatumView { + let value = array.value(index); + DatumView::$Kind(value) + } + })* + } +} + +define_numeric_column!( + Float, Double, UInt64, UInt32, UInt16, UInt8, Int64, Int32, Int16, Int8, Boolean +); + +#[derive(Debug)] +pub struct TimestampColumn(TimestampMillisecondArray); + +#[derive(Debug)] +pub struct VarbinaryColumn(BinaryArray); + +#[derive(Debug)] +pub struct StringColumn(StringArray); + +#[inline] +fn get_null_datum_view(_array: &NullArray, _index: usize) -> DatumView { + DatumView::Null +} + +#[inline] +fn get_timestamp_datum_view(array: &TimestampMillisecondArray, index: usize) -> DatumView { + let value = array.value(index); + DatumView::Timestamp(Timestamp::new(value)) +} + +#[inline] +fn get_varbinary_datum_view(array: &BinaryArray, index: usize) -> DatumView { + let value = array.value(index); + DatumView::Varbinary(value) +} + +#[inline] +fn get_string_datum_view(array: &StringArray, index: usize) -> DatumView { + let value = array.value(index); + DatumView::String(value) +} + +#[inline] +fn get_null_datum(_array: &NullArray, _index: usize) -> Datum { + Datum::Null +} + +#[inline] +fn get_timestamp_datum(array: &TimestampMillisecondArray, index: usize) -> Datum { + let value = array.value(index); + Datum::Timestamp(Timestamp::new(value)) +} + +// TODO(yingwen): Avoid clone of data. +// Require a clone. +#[inline] +fn get_varbinary_datum(array: &BinaryArray, index: usize) -> Datum { + let value = array.value(index); + Datum::Varbinary(Bytes::copy_from_slice(value)) +} + +// TODO(yingwen): Avoid clone of data. +// Require a clone. +#[inline] +fn get_string_datum(array: &StringArray, index: usize) -> Datum { + let value = array.value(index); + Datum::String(StringBytes::copy_from_str(value)) +} + +macro_rules! impl_column { + ($Column: ident, $get_datum: expr, $get_datum_view: expr) => { + impl $Column { + /// Get datum by index. + pub fn datum_opt(&self, index: usize) -> Option { + // Do bound check. + if index >= self.0.len() { + return None; + } + + Some(self.datum(index)) + } + + pub fn datum_view(&self, index: usize) -> DatumView { + // If this datum is null. + if self.0.is_null(index) { + return DatumView::Null; + } + + $get_datum_view(&self.0, index) + } + + pub fn datum(&self, index: usize) -> Datum { + // If this datum is null. + if self.0.is_null(index) { + return Datum::Null; + } + + $get_datum(&self.0, index) + } + + #[inline] + pub fn num_rows(&self) -> usize { + self.0.len() + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.num_rows() == 0 + } + } + }; +} + +macro_rules! impl_dedup { + ($Column: ident) => { + impl $Column { + /// If datum i is not equal to previous datum i - 1, mark `selected[i]` to + /// true. + /// + /// The first datum is marked to true. + /// + /// The size of selected must equal to the size of this column and + /// initialized to false. + #[allow(clippy::float_cmp)] + pub fn dedup(&self, selected: &mut [bool]) { + if self.0.is_empty() { + return; + } + + selected[0] = true; + for i in 1..self.0.len() { + let current = self.0.value(i); + let prev = self.0.value(i - 1); + + if current != prev { + selected[i] = true; + } + } + } + } + }; +} + +macro_rules! impl_new_null { + ($Column: ident, $Builder: ident) => { + impl $Column { + /// Create a column that all values are null. + fn new_null(num_rows: usize) -> Result { + let mut builder = $Builder::new(num_rows); + for _ in 0..num_rows { + builder.append_null().context(Append)?; + } + let array = builder.finish(); + + Ok(Self(array)) + } + } + }; +} + +macro_rules! impl_from_array_and_slice { + ($Column: ident, $ArrayType: ident) => { + impl From<$ArrayType> for $Column { + fn from(array: $ArrayType) -> Self { + Self(array) + } + } + + impl From<&$ArrayType> for $Column { + fn from(array_ref: &$ArrayType) -> Self { + // We need to clone the [arrow_deps::arrow::array::ArrayData], which clones + // the underlying vector of [arrow_deps::arrow::buffer::Buffer] and Bitmap (also + // holds a Buffer), thus require some allocation. However, the Buffer is + // managed by Arc, so cloning the buffer is not too expensive. + let array_data = array_ref.data().clone(); + let array = $ArrayType::from(array_data); + + Self(array) + } + } + + impl $Column { + fn to_arrow_array(&self) -> $ArrayType { + // Clone the array data. + let array_data = self.0.data().clone(); + $ArrayType::from(array_data) + } + + /// Returns a zero-copy slice of this array with the indicated offset and + /// length. + /// + /// Panics if offset with length is greater than column length. + fn slice(&self, offset: usize, length: usize) -> Self { + let array_slice = self.0.slice(offset, length); + // Clone the slice data. + let array_data = array_slice.data().clone(); + let array = $ArrayType::from(array_data); + + Self(array) + } + } + }; +} + +macro_rules! impl_iter { + ($Column: ident, $Value: ident) => { + impl $Column { + /// Iter column values. + pub fn iter(&self) -> impl Iterator> + '_ { + self.0.iter() + } + } + }; +} + +macro_rules! impl_iter_map { + ($Column: ident, $Value: ident) => { + impl $Column { + /// Iter column values. + pub fn iter(&self) -> impl Iterator> + '_ { + self.0.iter().map(|v| v.map($Value::from)) + } + } + }; +} + +impl_column!(NullColumn, get_null_datum, get_null_datum_view); +impl_column!( + TimestampColumn, + get_timestamp_datum, + get_timestamp_datum_view +); +impl_column!( + VarbinaryColumn, + get_varbinary_datum, + get_varbinary_datum_view +); +impl_column!(StringColumn, get_string_datum, get_string_datum_view); + +impl_new_null!(TimestampColumn, TimestampMillisecondBuilder); +impl_new_null!(VarbinaryColumn, BinaryBuilder); +impl_new_null!(StringColumn, StringBuilder); + +impl_from_array_and_slice!(NullColumn, NullArray); +impl_from_array_and_slice!(TimestampColumn, TimestampMillisecondArray); +impl_from_array_and_slice!(VarbinaryColumn, BinaryArray); +impl_from_array_and_slice!(StringColumn, StringArray); + +impl_iter_map!(TimestampColumn, Timestamp); + +impl_dedup!(TimestampColumn); +impl_dedup!(VarbinaryColumn); +impl_dedup!(StringColumn); + +macro_rules! impl_numeric_column { + ($(($Kind: ident, $type: ty)), *) => { + $( + paste! { + impl_column!([<$Kind Column>], [], []); + impl_from_array_and_slice!([<$Kind Column>], [<$Kind Array>]); + impl_new_null!([<$Kind Column>], [<$Kind Builder>]); + impl_iter!([<$Kind Column>], $type); + impl_dedup!([<$Kind Column>]); + } + )* + } +} + +impl_numeric_column!( + (Double, f64), + (Float, f32), + (UInt64, u64), + (UInt32, u32), + (UInt16, u16), + (UInt8, u8), + (Int64, i64), + (Int32, i32), + (Int16, i16), + (Int8, i8), + (Boolean, bool) +); + +macro_rules! impl_numeric_value { + ($Column: ident, $Value: ident) => { + impl $Column { + /// Get value at index. + pub fn value(&self, index: usize) -> Option<$Value> { + if self.0.is_valid(index) { + unsafe { Some(self.0.value_unchecked(index)) } + } else { + None + } + } + } + }; +} + +macro_rules! batch_impl_numeric_value { + ($(($Kind: ident, $type: ty)), *) => { + $( + paste! { + impl_numeric_value!([<$Kind Column>], $type); + } + )* + } +} + +batch_impl_numeric_value!( + (Timestamp, i64), + (Double, f64), + (Float, f32), + (UInt64, u64), + (UInt32, u32), + (UInt16, u16), + (UInt8, u8), + (Int64, i64), + (Int32, i32), + (Int16, i16), + (Int8, i8), + (Boolean, bool) +); + +impl VarbinaryColumn { + pub fn iter(&self) -> impl Iterator> + '_ { + self.0.iter() + } + + pub fn value(&self, index: usize) -> Option<&[u8]> { + if self.0.is_valid(index) { + unsafe { Some(self.0.value_unchecked(index)) } + } else { + None + } + } +} + +impl StringColumn { + pub fn iter(&self) -> impl Iterator> + '_ { + self.0.iter() + } + + pub fn value(&self, index: usize) -> Option<&str> { + if self.0.is_valid(index) { + unsafe { Some(self.0.value_unchecked(index)) } + } else { + None + } + } +} + +macro_rules! impl_column_block { + ($($Kind: ident), *) => { + impl ColumnBlock { + pub fn datum_kind(&self) -> DatumKind { + match self { + $(ColumnBlock::$Kind(_) => DatumKind::$Kind,)* + } + } + + pub fn datum_opt(&self, index: usize) -> Option { + match self { + $(ColumnBlock::$Kind(col) => col.datum_opt(index),)* + } + } + + /// Panic if index is out fo bound. + pub fn datum_view(&self, index: usize) -> DatumView { + match self { + $(ColumnBlock::$Kind(col) => col.datum_view(index),)* + } + } + + /// Panic if index is out fo bound. + pub fn datum(&self, index: usize) -> Datum { + match self { + $(ColumnBlock::$Kind(col) => col.datum(index),)* + } + } + + pub fn num_rows(&self) -> usize { + match self { + $(ColumnBlock::$Kind(col) => col.num_rows(),)* + } + } + + pub fn to_arrow_array_ref(&self) -> ArrayRef { + match self { + $(ColumnBlock::$Kind(col) => Arc::new(col.to_arrow_array()),)* + } + } + + /// If datum i is not equal to previous datum i - 1, mark `selected[i]` to true. + /// + /// The first datum is not marked to true. + pub fn dedup(&self, selected: &mut [bool]) { + match self { + $(ColumnBlock::$Kind(col) => col.dedup(selected),)* + } + } + + /// Returns a zero-copy slice of this array with the indicated offset and length. + /// + /// Panics if offset with length is greater than column length. + #[must_use] + pub fn slice(&self, offset: usize, length: usize) -> Self { + match self { + $(ColumnBlock::$Kind(col) => ColumnBlock::$Kind(col.slice(offset, length)),)* + } + } + } + + $(paste! { + impl From<[<$Kind Column>]> for ColumnBlock { + fn from(column: [<$Kind Column>]) -> Self { + Self::$Kind(column) + } + } + })* + }; +} + +// TODO(yingwen): We can add a unsafe function that don't do bound check. + +macro_rules! define_column_block { + ($($Kind: ident), *) => { + paste! { + #[derive(Debug)] + pub enum ColumnBlock { + Null(NullColumn), + $( + $Kind([<$Kind Column>]), + )* + } + + impl ColumnBlock { + pub fn try_from_arrow_array_ref(datum_kind: &DatumKind, array: &ArrayRef) -> Result { + let column = match datum_kind { + DatumKind::Null => ColumnBlock::Null(NullColumn::new_null(array.len())), + $( + DatumKind::$Kind => { + let column = cast_array(datum_kind, array)?; + ColumnBlock::$Kind([<$Kind Column>]::from(column)) + } + )* + }; + Ok(column) + } + + pub fn new_null_with_type(kind: &DatumKind, rows: usize) -> Result { + let block = match kind { + DatumKind::Null => ColumnBlock::Null(NullColumn::new_null(rows)), + $( + DatumKind::$Kind => ColumnBlock::$Kind([<$Kind Column>]::new_null(rows)?), + )* + }; + + Ok(block) + } + } + } + } +} + +// Define column blocks, Null is defined explicitly in macro. +define_column_block!( + Timestamp, Double, Float, Varbinary, String, UInt64, UInt32, UInt16, UInt8, Int64, Int32, + Int16, Int8, Boolean +); + +impl ColumnBlock { + pub fn try_cast_arrow_array_ref(array: &ArrayRef) -> Result { + let datum_kind = + DatumKind::from_data_type(array.data_type()).with_context(|| UnsupportedArray { + data_type: array.data_type().clone(), + })?; + + Self::try_from_arrow_array_ref(&datum_kind, array) + } + + pub fn new_null(rows: usize) -> Self { + Self::Null(NullColumn::new_null(rows)) + } + + pub fn as_timestamp(&self) -> Option<&TimestampColumn> { + match self { + ColumnBlock::Timestamp(c) => Some(c), + _ => None, + } + } +} + +impl_column_block!( + Null, Timestamp, Double, Float, Varbinary, String, UInt64, UInt32, UInt16, UInt8, Int64, Int32, + Int16, Int8, Boolean +); + +fn cast_array<'a, T: 'static>(datum_kind: &DatumKind, array: &'a ArrayRef) -> Result<&'a T> { + array + .as_any() + .downcast_ref::() + .with_context(|| InvalidArrayType { + datum_kind: *datum_kind, + data_type: array.data_type().clone(), + }) +} + +macro_rules! append_datum { + ($Kind: ident, $builder: ident, $DatumType: ident, $datum: ident) => { + match $datum { + $DatumType::Null => $builder.append_null().context(Append), + $DatumType::$Kind(v) => $builder.append_value(v).context(Append), + _ => ConflictType { + expect: DatumKind::$Kind, + given: $datum.kind(), + } + .fail(), + } + }; +} + +macro_rules! append_datum_into { + ($Kind: ident, $builder: ident, $DatumType: ident, $datum: ident) => { + match $datum { + $DatumType::Null => $builder.append_null().context(Append), + $DatumType::$Kind(v) => $builder.append_value(v.into()).context(Append), + _ => ConflictType { + expect: DatumKind::$Kind, + given: $datum.kind(), + } + .fail(), + } + }; +} + +macro_rules! append_block { + ($Kind: ident, $builder: ident, $BlockType: ident, $block: ident, $start: ident, $len: ident) => { + match $block { + $BlockType::Null(v) => { + let end = std::cmp::min($start + $len, v.num_rows()); + for _ in $start..end { + $builder.append_null().context(Append)?; + } + Ok(()) + } + $BlockType::$Kind(v) => { + // There is no convenient api to copy a range of data from array to builder, so + // we still need to clone value one by one using a for loop. + let end = std::cmp::min($start + $len, v.num_rows()); + for i in $start..end { + let value_opt = v.value(i); + match value_opt { + Some(value) => { + $builder.append_value(value).context(Append)?; + } + None => { + $builder.append_null().context(Append)?; + } + } + } + Ok(()) + } + _ => ConflictType { + expect: DatumKind::$Kind, + given: $block.datum_kind(), + } + .fail(), + } + }; +} + +macro_rules! define_column_block_builder { + ($(($Kind: ident, $Builder: ident)), *) => { + paste! { + #[derive(Debug)] + pub enum ColumnBlockBuilder { + Null { rows: usize }, + Timestamp(TimestampMillisecondBuilder), + $( + $Kind($Builder), + )* + } + + impl ColumnBlockBuilder { + /// Create by data type with initial capacity + pub fn with_capacity(data_type: &DatumKind, capacity: usize) -> Self { + match data_type { + DatumKind::Null => Self::Null { rows: 0 }, + DatumKind::Timestamp => Self::Timestamp(TimestampMillisecondBuilder::new(capacity)), + $( + DatumKind::$Kind => Self::$Kind($Builder::new(capacity)), + )* + } + } + + /// Append the datum into the builder, the datum should have same the data + /// type of builder + pub fn append(&mut self, datum: Datum) -> Result<()> { + let given = datum.kind(); + match self { + Self::Null { rows } => match datum { + Datum::Null => { + *rows += 1; + Ok(()) + } + _ => ConflictType { + expect: DatumKind::Null, + given, + } + .fail(), + }, + Self::Timestamp(builder) => append_datum_into!(Timestamp, builder, Datum, datum), + $( + Self::$Kind(builder) => append_datum!($Kind, builder, Datum, datum), + )* + } + } + + /// Append the [DatumView] into the builder, the datum view should have same the data + /// type of builder + pub fn append_view<'a>(&mut self, datum: DatumView<'a>) -> Result<()> { + let given = datum.kind(); + match self { + Self::Null { rows } => match datum { + DatumView::Null => { + *rows += 1; + Ok(()) + } + _ => ConflictType { + expect: DatumKind::Null, + given, + } + .fail(), + }, + Self::Timestamp(builder) => append_datum_into!(Timestamp, builder, DatumView, datum), + $( + Self::$Kind(builder) => append_datum!($Kind, builder, DatumView, datum), + )* + } + } + + /// Append rows in [start..start + len) from `block` to the builder. + /// + /// Returns rows actually appended. + pub fn append_block_range(&mut self, block: &ColumnBlock, start: usize, len: usize) -> Result<()> { + match self { + Self::Null { rows } => { + if start + len >= block.num_rows() { + *rows += block.num_rows() - start; + } else { + *rows += len; + } + Ok(()) + }, + Self::Timestamp(builder) => append_block!(Timestamp, builder, ColumnBlock, block, start, len), + $( + Self::$Kind(builder) => append_block!($Kind, builder, ColumnBlock, block, start, len), + )* + } + } + + pub fn len(&self) -> usize { + match &self { + Self::Null { rows } => *rows, + Self::Timestamp(builder) => builder.len(), + $( + Self::$Kind(builder) => builder.len(), + )* + } + } + + // Build and reset the builder. + pub fn build(&mut self) -> ColumnBlock { + match self { + Self::Null { rows } => { + let block = ColumnBlock::new_null(*rows); + *rows = 0; + block + } + Self::Timestamp(builder) => TimestampColumn::from(builder.finish()).into(), + $( + Self::$Kind(builder) => [<$Kind Column>]::from(builder.finish()).into(), + )* + } + } + } + } + } +} + +// Define column block builders, Null and Timestamp are defined explicitly in +// macro. +define_column_block_builder!( + (Double, DoubleBuilder), + (Float, FloatBuilder), + (Varbinary, BinaryBuilder), + (String, StringBuilder), + (UInt64, UInt64Builder), + (UInt32, UInt32Builder), + (UInt16, UInt16Builder), + (UInt8, UInt8Builder), + (Int64, Int64Builder), + (Int32, Int32Builder), + (Int16, Int16Builder), + (Int8, Int8Builder), + (Boolean, BooleanBuilder) +); + +impl ColumnBlockBuilder { + /// Create by data type + pub fn new(data_type: &DatumKind) -> Self { + Self::with_capacity(data_type, 0) + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Clear the builder by calling `build()` and drop the built result. + pub fn clear(&mut self) { + let _ = self.build(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::tests::{build_rows, build_schema}; + + #[test] + fn test_column_block_builder() { + let schema = build_schema(); + let rows = build_rows(); + // DatumKind::Varbinary + let column = schema.column(0); + let mut builder = ColumnBlockBuilder::with_capacity(&column.data_type, 2); + + // append + builder.append(rows[0][0].clone()).unwrap(); + let ret = builder.append(rows[0][1].clone()); + assert!(ret.is_err()); + + // append_view + builder.append_view(rows[1][0].as_view()).unwrap(); + let ret = builder.append_view(rows[0][1].as_view()); + assert!(ret.is_err()); + + let column_block = builder.build(); + assert_eq!(column_block.num_rows(), 2); + let mut builder = ColumnBlockBuilder::with_capacity(&column.data_type, 2); + + // append_block_range + builder.append_block_range(&column_block, 0, 1).unwrap(); + builder.append_block_range(&column_block, 1, 1).unwrap(); + + let column_block = builder.build(); + assert_eq!(column_block.num_rows(), 2); + assert_eq!( + column_block.datum(0), + Datum::Varbinary(Bytes::copy_from_slice(b"binary key")) + ); + assert_eq!( + column_block.datum(1), + Datum::Varbinary(Bytes::copy_from_slice(b"binary key1")) + ); + } +} diff --git a/common_types/src/column_schema.rs b/common_types/src/column_schema.rs new file mode 100644 index 0000000000..eecf4303eb --- /dev/null +++ b/common_types/src/column_schema.rs @@ -0,0 +1,477 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Schema of column + +use std::{collections::BTreeMap, convert::TryFrom, str::FromStr}; + +use arrow_deps::arrow::datatypes::{DataType, Field}; +use proto::common as common_pb; +use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu}; + +use crate::datum::DatumKind; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Unsupported arrow data type, type:{}.\nBacktrace:\n{}", + data_type, + backtrace + ))] + UnsupportedDataType { + data_type: DataType, + backtrace: Backtrace, + }, + + #[snafu(display("Invalid tag type:{}.\nBacktrace:\n{}", data_type, backtrace))] + InvalidTagType { + data_type: DataType, + backtrace: Backtrace, + }, + + #[snafu(display( + "Arrow field meta data is missing, field name:{}.\nBacktrace:\n{}", + field_name, + backtrace + ))] + ArrowFieldMetaDataMissing { + field_name: String, + backtrace: Backtrace, + }, + + #[snafu(display( + "Arrow field meta key is not found, key:{:?}.\nBacktrace:\n{}", + key, + backtrace + ))] + ArrowFieldMetaKeyNotFound { + key: ArrowFieldMetaKey, + backtrace: Backtrace, + }, + + #[snafu(display( + "Arrow field meta value is invalid, key:{:?}, raw_value:{}, err:{}.\nBacktrace:\n{}", + key, + raw_value, + source, + backtrace + ))] + InvalidArrowFieldMetaValue { + key: ArrowFieldMetaKey, + raw_value: String, + source: Box, + backtrace: Backtrace, + }, +} + +pub type Result = std::result::Result; + +/// Error of compatibility check +#[derive(Debug, Snafu)] +pub enum CompatError { + #[snafu(display( + "Incompatible data type of column, name:{}, expect:{:?}, given:{:?}.\nBacktrace:\n{}", + name, + expect, + given, + backtrace, + ))] + IncompatDataType { + name: String, + expect: DatumKind, + given: DatumKind, + backtrace: Backtrace, + }, + + #[snafu(display("Column is not nullable, name:{}.\nBacktrace:\n{}", name, backtrace))] + NotNullable { name: String, backtrace: Backtrace }, +} + +/// Id of column +pub type ColumnId = u32; + +/// A ColumnId used to indicate that the column id is uninitialized +pub const COLUMN_ID_UNINIT: ColumnId = 0; + +/// Read operation of a column +#[derive(Debug)] +pub enum ReadOp { + /// Use the column exactly + Exact, + /// Fill the column by null + FillNull, +} + +/// Meta data of the arrow field. +#[derive(Clone, Debug, Default)] +struct ArrowFieldMeta { + id: u32, + is_tag: bool, + comment: String, +} + +#[derive(Copy, Clone, Debug)] +pub enum ArrowFieldMetaKey { + Id, + IsTag, + Comment, +} + +impl ArrowFieldMetaKey { + fn as_str(&self) -> &str { + match self { + ArrowFieldMetaKey::Id => "field::id", + ArrowFieldMetaKey::IsTag => "field::is_tag", + ArrowFieldMetaKey::Comment => "field::comment", + } + } +} + +impl ToString for ArrowFieldMetaKey { + fn to_string(&self) -> String { + self.as_str().to_string() + } +} + +/// Schema of column +#[derive(Debug, Clone, PartialEq)] +pub struct ColumnSchema { + /// Id of column + pub id: ColumnId, + /// Column name + pub name: String, + /// Data type of the column + pub data_type: DatumKind, + /// Is nullable + pub is_nullable: bool, + /// Is tag, tag is just a hint for a column, there is no restriction that a + /// tag column must be a part of primary key + pub is_tag: bool, + /// Comment of the column + pub comment: String, +} + +impl ColumnSchema { + /// Check whether a type is valid tag type. + pub fn is_valid_tag_type(typ: DatumKind) -> bool { + match typ { + DatumKind::Null => false, + DatumKind::Timestamp => true, + DatumKind::Double => false, + DatumKind::Float => false, + DatumKind::Varbinary => true, + DatumKind::String => true, + DatumKind::UInt64 => true, + DatumKind::UInt32 => true, + DatumKind::UInt16 => true, + DatumKind::UInt8 => true, + DatumKind::Int64 => true, + DatumKind::Int32 => true, + DatumKind::Int16 => true, + DatumKind::Int8 => true, + DatumKind::Boolean => true, + } + } + + /// Convert `self` to [proto::common::ColumnSchema] + /// + /// The `is_key` is needed because it is maintained by + /// [crate::schema::Schema] + pub fn to_pb(&self) -> common_pb::ColumnSchema { + let mut column_schema = common_pb::ColumnSchema::new(); + column_schema.set_name(self.name.clone()); + column_schema.set_data_type(self.data_type.into()); + column_schema.set_is_nullable(self.is_nullable); + column_schema.set_id(self.id); + column_schema.set_is_tag(self.is_tag); + column_schema.set_comment(self.comment.clone()); + + column_schema + } + + /// Convert `self` to [arrow_deps::arrow::datatypes::Field] + pub fn to_arrow_field(&self) -> Field { + From::from(self) + } + + /// Returns Ok if column with `writer_schema` can write to column with the + /// same schema as `self`. + pub fn compatible_for_write( + &self, + writer_schema: &ColumnSchema, + ) -> std::result::Result<(), CompatError> { + ensure!( + self.data_type == writer_schema.data_type, + IncompatDataType { + name: &self.name, + expect: writer_schema.data_type, + given: self.data_type, + } + ); + + // This column is not nullable but writer is nullable + ensure!( + self.is_nullable || !writer_schema.is_nullable, + NotNullable { name: &self.name } + ); + + Ok(()) + } + + /// Returns `Ok` if the source schema can read by this schema, now we won't + /// validate data type of column + pub fn compatible_for_read( + &self, + source_schema: &ColumnSchema, + ) -> std::result::Result { + if self.is_nullable { + // Column is nullable + if self.id == source_schema.id { + // Same column + Ok(ReadOp::Exact) + } else { + // Not the same column, maybe dropped, fill by null. + Ok(ReadOp::FillNull) + } + } else { + // Column is not null. We consider the old column was dropped if they have + // different column id and also try to fill by null, so we + // also check column id. + ensure!( + self.id == source_schema.id && !source_schema.is_nullable, + NotNullable { + name: &source_schema.name, + } + ); + + Ok(ReadOp::Exact) + } + } +} + +impl From for ColumnSchema { + fn from(column_schema: common_pb::ColumnSchema) -> Self { + Self { + id: column_schema.id, + name: column_schema.name, + data_type: DatumKind::from(column_schema.data_type), + is_nullable: column_schema.is_nullable, + is_tag: column_schema.is_tag, + comment: column_schema.comment, + } + } +} + +impl TryFrom<&Field> for ColumnSchema { + type Error = Error; + + fn try_from(field: &Field) -> Result { + let meta_data = field.metadata().as_ref(); + let ArrowFieldMeta { + id, + is_tag, + comment, + } = if let Some(meta_data) = meta_data { + decode_arrow_field_meta_data(meta_data)? + } else { + // FIXME(xikai): Now we have to tolerate the decoding failure because of the bug + // of datafusion (fixed by: https://github.com/apache/arrow-datafusion/commit/1448d9752ab3a38f02732274f91136a6a6ad3db4). + // (The bug may cause the meta data of the field meta lost duration plan + // execution.) + ArrowFieldMeta::default() + }; + Ok(Self { + id, + name: field.name().clone(), + data_type: DatumKind::from_data_type(field.data_type()).context( + UnsupportedDataType { + data_type: field.data_type().clone(), + }, + )?, + is_nullable: field.is_nullable(), + is_tag, + comment, + }) + } +} + +impl From<&ColumnSchema> for Field { + fn from(col_schema: &ColumnSchema) -> Self { + let metadata = encode_arrow_field_meta_data(col_schema); + let mut field = Field::new( + &col_schema.name, + col_schema.data_type.into(), + col_schema.is_nullable, + ); + field.set_metadata(Some(metadata)); + + field + } +} + +fn parse_arrow_field_meta_value( + meta: &BTreeMap, + key: ArrowFieldMetaKey, +) -> Result +where + T: FromStr, + T::Err: std::error::Error + Send + Sync + 'static, +{ + let raw_value = meta + .get(key.as_str()) + .context(ArrowFieldMetaKeyNotFound { key })?; + T::from_str(raw_value.as_str()) + .map_err(|e| Box::new(e) as _) + .context(InvalidArrowFieldMetaValue { key, raw_value }) +} + +fn decode_arrow_field_meta_data(meta: &BTreeMap) -> Result { + Ok(ArrowFieldMeta { + id: parse_arrow_field_meta_value(meta, ArrowFieldMetaKey::Id)?, + is_tag: parse_arrow_field_meta_value(meta, ArrowFieldMetaKey::IsTag)?, + comment: parse_arrow_field_meta_value(meta, ArrowFieldMetaKey::Comment)?, + }) +} + +fn encode_arrow_field_meta_data(col_schema: &ColumnSchema) -> BTreeMap { + let mut meta = BTreeMap::new(); + + meta.insert(ArrowFieldMetaKey::Id.to_string(), col_schema.id.to_string()); + meta.insert( + ArrowFieldMetaKey::IsTag.to_string(), + col_schema.is_tag.to_string(), + ); + meta.insert( + ArrowFieldMetaKey::Comment.to_string(), + col_schema.comment.clone(), + ); + + meta +} + +/// ColumnSchema builder +#[must_use] +pub struct Builder { + id: ColumnId, + name: String, + data_type: DatumKind, + is_nullable: bool, + is_tag: bool, + comment: String, +} + +impl Builder { + /// Create a new builder + pub fn new(name: String, data_type: DatumKind) -> Self { + Self { + id: COLUMN_ID_UNINIT, + name, + data_type, + is_nullable: false, + is_tag: false, + comment: String::new(), + } + } + + pub fn id(mut self, id: ColumnId) -> Self { + self.id = id; + self + } + + /// Set this column is nullable, default is true (not nullable). + pub fn is_nullable(mut self, is_nullable: bool) -> Self { + self.is_nullable = is_nullable; + self + } + + /// Set this column is tag, default is false (not a tag). + pub fn is_tag(mut self, is_tag: bool) -> Self { + self.is_tag = is_tag; + self + } + + pub fn comment(mut self, comment: String) -> Self { + self.comment = comment; + self + } + + pub fn validate(&self) -> Result<()> { + if self.is_tag { + ensure!( + ColumnSchema::is_valid_tag_type(self.data_type), + InvalidTagType { + data_type: self.data_type + } + ); + } + + Ok(()) + } + + pub fn build(self) -> Result { + self.validate()?; + + Ok(ColumnSchema { + id: self.id, + name: self.name, + data_type: self.data_type, + is_nullable: self.is_nullable, + is_tag: self.is_tag, + comment: self.comment, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Create a column schema for test, each field is filled with non-default + /// value + fn new_test_column_schema() -> ColumnSchema { + Builder::new("test_column_schema".to_string(), DatumKind::Boolean) + .id(18) + .is_nullable(true) + .is_tag(true) + .comment("Comment of this column".to_string()) + .build() + .expect("should succeed to build column schema") + } + + #[test] + fn test_builder() { + let lhs = new_test_column_schema(); + let rhs = ColumnSchema { + id: 18, + name: "test_column_schema".to_string(), + data_type: DatumKind::Boolean, + is_nullable: true, + is_tag: true, + comment: "Comment of this column".to_string(), + }; + + assert_eq!(&lhs, &rhs); + } + + #[test] + fn test_pb_convert() { + let column_schema = new_test_column_schema(); + let pb_schema = column_schema.to_pb(); + // Check pb specific fields + assert!(pb_schema.is_tag); + + let schema_from_pb = ColumnSchema::from(pb_schema); + assert_eq!(&schema_from_pb, &column_schema); + } + + #[test] + fn test_valid_tag_type() { + let invalid_tag_types = vec![DatumKind::Null, DatumKind::Float, DatumKind::Double]; + + for v in &DatumKind::VALUES { + assert_eq!( + ColumnSchema::is_valid_tag_type(*v), + !invalid_tag_types.contains(v) + ); + } + } +} diff --git a/common_types/src/datum.rs b/common_types/src/datum.rs new file mode 100644 index 0000000000..4ae6a8124b --- /dev/null +++ b/common_types/src/datum.rs @@ -0,0 +1,887 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Datum holds different kind of data + +use std::{convert::TryFrom, fmt, str}; + +use arrow_deps::{ + arrow::datatypes::{DataType, TimeUnit}, + datafusion::scalar::ScalarValue, +}; +use chrono::{Local, TimeZone}; +use proto::common::DataType as DataTypePb; +use serde::ser::{Serialize, Serializer}; +use snafu::{Backtrace, ResultExt, Snafu}; +use sqlparser::ast::{DataType as SqlDataType, Value}; + +use crate::{bytes::Bytes, hash::hash64, string::StringBytes, time::Timestamp}; +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Unsupported SQL data type, type:{}.\nBacktrace:\n{}", + sql_type, + backtrace + ))] + UnsupportedDataType { + sql_type: SqlDataType, + backtrace: Backtrace, + }, + + #[snafu(display("Invalid double or float, err:{}.\nBacktrace:\n{}", source, backtrace))] + InvalidDouble { + source: std::num::ParseFloatError, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid insert value, kind:{}, value:{:?}.\nBacktrace:\n{}", + kind, + value, + backtrace + ))] + InvalidValueType { + kind: DatumKind, + value: Value, + backtrace: Backtrace, + }, + #[snafu(display("Invalid timestamp, err:{}.\nBacktrace:\n{}", source, backtrace))] + InvalidTimestamp { + source: std::num::ParseIntError, + backtrace: Backtrace, + }, + + #[snafu(display("Invalid integer, err:{}.\nBacktrace:\n{}", source, backtrace))] + InvalidInt { + source: std::num::ParseIntError, + backtrace: Backtrace, + }, + + #[snafu(display("Invalid datum byte, byte:{}.\nBacktrace:\n{}", value, backtrace))] + InvalidDatumByte { value: u8, backtrace: Backtrace }, +} + +pub type Result = std::result::Result; + +// FIXME(yingwen): How to handle timezone? + +/// Data type of datum +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DatumKind { + Null = 0, + Timestamp, + Double, + Float, + Varbinary, + String, + UInt64, + UInt32, + UInt16, + UInt8, + Int64, + Int32, + Int16, + Int8, + // DatumKind::Boolean as usize = 14 + Boolean, +} + +impl DatumKind { + pub const VALUES: [Self; 15] = [ + Self::Null, + Self::Timestamp, + Self::Double, + Self::Float, + Self::Varbinary, + Self::String, + Self::UInt64, + Self::UInt32, + Self::UInt16, + Self::UInt8, + Self::Int64, + Self::Int32, + Self::Int16, + Self::Int8, + Self::Boolean, + ]; + + /// Return true if this is DatumKind::Timestamp + pub fn is_timestamp(&self) -> bool { + matches!(self, DatumKind::Timestamp) + } + + pub fn is_f64_castable(&self) -> bool { + matches!( + self, + Self::Double + | Self::Float + | Self::UInt64 + | Self::UInt32 + | Self::UInt16 + | Self::UInt8 + | Self::Int64 + | Self::Int32 + | Self::Int16 + | Self::Int8 + ) + } + + /// Can column of this datum kind used as key column + pub fn is_key_kind(&self) -> bool { + matches!( + self, + DatumKind::Timestamp + | DatumKind::Varbinary + | DatumKind::String + | DatumKind::UInt64 + | DatumKind::UInt32 + | DatumKind::UInt16 + | DatumKind::UInt8 + | DatumKind::Int64 + | DatumKind::Int32 + | DatumKind::Int16 + | DatumKind::Int8 + | DatumKind::Boolean + ) + } + + pub fn unsign_kind(&self) -> Option { + match self { + Self::Int64 | Self::UInt64 => Some(Self::UInt64), + Self::Int32 | Self::UInt32 => Some(Self::UInt32), + Self::Int16 | Self::UInt16 => Some(Self::UInt16), + Self::Int8 | Self::UInt8 => Some(Self::UInt8), + _ => None, + } + } + + /// Create DatumKind from [arrow_deps::arrow::datatypes::DataType], if the + /// type is not supported, returns None + pub fn from_data_type(data_type: &DataType) -> Option { + match data_type { + DataType::Null => Some(Self::Null), + DataType::Timestamp(TimeUnit::Millisecond, None) => Some(Self::Timestamp), + DataType::Float64 => Some(Self::Double), + DataType::Float32 => Some(Self::Float), + DataType::Binary => Some(Self::Varbinary), + DataType::Utf8 => Some(Self::String), + DataType::UInt64 => Some(Self::UInt64), + DataType::UInt32 => Some(Self::UInt32), + DataType::UInt16 => Some(Self::UInt16), + DataType::UInt8 => Some(Self::UInt8), + DataType::Int64 => Some(Self::Int64), + DataType::Int32 => Some(Self::Int32), + DataType::Int16 => Some(Self::Int16), + DataType::Int8 => Some(Self::Int8), + DataType::Boolean => Some(Self::Boolean), + DataType::Float16 + | DataType::LargeUtf8 + | DataType::LargeBinary + | DataType::FixedSizeBinary(_) + | DataType::Struct(_) + | DataType::Union(_, _) + | DataType::List(_) + | DataType::LargeList(_) + | DataType::FixedSizeList(_, _) + | DataType::Time32(_) + | DataType::Time64(_) + | DataType::Timestamp(_, _) + | DataType::Date32 + | DataType::Date64 + | DataType::Interval(_) + | DataType::Duration(_) + | DataType::Dictionary(_, _) + | DataType::Decimal(_, _) + | DataType::Map(_, _) => None, + } + } + + /// Get name of this kind. + fn as_str(&self) -> &str { + match self { + DatumKind::Null => "null", + DatumKind::Timestamp => "timestamp", + DatumKind::Double => "double", + DatumKind::Float => "float", + DatumKind::Varbinary => "varbinary", + DatumKind::String => "string", + DatumKind::UInt64 => "uint64", + DatumKind::UInt32 => "uint32", + DatumKind::UInt16 => "uint16", + DatumKind::UInt8 => "uint8", + DatumKind::Int64 => "bigint", + DatumKind::Int32 => "int", + DatumKind::Int16 => "smallint", + DatumKind::Int8 => "tinyint", + DatumKind::Boolean => "boolean", + } + } + + /// Convert into a byte. + #[inline] + pub fn into_u8(self) -> u8 { + self as u8 + } +} + +impl From for DataType { + fn from(kind: DatumKind) -> Self { + match kind { + DatumKind::Null => DataType::Null, + DatumKind::Timestamp => DataType::Timestamp(TimeUnit::Millisecond, None), + DatumKind::Double => DataType::Float64, + DatumKind::Float => DataType::Float32, + DatumKind::Varbinary => DataType::Binary, + DatumKind::String => DataType::Utf8, + DatumKind::UInt64 => DataType::UInt64, + DatumKind::UInt32 => DataType::UInt32, + DatumKind::UInt16 => DataType::UInt16, + DatumKind::UInt8 => DataType::UInt8, + DatumKind::Int64 => DataType::Int64, + DatumKind::Int32 => DataType::Int32, + DatumKind::Int16 => DataType::Int16, + DatumKind::Int8 => DataType::Int8, + DatumKind::Boolean => DataType::Boolean, + } + } +} + +impl TryFrom<&SqlDataType> for DatumKind { + type Error = Error; + + fn try_from(sql_type: &SqlDataType) -> Result { + match sql_type { + // TODO(yingwen): Consider timezone + SqlDataType::Timestamp => Ok(Self::Timestamp), + SqlDataType::Real | SqlDataType::Float(_) => Ok(Self::Float), + SqlDataType::Double => Ok(Self::Double), + SqlDataType::Boolean => Ok(Self::Boolean), + SqlDataType::BigInt(_) => Ok(Self::Int64), + SqlDataType::Int(_) => Ok(Self::Int32), + SqlDataType::SmallInt(_) => Ok(Self::Int16), + SqlDataType::String => Ok(Self::String), + SqlDataType::Custom(objects) if objects.0.len() == 1 => { + match objects.0[0].value.as_str() { + "UINT64" | "uint64" => Ok(Self::UInt64), + "UINT32" | "uint32" => Ok(Self::UInt32), + "UINT16" | "uint16" => Ok(Self::UInt16), + "UINT8" | "uint8" => Ok(Self::UInt8), + "INT64" | "int64" => Ok(Self::Int64), + "INT32" | "int32" => Ok(Self::Int32), + "INT16" | "int16" => Ok(Self::Int16), + "TINYINT" | "INT8" | "tinyint" | "int8" => Ok(Self::Int8), + "VARBINARY" | "varbinary" => Ok(Self::Varbinary), + _ => UnsupportedDataType { + sql_type: sql_type.clone(), + } + .fail(), + } + } + + // Unlike datafusion, Decimal is not supported now + _ => UnsupportedDataType { + sql_type: sql_type.clone(), + } + .fail(), + } + } +} + +impl TryFrom for DatumKind { + type Error = Error; + + fn try_from(v: u8) -> Result { + match v { + v if DatumKind::Null.into_u8() == v => Ok(DatumKind::Null), + v if DatumKind::Timestamp.into_u8() == v => Ok(DatumKind::Timestamp), + v if DatumKind::Double.into_u8() == v => Ok(DatumKind::Double), + v if DatumKind::Float.into_u8() == v => Ok(DatumKind::Float), + v if DatumKind::Varbinary.into_u8() == v => Ok(DatumKind::Varbinary), + v if DatumKind::String.into_u8() == v => Ok(DatumKind::String), + v if DatumKind::UInt64.into_u8() == v => Ok(DatumKind::UInt64), + v if DatumKind::UInt32.into_u8() == v => Ok(DatumKind::UInt32), + v if DatumKind::UInt16.into_u8() == v => Ok(DatumKind::UInt16), + v if DatumKind::UInt8.into_u8() == v => Ok(DatumKind::UInt8), + v if DatumKind::Int64.into_u8() == v => Ok(DatumKind::Int64), + v if DatumKind::Int32.into_u8() == v => Ok(DatumKind::Int32), + v if DatumKind::Int16.into_u8() == v => Ok(DatumKind::Int16), + v if DatumKind::Int8.into_u8() == v => Ok(DatumKind::Int8), + v if DatumKind::Boolean.into_u8() == v => Ok(DatumKind::Boolean), + _ => InvalidDatumByte { value: v }.fail(), + } + } +} + +impl From for DataTypePb { + fn from(kind: DatumKind) -> Self { + match kind { + DatumKind::Null => Self::NULL, + DatumKind::Timestamp => Self::TIMESTAMP, + DatumKind::Double => Self::DOUBLE, + DatumKind::Float => Self::FLOAT, + DatumKind::Varbinary => Self::VARBINARY, + DatumKind::String => Self::STRING, + DatumKind::UInt64 => Self::UINT64, + DatumKind::UInt32 => Self::UINT32, + DatumKind::UInt16 => Self::UINT16, + DatumKind::UInt8 => Self::UINT8, + DatumKind::Int64 => Self::INT64, + DatumKind::Int32 => Self::INT32, + DatumKind::Int16 => Self::INT16, + DatumKind::Int8 => Self::INT8, + DatumKind::Boolean => Self::BOOL, + } + } +} + +impl From for DatumKind { + fn from(data_type: DataTypePb) -> Self { + match data_type { + DataTypePb::NULL => DatumKind::Null, + DataTypePb::TIMESTAMP => DatumKind::Timestamp, + DataTypePb::DOUBLE => DatumKind::Double, + DataTypePb::FLOAT => DatumKind::Float, + DataTypePb::VARBINARY => DatumKind::Varbinary, + DataTypePb::STRING => DatumKind::String, + DataTypePb::UINT64 => DatumKind::UInt64, + DataTypePb::UINT32 => DatumKind::UInt32, + DataTypePb::UINT16 => DatumKind::UInt16, + DataTypePb::UINT8 => DatumKind::UInt8, + DataTypePb::INT64 => DatumKind::Int64, + DataTypePb::INT32 => DatumKind::Int32, + DataTypePb::INT16 => DatumKind::Int16, + DataTypePb::INT8 => DatumKind::Int8, + DataTypePb::BOOL => DatumKind::Boolean, + } + } +} + +impl fmt::Display for DatumKind { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +// FIXME(yingwen): Validate the length of string and varbinary. +/// A data box holds different kind of data +#[derive(Debug, Clone, PartialEq, PartialOrd)] +pub enum Datum { + Null, + /// Millisecond precision + /// + /// Map to arrow::datatypes::DataType::Timestamp(TimeUnit::Millisecond, + /// None) + Timestamp(Timestamp), + /// Map to arrow::datatypes::DataType::Float64 + Double(f64), + /// Map to arrow::datatypes::DataType::Float32 + Float(f32), + /// Map to arrow::datatypes::DateType::Binary + /// + /// No more than 2G (size of i32) + Varbinary(Bytes), + /// Map to arrow::datatypes::DataType::String + /// + /// No more than 2G (size of i32) + String(StringBytes), + /// Map to arrow::datatypes::DataType::UInt64 + UInt64(u64), + UInt32(u32), + UInt16(u16), + UInt8(u8), + Int64(i64), + Int32(i32), + Int16(i16), + Int8(i8), + Boolean(bool), +} + +impl Datum { + /// Creates an empty datum by given datum kind + pub fn empty(kind: &DatumKind) -> Self { + match kind { + DatumKind::Null => Self::Null, + DatumKind::Timestamp => Self::Timestamp(Timestamp::new(0)), + DatumKind::Double => Self::Double(0.0), + DatumKind::Float => Self::Float(0.0), + DatumKind::Varbinary => Self::Varbinary(Bytes::new()), + DatumKind::String => Self::String(StringBytes::new()), + DatumKind::UInt64 => Self::UInt64(0), + DatumKind::UInt32 => Self::UInt32(0), + DatumKind::UInt16 => Self::UInt16(0), + DatumKind::UInt8 => Self::UInt8(0), + DatumKind::Int64 => Self::Int64(0), + DatumKind::Int32 => Self::Int32(0), + DatumKind::Int16 => Self::Int16(0), + DatumKind::Int8 => Self::Int8(0), + DatumKind::Boolean => Self::Boolean(false), + } + } + + /// Return the kind of datum + pub fn kind(&self) -> DatumKind { + match self { + Datum::Null => DatumKind::Null, + Datum::Timestamp(_) => DatumKind::Timestamp, + Datum::Double(_) => DatumKind::Double, + Datum::Float(_) => DatumKind::Float, + Datum::Varbinary(_) => DatumKind::Varbinary, + Datum::String(_) => DatumKind::String, + Datum::UInt64(_) => DatumKind::UInt64, + Datum::UInt32(_) => DatumKind::UInt32, + Datum::UInt16(_) => DatumKind::UInt16, + Datum::UInt8(_) => DatumKind::UInt8, + Datum::Int64(_) => DatumKind::Int64, + Datum::Int32(_) => DatumKind::Int32, + Datum::Int16(_) => DatumKind::Int16, + Datum::Int8(_) => DatumKind::Int8, + Datum::Boolean(_) => DatumKind::Boolean, + } + } + + // TODO: handle error + pub fn convert_to_uint64(&self) -> u64 { + match self { + Datum::Null => 0, + Datum::Timestamp(v) => v.as_i64() as u64, + Datum::Double(v) => *v as u64, + Datum::Float(v) => *v as u64, + Datum::Varbinary(v) => hash64(v), + Datum::String(v) => hash64(v.as_bytes()), + Datum::UInt64(v) => *v, + Datum::UInt32(v) => *v as u64, + Datum::UInt16(v) => *v as u64, + Datum::UInt8(v) => *v as u64, + Datum::Int64(v) => *v as u64, + Datum::Int32(v) => *v as u64, + Datum::Int16(v) => *v as u64, + Datum::Int8(v) => *v as u64, + Datum::Boolean(v) => *v as u64, + } + } + + pub fn is_null(&self) -> bool { + matches!(self, Datum::Null) + } + + /// Cast datum to timestamp. + pub fn as_timestamp(&self) -> Option { + match self { + Datum::Timestamp(v) => Some(*v), + _ => None, + } + } + + /// Cast datum to &str. + pub fn as_str(&self) -> Option<&str> { + match self { + Datum::String(v) => Some(v), + _ => None, + } + } + + /// Cast datum to uint64. + pub fn as_u64(&self) -> Option { + match self { + Datum::UInt64(v) => Some(*v), + Datum::UInt32(v) => Some(*v as u64), + Datum::UInt16(v) => Some(*v as u64), + Datum::UInt8(v) => Some(*v as u64), + Datum::Int64(v) => Some(*v as u64), + Datum::Int32(v) => Some(*v as u64), + Datum::Int16(v) => Some(*v as u64), + Datum::Int8(v) => Some(*v as u64), + Datum::Boolean(v) => Some(*v as u64), + _ => None, + } + } + + /// Cast datum to Bytes. + pub fn as_varbinary(&self) -> Option<&Bytes> { + match self { + Datum::Varbinary(v) => Some(v), + _ => None, + } + } + + pub fn as_f32(&self) -> Option { + match self { + Datum::Float(v) => Some(*v), + _ => None, + } + } + + pub fn as_f64(&self) -> Option { + match self { + Datum::Double(v) => Some(*v), + Datum::Float(v) => Some(*v as f64), + Datum::UInt64(v) => Some(*v as f64), + Datum::UInt32(v) => Some(*v as f64), + Datum::UInt16(v) => Some(*v as f64), + Datum::UInt8(v) => Some(*v as f64), + Datum::Int64(v) => Some(*v as f64), + Datum::Int32(v) => Some(*v as f64), + Datum::Int16(v) => Some(*v as f64), + Datum::Int8(v) => Some(*v as f64), + Datum::Boolean(_) + | Datum::Null + | Datum::Timestamp(_) + | Datum::Varbinary(_) + | Datum::String(_) => None, + } + } + + pub fn display_string(&self) -> String { + match self { + Datum::Null => "null".to_string(), + Datum::Timestamp(v) => Local.timestamp_millis(v.as_i64()).to_rfc3339(), + Datum::Double(v) => v.to_string(), + Datum::Float(v) => v.to_string(), + Datum::Varbinary(v) => format!("{:?}", v), + Datum::String(v) => v.to_string(), + Datum::UInt64(v) => v.to_string(), + Datum::UInt32(v) => v.to_string(), + Datum::UInt16(v) => v.to_string(), + Datum::UInt8(v) => v.to_string(), + Datum::Int64(v) => v.to_string(), + Datum::Int32(v) => v.to_string(), + Datum::Int16(v) => v.to_string(), + Datum::Int8(v) => v.to_string(), + Datum::Boolean(v) => v.to_string(), + } + } + + pub fn try_from_sql_value(kind: &DatumKind, value: Value) -> Result { + match (kind, value) { + (DatumKind::Null, Value::Null) => Ok(Datum::Null), + (DatumKind::Timestamp, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidTimestamp)?; + Ok(Datum::Timestamp(Timestamp::new(n))) + } + (DatumKind::Double, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidDouble)?; + Ok(Datum::Double(n)) + } + (DatumKind::Float, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidDouble)?; + Ok(Datum::Float(n)) + } + // TODO(yingwen): Support hex string. + (DatumKind::Varbinary, Value::SingleQuotedString(s)) => { + Ok(Datum::Varbinary(Bytes::from(s))) + } + (DatumKind::String, Value::SingleQuotedString(s)) => { + Ok(Datum::String(StringBytes::from(s))) + } + (DatumKind::UInt64, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidInt)?; + Ok(Datum::UInt64(n)) + } + (DatumKind::UInt32, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidInt)?; + Ok(Datum::UInt32(n)) + } + (DatumKind::UInt16, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidInt)?; + Ok(Datum::UInt16(n)) + } + (DatumKind::UInt8, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidInt)?; + Ok(Datum::UInt8(n)) + } + (DatumKind::Int64, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidInt)?; + Ok(Datum::Int64(n)) + } + (DatumKind::Int32, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidInt)?; + Ok(Datum::Int32(n)) + } + (DatumKind::Int16, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidInt)?; + Ok(Datum::Int16(n)) + } + (DatumKind::Int8, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidInt)?; + Ok(Datum::Int8(n)) + } + (DatumKind::Boolean, Value::Boolean(b)) => Ok(Datum::Boolean(b)), + (_, value) => InvalidValueType { kind: *kind, value }.fail(), + } + } + + pub fn as_scalar_value(&self) -> Option { + match self { + Datum::Null => None, + Datum::Timestamp(v) => { + Some(ScalarValue::TimestampMillisecond(Some((*v).as_i64()), None)) + } + Datum::Double(v) => Some(ScalarValue::Float64(Some(*v))), + Datum::Float(v) => Some(ScalarValue::Float32(Some(*v))), + Datum::Varbinary(v) => Some(ScalarValue::Binary(Some(v.to_vec()))), + Datum::String(v) => Some(ScalarValue::Utf8(Some(v.to_string()))), + Datum::UInt64(v) => Some(ScalarValue::UInt64(Some(*v))), + Datum::UInt32(v) => Some(ScalarValue::UInt32(Some(*v))), + Datum::UInt16(v) => Some(ScalarValue::UInt16(Some(*v))), + Datum::UInt8(v) => Some(ScalarValue::UInt8(Some(*v))), + Datum::Int64(v) => Some(ScalarValue::Int64(Some(*v))), + Datum::Int32(v) => Some(ScalarValue::Int32(Some(*v))), + Datum::Int16(v) => Some(ScalarValue::Int16(Some(*v))), + Datum::Int8(v) => Some(ScalarValue::Int8(Some(*v))), + Datum::Boolean(v) => Some(ScalarValue::Boolean(Some(*v))), + } + } + + #[cfg(test)] + pub fn as_view(&self) -> DatumView { + match self { + Datum::Null => DatumView::Null, + Datum::Timestamp(v) => DatumView::Timestamp(*v), + Datum::Double(v) => DatumView::Double(*v), + Datum::Float(v) => DatumView::Float(*v), + Datum::Varbinary(v) => DatumView::Varbinary(v), + Datum::String(v) => DatumView::String(v), + Datum::UInt64(v) => DatumView::UInt64(*v), + Datum::UInt32(v) => DatumView::UInt32(*v), + Datum::UInt16(v) => DatumView::UInt16(*v), + Datum::UInt8(v) => DatumView::UInt8(*v), + Datum::Int64(v) => DatumView::Int64(*v), + Datum::Int32(v) => DatumView::Int32(*v), + Datum::Int16(v) => DatumView::Int16(*v), + Datum::Int8(v) => DatumView::Int8(*v), + Datum::Boolean(v) => DatumView::Boolean(*v), + } + } +} + +macro_rules! impl_from { + ($Kind: ident, $FromType: ident) => { + impl From<$FromType> for Datum { + fn from(value: $FromType) -> Self { + Self::$Kind(value) + } + } + + impl From> for Datum { + fn from(value_opt: Option<$FromType>) -> Self { + match value_opt { + Some(value) => Self::$Kind(value), + None => Self::Null, + } + } + } + }; +} + +impl_from!(Timestamp, Timestamp); +impl_from!(Double, f64); +impl_from!(Float, f32); +impl_from!(Varbinary, Bytes); +impl_from!(String, StringBytes); +impl_from!(UInt64, u64); +impl_from!(UInt32, u32); +impl_from!(UInt16, u16); +impl_from!(UInt8, u8); +impl_from!(Int64, i64); +impl_from!(Int32, i32); +impl_from!(Int16, i16); +impl_from!(Int8, i8); +impl_from!(Boolean, bool); + +impl From<&str> for Datum { + fn from(value: &str) -> Datum { + Datum::String(StringBytes::copy_from_str(value)) + } +} + +impl From> for Datum { + fn from(value_opt: Option<&str>) -> Datum { + match value_opt { + Some(value) => Datum::String(StringBytes::copy_from_str(value)), + None => Datum::Null, + } + } +} + +impl From<&[u8]> for Datum { + fn from(value: &[u8]) -> Datum { + Datum::Varbinary(Bytes::copy_from_slice(value)) + } +} + +impl From> for Datum { + fn from(value_opt: Option<&[u8]>) -> Datum { + match value_opt { + Some(value) => Datum::Varbinary(Bytes::copy_from_slice(value)), + None => Datum::Null, + } + } +} + +/// impl serde serialize for Datum +impl Serialize for Datum { + fn serialize(&self, serializer: S) -> std::result::Result + where + S: Serializer, + { + match self { + Datum::Null => serializer.serialize_none(), + Datum::Timestamp(v) => serializer.serialize_i64(v.as_i64()), + Datum::Double(v) => serializer.serialize_f64(*v), + Datum::Float(v) => serializer.serialize_f32(*v), + Datum::Varbinary(v) => serializer.serialize_bytes(v), + Datum::String(v) => serializer.serialize_str(v), + Datum::UInt64(v) => serializer.serialize_u64(*v), + Datum::UInt32(v) => serializer.serialize_u32(*v), + Datum::UInt16(v) => serializer.serialize_u16(*v), + Datum::UInt8(v) => serializer.serialize_u8(*v), + Datum::Int64(v) => serializer.serialize_i64(*v), + Datum::Int32(v) => serializer.serialize_i32(*v), + Datum::Int16(v) => serializer.serialize_i16(*v), + Datum::Int8(v) => serializer.serialize_i8(*v), + Datum::Boolean(v) => serializer.serialize_bool(*v), + } + } +} + +/// A view to a datum. +/// +/// Holds copy of integer like datum and reference of string like datum. +#[derive(Debug, PartialEq, PartialOrd)] +pub enum DatumView<'a> { + Null, + Timestamp(Timestamp), + Double(f64), + Float(f32), + Varbinary(&'a [u8]), + String(&'a str), + UInt64(u64), + UInt32(u32), + UInt16(u16), + UInt8(u8), + Int64(i64), + Int32(i32), + Int16(i16), + Int8(i8), + Boolean(bool), +} + +impl<'a> DatumView<'a> { + /// Return the kind of datum + pub fn kind(&self) -> DatumKind { + match self { + DatumView::Null => DatumKind::Null, + DatumView::Timestamp(_) => DatumKind::Timestamp, + DatumView::Double(_) => DatumKind::Double, + DatumView::Float(_) => DatumKind::Float, + DatumView::Varbinary(_) => DatumKind::Varbinary, + DatumView::String(_) => DatumKind::String, + DatumView::UInt64(_) => DatumKind::UInt64, + DatumView::UInt32(_) => DatumKind::UInt32, + DatumView::UInt16(_) => DatumKind::UInt16, + DatumView::UInt8(_) => DatumKind::UInt8, + DatumView::Int64(_) => DatumKind::Int64, + DatumView::Int32(_) => DatumKind::Int32, + DatumView::Int16(_) => DatumKind::Int16, + DatumView::Int8(_) => DatumKind::Int8, + DatumView::Boolean(_) => DatumKind::Boolean, + } + } + + pub fn from_scalar_value(val: &'a ScalarValue) -> Option { + match val { + ScalarValue::Boolean(v) => v.map(DatumView::Boolean), + ScalarValue::Float32(v) => v.map(DatumView::Float), + ScalarValue::Float64(v) => v.map(DatumView::Double), + ScalarValue::Int8(v) => v.map(DatumView::Int8), + ScalarValue::Int16(v) => v.map(DatumView::Int16), + ScalarValue::Int32(v) => v.map(DatumView::Int32), + ScalarValue::Int64(v) => v.map(DatumView::Int64), + ScalarValue::UInt8(v) => v.map(DatumView::UInt8), + ScalarValue::UInt16(v) => v.map(DatumView::UInt16), + ScalarValue::UInt32(v) => v.map(DatumView::UInt32), + ScalarValue::UInt64(v) => v.map(DatumView::UInt64), + ScalarValue::Utf8(v) | ScalarValue::LargeUtf8(v) => { + v.as_ref().map(|v| DatumView::String(v.as_str())) + } + ScalarValue::Binary(v) | ScalarValue::LargeBinary(v) => { + v.as_ref().map(|v| DatumView::Varbinary(v.as_slice())) + } + ScalarValue::TimestampMillisecond(v, _) => { + v.map(|v| DatumView::Timestamp(Timestamp::new(v))) + } + ScalarValue::List(_, _) + | ScalarValue::Date32(_) + | ScalarValue::Date64(_) + | ScalarValue::TimestampSecond(_, _) + | ScalarValue::TimestampMicrosecond(_, _) + | ScalarValue::TimestampNanosecond(_, _) + | ScalarValue::IntervalYearMonth(_) + | ScalarValue::IntervalDayTime(_) + | ScalarValue::Struct(_, _) + | ScalarValue::Decimal128(_, _, _) => None, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_is_key_kind() { + assert!(!DatumKind::Null.is_key_kind()); + assert!(DatumKind::Timestamp.is_key_kind()); + assert!(!DatumKind::Double.is_key_kind()); + assert!(!DatumKind::Float.is_key_kind()); + assert!(DatumKind::Varbinary.is_key_kind()); + assert!(DatumKind::String.is_key_kind()); + assert!(DatumKind::UInt64.is_key_kind()); + assert!(DatumKind::UInt32.is_key_kind()); + assert!(DatumKind::UInt16.is_key_kind()); + assert!(DatumKind::UInt8.is_key_kind()); + assert!(DatumKind::Int64.is_key_kind()); + assert!(DatumKind::Int32.is_key_kind()); + assert!(DatumKind::Int16.is_key_kind()); + assert!(DatumKind::Int8.is_key_kind()); + assert!(DatumKind::Boolean.is_key_kind()); + } + + #[test] + fn test_unsign_kind() { + assert_eq!(DatumKind::UInt64.unsign_kind(), Some(DatumKind::UInt64)); + assert_eq!(DatumKind::Int64.unsign_kind(), Some(DatumKind::UInt64)); + assert_eq!(DatumKind::UInt32.unsign_kind(), Some(DatumKind::UInt32)); + assert_eq!(DatumKind::Int32.unsign_kind(), Some(DatumKind::UInt32)); + assert_eq!(DatumKind::UInt16.unsign_kind(), Some(DatumKind::UInt16)); + assert_eq!(DatumKind::Int16.unsign_kind(), Some(DatumKind::UInt16)); + assert_eq!(DatumKind::UInt8.unsign_kind(), Some(DatumKind::UInt8)); + assert_eq!(DatumKind::Int8.unsign_kind(), Some(DatumKind::UInt8)); + + assert!(DatumKind::Null.unsign_kind().is_none()); + assert!(DatumKind::Timestamp.unsign_kind().is_none()); + assert!(DatumKind::String.unsign_kind().is_none()); + assert!(DatumKind::Boolean.unsign_kind().is_none()); + assert!(DatumKind::Varbinary.unsign_kind().is_none()); + assert!(DatumKind::Double.unsign_kind().is_none()); + assert!(DatumKind::Float.unsign_kind().is_none()); + } + + #[test] + fn test_into_u8() { + assert_eq!(0, DatumKind::Null.into_u8()); + assert_eq!(1, DatumKind::Timestamp.into_u8()); + assert_eq!(2, DatumKind::Double.into_u8()); + assert_eq!(3, DatumKind::Float.into_u8()); + assert_eq!(4, DatumKind::Varbinary.into_u8()); + assert_eq!(5, DatumKind::String.into_u8()); + assert_eq!(6, DatumKind::UInt64.into_u8()); + assert_eq!(7, DatumKind::UInt32.into_u8()); + assert_eq!(8, DatumKind::UInt16.into_u8()); + assert_eq!(9, DatumKind::UInt8.into_u8()); + assert_eq!(10, DatumKind::Int64.into_u8()); + assert_eq!(11, DatumKind::Int32.into_u8()); + assert_eq!(12, DatumKind::Int16.into_u8()); + assert_eq!(13, DatumKind::Int8.into_u8()); + assert_eq!(14, DatumKind::Boolean.into_u8()); + } +} diff --git a/common_types/src/hash.rs b/common_types/src/hash.rs new file mode 100644 index 0000000000..9edc8c69cb --- /dev/null +++ b/common_types/src/hash.rs @@ -0,0 +1,39 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// custom hash mod +use byteorder::{ByteOrder, LittleEndian}; +use murmur3::murmur3_x64_128; + +pub fn hash64(mut bytes: &[u8]) -> u64 { + let mut out = [0; 16]; + murmur3_x64_128(&mut bytes, 0, &mut out); + // in most cases we run on little endian target + LittleEndian::read_u64(&out[0..8]) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn empty_hash_test() { + let res1 = hash64(&[]); + let res2 = hash64(&[]); + assert_eq!(res1, res2); + } + + #[test] + fn hash_test() { + let test_bytes_1 = b"cse_engine_hash_mod_test_bytes1".to_vec(); + let test_bytes_2 = b"cse_engine_hash_mod_test_bytes2".to_vec(); + { + // hash64 testing + let res1 = hash64(&test_bytes_1); + let res1_1 = hash64(&test_bytes_1); + assert_eq!(res1, res1_1); + + let res2 = hash64(&test_bytes_2); + assert_ne!(res1, res2); + } + } +} diff --git a/common_types/src/lib.rs b/common_types/src/lib.rs new file mode 100644 index 0000000000..3da29b0a52 --- /dev/null +++ b/common_types/src/lib.rs @@ -0,0 +1,24 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Contains common types + +pub mod bytes; +pub mod column; +pub mod column_schema; +pub mod datum; +pub mod hash; +pub mod projected_schema; +pub mod record_batch; +pub mod request_id; +pub mod row; +pub mod schema; +pub mod string; +pub mod time; + +/// Sequence number +pub type SequenceNumber = u64; +pub const MAX_SEQUENCE_NUMBER: u64 = u64::MAX; +pub const MIN_SEQUENCE_NUMBER: u64 = 0; + +#[cfg(any(test, feature = "test"))] +pub mod tests; diff --git a/common_types/src/projected_schema.rs b/common_types/src/projected_schema.rs new file mode 100644 index 0000000000..8fa17f2848 --- /dev/null +++ b/common_types/src/projected_schema.rs @@ -0,0 +1,292 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Projected schema + +use std::{fmt, sync::Arc}; + +use snafu::{ensure, Backtrace, ResultExt, Snafu}; + +use crate::{ + column_schema::{ColumnSchema, ReadOp}, + datum::Datum, + row::Row, + schema::{ArrowSchemaRef, RecordSchema, RecordSchemaWithKey, Schema}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Invalid projection index, index:{}.\nBacktrace:\n{}", + index, + backtrace + ))] + InvalidProjectionIndex { index: usize, backtrace: Backtrace }, + + #[snafu(display("Incompatible column schema for read, err:{}", source))] + IncompatReadColumn { + source: crate::column_schema::CompatError, + }, + + #[snafu(display("Failed to build projected schema, err:{}", source))] + BuildProjectedSchema { source: crate::schema::Error }, + + #[snafu(display( + "Missing not null column for read, name:{}.\nBacktrace:\n{}", + name, + backtrace + ))] + MissingReadColumn { name: String, backtrace: Backtrace }, +} + +pub type Result = std::result::Result; + +#[derive(Debug)] +pub struct RowProjector { + schema_with_key: RecordSchemaWithKey, + source_schema: Schema, + /// The Vec stores the column index in source, and `None` means this column + /// is not in source but required by reader, and need to filled by null. + /// The length of Vec is the same as the number of columns reader intended + /// to read. + source_projection: Vec>, +} + +impl RowProjector { + /// The projected indexes of existed columns in the source schema. + pub fn existed_source_projection(&self) -> Vec { + self.source_projection + .iter() + .filter_map(|index| *index) + .collect() + } + + /// The projected indexes of all columns(existed and not exist) in the + /// source schema. + pub fn source_projection(&self) -> &[Option] { + &self.source_projection + } + + pub fn schema_with_key(&self) -> &RecordSchemaWithKey { + &self.schema_with_key + } + + /// Project the row. + /// + /// REQUIRE: The schema of row is the same as source schema. + pub fn project_row(&self, row: &Row, mut datums_buffer: Vec) -> Row { + assert_eq!(self.source_schema.num_columns(), row.num_columns()); + + datums_buffer.reserve(self.schema_with_key.num_columns()); + + for p in &self.source_projection { + let datum = match p { + Some(index_in_source) => row[*index_in_source].clone(), + None => Datum::Null, + }; + + datums_buffer.push(datum); + } + + Row::from_datums(datums_buffer) + } +} + +#[derive(Clone)] +pub struct ProjectedSchema(Arc); + +impl fmt::Debug for ProjectedSchema { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ProjectedSchema") + .field("original_schema", &self.0.original_schema) + .field("projection", &self.0.projection) + .finish() + } +} + +impl ProjectedSchema { + pub fn no_projection(schema: Schema) -> Self { + let inner = ProjectedSchemaInner::no_projection(schema); + Self(Arc::new(inner)) + } + + pub fn new(schema: Schema, projection: Option>) -> Result { + let inner = ProjectedSchemaInner::new(schema, projection)?; + Ok(Self(Arc::new(inner))) + } + + pub fn is_all_projection(&self) -> bool { + self.0.is_all_projection() + } + + /// Returns the [RowProjector] to project the rows with source schema to + /// rows with [RecordSchemaWithKey]. + /// + /// REQUIRE: The key columns are the same as this schema. + #[inline] + pub fn try_project_with_key(&self, source_schema: &Schema) -> Result { + self.0.try_project_with_key(source_schema) + } + + // Returns the record schema after projection with key. + pub fn to_record_schema_with_key(&self) -> RecordSchemaWithKey { + self.0.schema_with_key.clone() + } + + pub(crate) fn as_record_schema_with_key(&self) -> &RecordSchemaWithKey { + &self.0.schema_with_key + } + + // Returns the record schema after projection. + pub fn to_record_schema(&self) -> RecordSchema { + self.0.record_schema.clone() + } + + /// Returns the arrow schema after projection. + pub fn to_projected_arrow_schema(&self) -> ArrowSchemaRef { + self.0.record_schema.to_arrow_schema_ref() + } +} + +/// Schema with projection informations +struct ProjectedSchemaInner { + /// The schema before projection that the reader intended to read, may + /// differ from current schema of the table. + original_schema: Schema, + /// Index of the projected columns in `self.schema`, `None` if + /// all columns are needed. + projection: Option>, + + /// The record schema from `self.schema` with key columns after projection. + schema_with_key: RecordSchemaWithKey, + /// The record schema from `self.schema` after projection. + record_schema: RecordSchema, +} + +impl ProjectedSchemaInner { + fn no_projection(schema: Schema) -> Self { + let schema_with_key = schema.to_record_schema_with_key(); + let record_schema = schema.to_record_schema(); + + Self { + original_schema: schema, + projection: None, + schema_with_key, + record_schema, + } + } + + fn new(schema: Schema, projection: Option>) -> Result { + if let Some(p) = &projection { + // Projection is provided, validate the projection is valid. This is necessary + // to avoid panic when creating RecordSchema and + // RecordSchemaWithKey. + if let Some(max_idx) = p.iter().max() { + ensure!( + *max_idx < schema.num_columns(), + InvalidProjectionIndex { index: *max_idx } + ); + } + + let schema_with_key = schema.project_record_schema_with_key(p); + let record_schema = schema.project_record_schema(p); + + Ok(Self { + original_schema: schema, + projection, + schema_with_key, + record_schema, + }) + } else { + Ok(Self::no_projection(schema)) + } + } + + /// Selecting all the columns is the all projection. + fn is_all_projection(&self) -> bool { + self.projection.is_none() + } + + // TODO(yingwen): We can fill missing not null column with default value instead + // of returning error. + fn try_project_with_key(&self, source_schema: &Schema) -> Result { + debug_assert_eq!( + self.schema_with_key.key_columns(), + source_schema.key_columns() + ); + // We consider the two schema is equal if they have same version. + if self.original_schema.version() == source_schema.version() { + debug_assert_eq!(self.original_schema, *source_schema); + } + + let mut source_projection = Vec::with_capacity(self.schema_with_key.num_columns()); + // For each column in `schema_with_key` + for column_schema in self.schema_with_key.columns() { + self.try_project_column(column_schema, source_schema, &mut source_projection)?; + } + + Ok(RowProjector { + schema_with_key: self.schema_with_key.clone(), + source_schema: source_schema.clone(), + source_projection, + }) + } + + fn try_project_column( + &self, + column: &ColumnSchema, + source_schema: &Schema, + source_projection: &mut Vec>, + ) -> Result<()> { + match source_schema.index_of(&column.name) { + Some(source_idx) => { + // Column is in source + if self.original_schema.version() == source_schema.version() { + // Same version, just use that column in source + source_projection.push(Some(source_idx)); + } else { + // Different version, need to check column schema + let source_column = source_schema.column(source_idx); + // TODO(yingwen): Data type is not checked here because we do not support alter + // data type now. + match column + .compatible_for_read(source_column) + .context(IncompatReadColumn)? + { + ReadOp::Exact => { + source_projection.push(Some(source_idx)); + } + ReadOp::FillNull => { + source_projection.push(None); + } + } + } + } + None => { + // Column is not in source + ensure!(column.is_nullable, MissingReadColumn { name: &column.name }); + // Column is nullable, fill this column by null + source_projection.push(None); + } + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use crate::{projected_schema::ProjectedSchema, tests::build_schema}; + + #[test] + fn test_projected_schema() { + let schema = build_schema(); + assert!(schema.num_columns() > 1); + let projection: Vec = (0..schema.num_columns() - 1).collect(); + let projected_schema = ProjectedSchema::new(schema.clone(), Some(projection)).unwrap(); + assert_eq!( + projected_schema.0.schema_with_key.num_columns(), + schema.num_columns() - 1 + ); + assert!(!projected_schema.is_all_projection()); + } +} diff --git a/common_types/src/record_batch.rs b/common_types/src/record_batch.rs new file mode 100644 index 0000000000..1b7ca99d98 --- /dev/null +++ b/common_types/src/record_batch.rs @@ -0,0 +1,695 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Record batch + +use std::{cmp, convert::TryFrom, mem}; + +use arrow_deps::{ + arrow::{ + datatypes::SchemaRef as ArrowSchemaRef, record_batch::RecordBatch as ArrowRecordBatch, + }, + util, +}; +use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu}; + +use crate::{ + column::{ColumnBlock, ColumnBlockBuilder}, + datum::DatumKind, + projected_schema::{ProjectedSchema, RowProjector}, + row::{ + contiguous::{ContiguousRow, ProjectedContiguousRow}, + Row, RowViewOnBatch, + }, + schema::{RecordSchema, RecordSchemaWithKey}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Invalid schema len to build RecordBatch.\nBacktrace:\n{}", backtrace))] + SchemaLen { backtrace: Backtrace }, + + #[snafu(display("Failed to create column block, err:{}", source))] + CreateColumnBlock { source: crate::column::Error }, + + #[snafu(display( + "Failed to create arrow record batch, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + CreateArrow { + source: arrow_deps::arrow::error::ArrowError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to iterate datum, err:{}", source))] + IterateDatum { source: crate::row::Error }, + + #[snafu(display("Failed to append datum, err:{}", source))] + AppendDatum { source: crate::column::Error }, + + #[snafu(display( + "Column not in schema with key, column_name:{}.\nBacktrace:\n{}", + name, + backtrace + ))] + ColumnNotInSchemaWithKey { name: String, backtrace: Backtrace }, + + #[snafu(display("Failed to convert arrow schema, err:{}", source))] + ConvertArrowSchema { source: crate::schema::Error }, + + #[snafu(display("Mismatch record schema to build RecordBatch, column_name:{}, schema_type:{:?}, column_type:{:?}.\nBacktrace:\n{}", column_name, schema_type, column_type, backtrace))] + MismatchRecordSchema { + column_name: String, + schema_type: DatumKind, + column_type: DatumKind, + backtrace: Backtrace, + }, + + #[snafu(display( + "Projection is out of the index, source_projection:{:?}, arrow_schema:{}.\nBacktrace:\n{}", + source_projection, + arrow_schema, + backtrace + ))] + OutOfIndexProjection { + source_projection: Vec>, + arrow_schema: ArrowSchemaRef, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to reverse record batch data, err:{:?}.\nBacktrace:\n{}", + source, + backtrace + ))] + ReverseRecordBatchData { + source: Box, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to select record batch data, err:{:?}.\nBacktrace:\n{}", + source, + backtrace + ))] + SelectRecordBatchData { + source: Box, + backtrace: Backtrace, + }, +} + +pub type Result = std::result::Result; + +#[derive(Debug)] +pub struct RecordBatchData { + arrow_record_batch: ArrowRecordBatch, + column_blocks: Vec, +} + +impl RecordBatchData { + fn new(arrow_schema: ArrowSchemaRef, column_blocks: Vec) -> Result { + let arrays = column_blocks + .iter() + .map(|column| column.to_arrow_array_ref()) + .collect(); + + let arrow_record_batch = + ArrowRecordBatch::try_new(arrow_schema, arrays).context(CreateArrow)?; + + Ok(RecordBatchData { + arrow_record_batch, + column_blocks, + }) + } + + fn num_rows(&self) -> usize { + self.column_blocks + .first() + .map(|column| column.num_rows()) + .unwrap_or(0) + } + + fn take_column_block(&mut self, index: usize) -> ColumnBlock { + let num_rows = self.num_rows(); + mem::replace( + &mut self.column_blocks[index], + ColumnBlock::new_null(num_rows), + ) + } + + /// Returns a zero-copy slice of this array with the indicated offset and + /// length. + /// + /// Panics if offset with length is greater than column length. + fn slice(&self, offset: usize, length: usize) -> Self { + let column_blocks = self + .column_blocks + .iter() + .map(|col| col.slice(offset, length)) + .collect(); + + Self { + arrow_record_batch: self.arrow_record_batch.slice(offset, length), + column_blocks, + } + } +} + +fn build_column_blocks_from_arrow_record_batch( + arrow_record_batch: &ArrowRecordBatch, + record_schema: &RecordSchema, +) -> Result> { + let mut column_blocks = Vec::with_capacity(arrow_record_batch.num_columns()); + for (column_schema, array) in record_schema + .columns() + .iter() + .zip(arrow_record_batch.columns()) + { + let column = ColumnBlock::try_from_arrow_array_ref(&column_schema.data_type, array) + .context(CreateColumnBlock)?; + column_blocks.push(column); + } + + Ok(column_blocks) +} + +impl TryFrom for RecordBatchData { + type Error = Error; + + fn try_from(arrow_record_batch: ArrowRecordBatch) -> Result { + let record_schema = + RecordSchema::try_from(arrow_record_batch.schema()).context(ConvertArrowSchema)?; + let column_blocks = + build_column_blocks_from_arrow_record_batch(&arrow_record_batch, &record_schema)?; + Ok(Self { + arrow_record_batch, + column_blocks, + }) + } +} + +// TODO(yingwen): The schema in RecordBatch should be much simple because it may +// lack some information. +#[derive(Debug)] +pub struct RecordBatch { + schema: RecordSchema, + data: RecordBatchData, +} + +impl RecordBatch { + pub fn new_empty(schema: RecordSchema) -> Self { + let arrow_schema = schema.to_arrow_schema_ref(); + let arrow_record_batch = ArrowRecordBatch::new_empty(arrow_schema); + + Self { + schema, + data: RecordBatchData { + arrow_record_batch, + column_blocks: Vec::new(), + }, + } + } + + pub fn new(schema: RecordSchema, column_blocks: Vec) -> Result { + ensure!(schema.num_columns() == column_blocks.len(), SchemaLen); + + // Validate schema and column_blocks. + for (column_schema, column_block) in schema.columns().iter().zip(column_blocks.iter()) { + ensure!( + column_schema.data_type == column_block.datum_kind(), + MismatchRecordSchema { + column_name: &column_schema.name, + schema_type: column_schema.data_type, + column_type: column_block.datum_kind(), + } + ); + } + + let arrow_schema = schema.to_arrow_schema_ref(); + let data = RecordBatchData::new(arrow_schema, column_blocks)?; + + Ok(Self { schema, data }) + } + + pub fn schema(&self) -> &RecordSchema { + &self.schema + } + + // REQUIRE: index is valid + pub fn column(&self, index: usize) -> &ColumnBlock { + &self.data.column_blocks[index] + } + + pub fn num_columns(&self) -> usize { + self.schema.num_columns() + } + + pub fn num_rows(&self) -> usize { + self.data.num_rows() + } + + pub fn into_arrow_record_batch(self) -> ArrowRecordBatch { + self.data.arrow_record_batch + } +} + +impl TryFrom for RecordBatch { + type Error = Error; + + fn try_from(arrow_record_batch: ArrowRecordBatch) -> Result { + let record_schema = + RecordSchema::try_from(arrow_record_batch.schema()).context(ConvertArrowSchema)?; + + let column_blocks = + build_column_blocks_from_arrow_record_batch(&arrow_record_batch, &record_schema)?; + + Ok(Self { + schema: record_schema, + data: RecordBatchData { + arrow_record_batch, + column_blocks, + }, + }) + } +} + +#[derive(Debug)] +pub struct RecordBatchWithKey { + schema_with_key: RecordSchemaWithKey, + data: RecordBatchData, +} + +impl RecordBatchWithKey { + pub fn num_rows(&self) -> usize { + self.data.num_rows() + } + + pub fn num_columns(&self) -> usize { + self.data.arrow_record_batch.num_columns() + } + + pub fn columns(&self) -> &[ColumnBlock] { + &self.data.column_blocks + } + + pub fn clone_row_at(&self, index: usize) -> Row { + let datums = self + .data + .column_blocks + .iter() + .map(|column_block| column_block.datum(index)) + .collect(); + + Row::from_datums(datums) + } + + /// Project the [RecordBatchWithKey] into a [RecordBatch] according to + /// [ProjectedSchema]. + /// + /// REQUIRE: The schema_with_key of the [RecordBatchWithKey] is the same as + /// the schema_with_key of [ProjectedSchema]. + pub fn try_project(mut self, projected_schema: &ProjectedSchema) -> Result { + debug_assert_eq!( + &self.schema_with_key, + projected_schema.as_record_schema_with_key() + ); + + // Get the schema after projection. + let record_schema = projected_schema.to_record_schema(); + let mut column_blocks = Vec::with_capacity(record_schema.num_columns()); + + for column_schema in record_schema.columns() { + let column_index = self.schema_with_key.index_of(&column_schema.name).context( + ColumnNotInSchemaWithKey { + name: &column_schema.name, + }, + )?; + + // Take the column block out. + let column_block = self.data.take_column_block(column_index); + column_blocks.push(column_block); + } + + let data = RecordBatchData::new(record_schema.to_arrow_schema_ref(), column_blocks)?; + + Ok(RecordBatch { + schema: record_schema, + data, + }) + } + + pub fn into_record_batch(self) -> RecordBatch { + RecordBatch { + schema: self.schema_with_key.into_record_schema(), + data: self.data, + } + } + + #[inline] + pub fn schema_with_key(&self) -> &RecordSchemaWithKey { + &self.schema_with_key + } + + #[inline] + pub fn column(&self, index: usize) -> &ColumnBlock { + &self.data.column_blocks[index] + } + + /// Reverse the rows in the data. + /// + /// The data retains intact if failed. + pub fn reverse_data(&mut self) -> Result<()> { + let reversed_record_batch = util::reverse_record_batch(&self.data.arrow_record_batch) + .map_err(|e| Box::new(e) as _) + .context(ReverseRecordBatchData)?; + + self.data = RecordBatchData::try_from(reversed_record_batch) + .map_err(|e| Box::new(e) as _) + .context(ReverseRecordBatchData)?; + + Ok(()) + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.num_rows() == 0 + } + + /// Returns a zero-copy slice of this array with the indicated offset and + /// length. + /// + /// Panics if offset with length is greater than column length. + #[must_use] + pub fn slice(&self, offset: usize, length: usize) -> Self { + Self { + schema_with_key: self.schema_with_key.clone(), + data: self.data.slice(offset, length), + } + } + + /// Select the rows according to the `selected_rows`. + /// + /// The data retains intact if failed. + pub fn select_data(&mut self, selected_rows: &[bool]) -> Result<()> { + assert_eq!(self.num_rows(), selected_rows.len()); + + let selected_record_batch = + util::select_record_batch(&self.data.arrow_record_batch, selected_rows) + .map_err(|e| Box::new(e) as _) + .context(SelectRecordBatchData)?; + let selected_data = RecordBatchData::try_from(selected_record_batch) + .map_err(|e| Box::new(e) as _) + .context(SelectRecordBatchData)?; + + self.data = selected_data; + + Ok(()) + } +} + +pub struct RecordBatchWithKeyBuilder { + schema_with_key: RecordSchemaWithKey, + builders: Vec, +} + +impl RecordBatchWithKeyBuilder { + pub fn new(schema_with_key: RecordSchemaWithKey) -> Self { + let builders = schema_with_key + .columns() + .iter() + .map(|column_schema| ColumnBlockBuilder::with_capacity(&column_schema.data_type, 0)) + .collect(); + Self { + schema_with_key, + builders, + } + } + + pub fn with_capacity(schema_with_key: RecordSchemaWithKey, capacity: usize) -> Self { + let builders = schema_with_key + .columns() + .iter() + .map(|column_schema| { + ColumnBlockBuilder::with_capacity(&column_schema.data_type, capacity) + }) + .collect(); + Self { + schema_with_key, + builders, + } + } + + /// Append row into builder. + /// + /// REQUIRE: The row and the builder must have the same schema. + pub fn append_row(&mut self, row: Row) -> Result<()> { + for (builder, datum) in self.builders.iter_mut().zip(row) { + builder.append(datum).context(AppendDatum)?; + } + + Ok(()) + } + + /// Append projected contiguous row into builder. + /// + /// REQUIRE: + /// - The schema of `row` is the same as the source schema of the + /// `projector`. + /// - The projected schema (with key) is the same as the schema of the + /// builder. + pub fn append_projected_contiguous_row( + &mut self, + row: &ProjectedContiguousRow, + ) -> Result<()> { + assert_eq!(row.num_datum_views(), self.builders.len()); + + for (index, builder) in self.builders.iter_mut().enumerate() { + let datum_view = row.datum_view_at(index); + builder.append_view(datum_view).context(AppendDatum)?; + } + + Ok(()) + } + + /// Append the row from the [RowView] to the builder. + /// + /// REQUIRE: The `row_view` and the builder must have the same schema. + pub fn append_row_view(&mut self, row_view: &RowViewOnBatch) -> Result<()> { + for (builder, datum) in self.builders.iter_mut().zip(row_view.iter_columns()) { + let datum = datum.context(IterateDatum)?; + builder.append(datum).context(AppendDatum)?; + } + + Ok(()) + } + + /// Append `len` from `start` (inclusive) to this builder. + /// + /// REQUIRE: + /// - The `record_batch` and the builder must have the same schema. + pub fn append_batch_range( + &mut self, + record_batch: &RecordBatchWithKey, + start: usize, + len: usize, + ) -> Result { + let num_rows = record_batch.num_rows(); + if start >= num_rows { + return Ok(0); + } + + let added = cmp::min(num_rows - start, len); + + for (builder, column) in self.builders.iter_mut().zip(record_batch.columns().iter()) { + builder + .append_block_range(column, start, added) + .context(AppendDatum)?; + } + + Ok(added) + } + + /// The number of the appended rows. + pub fn len(&self) -> usize { + self.builders + .first() + .map(|builder| builder.len()) + .unwrap_or(0) + } + + /// Returns true if the builder is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Reset the builders for reuse. + pub fn clear(&mut self) { + for builder in &mut self.builders { + builder.clear(); + } + } + + /// Build [RecordBatchWithKey] and reset the builder. + pub fn build(&mut self) -> Result { + let column_blocks: Vec<_> = self + .builders + .iter_mut() + .map(|builder| builder.build()) + .collect(); + let arrow_schema = self.schema_with_key.to_arrow_schema_ref(); + + Ok(RecordBatchWithKey { + schema_with_key: self.schema_with_key.clone(), + data: RecordBatchData::new(arrow_schema, column_blocks)?, + }) + } +} + +#[derive(Debug)] +pub struct ArrowRecordBatchProjector { + row_projector: RowProjector, +} + +impl From for ArrowRecordBatchProjector { + fn from(row_projector: RowProjector) -> Self { + Self { row_projector } + } +} + +impl ArrowRecordBatchProjector { + /// Project the [arrow::RecordBatch] to [RecordBatchWithKey] and these + /// things is to be done: + /// - Insert the null column if the projected column does not appear in the + /// source schema. + /// - Convert the [arrow::RecordBatch] to [RecordBatchWithKey]. + /// + /// REQUIRE: Schema of the `arrow_record_batch` is the same as the + /// projection of existing column in the source schema. + pub fn project_to_record_batch_with_key( + &self, + arrow_record_batch: ArrowRecordBatch, + ) -> Result { + let schema_with_key = self.row_projector.schema_with_key().clone(); + let source_projection = self.row_projector.source_projection(); + let mut column_blocks = Vec::with_capacity(schema_with_key.num_columns()); + + let num_rows = arrow_record_batch.num_rows(); + // ensure next_arrow_column_idx < num_columns + let mut next_arrow_column_idx = 0; + let num_columns = arrow_record_batch.num_columns(); + + for (source_idx, column_schema) in source_projection.iter().zip(schema_with_key.columns()) { + match source_idx { + Some(_) => { + ensure!( + next_arrow_column_idx < num_columns, + OutOfIndexProjection { + source_projection, + arrow_schema: arrow_record_batch.schema() + } + ); + + let array = arrow_record_batch.column(next_arrow_column_idx); + next_arrow_column_idx += 1; + + let column_block = + ColumnBlock::try_from_arrow_array_ref(&column_schema.data_type, array) + .context(CreateColumnBlock)?; + + column_blocks.push(column_block); + } + None => { + // Need to push row with specific type. + let null_block = + ColumnBlock::new_null_with_type(&column_schema.data_type, num_rows) + .context(CreateColumnBlock)?; + column_blocks.push(null_block); + } + } + } + + let data = RecordBatchData::new(schema_with_key.to_arrow_schema_ref(), column_blocks)?; + + Ok(RecordBatchWithKey { + schema_with_key, + data, + }) + } +} + +#[cfg(test)] +mod tests { + use crate::{ + record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder}, + row::RowViewOnBatch, + tests::{ + build_projected_schema, build_record_batch_with_key_by_rows, build_rows, + check_record_batch_with_key_with_rows, + }, + }; + + fn build_record_batch_with_key() -> RecordBatchWithKey { + let rows = build_rows(); + build_record_batch_with_key_by_rows(rows) + } + + fn check_record_batch_with_key( + record_batch_with_key: RecordBatchWithKey, + row_num: usize, + column_num: usize, + ) -> bool { + let rows = build_rows(); + check_record_batch_with_key_with_rows(&record_batch_with_key, row_num, column_num, rows) + } + + #[test] + fn test_append_projected_contiguous_row() { + let record_batch_with_key = build_record_batch_with_key(); + assert_eq!(record_batch_with_key.num_rows(), 5); + assert_eq!(record_batch_with_key.num_columns(), 3); + + check_record_batch_with_key(record_batch_with_key, 5, 3); + } + + #[test] + fn test_append_row_view() { + let projected_schema = build_projected_schema(); + + let record_batch_with_key = build_record_batch_with_key(); + + let mut builder = RecordBatchWithKeyBuilder::with_capacity( + projected_schema.to_record_schema_with_key(), + 2, + ); + let view = RowViewOnBatch { + record_batch: &record_batch_with_key, + row_idx: 1, + }; + builder.append_row_view(&view).unwrap(); + let record_batch_with_key = builder.build().unwrap(); + assert_eq!(record_batch_with_key.num_rows(), 1); + assert_eq!(record_batch_with_key.num_columns(), 3); + + check_record_batch_with_key(record_batch_with_key, 1, 3); + } + + #[test] + fn test_append_batch_range() { + let projected_schema = build_projected_schema(); + + let record_batch_with_key = build_record_batch_with_key(); + + let mut builder = RecordBatchWithKeyBuilder::with_capacity( + projected_schema.to_record_schema_with_key(), + 2, + ); + builder + .append_batch_range(&record_batch_with_key, 0, 2) + .unwrap(); + let record_batch_with_key = builder.build().unwrap(); + assert_eq!(record_batch_with_key.num_rows(), 2); + assert_eq!(record_batch_with_key.num_columns(), 3); + + check_record_batch_with_key(record_batch_with_key, 2, 3); + } +} diff --git a/common_types/src/request_id.rs b/common_types/src/request_id.rs new file mode 100644 index 0000000000..6990839818 --- /dev/null +++ b/common_types/src/request_id.rs @@ -0,0 +1,43 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Request id. + +use std::{ + fmt, + sync::atomic::{AtomicU64, Ordering}, +}; + +#[derive(Debug, Clone, Copy)] +pub struct RequestId(u64); + +impl RequestId { + /// Acquire next request id. + pub fn next_id() -> Self { + static NEXT_ID: AtomicU64 = AtomicU64::new(1); + + let id = NEXT_ID.fetch_add(1, Ordering::Relaxed); + + Self(id) + } +} + +impl fmt::Display for RequestId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_request_id() { + let id = RequestId::next_id(); + assert_eq!(1, id.0); + let id = RequestId::next_id(); + assert_eq!(2, id.0); + + assert_eq!("2", id.to_string()); + } +} diff --git a/common_types/src/row/contiguous.rs b/common_types/src/row/contiguous.rs new file mode 100644 index 0000000000..dd35f6ecb0 --- /dev/null +++ b/common_types/src/row/contiguous.rs @@ -0,0 +1,501 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Contiguous row. + +use std::{ + convert::{TryFrom, TryInto}, + fmt, mem, + ops::{Deref, DerefMut}, + str, +}; + +use snafu::{ensure, Backtrace, Snafu}; + +use crate::{ + datum::{Datum, DatumKind, DatumView}, + projected_schema::RowProjector, + row::Row, + schema::{IndexInWriterSchema, Schema}, + time::Timestamp, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "String is too long to encode into row (max is {}), len:{}.\nBacktrace:\n{}", + MAX_STRING_LEN, + len, + backtrace + ))] + StringTooLong { len: usize, backtrace: Backtrace }, +} + +pub type Result = std::result::Result; + +/// Size to store the offset of string buffer. +type OffsetSize = usize; + +/// Max allowed string length of datum to store in a contiguous row (16 MB). +const MAX_STRING_LEN: usize = 1024 * 1024 * 16; + +/// Row encoded in a contiguous buffer. +pub trait ContiguousRow { + /// Returns the number of datums. + fn num_datum_views(&self) -> usize; + + /// Returns [DatumView] of column in given index, and returns null if the + /// datum kind is unknown. + /// + /// Panic if index or buffer is out of bound. + fn datum_view_at(&self, index: usize) -> DatumView; +} + +pub struct ContiguousRowReader<'a, T> { + inner: &'a T, + byte_offsets: &'a [usize], + string_buffer_offset: usize, +} + +impl<'a, T> ContiguousRowReader<'a, T> { + pub fn with_schema(inner: &'a T, schema: &'a Schema) -> Self { + Self { + inner, + byte_offsets: schema.byte_offsets(), + string_buffer_offset: schema.string_buffer_offset(), + } + } +} + +impl<'a, T: Deref> ContiguousRow for ContiguousRowReader<'a, T> { + fn num_datum_views(&self) -> usize { + self.byte_offsets.len() + } + + fn datum_view_at(&self, index: usize) -> DatumView<'a> { + let offset = self.byte_offsets[index]; + let buf = &self.inner[offset..]; + + // Get datum kind, if the datum kind is unknown, returns null. + let datum_kind = match DatumKind::try_from(buf[0]) { + Ok(v) => v, + Err(_) => return DatumView::Null, + }; + + // Advance 1 byte to skip the header byte. + let datum_buf = &buf[1..]; + // If no string column in this schema, the string buffer offset should + // equal to the buffer len, and string buf is an empty slice. + let string_buf = &self.inner[self.string_buffer_offset..]; + + must_read_view(&datum_kind, datum_buf, string_buf) + } +} + +/// Contiguous row with projection information. +/// +/// The caller must ensure the source schema of projector is the same as the +/// schema of source row. +pub struct ProjectedContiguousRow<'a, T> { + source_row: T, + projector: &'a RowProjector, +} + +impl<'a, T: ContiguousRow> ProjectedContiguousRow<'a, T> { + pub fn new(source_row: T, projector: &'a RowProjector) -> Self { + Self { + source_row, + projector, + } + } +} + +impl<'a, T: ContiguousRow> ContiguousRow for ProjectedContiguousRow<'a, T> { + fn num_datum_views(&self) -> usize { + self.projector.source_projection().len() + } + + fn datum_view_at(&self, index: usize) -> DatumView { + let p = self.projector.source_projection()[index]; + + match p { + Some(index_in_source) => self.source_row.datum_view_at(index_in_source), + None => DatumView::Null, + } + } +} + +impl<'a, T: ContiguousRow> fmt::Debug for ProjectedContiguousRow<'a, T> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut list = f.debug_list(); + for i in 0..self.num_datum_views() { + let view = self.datum_view_at(i); + list.entry(&view); + } + list.finish() + } +} + +/// In memory buffer to hold data of a contiguous row. +pub trait RowBuffer: DerefMut { + /// Clear and resize the buffer size to `new_len` with given `value`. + fn reset(&mut self, new_len: usize, value: u8); + + /// Append slice into the buffer, resize the buffer automatically. + fn append_slice(&mut self, src: &[u8]); +} + +/// A writer to build a contiguous row. +pub struct ContiguousRowWriter<'a, T> { + inner: &'a mut T, + /// The schema the row group need to be encoded into, the schema + /// of the row need to be write compatible for the table schema. + table_schema: &'a Schema, + /// The index mapping from table schema to column in the + /// schema of row group. + index_in_writer: &'a IndexInWriterSchema, +} + +// TODO(yingwen): Try to replace usage of row by contiguous row. +impl<'a, T: RowBuffer + 'a> ContiguousRowWriter<'a, T> { + pub fn new( + inner: &'a mut T, + table_schema: &'a Schema, + index_in_writer: &'a IndexInWriterSchema, + ) -> Self { + Self { + inner, + table_schema, + index_in_writer, + } + } + + fn write_datum( + inner: &mut T, + datum: &Datum, + byte_offset: usize, + next_string_offset: &mut usize, + ) -> Result<()> { + let datum_offset = byte_offset + 1; + + match datum { + // Already filled by null, nothing to do. + Datum::Null => (), + Datum::Timestamp(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::Timestamp.into_u8()); + let value_buf = v.as_i64().to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + } + Datum::Double(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::Double.into_u8()); + let value_buf = v.to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + } + Datum::Float(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::Float.into_u8()); + let value_buf = v.to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + } + Datum::Varbinary(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::Varbinary.into_u8()); + let value_buf = next_string_offset.to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + // Use u32 to store length of string. + *next_string_offset += mem::size_of::() + v.len(); + + ensure!(v.len() <= MAX_STRING_LEN, StringTooLong { len: v.len() }); + + let string_len = v.len() as u32; + inner.append_slice(&string_len.to_ne_bytes()); + inner.append_slice(v); + } + Datum::String(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::String.into_u8()); + let value_buf = next_string_offset.to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + // Use u32 to store length of string. + *next_string_offset += mem::size_of::() + v.len(); + + ensure!(v.len() <= MAX_STRING_LEN, StringTooLong { len: v.len() }); + + let string_len = v.len() as u32; + inner.append_slice(&string_len.to_ne_bytes()); + inner.append_slice(v.as_bytes()); + } + Datum::UInt64(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::UInt64.into_u8()); + let value_buf = v.to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + } + Datum::UInt32(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::UInt32.into_u8()); + let value_buf = v.to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + } + Datum::UInt16(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::UInt16.into_u8()); + let value_buf = v.to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + } + Datum::UInt8(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::UInt8.into_u8()); + Self::write_slice_to_offset(inner, datum_offset, &[*v]); + } + Datum::Int64(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::Int64.into_u8()); + let value_buf = v.to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + } + Datum::Int32(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::Int32.into_u8()); + let value_buf = v.to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + } + Datum::Int16(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::Int16.into_u8()); + let value_buf = v.to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + } + Datum::Int8(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::Int8.into_u8()); + Self::write_slice_to_offset(inner, datum_offset, &[*v as u8]); + } + Datum::Boolean(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::Boolean.into_u8()); + Self::write_slice_to_offset(inner, datum_offset, &[*v as u8]); + } + } + + Ok(()) + } + + /// Write a row to the buffer, the buffer will be reset first. + pub fn write_row(&mut self, row: &Row) -> Result<()> { + let datum_buffer_len = self.table_schema.string_buffer_offset(); + // Reset the buffer and fill the buffer by null, now new slice will be + // appended to the string buffer. + self.inner + .reset(datum_buffer_len, DatumKind::Null.into_u8()); + + assert_eq!(row.num_columns(), self.table_schema.num_columns()); + + // Offset to next string in string buffer. + let mut next_string_offset: OffsetSize = 0; + for index_in_table in 0..self.table_schema.num_columns() { + if let Some(writer_index) = self.index_in_writer.column_index_in_writer(index_in_table) + { + let datum = &row[writer_index]; + let byte_offset = self.table_schema.byte_offset(index_in_table); + + // Write datum bytes to the buffer. + Self::write_datum(self.inner, datum, byte_offset, &mut next_string_offset)?; + } + // Column not in row is already filled by null. + } + + Ok(()) + } + + #[inline] + fn write_byte_to_offset(inner: &mut T, offset: usize, value: u8) { + inner[offset] = value; + } + + #[inline] + fn write_slice_to_offset(inner: &mut T, offset: usize, value_buf: &[u8]) { + let dst = &mut inner[offset..offset + value_buf.len()]; + dst.copy_from_slice(value_buf); + } +} + +/// The byte size to encode the datum of this kind in memory. +/// +/// Returns the (datum size + 1) for header. For integer types, the datum +/// size is the memory size of the interger type. For string types, the +/// datum size is the memory size to hold the offset. +pub(crate) fn byte_size_of_datum(kind: &DatumKind) -> usize { + let datum_size = match kind { + DatumKind::Null => 1, + DatumKind::Timestamp => mem::size_of::(), + DatumKind::Double => mem::size_of::(), + DatumKind::Float => mem::size_of::(), + // The size of offset. + DatumKind::Varbinary | DatumKind::String => mem::size_of::(), + DatumKind::UInt64 => mem::size_of::(), + DatumKind::UInt32 => mem::size_of::(), + DatumKind::UInt16 => mem::size_of::(), + DatumKind::UInt8 => mem::size_of::(), + DatumKind::Int64 => mem::size_of::(), + DatumKind::Int32 => mem::size_of::(), + DatumKind::Int16 => mem::size_of::(), + DatumKind::Int8 => mem::size_of::(), + DatumKind::Boolean => mem::size_of::(), + }; + + datum_size + 1 +} + +/// Read datum view from given datum buf, and may reference the string in +/// `string_buf`. +/// +/// Panic if out of bound. +/// +/// ## Safety +/// The string in buffer must be valid utf8. +fn must_read_view<'a>( + datum_kind: &DatumKind, + datum_buf: &'a [u8], + string_buf: &'a [u8], +) -> DatumView<'a> { + match datum_kind { + DatumKind::Null => DatumView::Null, + DatumKind::Timestamp => { + let value_buf = datum_buf[..mem::size_of::()].try_into().unwrap(); + let ts = Timestamp::new(i64::from_ne_bytes(value_buf)); + DatumView::Timestamp(ts) + } + DatumKind::Double => { + let value_buf = datum_buf[..mem::size_of::()].try_into().unwrap(); + let v = f64::from_ne_bytes(value_buf); + DatumView::Double(v) + } + DatumKind::Float => { + let value_buf = datum_buf[..mem::size_of::()].try_into().unwrap(); + let v = f32::from_ne_bytes(value_buf); + DatumView::Float(v) + } + DatumKind::Varbinary => { + let bytes = must_read_bytes(datum_buf, string_buf); + DatumView::Varbinary(bytes) + } + DatumKind::String => { + let bytes = must_read_bytes(datum_buf, string_buf); + let v = unsafe { str::from_utf8_unchecked(bytes) }; + DatumView::String(v) + } + DatumKind::UInt64 => { + let value_buf = datum_buf[..mem::size_of::()].try_into().unwrap(); + let v = u64::from_ne_bytes(value_buf); + DatumView::UInt64(v) + } + DatumKind::UInt32 => { + let value_buf = datum_buf[..mem::size_of::()].try_into().unwrap(); + let v = u32::from_ne_bytes(value_buf); + DatumView::UInt32(v) + } + DatumKind::UInt16 => { + let value_buf = datum_buf[..mem::size_of::()].try_into().unwrap(); + let v = u16::from_ne_bytes(value_buf); + DatumView::UInt16(v) + } + DatumKind::UInt8 => DatumView::UInt8(datum_buf[0]), + DatumKind::Int64 => { + let value_buf = datum_buf[..mem::size_of::()].try_into().unwrap(); + let v = i64::from_ne_bytes(value_buf); + DatumView::Int64(v) + } + DatumKind::Int32 => { + let value_buf = datum_buf[..mem::size_of::()].try_into().unwrap(); + let v = i32::from_ne_bytes(value_buf); + DatumView::Int32(v) + } + DatumKind::Int16 => { + let value_buf = datum_buf[..mem::size_of::()].try_into().unwrap(); + let v = i16::from_ne_bytes(value_buf); + DatumView::Int16(v) + } + DatumKind::Int8 => DatumView::Int8(datum_buf[0] as i8), + DatumKind::Boolean => DatumView::Boolean(datum_buf[0] != 0), + } +} + +fn must_read_bytes<'a>(datum_buf: &'a [u8], string_buf: &'a [u8]) -> &'a [u8] { + // Read offset of string in string buf. + let value_buf = datum_buf[..mem::size_of::()] + .try_into() + .unwrap(); + let offset = OffsetSize::from_ne_bytes(value_buf); + let string_buf = &string_buf[offset..]; + + // Read len of the string. + let len_buf = string_buf[..mem::size_of::()].try_into().unwrap(); + let string_len = u32::from_ne_bytes(len_buf) as usize; + let string_buf = &string_buf[mem::size_of::()..]; + + // Read string. + &string_buf[..string_len] +} + +impl RowBuffer for Vec { + fn reset(&mut self, new_len: usize, value: u8) { + self.clear(); + + self.resize(new_len, value); + } + + fn append_slice(&mut self, src: &[u8]) { + self.extend_from_slice(src); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + projected_schema::ProjectedSchema, + tests::{build_rows, build_schema}, + }; + + fn check_contiguous_row(row: &Row, reader: impl ContiguousRow, projection: Option>) { + let range = if let Some(projection) = projection { + projection + } else { + (0..reader.num_datum_views()).collect() + }; + for i in range { + let datum = &row[i]; + let view = reader.datum_view_at(i); + + assert_eq!(datum.as_view(), view); + } + } + + #[test] + fn test_contiguous_read_write() { + let schema = build_schema(); + let rows = build_rows(); + let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns()); + + let mut buf = Vec::new(); + for row in rows { + let mut writer = ContiguousRowWriter::new(&mut buf, &schema, &index_in_writer); + + writer.write_row(&row).unwrap(); + + let reader = ContiguousRowReader::with_schema(&buf, &schema); + check_contiguous_row(&row, reader, None); + } + } + + #[test] + fn test_project_contiguous_read_write() { + let schema = build_schema(); + assert!(schema.num_columns() > 1); + let projection: Vec = (0..schema.num_columns() - 1).collect(); + let projected_schema = + ProjectedSchema::new(schema.clone(), Some(projection.clone())).unwrap(); + let row_projected_schema = projected_schema.try_project_with_key(&schema).unwrap(); + let rows = build_rows(); + let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns()); + + let mut buf = Vec::new(); + for row in rows { + let mut writer = ContiguousRowWriter::new(&mut buf, &schema, &index_in_writer); + + writer.write_row(&row).unwrap(); + + let source_row = ContiguousRowReader::with_schema(&buf, &schema); + let projected_row = ProjectedContiguousRow::new(source_row, &row_projected_schema); + check_contiguous_row(&row, projected_row, Some(projection.clone())); + } + } +} diff --git a/common_types/src/row/mod.rs b/common_types/src/row/mod.rs new file mode 100644 index 0000000000..600052cfcc --- /dev/null +++ b/common_types/src/row/mod.rs @@ -0,0 +1,590 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Row type + +use std::{ + cmp, + ops::{Index, IndexMut}, +}; + +use snafu::{ensure, Backtrace, OptionExt, Snafu}; + +use crate::{ + column_schema::ColumnSchema, + datum::{Datum, DatumKind}, + record_batch::RecordBatchWithKey, + schema::{RecordSchemaWithKey, Schema}, + time::Timestamp, +}; + +pub mod contiguous; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Column out of bound, len:{}, given:{}.\nBacktrace:\n{}", + len, + given, + backtrace + ))] + ColumnOutOfBound { + len: usize, + given: usize, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid column num of row, expect:{}, given:{}.\nBacktrace:\n{}", + expect, + given, + backtrace + ))] + InvalidColumnNum { + expect: usize, + given: usize, + backtrace: Backtrace, + }, + + #[snafu(display("Column cannot be null, name:{}.\nBacktrace:\n{}", column, backtrace))] + NullColumn { + column: String, + backtrace: Backtrace, + }, + + #[snafu(display( + "Column type mismatch, name:{}, expect:{:?}, given:{:?}.\nBacktrace:\n{}", + column, + expect, + given, + backtrace + ))] + TypeMismatch { + column: String, + expect: DatumKind, + given: DatumKind, + backtrace: Backtrace, + }, + + #[snafu(display("Missing columns to build row.\nBacktrace:\n{}", backtrace))] + MissingColumns { backtrace: Backtrace }, + + #[snafu(display("Convert column failed, column:{}, err:{}", column, source))] + ConvertColumn { + column: String, + source: crate::datum::Error, + }, + + #[snafu(display("Column in the schema is not found, column_name:{}", column,))] + ColumnNameNotFound { column: String }, + + #[snafu(display( + "Column in the schema is not found, column_name:{}.\nBacktrace:\n{}", + column, + backtrace + ))] + ColumnNotFoundInSchema { + column: String, + backtrace: Backtrace, + }, +} + +// Do not depend on common_util crates +pub type Result = std::result::Result; + +// TODO(yingwen): +// - Memory pooling (or Arena) and statistics +// - Custom Debug format +// - Add a type RowWithSchema so we can ensure the row always matches the schema +// - Maybe add a type RowOperation like kudu + +/// Row contains multiple columns, each column is represented by a datum +/// The internal representation of row is not specific +#[derive(Debug, Clone, PartialEq)] +pub struct Row { + cols: Vec, +} + +impl Row { + /// Convert vec of Datum into Row + pub fn from_datums(cols: Vec) -> Self { + Self { cols } + } + + /// Returns the column num + pub fn num_columns(&self) -> usize { + self.cols.len() + } + + /// Iterate all datums + pub fn iter(&self) -> IterDatum { + IterDatum { + iter: self.cols.iter(), + } + } + + /// Get the timestamp column + pub fn timestamp(&self, schema: &Schema) -> Option { + let timestamp_index = schema.timestamp_index(); + + self.cols[timestamp_index].as_timestamp() + } +} + +#[derive(Debug)] +pub struct IterDatum<'a> { + iter: std::slice::Iter<'a, Datum>, +} + +impl<'a> Iterator for IterDatum<'a> { + type Item = &'a Datum; + + fn next(&mut self) -> Option { + self.iter.next() + } +} + +impl Index for Row { + type Output = Datum; + + fn index(&self, index: usize) -> &Self::Output { + &self.cols[index] + } +} + +impl IndexMut for Row { + fn index_mut(&mut self, index: usize) -> &mut Self::Output { + &mut self.cols[index] + } +} + +impl<'a> IntoIterator for &'a Row { + type IntoIter = std::slice::Iter<'a, Datum>; + type Item = &'a Datum; + + fn into_iter(self) -> Self::IntoIter { + self.cols.iter() + } +} + +impl IntoIterator for Row { + type IntoIter = std::vec::IntoIter; + type Item = Datum; + + fn into_iter(self) -> Self::IntoIter { + self.cols.into_iter() + } +} + +/// Check whether the schema of the row equals to given `schema` +pub fn check_row_schema(row: &Row, schema: &Schema) -> Result<()> { + ensure!( + schema.num_columns() == row.num_columns(), + InvalidColumnNum { + expect: schema.num_columns(), + given: row.num_columns(), + } + ); + + for (index, datum) in row.iter().enumerate() { + let column = schema.column(index); + check_datum_type(datum, column)?; + } + + Ok(()) +} + +// TODO(yingwen): For multiple rows that share the same schema, no need to store +// Datum for each row element, we can store the whole row as a binary and +// provide more efficent way to convert rows into columns +/// RowGroup +/// +/// The min/max timestamp of an empty RowGroup is 0. +/// +/// Rows in the RowGroup have the same schema. The internal representation of +/// rows is not specific. +#[derive(Debug)] +pub struct RowGroup { + /// Schema of the row group, all rows in the row group should have same + /// schema + schema: Schema, + /// Rows in the row group + rows: Vec, + // TODO(yingwen): Maybe remove min/max timestamp + /// Min timestamp of all the rows + min_timestamp: Timestamp, + /// Max timestamp of all the rows + max_timestamp: Timestamp, +} + +impl RowGroup { + /// Returns true if the row group is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.rows.is_empty() + } + + /// Returns number of rows in the row group + #[inline] + pub fn num_rows(&self) -> usize { + self.rows.len() + } + + /// Returns the idx-th row in the row group + #[inline] + pub fn get_row(&self, idx: usize) -> Option<&Row> { + self.rows.get(idx) + } + + /// Returns the idx-th mutable row in the row group + #[inline] + pub fn get_row_mut(&mut self, idx: usize) -> Option<&mut Row> { + self.rows.get_mut(idx) + } + + /// Iter all datum of the column + /// + /// Will panic if col_index is out of bound + pub fn iter_column(&self, col_index: usize) -> IterCol { + IterCol { + rows: &self.rows, + row_index: 0, + col_index, + } + } + + /// The schema of the row group + #[inline] + pub fn schema(&self) -> &Schema { + &self.schema + } + + /// Iter the row group by rows + // TODO(yingwen): Add a iter_with_schema + pub fn iter(&self) -> IterRow { + IterRow { + iter: self.rows.iter(), + } + } + + /// Get the min timestamp of rows + #[inline] + pub fn min_timestamp(&self) -> Timestamp { + self.min_timestamp + } + + /// Get the max timestamp of rows + #[inline] + pub fn max_timestmap(&self) -> Timestamp { + self.max_timestamp + } +} + +impl<'a> IntoIterator for &'a RowGroup { + type IntoIter = std::slice::Iter<'a, Row>; + type Item = &'a Row; + + fn into_iter(self) -> Self::IntoIter { + self.rows.iter() + } +} + +impl IntoIterator for RowGroup { + type IntoIter = std::vec::IntoIter; + type Item = Row; + + fn into_iter(self) -> Self::IntoIter { + self.rows.into_iter() + } +} + +#[derive(Debug)] +pub struct IterRow<'a> { + iter: std::slice::Iter<'a, Row>, +} + +impl<'a> Iterator for IterRow<'a> { + type Item = &'a Row; + + fn next(&mut self) -> Option { + self.iter.next() + } +} + +#[derive(Debug)] +pub struct IterCol<'a> { + rows: &'a Vec, + row_index: usize, + col_index: usize, +} + +impl<'a> Iterator for IterCol<'a> { + type Item = &'a Datum; + + fn next(&mut self) -> Option { + if self.rows.is_empty() { + return None; + } + + if self.row_index >= self.rows.len() { + return None; + } + + let row = &self.rows[self.row_index]; + self.row_index += 1; + + Some(&row[self.col_index]) + } + + fn size_hint(&self) -> (usize, Option) { + let remaining = self.rows.len() - self.row_index; + (remaining, Some(remaining)) + } +} + +/// RowGroup builder +#[derive(Debug)] +pub struct RowGroupBuilder { + schema: Schema, + rows: Vec, + min_timestamp: Option, + max_timestmap: Timestamp, +} + +impl RowGroupBuilder { + /// Create a new builder + pub fn new(schema: Schema) -> Self { + Self::with_capacity(schema, 0) + } + + /// Create a new builder with given capacity + pub fn with_capacity(schema: Schema, capacity: usize) -> Self { + Self { + schema, + rows: Vec::with_capacity(capacity), + min_timestamp: None, + max_timestmap: Timestamp::new(0), + } + } + + /// Create a new builder with schema and rows + /// + /// Return error if the `rows` do not matched the `schema` + pub fn with_rows(schema: Schema, rows: Vec) -> Result { + let mut row_group = Self::new(schema); + + // Check schema and update min/max timestamp + for row in &rows { + check_row_schema(row, &row_group.schema)?; + row_group.update_timestamps(row); + } + + row_group.rows = rows; + + Ok(row_group) + } + + /// Add a schema checked row + /// + /// REQUIRE: Caller should ensure the schema of row must equal to the schema + /// of this builder + pub fn push_checked_row(&mut self, row: Row) { + self.update_timestamps(&row); + + self.rows.push(row); + } + + /// Acquire builder to build next row of the row group + pub fn row_builder(&mut self) -> RowBuilder { + RowBuilder { + // schema: &self.schema, + cols: Vec::with_capacity(self.schema.num_columns()), + // rows: &mut self.rows, + group_builder: self, + } + } + + /// Build the row group + pub fn build(self) -> RowGroup { + RowGroup { + schema: self.schema, + rows: self.rows, + min_timestamp: self.min_timestamp.unwrap_or_else(|| Timestamp::new(0)), + max_timestamp: self.max_timestmap, + } + } + + /// Update min/max timestamp of the row group + fn update_timestamps(&mut self, row: &Row) { + // check_row_schema() ensures this datum is a timestamp, so we just unwrap here + let row_timestamp = row.timestamp(&self.schema).unwrap(); + + self.min_timestamp = match self.min_timestamp { + Some(min_timestamp) => Some(cmp::min(min_timestamp, row_timestamp)), + None => Some(row_timestamp), + }; + self.max_timestmap = cmp::max(self.max_timestmap, row_timestamp); + } +} + +/// Check whether the datum kind matches the column schema +pub fn check_datum_type(datum: &Datum, column_schema: &ColumnSchema) -> Result<()> { + // Check null datum + if let Datum::Null = datum { + ensure!( + column_schema.is_nullable, + NullColumn { + column: &column_schema.name, + } + ); + } else { + ensure!( + datum.kind() == column_schema.data_type, + TypeMismatch { + column: &column_schema.name, + expect: column_schema.data_type, + given: datum.kind(), + } + ); + } + + Ok(()) +} + +// TODO(yingwen): This builder is used to build RowGroup, need to provide a +// builder to build one row +/// Row builder for the row group +#[derive(Debug)] +pub struct RowBuilder<'a> { + group_builder: &'a mut RowGroupBuilder, + cols: Vec, +} + +impl<'a> RowBuilder<'a> { + /// Append a datum into the row + pub fn append_datum(mut self, datum: Datum) -> Result { + self.check_datum(&datum)?; + + self.cols.push(datum); + + Ok(self) + } + + /// Check whether the datum is valid + fn check_datum(&self, datum: &Datum) -> Result<()> { + let index = self.cols.len(); + let schema = &self.group_builder.schema; + ensure!( + index < schema.num_columns(), + ColumnOutOfBound { + len: schema.num_columns(), + given: index, + } + ); + + let column = schema.column(index); + check_datum_type(datum, column) + } + + /// Finish building this row and append this row into the row group + pub fn finish(self) -> Result<()> { + ensure!( + self.cols.len() == self.group_builder.schema.num_columns(), + MissingColumns + ); + + self.group_builder.push_checked_row(Row { cols: self.cols }); + Ok(()) + } +} + +pub trait RowView { + fn try_get_column_by_name(&self, column_name: &str) -> Result>; + + fn column_by_idx(&self, column_idx: usize) -> Datum; +} + +// TODO(yingwen): Add a method to get row view on RecordBatchWithKey. +/// A row view on the [RecordBatchWithKey]. +/// +/// `row_idx < record_batch.num_rows()` is ensured. +#[derive(Debug)] +pub struct RowViewOnBatch<'a> { + pub record_batch: &'a RecordBatchWithKey, + pub row_idx: usize, +} + +impl<'a> RowViewOnBatch<'a> { + pub fn iter_columns(&self) -> RowViewOnBatchColumnIter { + RowViewOnBatchColumnIter { + next_column_idx: 0, + row_idx: self.row_idx, + record_batch: self.record_batch, + } + } +} + +pub struct RowViewOnBatchColumnIter<'a> { + next_column_idx: usize, + row_idx: usize, + record_batch: &'a RecordBatchWithKey, +} + +impl<'a> RowView for RowViewOnBatch<'a> { + fn try_get_column_by_name(&self, column_name: &str) -> Result> { + let column_idx = self + .record_batch + .schema_with_key() + .index_of(column_name) + .context(ColumnNameNotFound { + column: column_name, + })?; + Ok(Some(self.column_by_idx(column_idx))) + } + + #[inline] + fn column_by_idx(&self, column_idx: usize) -> Datum { + let column = self.record_batch.column(column_idx); + column.datum(self.row_idx) + } +} + +impl<'a> Iterator for RowViewOnBatchColumnIter<'a> { + type Item = Result; + + fn next(&mut self) -> Option { + if self.next_column_idx >= self.record_batch.num_columns() { + return None; + } + + let curr_column_idx = self.next_column_idx; + let column = self.record_batch.column(curr_column_idx); + let datum = column.datum_opt(self.row_idx).map(Ok); + + self.next_column_idx += 1; + + datum + } +} + +#[derive(Debug, Clone)] +pub struct RowWithMeta<'a> { + pub row: &'a Row, + pub schema: &'a RecordSchemaWithKey, +} + +impl<'a> RowView for RowWithMeta<'a> { + fn try_get_column_by_name(&self, column_name: &str) -> Result> { + let idx = self + .schema + .index_of(column_name) + .context(ColumnNotFoundInSchema { + column: column_name, + })?; + Ok(Some(self.column_by_idx(idx))) + } + + #[inline] + fn column_by_idx(&self, column_idx: usize) -> Datum { + self.row.cols[column_idx].clone() + } +} diff --git a/common_types/src/schema.rs b/common_types/src/schema.rs new file mode 100644 index 0000000000..4172886057 --- /dev/null +++ b/common_types/src/schema.rs @@ -0,0 +1,1554 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Schema of table + +use std::{ + cmp::{self, Ordering}, + collections::{HashMap, HashSet}, + convert::TryFrom, + fmt, + str::FromStr, + sync::Arc, +}; + +// Just re-use arrow's types +// TODO(yingwen): No need to support all schema that arrow supports, we can +// use a new type pattern to wrap Schema/SchemaRef and not allow to use +// the data type we not supported +pub use arrow_deps::arrow::datatypes::{ + DataType, Field, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, +}; +use proto::common as common_pb; +use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu}; + +use crate::{ + column_schema::{self, ColumnId, ColumnSchema}, + datum::DatumKind, + row::{contiguous, RowView}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Projection too long, max:{}, given:{}.\nBacktrace:\n{}", + max, + given, + backtrace + ))] + ProjectionTooLong { + max: usize, + given: usize, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid projection index, max:{}, given:{}.\nBacktrace:\n{}", + max, + given, + backtrace + ))] + InvalidProjectionIndex { + max: usize, + given: usize, + backtrace: Backtrace, + }, + + #[snafu(display("Projection must have timestamp column.\nBacktrace:\n{}", backtrace))] + ProjectionMissTimestamp { backtrace: Backtrace }, + + #[snafu(display( + "Column name already exists, name:{}.\nBacktrace:\n{}", + name, + backtrace + ))] + ColumnNameExists { name: String, backtrace: Backtrace }, + + #[snafu(display( + "Column id already exists, name:{}, id:{}.\nBacktrace:\n{}", + name, + id, + backtrace + ))] + ColumnIdExists { + name: String, + id: ColumnId, + backtrace: Backtrace, + }, + + #[snafu(display( + "Unsupported key column type, name:{}, type:{:?}.\nBacktrace:\n{}", + name, + kind, + backtrace + ))] + KeyColumnType { + name: String, + kind: DatumKind, + backtrace: Backtrace, + }, + + #[snafu(display( + "Timestamp key column already exists, timestamp_column:{}, given:{}.\nBacktrace:\n{}", + timestamp_column, + given_column, + backtrace + ))] + TimestampKeyExists { + timestamp_column: String, + given_column: String, + backtrace: Backtrace, + }, + + #[snafu(display("Timestamp key not exists.\nBacktrace:\n{}", backtrace))] + MissingTimestampKey { backtrace: Backtrace }, + + #[snafu(display( + "Key column cannot be nullable, name:{}.\nBacktrace:\n{}", + name, + backtrace + ))] + NullKeyColumn { name: String, backtrace: Backtrace }, + + #[snafu(display( + "Invalid arrow field, field_name:{}, arrow_schema:{:?}, err:{}", + field_name, + arrow_schema, + source + ))] + InvalidArrowField { + field_name: String, + arrow_schema: ArrowSchemaRef, + source: crate::column_schema::Error, + }, + + #[snafu(display( + "Invalid schema to generate tsid primary key.\nBacktrace:\n{}", + backtrace + ))] + InvalidTsidSchema { backtrace: Backtrace }, + + #[snafu(display( + "Invalid arrow schema key, key:{:?}, raw_value:{}, err:{:?}.\nBacktrace:\n{}", + key, + raw_value, + source, + backtrace + ))] + InvalidArrowSchemaMetaValue { + key: ArrowSchemaMetaKey, + raw_value: String, + source: Box, + backtrace: Backtrace, + }, + + #[snafu(display( + "Arrow schema meta key not found, key:{:?}.\nBacktrace:\n{}", + key, + backtrace + ))] + ArrowSchemaMetaKeyNotFound { + key: ArrowSchemaMetaKey, + backtrace: Backtrace, + }, +} + +// TODO(boyan) make these constants configurable +pub const TSID_COLUMN: &str = "tsid"; +pub const TIMESTAMP_COLUMN: &str = "timestamp"; + +pub type Result = std::result::Result; + +const DEFAULT_SCHEMA_VERSION: Version = 1; + +#[derive(Debug, Snafu)] +pub enum CompatError { + #[snafu(display("Incompatible column schema for write, err:{}", source))] + IncompatWriteColumn { + source: crate::column_schema::CompatError, + }, + + #[snafu(display("Missing column, name:{}", name))] + MissingWriteColumn { name: String }, + + #[snafu(display("Columns to write not found in table, names:{:?}", names))] + WriteMoreColumn { names: Vec }, +} + +/// Meta data of the arrow schema +struct ArrowSchemaMeta { + num_key_columns: usize, + timestamp_index: usize, + enable_tsid_primary_key: bool, + version: u32, +} + +#[derive(Copy, Clone, Debug)] +pub enum ArrowSchemaMetaKey { + NumKeyColumns, + TimestampIndex, + EnableTsidPrimaryKey, + Version, +} + +impl ArrowSchemaMetaKey { + fn as_str(&self) -> &str { + match self { + ArrowSchemaMetaKey::NumKeyColumns => "schema:num_key_columns", + ArrowSchemaMetaKey::TimestampIndex => "schema::timestamp_index", + ArrowSchemaMetaKey::EnableTsidPrimaryKey => "schema::enable_tsid_primary_key", + ArrowSchemaMetaKey::Version => "schema::version", + } + } +} + +impl ToString for ArrowSchemaMetaKey { + fn to_string(&self) -> String { + self.as_str().to_string() + } +} + +/// Schema version +pub type Version = u32; + +/// Mapping column index in table schema to column index in writer schema +#[derive(Default)] +pub struct IndexInWriterSchema(Vec>); + +impl IndexInWriterSchema { + /// Create a index mapping for same schema with `num_columns` columns. + pub fn for_same_schema(num_columns: usize) -> Self { + let indexes = (0..num_columns).into_iter().map(Some).collect(); + Self(indexes) + } + + /// Returns the column index in writer schema of the column with index + /// `index_in_table` in the table schema where the writer prepared to + /// write to. + /// + /// If the column is not in writer schema, returns None, which means that + /// this column should be filled by null. + /// + /// Panic if the index_in_table is out of bound + pub fn column_index_in_writer(&self, index_in_table: usize) -> Option { + self.0[index_in_table] + } +} + +// TODO(yingwen): No need to compare all elements in ColumnSchemas, Schema, +// RecordSchema, custom PartialEq for them. + +/// Data of column schemas +#[derive(PartialEq)] +pub(crate) struct ColumnSchemas { + /// Column schemas + columns: Vec, + /// Column name to index of that column schema in `columns`, the index is + /// guaranteed to be valid + name_to_index: HashMap, + /// Byte offsets of each column in contiguous row. + byte_offsets: Vec, + /// String buffer offset in contiguous row. + string_buffer_offset: usize, +} + +impl ColumnSchemas { + fn new(columns: Vec) -> Self { + let name_to_index = columns + .iter() + .enumerate() + .map(|(idx, c)| (c.name.to_string(), idx)) + .collect(); + + let mut current_offset = 0; + let mut byte_offsets = Vec::with_capacity(columns.len()); + for column_schema in &columns { + byte_offsets.push(current_offset); + current_offset += contiguous::byte_size_of_datum(&column_schema.data_type); + } + + Self { + columns, + name_to_index, + byte_offsets, + string_buffer_offset: current_offset, + } + } +} + +impl ColumnSchemas { + pub fn num_columns(&self) -> usize { + self.columns().len() + } + + pub fn columns(&self) -> &[ColumnSchema] { + &self.columns + } + + pub fn column(&self, i: usize) -> &ColumnSchema { + &self.columns[i] + } + + pub fn index_of(&self, name: &str) -> Option { + self.name_to_index.get(name).copied() + } +} + +impl fmt::Debug for ColumnSchemas { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ColumnSchemas") + // name_to_index is ignored. + .field("columns", &self.columns) + .finish() + } +} + +/// Schema of [crate::record_batch::RecordBatch] +/// +/// Should be cheap to clone. +/// +/// Note: Only `name`, `data_type`, `is_nullable` is valid after converting from +/// arrow's schema, the additional fields like `id`/`is_tag`/`comment` is always +/// unset. Now we only convert arrow's schema into our record before we output +/// the final query result, where the additional fields is never used. +#[derive(Debug, Clone, PartialEq)] +pub struct RecordSchema { + arrow_schema: ArrowSchemaRef, + column_schemas: Arc, +} + +impl RecordSchema { + fn from_column_schemas(column_schemas: ColumnSchemas) -> Self { + // Convert to arrow fields. + let fields = column_schemas + .columns + .iter() + .map(|col| col.to_arrow_field()) + .collect(); + // Build arrow schema. + let arrow_schema = Arc::new(ArrowSchema::new(fields)); + + Self { + arrow_schema, + column_schemas: Arc::new(column_schemas), + } + } + + pub fn num_columns(&self) -> usize { + self.column_schemas.num_columns() + } + + pub fn columns(&self) -> &[ColumnSchema] { + self.column_schemas.columns() + } + + pub fn index_of(&self, name: &str) -> Option { + self.column_schemas.index_of(name) + } + + pub fn column(&self, i: usize) -> &ColumnSchema { + self.column_schemas.column(i) + } + + pub fn to_arrow_schema_ref(&self) -> ArrowSchemaRef { + self.arrow_schema.clone() + } +} + +impl TryFrom for RecordSchema { + type Error = Error; + + fn try_from(arrow_schema: ArrowSchemaRef) -> Result { + let fields = arrow_schema.fields(); + let mut columns = Vec::with_capacity(fields.len()); + + for field in fields { + let column_schema = + ColumnSchema::try_from(field).with_context(|| InvalidArrowField { + arrow_schema: arrow_schema.clone(), + field_name: field.name(), + })?; + columns.push(column_schema); + } + + let column_schemas = ColumnSchemas::new(columns); + + Ok(Self::from_column_schemas(column_schemas)) + } +} + +#[derive(Debug, Clone, PartialEq)] +pub struct RecordSchemaWithKey { + record_schema: RecordSchema, + num_key_columns: usize, +} + +impl RecordSchemaWithKey { + pub fn num_columns(&self) -> usize { + self.record_schema.num_columns() + } + + pub fn compare_row(&self, lhs: &LR, rhs: &RR) -> Ordering { + compare_row(self.num_key_columns, lhs, rhs) + } + + pub fn index_of(&self, name: &str) -> Option { + self.record_schema.index_of(name) + } + + pub fn columns(&self) -> &[ColumnSchema] { + self.record_schema.columns() + } + + /// Returns an immutable reference of the key column vector. + pub fn key_columns(&self) -> &[ColumnSchema] { + &self.columns()[..self.num_key_columns] + } + + pub(crate) fn into_record_schema(self) -> RecordSchema { + self.record_schema + } + + pub(crate) fn to_arrow_schema_ref(&self) -> ArrowSchemaRef { + self.record_schema.to_arrow_schema_ref() + } + + #[inline] + pub fn num_key_columns(&self) -> usize { + self.num_key_columns + } +} + +/// Compare the two rows. +/// +/// REQUIRES: the two rows must have the same number of key columns as +/// `num_key_columns`. +pub fn compare_row( + num_key_columns: usize, + lhs: &LR, + rhs: &RR, +) -> Ordering { + for column_idx in 0..num_key_columns { + // caller should ensure the row view is valid. + // TODO(xikai): unwrap may not a good way to handle the error. + let left_datum = lhs.column_by_idx(column_idx); + let right_datum = rhs.column_by_idx(column_idx); + // the two datums must be of the same kind type. + match left_datum.partial_cmp(&right_datum).unwrap() { + Ordering::Equal => continue, + v @ Ordering::Less | v @ Ordering::Greater => return v, + } + } + + Ordering::Equal +} + +// TODO(yingwen): Maybe rename to TableSchema. +/// Schema of a table +/// +/// - Should be immutable +/// - Each schema must have a timestamp column +/// - Should be immutable and cheap to clone, though passing by reference is +/// preferred +/// - The prefix of columns makes up the primary key (similar to kudu's schema) +/// - The Schema should built by builder +#[derive(Clone, PartialEq)] +pub struct Schema { + /// The underlying arrow schema, data type of fields must be supported by + /// datum + arrow_schema: ArrowSchemaRef, + /// The number of primary key columns + num_key_columns: usize, + /// Index of timestamp key column + // TODO(yingwen): Maybe we can remove the restriction that timestamp column must exists in + // schema (mainly for projected schema) + timestamp_index: usize, + /// Index of tsid key column and None denotes the `enable_tsid_primary_key` + /// is not set. + tsid_index: Option, + /// Control whether to generate tsid as primary key + enable_tsid_primary_key: bool, + /// Column schemas, only holds arc pointer so the Schema can be cloned + /// without much overhead. + column_schemas: Arc, + /// Version of the schema, schemas with same version should be identical. + version: Version, +} + +impl fmt::Debug for Schema { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Schema") + // arrow_schema is ignored. + .field("num_key_columns", &self.num_key_columns) + .field("timestamp_index", &self.timestamp_index) + .field("tsid_index", &self.tsid_index) + .field("enable_tsid_primary_key", &self.enable_tsid_primary_key) + .field("column_schemas", &self.column_schemas) + .field("version", &self.version) + .finish() + } +} + +impl TryFrom for Schema { + type Error = Error; + + fn try_from(arrow_schema: ArrowSchemaRef) -> Result { + Builder::build_from_arrow_schema(arrow_schema) + } +} + +impl TryFrom for Schema { + type Error = Error; + + fn try_from(record_schema: RecordSchema) -> Result { + Builder::build_from_arrow_schema(record_schema.to_arrow_schema_ref()) + } +} + +impl Schema { + /// Returns an immutable reference of the vector of [ColumnSchema]. + pub fn columns(&self) -> &[ColumnSchema] { + self.column_schemas.columns() + } + + /// Returns an immutable reference of the key column vector. + pub fn key_columns(&self) -> &[ColumnSchema] { + &self.columns()[..self.num_key_columns] + } + + /// Returns an immutable reference of the normal column vector. + pub fn normal_columns(&self) -> &[ColumnSchema] { + &self.columns()[self.num_key_columns..] + } + + /// Returns index of the tsid column. + pub fn index_of_tsid(&self) -> Option { + self.tsid_index + } + + /// Returns tsid column index and immutable reference of tsid column + pub fn tsid_column(&self) -> Option<&ColumnSchema> { + if let Some(idx) = self.index_of_tsid() { + Some(&self.column_schemas.columns[idx]) + } else { + None + } + } + + /// Returns total number of columns + pub fn num_columns(&self) -> usize { + self.column_schemas.num_columns() + } + + /// Returns an immutable reference of a specific [ColumnSchema] selected by + /// name. + pub fn column_with_name(&self, name: &str) -> Option<&ColumnSchema> { + let index = self.column_schemas.name_to_index.get(name)?; + Some(&self.column_schemas.columns[*index]) + } + + /// Returns an immutable reference of a specific [ColumnSchema] selected + /// using an offset within the internal vector. + /// + /// Panic if i is out of bound + pub fn column(&self, i: usize) -> &ColumnSchema { + self.column_schemas.column(i) + } + + /// Return the ref to [arrow_deps::arrow::datatypes::SchemaRef] + pub fn as_arrow_schema_ref(&self) -> &ArrowSchemaRef { + &self.arrow_schema + } + + /// Return the cloned [arrow_deps::arrow::datatypes::SchemaRef] + pub fn to_arrow_schema_ref(&self) -> ArrowSchemaRef { + self.arrow_schema.clone() + } + + /// Into [arrow_deps::arrow::datatypes::SchemaRef] + pub fn into_arrow_schema_ref(self) -> ArrowSchemaRef { + self.arrow_schema + } + + /// Find the index of the column with the given name. + pub fn index_of(&self, name: &str) -> Option { + self.column_schemas.index_of(name) + } + + /// Returns the number of columns in primary key + #[inline] + pub fn num_key_columns(&self) -> usize { + self.num_key_columns + } + + /// Get the name of the timestamp column + #[inline] + pub fn timestamp_name(&self) -> &str { + &self.column(self.timestamp_index()).name + } + + /// Get the index of the timestamp column + #[inline] + pub fn timestamp_index(&self) -> usize { + self.timestamp_index + } + + /// Get the version of this schema + #[inline] + pub fn version(&self) -> Version { + self.version + } + + /// Compare the two rows. + /// + /// REQUIRES: the two rows must have the key columns defined by the schema. + pub fn compare_row(&self, lhs: &R, rhs: &R) -> Ordering { + compare_row(self.num_key_columns, lhs, rhs) + } + + /// Returns `Ok` if rows with `writer_schema` can write to table with the + /// same schema as `self`. + pub fn compatible_for_write( + &self, + writer_schema: &Schema, + index_in_writer: &mut IndexInWriterSchema, + ) -> std::result::Result<(), CompatError> { + index_in_writer.0.reserve(self.num_columns()); + + let mut num_col_in_writer = 0; + for column in self.columns() { + // Find column in schema of writer. + match writer_schema.index_of(&column.name) { + Some(writer_index) => { + let writer_column = writer_schema.column(writer_index); + + // Column is found in writer + num_col_in_writer += 1; + + // Column with same name, but not compatible + column + .compatible_for_write(writer_column) + .context(IncompatWriteColumn)?; + + // Column is compatible, push index mapping + index_in_writer.0.push(Some(writer_index)); + } + None => { + // Column is not found in writer, then the column should be nullable. + ensure!( + column.is_nullable, + MissingWriteColumn { name: &column.name } + ); + + // Column is nullable, push index mapping + index_in_writer.0.push(None); + } + } + } + // All columns of this schema have been checked + + // If the writer have columns not in this schema, then we consider it + // incompatible + ensure!( + num_col_in_writer == writer_schema.num_columns(), + WriteMoreColumn { + names: writer_schema + .columns() + .iter() + .filter_map(|c| if self.column_with_name(&c.name).is_none() { + Some(c.name.clone()) + } else { + None + }) + .collect::>(), + } + ); + + Ok(()) + } + + pub fn to_record_schema(&self) -> RecordSchema { + RecordSchema { + arrow_schema: self.arrow_schema.clone(), + column_schemas: self.column_schemas.clone(), + } + } + + pub fn to_record_schema_with_key(&self) -> RecordSchemaWithKey { + RecordSchemaWithKey { + record_schema: self.to_record_schema(), + num_key_columns: self.num_key_columns, + } + } + + /// Panic if projection is invalid. + pub(crate) fn project_record_schema_with_key( + &self, + projection: &[usize], + ) -> RecordSchemaWithKey { + let mut columns = Vec::with_capacity(self.num_key_columns); + // Keep all key columns in order. + for key_column in self.key_columns() { + columns.push(key_column.clone()); + } + + // Collect normal columns needed by the projection. + for p in projection { + if *p >= self.num_key_columns { + // A normal column + let normal_column = &self.columns()[*p]; + columns.push(normal_column.clone()); + } + } + + let record_schema = RecordSchema::from_column_schemas(ColumnSchemas::new(columns)); + + RecordSchemaWithKey { + record_schema, + num_key_columns: self.num_key_columns, + } + } + + /// Panic if projection is invalid. + pub(crate) fn project_record_schema(&self, projection: &[usize]) -> RecordSchema { + let mut columns = Vec::with_capacity(projection.len()); + + // Collect all columns needed by the projection. + for p in projection { + let column_schema = &self.columns()[*p]; + // Insert the index in projected schema of the column + columns.push(column_schema.clone()); + } + + RecordSchema::from_column_schemas(ColumnSchemas::new(columns)) + } + + /// Returns byte offsets in contiguous row. + #[inline] + pub fn byte_offsets(&self) -> &[usize] { + &self.column_schemas.byte_offsets + } + + /// Returns byte offset in contiguous row of given column. + /// + /// Panic if out of bound. + #[inline] + pub fn byte_offset(&self, index: usize) -> usize { + self.column_schemas.byte_offsets[index] + } + + /// Returns string buffer offset in contiguous row. + #[inline] + pub fn string_buffer_offset(&self) -> usize { + self.column_schemas.string_buffer_offset + } +} + +impl TryFrom for Schema { + type Error = Error; + + fn try_from(schema: common_pb::TableSchema) -> Result { + let mut builder = Builder::with_capacity(schema.columns.len()) + .version(schema.version) + .enable_tsid_primary_key(schema.enable_tsid_primary_key); + + for (i, column_schema_pb) in schema.columns.into_iter().enumerate() { + let column = ColumnSchema::from(column_schema_pb); + + if i < schema.num_key_columns as usize { + builder = builder.add_key_column(column)?; + } else { + builder = builder.add_normal_column(column)?; + } + } + + builder.build() + } +} + +impl From for common_pb::TableSchema { + fn from(schema: Schema) -> Self { + let mut table_schema = common_pb::TableSchema::new(); + + for column in schema.columns() { + // Convert schema of each column + let column_schema = column.to_pb(); + table_schema.columns.push(column_schema); + } + + table_schema.num_key_columns = schema.num_key_columns as u32; + table_schema.timestamp_index = schema.timestamp_index as u32; + table_schema.enable_tsid_primary_key = schema.enable_tsid_primary_key; + table_schema.version = schema.version; + + table_schema + } +} + +/// Schema builder +#[must_use] +pub struct Builder { + columns: Vec, + /// The number of primary key columns + num_key_columns: usize, + /// Timestamp column index + timestamp_index: Option, + column_names: HashSet, + column_ids: HashSet, + /// Version of the schema + version: Version, + /// Auto increment the column id if the id of the input ColumnSchema is + /// [crate::column_schema::COLUMN_ID_UNINIT]. + auto_increment_column_id: bool, + max_column_id: ColumnId, + enable_tsid_primary_key: bool, +} + +impl Default for Builder { + fn default() -> Self { + Self::new() + } +} + +impl Builder { + /// Create a new builder + pub fn new() -> Self { + Self::with_capacity(0) + } + + /// Create a builder with given capacity + pub fn with_capacity(capacity: usize) -> Self { + Self { + columns: Vec::with_capacity(capacity), + num_key_columns: 0, + timestamp_index: None, + column_names: HashSet::with_capacity(capacity), + column_ids: HashSet::with_capacity(capacity), + version: DEFAULT_SCHEMA_VERSION, + auto_increment_column_id: false, + max_column_id: column_schema::COLUMN_ID_UNINIT, + enable_tsid_primary_key: false, + } + } + + /// Add a key column + pub fn add_key_column(mut self, mut column: ColumnSchema) -> Result { + self.may_alloc_column_id(&mut column); + self.validate_column(&column, true)?; + + ensure!(!column.is_nullable, NullKeyColumn { name: column.name }); + + // FIXME(xikai): it seems not reasonable to decide the timestamp column in this + // way. + let is_timestamp = DatumKind::Timestamp == column.data_type; + if is_timestamp { + ensure!( + self.timestamp_index.is_none(), + TimestampKeyExists { + timestamp_column: &self.columns[self.timestamp_index.unwrap()].name, + given_column: column.name, + } + ); + self.timestamp_index = Some(self.num_key_columns); + } + + self.insert_new_key_column(column); + + Ok(self) + } + + /// Add a normal (non key) column + pub fn add_normal_column(mut self, mut column: ColumnSchema) -> Result { + self.may_alloc_column_id(&mut column); + self.validate_column(&column, false)?; + + self.insert_new_normal_column(column); + + Ok(self) + } + + /// Set version of the schema + pub fn version(mut self, version: Version) -> Self { + self.version = version; + self + } + + /// When auto increment is true, assign the column schema an auto + /// incremented id if its id is [crate::column_schema::COLUMN_ID_UNINIT]. + /// + /// Default is false + pub fn auto_increment_column_id(mut self, auto_increment: bool) -> Self { + self.auto_increment_column_id = auto_increment; + self + } + + /// Enable tsid as primary key. + pub fn enable_tsid_primary_key(mut self, enable_tsid_primary_key: bool) -> Self { + self.enable_tsid_primary_key = enable_tsid_primary_key; + self + } + + fn may_alloc_column_id(&mut self, column: &mut ColumnSchema) { + // Assign this column an id + if self.auto_increment_column_id && column.id == column_schema::COLUMN_ID_UNINIT { + column.id = self.max_column_id + 1; + } + + self.max_column_id = cmp::max(self.max_column_id, column.id); + } + + // TODO(yingwen): Do we need to support null data type? + fn validate_column(&self, column: &ColumnSchema, is_key: bool) -> Result<()> { + ensure!( + !self.column_names.contains(&column.name), + ColumnNameExists { name: &column.name } + ); + + // Check datum kind if this is a key column + if is_key { + ensure!( + column.data_type.is_key_kind(), + KeyColumnType { + name: &column.name, + kind: column.data_type, + } + ); + } + + ensure!( + !self.column_ids.contains(&column.id), + ColumnIdExists { + name: &column.name, + id: column.id, + } + ); + + Ok(()) + } + + fn insert_new_key_column(&mut self, column: ColumnSchema) { + self.column_names.insert(column.name.clone()); + self.column_ids.insert(column.id); + + self.columns.insert(self.num_key_columns, column); + self.num_key_columns += 1; + } + + fn insert_new_normal_column(&mut self, column: ColumnSchema) { + self.column_names.insert(column.name.clone()); + self.column_ids.insert(column.id); + + self.columns.push(column); + } + + fn build_from_arrow_schema(arrow_schema: ArrowSchemaRef) -> Result { + let fields = arrow_schema.fields(); + let mut columns = Vec::with_capacity(fields.len()); + + for field in fields { + let column_schema = + ColumnSchema::try_from(field).with_context(|| InvalidArrowField { + arrow_schema: arrow_schema.clone(), + field_name: field.name(), + })?; + columns.push(column_schema); + } + + // FIXME(xikai): Now we have to tolerate the decoding failure because of the bug + // of datafusion (fixed by: https://github.com/apache/arrow-datafusion/commit/1448d9752ab3a38f02732274f91136a6a6ad3db4). + // (The bug may cause the meta data of the schema meta lost duration plan + // execution.) + let ArrowSchemaMeta { + num_key_columns, + timestamp_index, + enable_tsid_primary_key, + version, + } = Self::parse_arrow_schema_meta_or_default(arrow_schema.metadata())?; + let tsid_index = Self::find_tsid_index(enable_tsid_primary_key, &columns)?; + + let column_schemas = Arc::new(ColumnSchemas::new(columns)); + + Ok(Schema { + arrow_schema, + num_key_columns, + timestamp_index, + tsid_index, + enable_tsid_primary_key, + column_schemas, + version, + }) + } + + fn parse_arrow_schema_meta_value( + meta: &HashMap, + key: ArrowSchemaMetaKey, + ) -> Result + where + T: FromStr, + T::Err: std::error::Error + Send + Sync + 'static, + { + let raw_value = meta + .get(key.as_str()) + .context(ArrowSchemaMetaKeyNotFound { key })?; + T::from_str(raw_value.as_str()) + .map_err(|e| Box::new(e) as _) + .context(InvalidArrowSchemaMetaValue { key, raw_value }) + } + + /// Parse the necessary meta information from the arrow schema's meta data. + fn parse_arrow_schema_meta_or_default( + meta: &HashMap, + ) -> Result { + match Self::parse_arrow_schema_meta(meta) { + Ok(v) => Ok(v), + Err(Error::ArrowSchemaMetaKeyNotFound { .. }) => Ok(ArrowSchemaMeta { + num_key_columns: 0, + timestamp_index: 0, + enable_tsid_primary_key: false, + version: 0, + }), + Err(e) => Err(e), + } + } + + /// Parse the necessary meta information from the arrow schema's meta data. + fn parse_arrow_schema_meta(meta: &HashMap) -> Result { + Ok(ArrowSchemaMeta { + num_key_columns: Self::parse_arrow_schema_meta_value( + meta, + ArrowSchemaMetaKey::NumKeyColumns, + )?, + timestamp_index: Self::parse_arrow_schema_meta_value( + meta, + ArrowSchemaMetaKey::TimestampIndex, + )?, + enable_tsid_primary_key: Self::parse_arrow_schema_meta_value( + meta, + ArrowSchemaMetaKey::EnableTsidPrimaryKey, + )?, + version: Self::parse_arrow_schema_meta_value(meta, ArrowSchemaMetaKey::Version)?, + }) + } + + /// Build arrow schema meta data. + /// + /// Requires: the timestamp index is not None. + fn build_arrow_schema_meta(&self) -> HashMap { + let mut meta = HashMap::with_capacity(4); + meta.insert( + ArrowSchemaMetaKey::NumKeyColumns.to_string(), + self.num_key_columns.to_string(), + ); + meta.insert( + ArrowSchemaMetaKey::TimestampIndex.to_string(), + self.timestamp_index.unwrap().to_string(), + ); + meta.insert( + ArrowSchemaMetaKey::Version.to_string(), + self.version.to_string(), + ); + meta.insert( + ArrowSchemaMetaKey::EnableTsidPrimaryKey.to_string(), + self.enable_tsid_primary_key.to_string(), + ); + + meta + } + + fn find_tsid_index( + enable_tsid_primary_key: bool, + columns: &[ColumnSchema], + ) -> Result> { + if !enable_tsid_primary_key { + return Ok(None); + } + + let idx = columns + .iter() + .enumerate() + .find_map(|(idx, col_schema)| { + if col_schema.name == TSID_COLUMN { + Some(idx) + } else { + None + } + }) + .context(InvalidTsidSchema)?; + + Ok(Some(idx)) + } + + /// Build the schema + pub fn build(self) -> Result { + let timestamp_index = self.timestamp_index.context(MissingTimestampKey)?; + // Timestamp key column is exists, so key columns should not be zero + assert!(self.num_key_columns > 0); + if self.enable_tsid_primary_key { + ensure!(self.num_key_columns == 2, InvalidTsidSchema); + } + + let tsid_index = Self::find_tsid_index(self.enable_tsid_primary_key, &self.columns)?; + + let fields = self.columns.iter().map(|c| c.to_arrow_field()).collect(); + let meta = self.build_arrow_schema_meta(); + + Ok(Schema { + arrow_schema: Arc::new(ArrowSchema::new_with_metadata(fields, meta)), + num_key_columns: self.num_key_columns, + timestamp_index, + tsid_index, + enable_tsid_primary_key: self.enable_tsid_primary_key, + column_schemas: Arc::new(ColumnSchemas::new(self.columns)), + version: self.version, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + bytes::Bytes, + datum::Datum, + row::{Row, RowWithMeta}, + time::Timestamp, + }; + + #[test] + fn test_schema() { + let schema = Builder::new() + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new("key1".to_string(), DatumKind::Varbinary) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("timestamp".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Double) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field2".to_string(), DatumKind::Double) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .build() + .unwrap(); + + // Length related test + assert_eq!(4, schema.columns().len()); + assert_eq!(4, schema.num_columns()); + assert_eq!(2, schema.num_key_columns()); + assert_eq!(1, schema.timestamp_index()); + + // Test key columns + assert_eq!(2, schema.key_columns().len()); + assert_eq!("key1", &schema.key_columns()[0].name); + assert_eq!("timestamp", &schema.key_columns()[1].name); + + // Test normal columns + assert_eq!(2, schema.normal_columns().len()); + assert_eq!("field1", &schema.normal_columns()[0].name); + assert_eq!("field2", &schema.normal_columns()[1].name); + + // Test column_with_name() + let field1 = schema.column_with_name("field1").unwrap(); + assert_eq!(3, field1.id); + assert_eq!("field1", field1.name); + assert!(schema.column_with_name("not exists").is_none()); + + // Test column() + assert_eq!(field1, schema.column(2)); + + // Test arrow schema + let arrow_schema = schema.as_arrow_schema_ref(); + let key1 = arrow_schema.field(0); + assert_eq!("key1", key1.name()); + let field2 = arrow_schema.field(3); + assert_eq!("field2", field2.name()); + + // Test index_of() + assert_eq!(1, schema.index_of("timestamp").unwrap()); + assert!(schema.index_of("not exists").is_none()); + + // Test pb convert + let schema_pb = common_pb::TableSchema::from(schema.clone()); + let schema_from_pb = Schema::try_from(schema_pb).unwrap(); + assert_eq!(schema, schema_from_pb); + } + + #[test] + fn test_build_unordered() { + let schema = Builder::new() + .auto_increment_column_id(true) + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Double) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("key1".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("key2".to_string(), DatumKind::Varbinary) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field2".to_string(), DatumKind::Double) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .build() + .unwrap(); + + let columns = schema.columns(); + assert_eq!(2, columns[0].id); + assert_eq!("key1", columns[0].name); + assert_eq!(3, columns[1].id); + assert_eq!("key2", columns[1].name); + assert_eq!(1, columns[2].id); + assert_eq!("field1", columns[2].name); + assert_eq!(4, columns[3].id); + assert_eq!("field2", columns[3].name); + } + + #[test] + fn test_name_exists() { + let builder = Builder::new() + .auto_increment_column_id(true) + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Double) + .build() + .expect("should succeed build column schema"), + ) + .unwrap(); + assert!(builder + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Double) + .build() + .expect("should succeed build column schema"), + ) + .is_err()); + } + + #[test] + fn test_id_exists() { + let builder = Builder::new() + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Double) + .id(1) + .build() + .expect("should succeed build column schema"), + ) + .unwrap(); + assert!(builder + .add_normal_column( + column_schema::Builder::new("field2".to_string(), DatumKind::Double) + .id(1) + .build() + .expect("should succeed build column schema") + ) + .is_err()); + } + + #[test] + fn test_key_column_type() { + assert!(Builder::new() + .add_key_column( + column_schema::Builder::new("key".to_string(), DatumKind::Double) + .id(1) + .build() + .expect("should succeed build column schema") + ) + .is_err()); + } + + #[test] + fn test_timestamp_key_exists() { + let builder = Builder::new() + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new("key1".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema"), + ) + .unwrap(); + assert!(builder + .add_key_column( + column_schema::Builder::new("key2".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema") + ) + .is_err()); + } + + #[test] + fn test_mulitple_timestamp() { + Builder::new() + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new("key1".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .build() + .unwrap(); + } + + #[test] + fn test_missing_timestamp_key() { + let builder = Builder::new() + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new("key1".to_string(), DatumKind::Varbinary) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Double) + .build() + .expect("should succeed build column schema"), + ) + .unwrap(); + assert!(builder.build().is_err()); + } + + #[test] + fn test_null_key() { + assert!(Builder::new() + .add_key_column( + column_schema::Builder::new("key1".to_string(), DatumKind::Varbinary) + .id(1) + .is_nullable(true) + .build() + .expect("should succeed build column schema") + ) + .is_err()); + } + + #[test] + fn test_max_column_id() { + let builder = Builder::new() + .add_key_column( + column_schema::Builder::new("key1".to_string(), DatumKind::Varbinary) + .id(2) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Timestamp) + .id(5) + .build() + .expect("should succeed build column schema"), + ) + .unwrap(); + + let schema = builder + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new("key2".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field2".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .build() + .unwrap(); + + let columns = schema.columns(); + // Check key1 + assert_eq!("key1", &columns[0].name); + assert_eq!(2, columns[0].id); + // Check key2 + assert_eq!("key2", &columns[1].name); + assert_eq!(6, columns[1].id); + // Check field1 + assert_eq!("field1", &columns[2].name); + assert_eq!(5, columns[2].id); + // Check field2 + assert_eq!("field2", &columns[3].name); + assert_eq!(7, columns[3].id); + } + + fn assert_row_compare(ordering: Ordering, schema: &Schema, row1: &Row, row2: &Row) { + let schema_with_key = schema.to_record_schema_with_key(); + let lhs = RowWithMeta { + row: row1, + schema: &schema_with_key, + }; + let rhs = RowWithMeta { + row: row2, + schema: &schema_with_key, + }; + assert_eq!(ordering, schema.compare_row(&lhs, &rhs)); + } + + #[test] + fn test_compare_row() { + let schema = Builder::new() + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new("key1".to_string(), DatumKind::Varbinary) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("key2".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Double) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .build() + .unwrap(); + + // Test equal + { + let row1 = Row::from_datums(vec![ + Datum::Varbinary(Bytes::from_static(b"key1")), + Datum::Timestamp(Timestamp::new(1005)), + Datum::Double(12.5), + ]); + let row2 = Row::from_datums(vec![ + Datum::Varbinary(Bytes::from_static(b"key1")), + Datum::Timestamp(Timestamp::new(1005)), + Datum::Double(15.5), + ]); + + assert_row_compare(Ordering::Equal, &schema, &row1, &row2); + } + + // Test first key column less + { + let row1 = Row::from_datums(vec![ + Datum::Varbinary(Bytes::from_static(b"key2")), + Datum::Timestamp(Timestamp::new(1005)), + Datum::Double(17.5), + ]); + let row2 = Row::from_datums(vec![ + Datum::Varbinary(Bytes::from_static(b"key5")), + Datum::Timestamp(Timestamp::new(1005)), + Datum::Double(17.5), + ]); + + assert_row_compare(Ordering::Less, &schema, &row1, &row2); + } + + // Test second key column less + { + let row1 = Row::from_datums(vec![ + Datum::Varbinary(Bytes::from_static(b"key2")), + Datum::Timestamp(Timestamp::new(1002)), + Datum::Double(17.5), + ]); + let row2 = Row::from_datums(vec![ + Datum::Varbinary(Bytes::from_static(b"key2")), + Datum::Timestamp(Timestamp::new(1005)), + Datum::Double(17.5), + ]); + + assert_row_compare(Ordering::Less, &schema, &row1, &row2); + } + + // Test first key column greater + { + let row1 = Row::from_datums(vec![ + Datum::Varbinary(Bytes::from_static(b"key7")), + Datum::Timestamp(Timestamp::new(1005)), + Datum::Double(17.5), + ]); + let row2 = Row::from_datums(vec![ + Datum::Varbinary(Bytes::from_static(b"key5")), + Datum::Timestamp(Timestamp::new(1005)), + Datum::Double(17.5), + ]); + + assert_row_compare(Ordering::Greater, &schema, &row1, &row2); + } + + // Test second key column greater + { + let row1 = Row::from_datums(vec![ + Datum::Varbinary(Bytes::from_static(b"key2")), + Datum::Timestamp(Timestamp::new(1007)), + Datum::Double(17.5), + ]); + let row2 = Row::from_datums(vec![ + Datum::Varbinary(Bytes::from_static(b"key2")), + Datum::Timestamp(Timestamp::new(1005)), + Datum::Double(17.5), + ]); + + assert_row_compare(Ordering::Greater, &schema, &row1, &row2); + } + } + + #[test] + fn test_build_from_arrow_schema() { + let schema = Builder::new() + .auto_increment_column_id(true) + .enable_tsid_primary_key(true) + .add_key_column( + column_schema::Builder::new(TSID_COLUMN.to_string(), DatumKind::UInt64) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("timestamp".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("value".to_string(), DatumKind::Double) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .build() + .expect("should succeed to build schema"); + + let arrow_schema = schema.clone().into_arrow_schema_ref(); + let new_schema = Builder::build_from_arrow_schema(arrow_schema) + .expect("should succeed to build new schema"); + + assert_eq!(schema, new_schema); + } +} diff --git a/common_types/src/string.rs b/common_types/src/string.rs new file mode 100644 index 0000000000..be41c82702 --- /dev/null +++ b/common_types/src/string.rs @@ -0,0 +1,107 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Bytes that can safely cast to str/string. + +use std::{convert::TryFrom, fmt, ops, str}; + +use snafu::{Backtrace, ResultExt, Snafu}; + +use crate::bytes::Bytes; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Bytes are not valid utf8, err:{}.\nBacktrace:\n{}", source, backtrace))] + FromBytes { + source: std::str::Utf8Error, + backtrace: Backtrace, + }, +} + +pub type Result = std::result::Result; + +/// String using [crate::bytes::Bytes] as storage so it can be cast into `Bytes` +/// and clone like `Bytes`. +#[derive(Debug, Clone, PartialEq, PartialOrd)] +pub struct StringBytes(Bytes); + +impl StringBytes { + pub fn new() -> StringBytes { + StringBytes(Bytes::new()) + } + + pub const fn from_static(src: &'static str) -> StringBytes { + StringBytes(Bytes::from_static(src.as_bytes())) + } + + pub fn copy_from_str(src: &str) -> StringBytes { + StringBytes(Bytes::copy_from_slice(src.as_bytes())) + } + + /// Create a [StringBytes] from a valid utf bytes. + /// + /// # Safety + /// The caller must ensure `bytes` is valid utf string. + pub unsafe fn from_bytes_unchecked(bytes: Bytes) -> StringBytes { + StringBytes(bytes) + } + + #[inline] + pub fn as_bytes(&self) -> &[u8] { + &self.0 + } + + #[inline] + pub fn as_str(&self) -> &str { + unsafe { str::from_utf8_unchecked(self.as_bytes()) } + } +} + +impl Default for StringBytes { + fn default() -> Self { + Self::new() + } +} + +impl ops::Deref for StringBytes { + type Target = str; + + #[inline] + fn deref(&self) -> &str { + self.as_str() + } +} + +impl AsRef for StringBytes { + #[inline] + fn as_ref(&self) -> &str { + self.as_str() + } +} + +impl fmt::Display for StringBytes { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(self.as_str()) + } +} + +impl TryFrom for StringBytes { + type Error = Error; + + fn try_from(bytes: Bytes) -> Result { + str::from_utf8(&bytes).context(FromBytes)?; + + Ok(StringBytes(bytes)) + } +} + +impl From for StringBytes { + fn from(src: String) -> Self { + Self(Bytes::from(src)) + } +} + +impl From<&str> for StringBytes { + fn from(src: &str) -> Self { + Self::copy_from_str(src) + } +} diff --git a/common_types/src/tests.rs b/common_types/src/tests.rs new file mode 100644 index 0000000000..e20313ce1c --- /dev/null +++ b/common_types/src/tests.rs @@ -0,0 +1,139 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use bytes::Bytes; + +use crate::{ + column_schema, + datum::{Datum, DatumKind}, + projected_schema::ProjectedSchema, + record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder}, + row::{ + contiguous::{ContiguousRowReader, ContiguousRowWriter, ProjectedContiguousRow}, + Row, + }, + schema, + schema::{IndexInWriterSchema, Schema}, + string::StringBytes, + time::Timestamp, +}; + +fn base_schema_builder() -> schema::Builder { + schema::Builder::new() + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new("key1".to_string(), DatumKind::Varbinary) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("key2".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Double) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field2".to_string(), DatumKind::String) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() +} + +/// Build a schema for testing: +/// (key1(varbinary), key2(timestamp), field1(double), field2(string)) +pub fn build_schema() -> Schema { + base_schema_builder().build().unwrap() +} + +pub fn build_projected_schema() -> ProjectedSchema { + let schema = build_schema(); + assert!(schema.num_columns() > 1); + let projection: Vec = (0..schema.num_columns() - 1).collect(); + ProjectedSchema::new(schema, Some(projection)).unwrap() +} + +pub fn build_row(key1: &[u8], key2: i64, field1: f64, field2: &str) -> Row { + let datums = vec![ + Datum::Varbinary(Bytes::copy_from_slice(key1)), + Datum::Timestamp(Timestamp::new(key2)), + Datum::Double(field1), + Datum::String(StringBytes::from(field2)), + ]; + + Row::from_datums(datums) +} + +pub fn build_row_opt(key1: &[u8], key2: i64, field1: Option, field2: Option<&str>) -> Row { + let datums = vec![ + Datum::Varbinary(Bytes::copy_from_slice(key1)), + Datum::Timestamp(Timestamp::new(key2)), + field1.map(Datum::Double).unwrap_or(Datum::Null), + field2 + .map(|v| Datum::String(StringBytes::from(v))) + .unwrap_or(Datum::Null), + ]; + + Row::from_datums(datums) +} + +pub fn build_rows() -> Vec { + vec![ + build_row(b"binary key", 1000000, 10.0, "string value"), + build_row(b"binary key1", 1000001, 11.0, "string value 1"), + build_row_opt(b"binary key2", 1000002, None, Some("string value 2")), + build_row_opt(b"binary key3", 1000003, Some(13.0), None), + build_row_opt(b"binary key4", 1000004, None, None), + ] +} + +pub fn build_record_batch_with_key_by_rows(rows: Vec) -> RecordBatchWithKey { + let schema = build_schema(); + assert!(schema.num_columns() > 1); + let projection: Vec = (0..schema.num_columns() - 1).collect(); + let projected_schema = ProjectedSchema::new(schema.clone(), Some(projection)).unwrap(); + let row_projected_schema = projected_schema.try_project_with_key(&schema).unwrap(); + + let mut builder = + RecordBatchWithKeyBuilder::with_capacity(projected_schema.to_record_schema_with_key(), 2); + let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns()); + + let mut buf = Vec::new(); + for row in rows { + let mut writer = ContiguousRowWriter::new(&mut buf, &schema, &index_in_writer); + + writer.write_row(&row).unwrap(); + + let source_row = ContiguousRowReader::with_schema(&buf, &schema); + let projected_row = ProjectedContiguousRow::new(source_row, &row_projected_schema); + builder + .append_projected_contiguous_row(&projected_row) + .unwrap(); + } + builder.build().unwrap() +} + +pub fn check_record_batch_with_key_with_rows( + record_batch_with_key: &RecordBatchWithKey, + row_num: usize, + column_num: usize, + rows: Vec, +) -> bool { + for (i, row) in rows.iter().enumerate().take(row_num) { + for j in 0..column_num { + let datum = &row[j]; + let datum2 = record_batch_with_key.column(j).datum(i); + + if *datum != datum2 { + return false; + } + } + } + true +} diff --git a/common_types/src/time.rs b/common_types/src/time.rs new file mode 100644 index 0000000000..27ff8802c0 --- /dev/null +++ b/common_types/src/time.rs @@ -0,0 +1,363 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Time types + +// TODO(yingwen): Support timezone + +use std::{ + convert::{TryFrom, TryInto}, + time::{self, Duration, SystemTime}, +}; + +use proto::common::TimeRange as TimeRangePb; +use snafu::{Backtrace, OptionExt, Snafu}; + +/// Error of time module. +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Invalid time range, start:{}, end:{}", start, end))] + InvalidTimeRange { + start: i64, + end: i64, + backtrace: Backtrace, + }, +} + +/// Unix timestamp type in millis +// Use i64 so we can store timestamp before 1970-01-01 +#[derive(Clone, Copy, Debug, Default, Eq, Ord, PartialEq, PartialOrd, Hash)] +pub struct Timestamp(i64); + +impl Timestamp { + pub const MAX: Timestamp = Timestamp(i64::MAX); + pub const MIN: Timestamp = Timestamp(i64::MIN); + pub const ZERO: Timestamp = Timestamp(0); + + pub const fn new(ts: i64) -> Self { + Self(ts) + } + + /// Return current (non-negative) unix timestamp in millis. + pub fn now() -> Self { + SystemTime::now() + .duration_since(time::UNIX_EPOCH) + .map(|duration| { + duration + .as_millis() + .try_into() + .map(Timestamp) + .unwrap_or(Timestamp::MAX) + }) + .unwrap_or(Timestamp::ZERO) + } + + /// Returns the earliest expired timestamp. + #[inline] + pub fn expire_time(ttl: Duration) -> Timestamp { + Timestamp::now().sub_duration_or_min(ttl) + } + + #[inline] + pub fn as_i64(&self) -> i64 { + self.0 + } + + /// Truncate the value of this timestamp by given duration, return that + /// value and keeps current timestamp unchanged. + /// + /// This function won't do overflow check. + #[must_use] + pub fn truncate_by(&self, duration: Duration) -> Self { + let duration_millis = duration.as_millis() as i64; + Timestamp::new(self.0 / duration_millis * duration_millis) + } + + /// Floor the timestamp by the `duration_ms` (in millisecond) and return a + /// new Timestamp instance or None if overflow occurred. + /// + /// The `duration_ms` must be positive + #[inline] + fn checked_floor_by_i64(&self, duration_ms: i64) -> Option { + assert!(duration_ms > 0); + let normalized_ts = if self.0 >= 0 { + // self / duration_ms * duration_ms + self.0 + } else { + // (self - (duration_ms - 1)) / duration_ms * duration_ms + self.0.checked_sub(duration_ms - 1)? + }; + + normalized_ts + .checked_div(duration_ms) + .and_then(|v| v.checked_mul(duration_ms)) + .map(Timestamp) + } + + /// Returns the result of this `timestamp + offset_ms`, or None if overflow + /// occurred. + /// + /// The `offset_ms` is in millis resolution + pub fn checked_add_i64(&self, offset_ms: i64) -> Option { + self.0.checked_add(offset_ms).map(Timestamp) + } + + pub fn checked_add(&self, other: Self) -> Option { + self.0.checked_add(other.0).map(Timestamp) + } + + pub fn checked_sub(&self, other: Self) -> Option { + self.0.checked_sub(other.0).map(Timestamp) + } + + /// Returns the result of this `timestamp` - `duration`, or None if overflow + /// occurred. + pub fn checked_sub_duration(&self, duration: Duration) -> Option { + let duration_millis = duration.as_millis().try_into().ok()?; + self.0.checked_sub(duration_millis).map(Timestamp) + } + + /// Return true if the time is expired + pub fn is_expired(&self, expired_time: Timestamp) -> bool { + *self < expired_time + } + + /// Returns the result of this `timestamp` - `duration`, or MIN if overflow + /// occurred. + #[must_use] + pub fn sub_duration_or_min(&self, duration: Duration) -> Timestamp { + self.checked_sub_duration(duration) + .unwrap_or(Timestamp::MIN) + } +} + +impl From for i64 { + fn from(timestamp: Timestamp) -> Self { + timestamp.0 + } +} + +impl From for Timestamp { + fn from(ts: i64) -> Self { + Self::new(ts) + } +} + +impl From<&i64> for Timestamp { + fn from(ts: &i64) -> Self { + Self::new(*ts) + } +} + +/// Unix timestamp range in millis +/// +/// The start time is inclusive and the end time is exclusive: [start, end). +/// The range is empty if start equals end. +#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)] +pub struct TimeRange { + /// The start timestamp (inclusive) + inclusive_start: Timestamp, + /// The end timestamp (exclusive) + exclusive_end: Timestamp, +} + +impl TimeRange { + /// Create a new time range, returns None if the start/end is invalid + pub fn new(inclusive_start: Timestamp, exclusive_end: Timestamp) -> Option { + if inclusive_start <= exclusive_end { + Some(Self { + inclusive_start, + exclusive_end, + }) + } else { + None + } + } + + /// Create a new time range, panic if the start/end is invalid. + pub fn new_unchecked(inclusive_start: Timestamp, exclusive_end: Timestamp) -> Self { + Self::new(inclusive_start, exclusive_end).unwrap() + } + + #[cfg(any(test, feature = "test"))] + pub fn new_unchecked_for_test(inclusive_start: i64, exclusive_end: i64) -> Self { + Self::new( + Timestamp::new(inclusive_start), + Timestamp::new(exclusive_end), + ) + .unwrap() + } + + /// Create a time range only including the single timestamp. + pub fn from_timestamp(t: Timestamp) -> Self { + // FIXME(xikai): now the time range can not express the `exclusive_end` as + // infinite. + let end = t.checked_add_i64(1).unwrap_or(t); + Self::new(t, end).unwrap() + } + + /// Create a new time range of [0, max) + pub fn min_to_max() -> Self { + Self { + inclusive_start: Timestamp::MIN, + exclusive_end: Timestamp::MAX, + } + } + + /// Create a empty time range. + pub fn empty() -> Self { + Self { + inclusive_start: Timestamp::ZERO, + exclusive_end: Timestamp::ZERO, + } + } + + /// The inclusive start timestamp + #[inline] + pub fn inclusive_start(&self) -> Timestamp { + self.inclusive_start + } + + /// The exclusive end timestamp + #[inline] + pub fn exclusive_end(&self) -> Timestamp { + self.exclusive_end + } + + /// Return the reference to the exclusive end timestamp. + #[inline] + pub fn exclusive_end_ref(&self) -> &Timestamp { + &self.exclusive_end + } + + /// Returns true if the time range contains the given `ts` + #[inline] + pub fn contains(&self, ts: Timestamp) -> bool { + self.inclusive_start <= ts && ts < self.exclusive_end + } + + /// Returns a time bucket with fixed bucket size that the timestamp belongs + /// to. Returns None if overflow occurred, the bucket_duration is greater + /// than [i64::MAX] or not positive. + pub fn bucket_of(timestamp: Timestamp, bucket_duration: Duration) -> Option { + let bucket_duration_ms: i64 = bucket_duration.as_millis().try_into().ok()?; + if bucket_duration_ms <= 0 { + return None; + } + + let inclusive_start = timestamp.checked_floor_by_i64(bucket_duration_ms)?; + // end = start + bucket_duration + let exclusive_end = inclusive_start.checked_add_i64(bucket_duration_ms)?; + + Some(Self { + inclusive_start, + exclusive_end, + }) + } + + /// Returns true if this time range intersect with `other` + pub fn intersect_with(&self, other: TimeRange) -> bool { + !self.not_intersecting(other) + } + + /// Return true if the time range is expired (`exclusive_end_time` < + /// `expire_time`). + pub fn is_expired(&self, expire_time: Option) -> bool { + expire_time.is_some() && self.exclusive_end() <= expire_time.unwrap() + } + + #[inline] + fn not_intersecting(&self, other: TimeRange) -> bool { + other.exclusive_end <= self.inclusive_start || other.inclusive_start >= self.exclusive_end + } + + pub fn intersected_range(&self, other: TimeRange) -> Option { + TimeRange::new( + self.inclusive_start.max(other.inclusive_start), + self.exclusive_end.min(other.exclusive_end), + ) + } +} + +impl From for TimeRangePb { + fn from(src: TimeRange) -> Self { + let mut target = TimeRangePb::default(); + target.set_start(src.inclusive_start.as_i64()); + target.set_end(src.exclusive_end.as_i64()); + target + } +} + +impl TryFrom for TimeRange { + type Error = Error; + + fn try_from(src: TimeRangePb) -> Result { + Self::new(Timestamp::new(src.start), Timestamp::new(src.end)).context(InvalidTimeRange { + start: src.start, + end: src.end, + }) + } +} + +#[cfg(test)] +mod test { + use std::time::Duration; + + use crate::time::{TimeRange, Timestamp}; + + #[test] + fn test_timestamp() { + // 1637723901000: 2021-11-24 11:18:21 + let timestamp = Timestamp::new(1637723901000); + // 1d + let ttl = Duration::from_secs(24 * 3600); + assert_eq!( + timestamp.sub_duration_or_min(ttl), + Timestamp::new(1637637501000) + ); + assert_eq!(timestamp.truncate_by(ttl), Timestamp::new(1637712000000)); + assert_eq!( + timestamp.checked_floor_by_i64(2000), + Some(Timestamp::new(1637723900000)) + ); + assert_eq!( + timestamp.checked_add_i64(2000), + Some(Timestamp::new(1637723903000)) + ); + assert_eq!( + timestamp.checked_sub_duration(ttl), + Some(Timestamp::new(1637637501000)) + ); + } + + #[test] + fn test_time_range() { + // [100,200) + let time_range = TimeRange::new_unchecked_for_test(100, 200); + assert!(time_range.contains(Timestamp::new(150))); + assert!(time_range.contains(Timestamp::new(100))); + assert!(!time_range.contains(Timestamp::new(200))); + + assert!(!time_range.is_expired(Some(Timestamp::new(50)))); + assert!(time_range.is_expired(Some(Timestamp::new(200)))); + + assert_eq!( + TimeRange::bucket_of(Timestamp::new(100), Duration::from_millis(2)), + Some(TimeRange::new_unchecked_for_test(100, 102)) + ); + + let time_range2 = TimeRange::new_unchecked_for_test(200, 300); + assert!(!time_range.intersect_with(time_range2)); + let time_range3 = TimeRange::new_unchecked_for_test(50, 200); + assert!(time_range.intersect_with(time_range3)); + + assert!(time_range.not_intersecting(time_range2)); + assert!(!time_range.not_intersecting(time_range3)); + } + + #[test] + fn test_bucket_of_negative_timestamp() { + let ts = Timestamp::new(-126316800000); + let range = TimeRange::bucket_of(ts, Duration::from_millis(25920000000)).unwrap(); + assert!(range.contains(ts), "range:{:?}", range); + } +} diff --git a/common_util/Cargo.toml b/common_util/Cargo.toml new file mode 100644 index 0000000000..884b13236b --- /dev/null +++ b/common_util/Cargo.toml @@ -0,0 +1,44 @@ +[package] +name = "common_util" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[features] +test = ["env_logger"] + +[dependencies] +# In alphabetical order +backtrace = "0.3.9" +common_types = { path = "../common_types", features = ["test"] } +chrono = "0.4" +crossbeam-utils = "0.8" +env_logger = { version = "0.6", optional = true } +lazy_static = "1.4.0" +libc = "0.2" +log = "0.4" +logger = { path = "../components/logger"} +snafu = { version ="0.6.10", features = ["backtraces"]} +serde = {version = "1.0.81", features = ["derive"]} +serde_derive = "1.0.81" +pin-project-lite = "0.2" +prometheus = "0.12" +proto = { path = "../proto" } +time = "0.1" +tokio = { version = "1.15", features = ["full"] } +toml = "0.5" + +[dev-dependencies] +env_logger = "0.6" +gag = "1.0" +nix = "0.19" +slog = "2.7" +tempfile = "3.1.0" +tokio-test = "0.4.2" + +[dev-dependencies.slog-global] +version = "0.1" +git = "https://github.com/breezewish/slog-global.git" +rev = "0e23a5baff302a9d7bccd85f8f31e43339c2f2c1" diff --git a/common_util/src/alloc_tracker.rs b/common_util/src/alloc_tracker.rs new file mode 100644 index 0000000000..7e0979cb0f --- /dev/null +++ b/common_util/src/alloc_tracker.rs @@ -0,0 +1,159 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Alloc tracker + +use std::sync::atomic::{AtomicUsize, Ordering}; + +/// Collect memory usage from tracker, useful for extending the tracker +pub trait Collector { + /// Called when `bytes` bytes memory is allocated and tracked by the tracker + fn on_allocate(&self, bytes: usize); + + /// Called when `bytes` bytes memory is freed and tracked by the tracker + fn on_free(&self, bytes: usize); +} + +/// A tracker to track memory in used +// TODO(yingwen): Impl a thread local or local tracker that are not thread safe, +// and collect statistics into the thread safe one for better performance +pub struct Tracker { + collector: T, + bytes_allocated: AtomicUsize, +} + +impl Tracker { + pub fn new(collector: T) -> Self { + Self { + collector, + bytes_allocated: AtomicUsize::new(0), + } + } + + /// Increase consumption of this tracker by bytes + pub fn consume(&self, bytes: usize) { + self.bytes_allocated.fetch_add(bytes, Ordering::Relaxed); + self.collector.on_allocate(bytes); + } + + /// Decrease consumption of this tracker by bytes + /// + /// The caller should guarantee the released bytes wont larger than bytes + /// already consumed + pub fn release(&self, bytes: usize) { + self.bytes_allocated.fetch_sub(bytes, Ordering::Relaxed); + self.collector.on_free(bytes); + } + + /// Bytes allocated + pub fn bytes_allocated(&self) -> usize { + self.bytes_allocated.load(Ordering::Relaxed) + } +} + +impl Drop for Tracker { + fn drop(&mut self) { + let bytes = *self.bytes_allocated.get_mut(); + self.collector.on_free(bytes); + } +} + +/// The noop collector does nothing on alloc and free +struct NoopCollector; + +impl Collector for NoopCollector { + fn on_allocate(&self, _bytes: usize) {} + + fn on_free(&self, _bytes: usize) {} +} + +/// A simple tracker hides the collector api +pub struct SimpleTracker(Tracker); + +impl Default for SimpleTracker { + fn default() -> Self { + Self(Tracker::new(NoopCollector)) + } +} + +impl SimpleTracker { + /// Increase consumption of this tracker by bytes + #[inline] + pub fn consume(&self, bytes: usize) { + self.0.consume(bytes); + } + + /// Decrease consumption of this tracker by bytes + /// + /// The caller should guarantee the released bytes wont larger than bytes + /// already consumed + #[inline] + pub fn release(&self, bytes: usize) { + self.0.release(bytes); + } + + /// Bytes allocated + pub fn bytes_allocated(&self) -> usize { + self.0.bytes_allocated() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_simple_tracker() { + let tracker = SimpleTracker::default(); + tracker.consume(256); + assert_eq!(256, tracker.bytes_allocated()); + + tracker.release(100); + assert_eq!(156, tracker.bytes_allocated()); + } + + #[test] + fn test_collector() { + use std::sync::atomic::AtomicBool; + + struct MockCollector { + allocated: AtomicBool, + freed: AtomicBool, + } + + impl MockCollector { + fn new() -> Self { + Self { + allocated: AtomicBool::new(false), + freed: AtomicBool::new(false), + } + } + } + + impl Drop for MockCollector { + fn drop(&mut self) { + assert!(*self.allocated.get_mut()); + assert!(*self.freed.get_mut()); + } + } + + impl Collector for MockCollector { + fn on_allocate(&self, bytes: usize) { + assert_eq!(800, bytes); + self.allocated.store(true, Ordering::Relaxed); + } + + fn on_free(&self, bytes: usize) { + if self.freed.load(Ordering::Relaxed) { + assert_eq!(440, bytes); + } else { + assert_eq!(360, bytes); + } + self.freed.store(true, Ordering::Relaxed); + } + } + + let tracker = Tracker::new(MockCollector::new()); + tracker.consume(800); + tracker.release(360); + } +} diff --git a/common_util/src/codec/compact/bytes.rs b/common_util/src/codec/compact/bytes.rs new file mode 100644 index 0000000000..aeeff7739d --- /dev/null +++ b/common_util/src/codec/compact/bytes.rs @@ -0,0 +1,130 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Bytes format + +use std::convert::TryFrom; + +use common_types::bytes::{Bytes, BytesMut, MemBuf, MemBufMut}; +use snafu::{ensure, ResultExt}; + +use crate::codec::{ + compact::{ + DecodeEmptyValue, DecodeValue, DecodeVarint, EncodeValue, EncodeVarint, Error, + MemCompactDecoder, MemCompactEncoder, Result, TryIntoUsize, + }, + consts, varint, DecodeTo, Encoder, +}; + +impl Encoder<[u8]> for MemCompactEncoder { + type Error = Error; + + // EncodeCompactBytes joins bytes with its length into a byte slice. It is more + // efficient in both space and time compare to EncodeBytes. Note that the + // encoded result is not memcomparable. + fn encode(&self, buf: &mut B, value: &[u8]) -> Result<()> { + varint::encode_varint(buf, value.len() as i64).context(EncodeVarint)?; + buf.write_slice(value).context(EncodeValue)?; + Ok(()) + } + + fn estimate_encoded_size(&self, value: &[u8]) -> usize { + consts::MAX_VARINT_BYTES + value.len() + } +} + +impl Encoder for MemCompactEncoder { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &Bytes) -> Result<()> { + self.encode(buf, &value[..]) + } + + fn estimate_encoded_size(&self, value: &Bytes) -> usize { + self.estimate_encoded_size(&value[..]) + } +} + +impl DecodeTo for MemCompactDecoder { + type Error = Error; + + fn decode_to(&self, buf: &mut B, value: &mut BytesMut) -> Result<()> { + let v = usize::try_from(varint::decode_varint(buf).context(DecodeVarint)?) + .context(TryIntoUsize)?; + ensure!(buf.remaining_slice().len() >= v, DecodeEmptyValue); + value + .write_slice(&buf.remaining_slice()[..v]) + .context(DecodeValue)?; + buf.must_advance(v); + Ok(()) + } +} + +#[cfg(test)] +mod test { + use super::*; + + struct BytesTest { + data: Bytes, + estimate_encoded_size: usize, + } + + #[test] + fn test_compact_bytes_codec() { + let data = vec![ + BytesTest { + data: Bytes::from_static(b""), + estimate_encoded_size: 10, + }, + BytesTest { + data: Bytes::from_static(b"hello1"), + estimate_encoded_size: 16, + }, + BytesTest { + data: Bytes::from_static(b"hello2"), + estimate_encoded_size: 16, + }, + BytesTest { + data: Bytes::from_static(b"hello3"), + estimate_encoded_size: 16, + }, + BytesTest { + data: Bytes::from_static(&[0x00, 0x01]), + estimate_encoded_size: 12, + }, + BytesTest { + data: Bytes::from_static(&[0xff, 0xff]), + estimate_encoded_size: 12, + }, + BytesTest { + data: Bytes::from_static(&[0x01, 0x00]), + estimate_encoded_size: 12, + }, + BytesTest { + data: Bytes::from_static(b"abc"), + estimate_encoded_size: 13, + }, + BytesTest { + data: Bytes::from_static(b"hello world"), + estimate_encoded_size: 21, + }, + ]; + + let encoder = MemCompactEncoder; + let mut buf = vec![]; + for x in &data { + encoder.encode(&mut buf, &x.data).unwrap(); + assert_eq!( + x.estimate_encoded_size, + encoder.estimate_encoded_size(&x.data) + ); + } + + let decoder = MemCompactDecoder; + let mut buf = &buf[..]; + for x in &data { + let mut d = BytesMut::new(); + decoder.decode_to(&mut buf, &mut d).unwrap(); + assert_eq!(d, x.data); + } + } +} diff --git a/common_util/src/codec/compact/datum.rs b/common_util/src/codec/compact/datum.rs new file mode 100644 index 0000000000..0d80088e06 --- /dev/null +++ b/common_util/src/codec/compact/datum.rs @@ -0,0 +1,264 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Datum compact codec + +use common_types::{ + bytes::{BytesMut, MemBuf, MemBufMut}, + datum::Datum, + string::StringBytes, + time::Timestamp, +}; +use snafu::ResultExt; + +use crate::codec::{ + compact::{EncodeKey, Error, MemCompactDecoder, MemCompactEncoder, Result}, + consts, DecodeTo, Encoder, +}; + +// For float points, we use same encoding as mem comparable encoder +impl Encoder for MemCompactEncoder { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &Datum) -> Result<()> { + match value { + Datum::Null => buf.write_u8(consts::NULL_FLAG).context(EncodeKey), + Datum::Timestamp(ts) => { + buf.write_u8(consts::VARINT_FLAG).context(EncodeKey)?; + self.encode(buf, &ts.as_i64()) + } + Datum::Double(v) => { + buf.write_u8(consts::FLOAT_FLAG).context(EncodeKey)?; + self.encode(buf, v) + } + Datum::Float(v) => { + buf.write_u8(consts::FLOAT_FLAG).context(EncodeKey)?; + self.encode(buf, v) + } + Datum::Varbinary(v) => { + buf.write_u8(consts::COMPACT_BYTES_FLAG) + .context(EncodeKey)?; + self.encode(buf, v) + } + // For string, just encode/decode like bytes. + Datum::String(v) => { + buf.write_u8(consts::COMPACT_BYTES_FLAG) + .context(EncodeKey)?; + self.encode(buf, v.as_bytes()) + } + Datum::UInt64(v) => { + buf.write_u8(consts::UVARINT_FLAG).context(EncodeKey)?; + self.encode(buf, v) + } + Datum::UInt32(v) => { + buf.write_u8(consts::UVARINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(u64::from(*v))) + } + Datum::UInt16(v) => { + buf.write_u8(consts::UVARINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(u64::from(*v))) + } + Datum::UInt8(v) => { + buf.write_u8(consts::UVARINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(u64::from(*v))) + } + Datum::Int64(v) => { + buf.write_u8(consts::VARINT_FLAG).context(EncodeKey)?; + self.encode(buf, v) + } + Datum::Int32(v) => { + buf.write_u8(consts::VARINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(i64::from(*v))) + } + Datum::Int16(v) => { + buf.write_u8(consts::VARINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(i64::from(*v))) + } + Datum::Int8(v) => { + buf.write_u8(consts::VARINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(i64::from(*v))) + } + Datum::Boolean(v) => { + buf.write_u8(consts::UVARINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(u64::from(*v))) + } + } + } + + fn estimate_encoded_size(&self, value: &Datum) -> usize { + match value { + // Null takes 1 byte + Datum::Null => 1, + Datum::Timestamp(ts) => self.estimate_encoded_size(&ts.as_i64()), + Datum::Double(v) => self.estimate_encoded_size(v), + Datum::Float(v) => self.estimate_encoded_size(v), + Datum::Varbinary(v) => self.estimate_encoded_size(v), + Datum::String(v) => self.estimate_encoded_size(v.as_bytes()), + Datum::UInt64(v) => self.estimate_encoded_size(v), + Datum::UInt32(v) => self.estimate_encoded_size(&(u64::from(*v))), + Datum::UInt16(v) => self.estimate_encoded_size(&(u64::from(*v))), + Datum::UInt8(v) => self.estimate_encoded_size(&(u64::from(*v))), + Datum::Int64(v) => self.estimate_encoded_size(v), + Datum::Int32(v) => self.estimate_encoded_size(&(i64::from(*v))), + Datum::Int16(v) => self.estimate_encoded_size(&(i64::from(*v))), + Datum::Int8(v) => self.estimate_encoded_size(&(i64::from(*v))), + Datum::Boolean(v) => self.estimate_encoded_size(&(u64::from(*v))), + } + } +} + +macro_rules! decode_var_u64_into { + ($self: ident, $v: ident, $actual: ident, $buf: ident, $type: ty) => {{ + Self::ensure_flag(consts::UVARINT_FLAG, $actual)?; + let mut data = 0u64; + $self.decode_to($buf, &mut data)?; + *$v = data as $type; + }}; +} + +macro_rules! decode_var_u64_into_bool { + ($self: ident, $v: ident, $actual: ident, $buf: ident) => {{ + Self::ensure_flag(consts::UVARINT_FLAG, $actual)?; + let mut data = 0u64; + $self.decode_to($buf, &mut data)?; + *$v = data != 0; + }}; +} + +macro_rules! decode_var_i64_into { + ($self: ident, $v: ident, $actual: ident, $buf: ident, $type: ty) => {{ + Self::ensure_flag(consts::VARINT_FLAG, $actual)?; + let mut data = 0i64; + $self.decode_to($buf, &mut data)?; + *$v = data as $type; + }}; +} + +impl DecodeTo for MemCompactDecoder { + type Error = Error; + + /// REQUIRE: The datum type should match the type in buf + /// + /// For string datum, the utf8 check will be skipped. + fn decode_to(&self, buf: &mut B, value: &mut Datum) -> Result<()> { + let actual = match self.maybe_read_null(buf)? { + Some(v) => v, + None => { + *value = Datum::Null; + return Ok(()); + } + }; + + match value { + Datum::Null => { + Self::ensure_flag(consts::NULL_FLAG, actual)?; + } + Datum::Timestamp(ts) => { + Self::ensure_flag(consts::VARINT_FLAG, actual)?; + let mut data = 0; + self.decode_to(buf, &mut data)?; + *ts = Timestamp::new(data); + } + Datum::Double(v) => { + Self::ensure_flag(consts::FLOAT_FLAG, actual)?; + self.decode_to(buf, v)?; + } + Datum::Float(v) => { + Self::ensure_flag(consts::FLOAT_FLAG, actual)?; + self.decode_to(buf, v)?; + } + Datum::Varbinary(v) => { + Self::ensure_flag(consts::COMPACT_BYTES_FLAG, actual)?; + let mut data = BytesMut::new(); + self.decode_to(buf, &mut data)?; + *v = data.freeze(); + } + Datum::String(v) => { + Self::ensure_flag(consts::COMPACT_BYTES_FLAG, actual)?; + let mut data = BytesMut::new(); + self.decode_to(buf, &mut data)?; + // For string datum, we won't validate whether the bytes is a valid utf string + // during decoding to improve decode performance. The encoder + // should already done the utf8 check. + unsafe { + *v = StringBytes::from_bytes_unchecked(data.freeze()); + } + } + Datum::UInt64(v) => { + Self::ensure_flag(consts::UVARINT_FLAG, actual)?; + self.decode_to(buf, v)?; + } + Datum::UInt32(v) => decode_var_u64_into!(self, v, actual, buf, u32), + Datum::UInt16(v) => decode_var_u64_into!(self, v, actual, buf, u16), + Datum::UInt8(v) => decode_var_u64_into!(self, v, actual, buf, u8), + Datum::Int64(v) => { + Self::ensure_flag(consts::VARINT_FLAG, actual)?; + self.decode_to(buf, v)?; + } + Datum::Int32(v) => decode_var_i64_into!(self, v, actual, buf, i32), + Datum::Int16(v) => decode_var_i64_into!(self, v, actual, buf, i16), + Datum::Int8(v) => decode_var_i64_into!(self, v, actual, buf, i8), + Datum::Boolean(v) => decode_var_u64_into_bool!(self, v, actual, buf), + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use common_types::bytes::Bytes; + + use super::*; + + // TODO(yingwen): Test nullable. + #[test] + fn test_datum_codec() { + let data = vec![ + // (datum to encode, estimate_encoded_size) + (Datum::Null, 1), + (Datum::Timestamp(Timestamp::new(12345)), 10), + (Datum::Double(10.5), 8), + (Datum::Float(1.99), 4), + (Datum::Varbinary(Bytes::from_static(b"hello world")), 21), + (Datum::String(StringBytes::from_static("hello world")), 21), + (Datum::UInt64(12345), 10), + (Datum::UInt32(1000), 10), + (Datum::UInt16(65000), 10), + (Datum::UInt8(150), 10), + (Datum::Int64(-100209), 10), + (Datum::Int32(-10020), 10), + (Datum::Int16(32500), 10), + (Datum::Int8(-120), 10), + (Datum::Boolean(true), 10), + (Datum::Boolean(false), 10), + ]; + let mut decoded = vec![ + Datum::Null, + Datum::Timestamp(Timestamp::new(0)), + Datum::Double(0.0), + Datum::Float(0.0), + Datum::Varbinary(Bytes::new()), + Datum::String(StringBytes::new()), + Datum::UInt64(0), + Datum::UInt32(0), + Datum::UInt16(0), + Datum::UInt8(0), + Datum::Int64(0), + Datum::Int32(0), + Datum::Int16(0), + Datum::Int8(0), + Datum::Boolean(false), + Datum::Boolean(false), + ]; + let encoder = MemCompactEncoder; + let decoder = MemCompactDecoder; + for (index, x) in data.iter().enumerate() { + let mut buf = vec![]; + encoder.encode(&mut buf, &x.0).unwrap(); + assert_eq!(x.1, encoder.estimate_encoded_size(&x.0)); + decoder + .decode_to(&mut buf.as_slice(), &mut decoded[index]) + .unwrap(); + assert_eq!(decoded[index], data[index].0); + } + } +} diff --git a/common_util/src/codec/compact/float.rs b/common_util/src/codec/compact/float.rs new file mode 100644 index 0000000000..867ff3282b --- /dev/null +++ b/common_util/src/codec/compact/float.rs @@ -0,0 +1,101 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::mem; + +use common_types::bytes::{MemBuf, MemBufMut}; +use snafu::ResultExt; + +use crate::codec::{ + compact::{DecodeValue, EncodeValue, Error, MemCompactDecoder, MemCompactEncoder, Result}, + DecodeTo, Encoder, +}; + +impl Encoder for MemCompactEncoder { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &f64) -> Result<()> { + buf.write_f64(*value).context(EncodeValue)?; + Ok(()) + } + + fn estimate_encoded_size(&self, _value: &f64) -> usize { + mem::size_of::() + } +} + +impl DecodeTo for MemCompactDecoder { + type Error = Error; + + fn decode_to(&self, buf: &mut B, value: &mut f64) -> Result<()> { + *value = buf.read_f64().context(DecodeValue)?; + Ok(()) + } +} + +impl Encoder for MemCompactEncoder { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &f32) -> Result<()> { + buf.write_f32(*value).context(EncodeValue)?; + Ok(()) + } + + fn estimate_encoded_size(&self, _value: &f32) -> usize { + mem::size_of::() + } +} + +impl DecodeTo for MemCompactDecoder { + type Error = Error; + + fn decode_to(&self, buf: &mut B, value: &mut f32) -> Result<()> { + *value = buf.read_f32().context(DecodeValue)?; + Ok(()) + } +} + +#[cfg(test)] +mod test { + use super::*; + + struct TestF64 { + data: f64, + estimate_encoded_size: usize, + } + + #[test] + fn test_compact_f64_codec() { + let data = vec![ + TestF64 { + data: 162132470.5, + estimate_encoded_size: 8, + }, + TestF64 { + data: f64::MIN, + estimate_encoded_size: 8, + }, + TestF64 { + data: f64::MAX, + estimate_encoded_size: 8, + }, + ]; + + let encoder = MemCompactEncoder; + let mut buf = vec![]; + for x in &data { + encoder.encode(&mut buf, &x.data).unwrap(); + assert_eq!( + x.estimate_encoded_size, + encoder.estimate_encoded_size(&x.data) + ); + } + + let decoder = MemCompactDecoder; + let mut buf = &buf[..]; + for x in &data { + let mut d = 0.0; + decoder.decode_to(&mut buf, &mut d).unwrap(); + assert!((d - x.data).abs() < f64::EPSILON); + } + } +} diff --git a/common_util/src/codec/compact/mod.rs b/common_util/src/codec/compact/mod.rs new file mode 100644 index 0000000000..1327e05929 --- /dev/null +++ b/common_util/src/codec/compact/mod.rs @@ -0,0 +1,92 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Mem compact format codec + +// Implementation reference: +// https://github.com/pingcap/tidb/blob/bd011d3c9567c506d8d4343ade03edf77fcd5b56/util/codec/codec.go +mod bytes; +mod datum; +mod float; +mod number; + +use common_types::bytes::MemBuf; +use snafu::{ensure, Backtrace, ResultExt, Snafu}; + +use crate::codec::consts; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to encode flag, err:{}", source))] + EncodeKey { source: common_types::bytes::Error }, + + #[snafu(display("Failed to encode value, err:{}", source))] + EncodeValue { source: common_types::bytes::Error }, + + #[snafu(display("Failed to encode varint, err:{}", source))] + EncodeVarint { source: crate::codec::varint::Error }, + + #[snafu(display("Failed to decode varint, err:{}", source))] + DecodeVarint { source: crate::codec::varint::Error }, + + #[snafu(display("Failed to decode key, err:{}", source))] + DecodeKey { source: common_types::bytes::Error }, + + #[snafu(display("Insufficient bytes to decode value.\nBacktrace:\n{}", backtrace))] + DecodeEmptyValue { backtrace: Backtrace }, + + #[snafu(display( + "Invalid flag, expect:{}, actual:{}.\nBacktrace:\n{}", + expect, + actual, + backtrace + ))] + InvalidKeyFlag { + expect: u8, + actual: u8, + backtrace: Backtrace, + }, + + #[snafu(display("Insufficient bytes to decode value, err:{}", source))] + DecodeValue { source: common_types::bytes::Error }, + + #[snafu(display("Try into usize error:{}.\nBacktrace:\n{}", source, backtrace))] + TryIntoUsize { + source: std::num::TryFromIntError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to decode string, err:{}", source))] + DecodeString { source: common_types::string::Error }, + + #[snafu(display("Datum cannot be null.\nBacktrace:\n{}", backtrace))] + NullDatum { backtrace: Backtrace }, +} + +define_result!(Error); + +/// Mem compact encoder +pub struct MemCompactEncoder; + +/// Mem compact decoder +pub struct MemCompactDecoder; + +impl MemCompactDecoder { + /// Returns None if we need to return null datum, otherwise return the flag. + fn maybe_read_null(&self, buf: &mut B) -> Result> { + let actual = buf.read_u8().context(DecodeKey)?; + // If actual flag is null, need to check whether this datum is nullable. + if actual == consts::NULL_FLAG { + // The decoder need to return null datum. + return Ok(None); + } + + Ok(Some(actual)) + } + + #[inline] + fn ensure_flag(expect: u8, actual: u8) -> Result<()> { + // Actual flag is not null. + ensure!(expect == actual, InvalidKeyFlag { expect, actual }); + Ok(()) + } +} diff --git a/common_util/src/codec/compact/number.rs b/common_util/src/codec/compact/number.rs new file mode 100644 index 0000000000..56aa76504f --- /dev/null +++ b/common_util/src/codec/compact/number.rs @@ -0,0 +1,160 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Number format + +use common_types::bytes::{MemBuf, MemBufMut}; +use snafu::ResultExt; + +use crate::codec::{ + compact::{DecodeVarint, EncodeVarint, Error, MemCompactDecoder, MemCompactEncoder, Result}, + consts, varint, DecodeTo, Encoder, +}; + +impl Encoder for MemCompactEncoder { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &i64) -> Result<()> { + varint::encode_varint(buf, *value).context(EncodeVarint)?; + Ok(()) + } + + fn estimate_encoded_size(&self, _value: &i64) -> usize { + consts::MAX_VARINT_BYTES + } +} + +impl DecodeTo for MemCompactDecoder { + type Error = Error; + + fn decode_to(&self, buf: &mut B, value: &mut i64) -> Result<()> { + *value = varint::decode_varint(buf).context(DecodeVarint)?; + Ok(()) + } +} + +impl Encoder for MemCompactEncoder { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &u64) -> Result<()> { + varint::encode_uvarint(buf, *value).context(EncodeVarint)?; + Ok(()) + } + + fn estimate_encoded_size(&self, _value: &u64) -> usize { + consts::MAX_UVARINT_BYTES + } +} + +impl DecodeTo for MemCompactDecoder { + type Error = Error; + + fn decode_to(&self, buf: &mut B, value: &mut u64) -> Result<()> { + *value = varint::decode_uvarint(buf).context(DecodeVarint)?; + Ok(()) + } +} + +#[cfg(test)] +mod test { + use super::*; + + struct TestI64 { + data: i64, + estimate_encoded_size: usize, + } + #[test] + fn test_compact_i64_codec() { + let data = vec![ + TestI64 { + data: 1621324705, + estimate_encoded_size: 10, + }, + TestI64 { + data: 1621324705000, + estimate_encoded_size: 10, + }, + TestI64 { + data: 1521324705, + estimate_encoded_size: 10, + }, + TestI64 { + data: 1621324705123, + estimate_encoded_size: 10, + }, + TestI64 { + data: i64::MIN, + estimate_encoded_size: 10, + }, + TestI64 { + data: i64::MIN + 1, + estimate_encoded_size: 10, + }, + TestI64 { + data: 0, + estimate_encoded_size: 10, + }, + TestI64 { + data: i64::MAX, + estimate_encoded_size: 10, + }, + TestI64 { + data: (1 << 47) - 1, + estimate_encoded_size: 10, + }, + TestI64 { + data: -1 << 47, + estimate_encoded_size: 10, + }, + TestI64 { + data: (1 << 23) - 1, + estimate_encoded_size: 10, + }, + TestI64 { + data: -1 << 23, + estimate_encoded_size: 10, + }, + TestI64 { + data: (1 << 33) - 1, + estimate_encoded_size: 10, + }, + TestI64 { + data: -1 << 33, + estimate_encoded_size: 10, + }, + TestI64 { + data: (1 << 55) - 1, + estimate_encoded_size: 10, + }, + TestI64 { + data: -1 << 55, + estimate_encoded_size: 10, + }, + TestI64 { + data: 1, + estimate_encoded_size: 10, + }, + TestI64 { + data: -1, + estimate_encoded_size: 10, + }, + ]; + + let encoder = MemCompactEncoder; + let mut buf = vec![]; + for x in &data { + encoder.encode(&mut buf, &x.data).unwrap(); + assert_eq!( + x.estimate_encoded_size, + encoder.estimate_encoded_size(&x.data) + ); + } + + let decoder = MemCompactDecoder; + let mut buf = &buf[..]; + for x in &data { + let mut d = -1; + decoder.decode_to(&mut buf, &mut d).unwrap(); + assert_eq!(d, x.data); + } + } +} diff --git a/common_util/src/codec/consts.rs b/common_util/src/codec/consts.rs new file mode 100644 index 0000000000..843985eec6 --- /dev/null +++ b/common_util/src/codec/consts.rs @@ -0,0 +1,21 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Common constants used in codec + +// First byte in the encoded value which specifies the encoding type. +// TODO(yingwen): Replace flags by datum kind. (Incompatible with old format). +pub const NULL_FLAG: u8 = 0; +pub const BYTES_FLAG: u8 = 1; +pub const COMPACT_BYTES_FLAG: u8 = 2; +pub const INT_FLAG: u8 = 3; +pub const UINT_FLAG: u8 = 4; +pub const FLOAT_FLAG: u8 = 5; +pub const VARINT_FLAG: u8 = 8; +pub const UVARINT_FLAG: u8 = 9; + +/// Max bytes varint can use +pub const MAX_VARINT_BYTES: usize = 10; +/// Max bytes uvarint can be use +pub const MAX_UVARINT_BYTES: usize = 10; +/// Sign mask for u64/i64 conversion +pub const SIGN_MASK: u64 = 0x8000000000000000; diff --git a/common_util/src/codec/memcomparable/bytes.rs b/common_util/src/codec/memcomparable/bytes.rs new file mode 100644 index 0000000000..878ad9c051 --- /dev/null +++ b/common_util/src/codec/memcomparable/bytes.rs @@ -0,0 +1,279 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Bytes format + +use common_types::bytes::{Bytes, BytesMut, MemBuf, MemBufMut}; +use snafu::{ensure, ResultExt}; + +use crate::codec::{ + memcomparable::{ + DecodeValueGroup, DecodeValueMarker, DecodeValuePadding, EncodeValue, Error, MemComparable, + Result, + }, + DecodeTo, Encoder, +}; + +const ENC_GROUP_SIZE: usize = 8; +const ENC_MARKER: u8 = 0xFF; +const ENC_PAD: u8 = 0x0; +const PADS: [u8; ENC_GROUP_SIZE] = [0; ENC_GROUP_SIZE]; + +impl Encoder<[u8]> for MemComparable { + type Error = Error; + + // encode Bytes guarantees the encoded value is in ascending order for + // comparison, encoding with the following rule: + // [group1][marker1]...[groupN][markerN] + // group is 8 bytes slice which is padding with 0. + // marker is `0xFF - padding 0 count` + // For example: + // + // ``` + // [] -> [0, 0, 0, 0, 0, 0, 0, 0, 247] + // [1, 2, 3] -> [1, 2, 3, 0, 0, 0, 0, 0, 250] + // [1, 2, 3, 0] -> [1, 2, 3, 0, 0, 0, 0, 0, 251] + // [1, 2, 3, 4, 5, 6, 7, 8] -> [1, 2, 3, 4, 5, 6, 7, 8, 255, 0, 0, 0, 0, 0, 0, 0, 0, 247] + // ``` + // + // Refer: https://github.com/facebook/mysql-5.6/wiki/MyRocks-record-format#memcomparable-format + fn encode(&self, buf: &mut B, value: &[u8]) -> Result<()> { + let value_len = value.len(); + for idx in (0..=value_len).step_by(ENC_GROUP_SIZE) { + let remain = value_len - idx; + let mut pad_count = 0; + if remain >= ENC_GROUP_SIZE { + buf.write_slice(&value[idx..idx + ENC_GROUP_SIZE]) + .context(EncodeValue)?; + } else { + pad_count = ENC_GROUP_SIZE - remain; + buf.write_slice(&value[idx..]).context(EncodeValue)?; + buf.write_slice(&PADS[..pad_count]).context(EncodeValue)?; + } + let marker = ENC_MARKER - pad_count as u8; + buf.write_u8(marker).context(EncodeValue)?; + } + Ok(()) + } + + // Allocate more space to avoid unnecessary slice growing. + // Assume that the byte slice size is about `(len(data) / encGroupSize + 1) * + // (encGroupSize + 1)` bytes, that is `(len(data) / 8 + 1) * 9` in our + // implement. + fn estimate_encoded_size(&self, value: &[u8]) -> usize { + (value.len() / ENC_GROUP_SIZE + 1) * (ENC_GROUP_SIZE + 1) + } +} + +impl Encoder for MemComparable { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &Bytes) -> Result<()> { + self.encode(buf, &value[..]) + } + + fn estimate_encoded_size(&self, value: &Bytes) -> usize { + self.estimate_encoded_size(&value[..]) + } +} + +impl DecodeTo for MemComparable { + type Error = Error; + + // decode Bytes which is encoded by encode Bytes before, + // returns the leftover bytes and decoded value if no error. + fn decode_to(&self, buf: &mut B, value: &mut BytesMut) -> Result<()> { + loop { + let b = buf.remaining_slice(); + ensure!(b.len() > ENC_GROUP_SIZE, DecodeValueGroup); + + let group_bytes = &b[..ENC_GROUP_SIZE + 1]; + let group = &group_bytes[..ENC_GROUP_SIZE]; + let marker = group_bytes[ENC_GROUP_SIZE]; + let pad_count = usize::from(ENC_MARKER - marker); + ensure!( + pad_count <= ENC_GROUP_SIZE, + DecodeValueMarker { group_bytes } + ); + + let real_group_size = ENC_GROUP_SIZE - pad_count; + value + .write_slice(&group[..real_group_size]) + .context(EncodeValue)?; + + if pad_count != 0 { + // Check validity of padding bytes. + for v in &group[real_group_size..] { + ensure!(*v == ENC_PAD, DecodeValuePadding { group_bytes }); + } + buf.must_advance(ENC_GROUP_SIZE + 1); + + break; + } + buf.must_advance(ENC_GROUP_SIZE + 1); + } + Ok(()) + } +} + +#[cfg(test)] +mod test { + use core::cmp::Ordering; + + use super::*; + + struct BytesTest { + data: Bytes, + estimate_encoded_size: usize, + } + + #[test] + fn test_bytes_codec() { + let data = vec![ + BytesTest { + data: Bytes::from_static(b""), + estimate_encoded_size: 9, + }, + BytesTest { + data: Bytes::from_static(b"hello1"), + estimate_encoded_size: 9, + }, + BytesTest { + data: Bytes::from_static(b"hello2"), + estimate_encoded_size: 9, + }, + BytesTest { + data: Bytes::from_static(b"hello3"), + estimate_encoded_size: 9, + }, + BytesTest { + data: Bytes::from_static(&[0x00, 0x01]), + estimate_encoded_size: 9, + }, + BytesTest { + data: Bytes::from_static(&[0xff, 0xff]), + estimate_encoded_size: 9, + }, + BytesTest { + data: Bytes::from_static(&[0x01, 0x00]), + estimate_encoded_size: 9, + }, + BytesTest { + data: Bytes::from_static(b"abc"), + estimate_encoded_size: 9, + }, + BytesTest { + data: Bytes::from_static(b"hello world"), + estimate_encoded_size: 18, + }, + ]; + + let c = MemComparable; + let mut buf = vec![]; + for x in &data { + c.encode(&mut buf, &x.data).unwrap(); + assert_eq!(x.estimate_encoded_size, c.estimate_encoded_size(&x.data)); + } + + let mut buf = &buf[..]; + for x in &data { + let mut d = BytesMut::new(); + c.decode_to(&mut buf, &mut d).unwrap(); + assert_eq!(d, x.data); + } + } + + struct TbBytes { + arg1: Bytes, + arg2: Bytes, + ret: Ordering, + } + + #[test] + fn test_bytes_order() { + let data = vec![ + TbBytes { + arg1: Bytes::new(), + arg2: Bytes::from_static(&[0x00]), + ret: Ordering::Less, + }, + TbBytes { + arg1: Bytes::from_static(&[0x00]), + arg2: Bytes::from_static(&[0x00]), + ret: Ordering::Equal, + }, + TbBytes { + arg1: Bytes::from_static(&[0xFF]), + arg2: Bytes::from_static(&[0x00]), + ret: Ordering::Greater, + }, + TbBytes { + arg1: Bytes::from_static(&[0xFF]), + arg2: Bytes::from_static(&[0xFF, 0x00]), + ret: Ordering::Less, + }, + TbBytes { + arg1: Bytes::from_static(b"a"), + arg2: Bytes::from_static(b"b"), + ret: Ordering::Less, + }, + TbBytes { + arg1: Bytes::from_static(b"a"), + arg2: Bytes::from_static(&[0x00]), + ret: Ordering::Greater, + }, + TbBytes { + arg1: Bytes::from_static(&[0x00]), + arg2: Bytes::from_static(&[0x01]), + ret: Ordering::Less, + }, + TbBytes { + arg1: Bytes::from_static(&[0x00, 0x01]), + arg2: Bytes::from_static(&[0x00, 0x00]), + ret: Ordering::Greater, + }, + TbBytes { + arg1: Bytes::from_static(&[0x00, 0x00, 0x00]), + arg2: Bytes::from_static(&[0x00, 0x00]), + ret: Ordering::Greater, + }, + TbBytes { + arg1: Bytes::from_static(&[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]), + arg2: Bytes::from_static(&[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]), + ret: Ordering::Less, + }, + TbBytes { + arg1: Bytes::from_static(&[0x01, 0x02, 0x03, 0x00]), + arg2: Bytes::from_static(&[0x01, 0x02, 0x03]), + ret: Ordering::Greater, + }, + TbBytes { + arg1: Bytes::from_static(&[0x01, 0x03, 0x03, 0x04]), + arg2: Bytes::from_static(&[0x01, 0x03, 0x03, 0x05]), + ret: Ordering::Less, + }, + TbBytes { + arg1: Bytes::from_static(&[0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07]), + arg2: Bytes::from_static(&[0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08]), + ret: Ordering::Less, + }, + TbBytes { + arg1: Bytes::from_static(&[0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09]), + arg2: Bytes::from_static(&[0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08]), + ret: Ordering::Greater, + }, + TbBytes { + arg1: Bytes::from_static(&[0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x00]), + arg2: Bytes::from_static(&[0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08]), + ret: Ordering::Greater, + }, + ]; + let c = MemComparable; + for x in &data { + let mut buf1 = vec![]; + let mut buf2 = vec![]; + c.encode(&mut buf1, &x.arg1).unwrap(); + c.encode(&mut buf2, &x.arg2).unwrap(); + assert_eq!(x.ret, buf1.as_slice().cmp(buf2.as_slice())); + } + } +} diff --git a/common_util/src/codec/memcomparable/datum.rs b/common_util/src/codec/memcomparable/datum.rs new file mode 100644 index 0000000000..3af3d5f474 --- /dev/null +++ b/common_util/src/codec/memcomparable/datum.rs @@ -0,0 +1,290 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Datum comparable codec + +use common_types::{ + bytes::{BytesMut, MemBuf, MemBufMut}, + datum::{Datum, DatumKind}, + string::StringBytes, + time::Timestamp, +}; +use snafu::ResultExt; + +use crate::codec::{ + consts, + memcomparable::{EncodeKey, Error, MemComparable, Result, UnsupportedKind}, + DecodeTo, Encoder, +}; + +// TODO(yingwen): Consider collate for string. +impl Encoder for MemComparable { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &Datum) -> Result<()> { + match value { + Datum::Null => buf.write_u8(consts::NULL_FLAG).context(EncodeKey), + Datum::Timestamp(ts) => { + buf.write_u8(consts::INT_FLAG).context(EncodeKey)?; + self.encode(buf, &ts.as_i64()) + } + Datum::Varbinary(v) => { + buf.write_u8(consts::BYTES_FLAG).context(EncodeKey)?; + self.encode(buf, v) + } + // For string, we just use same encoding method as bytes now. + Datum::String(v) => { + buf.write_u8(consts::BYTES_FLAG).context(EncodeKey)?; + self.encode(buf, v.as_bytes()) + } + Datum::UInt64(v) => { + buf.write_u8(consts::UINT_FLAG).context(EncodeKey)?; + self.encode(buf, v) + } + Datum::UInt32(v) => { + buf.write_u8(consts::UINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(u64::from(*v))) + } + Datum::UInt16(v) => { + buf.write_u8(consts::UINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(u64::from(*v))) + } + Datum::UInt8(v) => { + buf.write_u8(consts::UINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(u64::from(*v))) + } + Datum::Int64(v) => { + buf.write_u8(consts::INT_FLAG).context(EncodeKey)?; + self.encode(buf, v) + } + Datum::Int32(v) => { + buf.write_u8(consts::INT_FLAG).context(EncodeKey)?; + self.encode(buf, &(i64::from(*v))) + } + Datum::Int16(v) => { + buf.write_u8(consts::INT_FLAG).context(EncodeKey)?; + self.encode(buf, &(i64::from(*v))) + } + Datum::Int8(v) => { + buf.write_u8(consts::INT_FLAG).context(EncodeKey)?; + self.encode(buf, &(i64::from(*v))) + } + Datum::Boolean(v) => { + buf.write_u8(consts::UINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(u64::from(*v))) + } + Datum::Double(_) => UnsupportedKind { + kind: DatumKind::Double, + } + .fail(), + Datum::Float(_) => UnsupportedKind { + kind: DatumKind::Float, + } + .fail(), + } + } + + fn estimate_encoded_size(&self, value: &Datum) -> usize { + match value { + // Null takes 1 byte + Datum::Null => 1, + Datum::Timestamp(ts) => self.estimate_encoded_size(&ts.as_i64()), + Datum::Varbinary(v) => self.estimate_encoded_size(v), + Datum::String(v) => self.estimate_encoded_size(v.as_bytes()), + Datum::UInt64(v) => self.estimate_encoded_size(v), + Datum::UInt32(v) => self.estimate_encoded_size(&(u64::from(*v))), + Datum::UInt16(v) => self.estimate_encoded_size(&(u64::from(*v))), + Datum::UInt8(v) => self.estimate_encoded_size(&(u64::from(*v))), + Datum::Int64(v) => self.estimate_encoded_size(v), + Datum::Int32(v) => self.estimate_encoded_size(&(i64::from(*v))), + Datum::Int16(v) => self.estimate_encoded_size(&(i64::from(*v))), + Datum::Int8(v) => self.estimate_encoded_size(&(i64::from(*v))), + Datum::Boolean(v) => self.estimate_encoded_size(&(u64::from(*v))), + // Unsupported kind, but we return 1 + Datum::Double(_) | Datum::Float(_) => 1, + } + } +} + +macro_rules! decode_u64_into { + ($self: ident, $v: ident, $buf: ident, $type: ty) => {{ + Self::ensure_flag($buf, consts::UINT_FLAG)?; + let mut data = 0u64; + $self.decode_to($buf, &mut data)?; + *$v = data as $type; + }}; +} + +macro_rules! decode_u64_into_bool { + ($self: ident, $v: ident, $buf: ident) => {{ + Self::ensure_flag($buf, consts::UINT_FLAG)?; + let mut data = 0u64; + $self.decode_to($buf, &mut data)?; + *$v = data != 0; + }}; +} + +macro_rules! decode_i64_into { + ($self: ident, $v: ident, $buf: ident, $type: ty) => {{ + Self::ensure_flag($buf, consts::INT_FLAG)?; + let mut data = 0i64; + $self.decode_to($buf, &mut data)?; + *$v = data as $type; + }}; +} + +impl DecodeTo for MemComparable { + type Error = Error; + + /// REQUIRE: The datum type should match the type in buf + /// + /// For string datum, the utf8 check will be skipped. + fn decode_to(&self, buf: &mut B, value: &mut Datum) -> Result<()> { + match value { + Datum::Null => { + Self::ensure_flag(buf, consts::NULL_FLAG)?; + } + Datum::Timestamp(ts) => { + Self::ensure_flag(buf, consts::INT_FLAG)?; + let mut data = 0; + self.decode_to(buf, &mut data)?; + *ts = Timestamp::new(data); + } + Datum::Varbinary(v) => { + Self::ensure_flag(buf, consts::BYTES_FLAG)?; + let mut data = BytesMut::new(); + self.decode_to(buf, &mut data)?; + *v = data.freeze(); + } + Datum::String(v) => { + Self::ensure_flag(buf, consts::BYTES_FLAG)?; + let mut data = BytesMut::new(); + self.decode_to(buf, &mut data)?; + // For string datum, we won't validate whether the bytes is a valid utf string + // during decoding to improve decode performance. The encoder + // should already done the utf8 check. + unsafe { + *v = StringBytes::from_bytes_unchecked(data.freeze()); + } + } + Datum::UInt64(v) => { + Self::ensure_flag(buf, consts::UINT_FLAG)?; + self.decode_to(buf, v)?; + } + Datum::UInt32(v) => decode_u64_into!(self, v, buf, u32), + Datum::UInt16(v) => decode_u64_into!(self, v, buf, u16), + Datum::UInt8(v) => decode_u64_into!(self, v, buf, u8), + Datum::Int64(v) => { + Self::ensure_flag(buf, consts::INT_FLAG)?; + self.decode_to(buf, v)?; + } + Datum::Int32(v) => decode_i64_into!(self, v, buf, i32), + Datum::Int16(v) => decode_i64_into!(self, v, buf, i16), + Datum::Int8(v) => decode_i64_into!(self, v, buf, i8), + Datum::Boolean(v) => decode_u64_into_bool!(self, v, buf), + Datum::Double(_) => { + return UnsupportedKind { + kind: DatumKind::Double, + } + .fail(); + } + Datum::Float(_) => { + return UnsupportedKind { + kind: DatumKind::Float, + } + .fail(); + } + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use core::cmp::Ordering; + + use common_types::bytes::Bytes; + + use super::*; + + #[test] + fn test_datum_codec() { + let data = vec![ + // (datum to encode, estimate_encoded_size) + (Datum::Null, 1), + (Datum::Timestamp(Timestamp::new(12345)), 9), + (Datum::Varbinary(Bytes::from_static(b"hello world")), 18), + (Datum::String(StringBytes::from_static("hello world")), 18), + (Datum::UInt64(100209), 9), + (Datum::UInt32(10020), 9), + (Datum::UInt16(65000), 9), + (Datum::UInt8(150), 9), + (Datum::Int64(-100209), 9), + (Datum::Int32(-10020), 9), + (Datum::Int16(32500), 9), + (Datum::Int8(-120), 9), + (Datum::Boolean(true), 9), + (Datum::Boolean(false), 9), + ]; + let mut decoded = vec![ + Datum::Null, + Datum::Timestamp(Timestamp::new(0)), + Datum::Varbinary(Bytes::new()), + Datum::String(StringBytes::new()), + Datum::UInt64(0), + Datum::UInt32(0), + Datum::UInt16(0), + Datum::UInt8(0), + Datum::Int64(0), + Datum::Int32(0), + Datum::Int16(0), + Datum::Int8(0), + Datum::Boolean(false), + Datum::Boolean(false), + ]; + let c = MemComparable; + for (index, x) in data.iter().enumerate() { + let mut buf = vec![]; + c.encode(&mut buf, &x.0).unwrap(); + assert_eq!(x.1, c.estimate_encoded_size(&x.0)); + c.decode_to(&mut buf.as_slice(), &mut decoded[index]) + .unwrap(); + assert_eq!(decoded[index], data[index].0); + } + } + + #[test] + fn test_datum_order() { + let data = vec![ + // (arg1, arg2, cmp order of arg1 and arg2) + (Datum::Null, Datum::Null, Ordering::Equal), + ( + Datum::Timestamp(Timestamp::new(12345)), + Datum::Timestamp(Timestamp::new(123456)), + Ordering::Less, + ), + ( + Datum::Varbinary(Bytes::from_static(&[ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + ])), + Datum::Varbinary(Bytes::from_static(&[ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + ])), + Ordering::Less, + ), + ( + Datum::String(StringBytes::from_static("abce123")), + Datum::String(StringBytes::from_static("abce1234")), + Ordering::Less, + ), + (Datum::UInt64(888), Datum::UInt64(889), Ordering::Less), + ]; + let c = MemComparable; + for x in &data { + let mut buf1 = vec![]; + let mut buf2 = vec![]; + c.encode(&mut buf1, &x.0).unwrap(); + c.encode(&mut buf2, &x.1).unwrap(); + assert_eq!(x.2, buf1.as_slice().cmp(buf2.as_slice())); + } + } +} diff --git a/common_util/src/codec/memcomparable/mod.rs b/common_util/src/codec/memcomparable/mod.rs new file mode 100644 index 0000000000..1321fffdab --- /dev/null +++ b/common_util/src/codec/memcomparable/mod.rs @@ -0,0 +1,98 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Mem comparable format codec + +// Implementation reference: +// https://github.com/pingcap/tidb/blob/bd011d3c9567c506d8d4343ade03edf77fcd5b56/util/codec/codec.go + +mod bytes; +mod datum; +mod number; + +use common_types::{ + bytes::{BytesMut, MemBuf}, + datum::DatumKind, +}; +use snafu::{ensure, Backtrace, ResultExt, Snafu}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to encode flag, err:{}", source))] + EncodeKey { source: common_types::bytes::Error }, + + #[snafu(display("Failed to encode value, err:{}", source))] + EncodeValue { source: common_types::bytes::Error }, + + #[snafu(display("Failed to decode key, err:{}", source))] + DecodeKey { source: common_types::bytes::Error }, + + #[snafu(display( + "Invalid flag, expect:{}, actual:{}.\nBacktrace:\n{}", + expect, + actual, + backtrace + ))] + InvalidKeyFlag { + expect: u8, + actual: u8, + backtrace: Backtrace, + }, + + #[snafu(display( + "Unsupported datum kind to compare in mem, kind :{}.\nBacktrace:\n{}", + kind, + backtrace + ))] + UnsupportedKind { + kind: DatumKind, + backtrace: Backtrace, + }, + + #[snafu(display("Insufficient bytes to decode value, err:{}", source))] + DecodeValue { source: common_types::bytes::Error }, + + #[snafu(display("Insufficient bytes to decode value group.\nBacktrace:\n{}", backtrace))] + DecodeValueGroup { backtrace: Backtrace }, + + #[snafu(display( + "Invalid marker byte, group bytes: {:?}.\nBacktrace:\n{}", + group_bytes, + backtrace + ))] + DecodeValueMarker { + group_bytes: BytesMut, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid padding byte, group bytes: {:?}.\nBacktrace:\n{}", + group_bytes, + backtrace + ))] + DecodeValuePadding { + group_bytes: BytesMut, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to decode string, err:{}", source))] + DecodeString { source: common_types::string::Error }, +} + +define_result!(Error); + +/// Mem comparable codec +pub struct MemComparable; + +impl MemComparable { + fn ensure_flag(buf: &mut B, flag: u8) -> Result<()> { + let actual = buf.read_u8().context(DecodeKey)?; + ensure!( + flag == actual, + InvalidKeyFlag { + expect: flag, + actual + } + ); + Ok(()) + } +} diff --git a/common_util/src/codec/memcomparable/number.rs b/common_util/src/codec/memcomparable/number.rs new file mode 100644 index 0000000000..70cb36b03d --- /dev/null +++ b/common_util/src/codec/memcomparable/number.rs @@ -0,0 +1,333 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Number format + +use common_types::bytes::{MemBuf, MemBufMut}; +use snafu::ResultExt; + +use crate::codec::{ + consts, + memcomparable::{DecodeValue, EncodeValue, Error, MemComparable, Result}, + DecodeTo, Encoder, +}; + +impl Encoder for MemComparable { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &i64) -> Result<()> { + buf.write_u64(encode_int_to_cmp_uint(*value)) + .context(EncodeValue)?; + Ok(()) + } + + fn estimate_encoded_size(&self, _value: &i64) -> usize { + // flag + u64 + 9 + } +} + +impl DecodeTo for MemComparable { + type Error = Error; + + fn decode_to(&self, buf: &mut B, value: &mut i64) -> Result<()> { + *value = decode_cmp_uint_to_int(buf.read_u64().context(DecodeValue)?); + Ok(()) + } +} + +// encode_int_to_cmp_uint make int v to comparable uint type +fn encode_int_to_cmp_uint(v: i64) -> u64 { + (v as u64) ^ consts::SIGN_MASK +} + +// decode_cmp_uint_to_int decodes the u that encoded by encode_int_to_cmp_uint +fn decode_cmp_uint_to_int(u: u64) -> i64 { + (u ^ consts::SIGN_MASK) as i64 +} + +impl Encoder for MemComparable { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &u64) -> Result<()> { + buf.write_u64(*value).context(EncodeValue)?; + Ok(()) + } + + fn estimate_encoded_size(&self, _value: &u64) -> usize { + // flag + u64 + 9 + } +} + +impl DecodeTo for MemComparable { + type Error = Error; + + fn decode_to(&self, buf: &mut B, value: &mut u64) -> Result<()> { + *value = buf.read_u64().context(DecodeValue)?; + Ok(()) + } +} + +#[cfg(test)] +mod test { + use core::cmp::Ordering; + + use super::*; + + struct TestI64 { + data: i64, + estimate_encoded_size: usize, + } + + impl TestI64 { + fn new(data: i64) -> Self { + Self { + data, + estimate_encoded_size: 9, + } + } + } + + #[test] + fn test_i64_codec() { + let data = vec![ + TestI64::new(1621324705), + TestI64::new(1621324705000), + TestI64::new(1521324705), + TestI64::new(1621324705123), + TestI64::new(i64::MIN), + TestI64::new(i64::MIN + 1), + TestI64::new(0), + TestI64::new(i64::MAX), + TestI64::new((1 << 47) - 1), + TestI64::new(-1 << 47), + TestI64::new((1 << 23) - 1), + TestI64::new(-1 << 23), + TestI64::new((1 << 33) - 1), + TestI64::new(-1 << 33), + TestI64::new((1 << 55) - 1), + TestI64::new(-1 << 55), + TestI64::new(1), + TestI64::new(-1), + ]; + let c = MemComparable; + let mut buf = vec![]; + for x in &data { + c.encode(&mut buf, &x.data).unwrap(); + assert_eq!(x.estimate_encoded_size, c.estimate_encoded_size(&x.data)); + } + + let mut buf = &buf[..]; + for x in &data { + let mut d = -1; + c.decode_to(&mut buf, &mut d).unwrap(); + assert_eq!(d, x.data); + } + } + + struct TestU64 { + data: u64, + estimate_encoded_size: usize, + } + + impl TestU64 { + fn new(data: u64) -> Self { + Self { + data, + estimate_encoded_size: 9, + } + } + } + + #[test] + fn test_u64_codec() { + let data = vec![ + TestU64::new(0), + TestU64::new(u64::from(u8::MAX)), + TestU64::new(u64::from(u16::MAX)), + TestU64::new(u64::from(u32::MAX)), + TestU64::new(u64::MAX), + TestU64::new((1 << 24) - 1), + TestU64::new((1 << 48) - 1), + TestU64::new((1 << 56) - 1), + TestU64::new(1), + TestU64::new(i8::MAX as u64), + TestU64::new(i16::MAX as u64), + TestU64::new(i32::MAX as u64), + TestU64::new(i64::MAX as u64), + ]; + let c = MemComparable; + let mut buf = vec![]; + for x in &data { + c.encode(&mut buf, &x.data).unwrap(); + assert_eq!(x.estimate_encoded_size, c.estimate_encoded_size(&x.data)); + } + + let mut buf = &buf[..]; + for x in &data { + let mut d = 0; + c.decode_to(&mut buf, &mut d).unwrap(); + assert_eq!(d, x.data); + } + } + + struct TblI64 { + arg1: i64, + arg2: i64, + ret: Ordering, + } + + #[test] + fn test_i64_order() { + let data = vec![ + TblI64 { + arg1: -1, + arg2: 1, + ret: Ordering::Less, + }, + TblI64 { + arg1: i64::MAX, + arg2: i64::MIN, + ret: Ordering::Greater, + }, + TblI64 { + arg1: i64::MAX, + arg2: i32::MAX as i64, + ret: Ordering::Greater, + }, + TblI64 { + arg1: i32::MIN as i64, + arg2: i16::MAX as i64, + ret: Ordering::Less, + }, + TblI64 { + arg1: i64::MIN, + arg2: i8::MAX as i64, + ret: Ordering::Less, + }, + TblI64 { + arg1: 0, + arg2: i8::MAX as i64, + ret: Ordering::Less, + }, + TblI64 { + arg1: i8::MIN as i64, + arg2: 0, + ret: Ordering::Less, + }, + TblI64 { + arg1: i16::MIN as i64, + arg2: i16::MAX as i64, + ret: Ordering::Less, + }, + TblI64 { + arg1: 1, + arg2: -1, + ret: Ordering::Greater, + }, + TblI64 { + arg1: 1, + arg2: 0, + ret: Ordering::Greater, + }, + TblI64 { + arg1: -1, + arg2: 0, + ret: Ordering::Less, + }, + TblI64 { + arg1: 0, + arg2: 0, + ret: Ordering::Equal, + }, + TblI64 { + arg1: i16::MAX as i64, + arg2: i16::MAX as i64, + ret: Ordering::Equal, + }, + ]; + let c = MemComparable; + for x in &data { + let mut buf1 = vec![]; + let mut buf2 = vec![]; + c.encode(&mut buf1, &x.arg1).unwrap(); + c.encode(&mut buf2, &x.arg2).unwrap(); + assert_eq!(x.ret, buf1.as_slice().cmp(buf2.as_slice())); + } + } + + struct TblU64 { + arg1: u64, + arg2: u64, + ret: Ordering, + } + + #[test] + fn test_u64_order() { + let data = vec![ + TblU64 { + arg1: 0, + arg2: 0, + ret: Ordering::Equal, + }, + TblU64 { + arg1: 1, + arg2: 0, + ret: Ordering::Greater, + }, + TblU64 { + arg1: 0, + arg2: 1, + ret: Ordering::Less, + }, + TblU64 { + arg1: i8::MAX as u64, + arg2: i16::MAX as u64, + ret: Ordering::Less, + }, + TblU64 { + arg1: u32::MAX as u64, + arg2: i32::MAX as u64, + ret: Ordering::Greater, + }, + TblU64 { + arg1: u8::MAX as u64, + arg2: i8::MAX as u64, + ret: Ordering::Greater, + }, + TblU64 { + arg1: u16::MAX as u64, + arg2: i32::MAX as u64, + ret: Ordering::Less, + }, + TblU64 { + arg1: u64::MAX as u64, + arg2: i64::MAX as u64, + ret: Ordering::Greater, + }, + TblU64 { + arg1: i64::MAX as u64, + arg2: u32::MAX as u64, + ret: Ordering::Greater, + }, + TblU64 { + arg1: u64::MAX, + arg2: 0, + ret: Ordering::Greater, + }, + TblU64 { + arg1: 0, + arg2: u64::MAX, + ret: Ordering::Less, + }, + ]; + let c = MemComparable; + for x in &data { + let mut buf1 = vec![]; + let mut buf2 = vec![]; + c.encode(&mut buf1, &x.arg1).unwrap(); + c.encode(&mut buf2, &x.arg2).unwrap(); + assert_eq!(x.ret, buf1.as_slice().cmp(buf2.as_slice())); + } + } +} diff --git a/common_util/src/codec/mod.rs b/common_util/src/codec/mod.rs new file mode 100644 index 0000000000..0a9825f355 --- /dev/null +++ b/common_util/src/codec/mod.rs @@ -0,0 +1,42 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Data encoding + +// TODO(yingwen): Buf use generic type to avoid cost of vtable call per +// encode/decode + +pub mod compact; +mod consts; +pub mod memcomparable; +pub mod row; +mod varint; + +use common_types::bytes::{MemBuf, MemBufMut}; + +// encoder/decoder +/// Data encode abstraction +pub trait Encoder { + type Error; + + /// Encode value into buf + fn encode(&self, buf: &mut B, value: &T) -> Result<(), Self::Error>; + + /// Estimate the value size after encoded + fn estimate_encoded_size(&self, value: &T) -> usize; +} + +/// Data decode to target +pub trait DecodeTo { + type Error; + + /// Decode from `buf` to `value` + fn decode_to(&self, buf: &mut B, value: &mut T) -> Result<(), Self::Error>; +} + +/// Data decode abstraction +pub trait Decoder { + type Error; + + /// Decode `value` from `buf` + fn decode(&self, buf: &mut B) -> Result; +} diff --git a/common_util/src/codec/row.rs b/common_util/src/codec/row.rs new file mode 100644 index 0000000000..54c1b8ccbe --- /dev/null +++ b/common_util/src/codec/row.rs @@ -0,0 +1,234 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Row encoding utils +//! +//! Notice: The encoding method is used both in wal and memtable. Be careful for +//! data compatibility + +use std::convert::TryFrom; + +use common_types::{ + bytes::{BufMut, ByteVec, BytesMut, MemBuf, MemBufMut}, + datum::Datum, + row::{Row, RowGroup}, + schema::{IndexInWriterSchema, Schema}, +}; +use snafu::{ResultExt, Snafu}; + +use crate::codec::{ + compact::{MemCompactDecoder, MemCompactEncoder}, + DecodeTo, Decoder, Encoder, +}; + +#[derive(Debug, Snafu)] +#[snafu(visibility(pub(crate)))] +pub enum Error { + #[snafu(display("Failed to encode row datum, err:{}", source))] + EncodeRowDatum { + source: crate::codec::compact::Error, + }, + + #[snafu(display("Failed to decode row datum, err:{}", source))] + DecodeRowDatum { + source: crate::codec::compact::Error, + }, +} + +define_result!(Error); + +/// Compact row encoder for wal. +struct WalRowEncoder<'a> { + /// Schema of table + table_schema: &'a Schema, + /// Index of table column in writer + index_in_writer: &'a IndexInWriterSchema, +} + +impl<'a> Encoder for WalRowEncoder<'a> { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &Row) -> Result<()> { + let encoder = MemCompactEncoder; + for index_in_table in 0..self.table_schema.num_columns() { + match self.index_in_writer.column_index_in_writer(index_in_table) { + Some(writer_index) => { + // Column in writer + encoder + .encode(buf, &value[writer_index]) + .context(EncodeRowDatum)?; + } + None => { + // Column not in writer + encoder.encode(buf, &Datum::Null).context(EncodeRowDatum)?; + } + } + } + + Ok(()) + } + + fn estimate_encoded_size(&self, value: &Row) -> usize { + let encoder = MemCompactEncoder; + let mut total_len = 0; + for index_in_table in 0..self.table_schema.num_columns() { + match self.index_in_writer.column_index_in_writer(index_in_table) { + Some(writer_index) => { + // Column in writer + total_len += encoder.estimate_encoded_size(&value[writer_index]); + } + None => { + // Column not in writer + total_len += encoder.estimate_encoded_size(&Datum::Null); + } + } + } + + total_len + } +} + +/// Compact row decoder for wal, supports projection. +#[derive(Debug)] +pub struct WalRowDecoder<'a> { + /// Schema of row to decode + schema: &'a Schema, +} + +impl<'a> WalRowDecoder<'a> { + /// Create a decoder with given `schema`, the caller should ensure the + /// schema matches the row to be decoded. + pub fn new(schema: &'a Schema) -> Self { + Self { schema } + } +} + +impl<'a> Decoder for WalRowDecoder<'a> { + type Error = Error; + + fn decode(&self, buf: &mut B) -> Result { + let num_columns = self.schema.num_columns(); + let mut datums = Vec::with_capacity(num_columns); + + for idx in 0..num_columns { + let column_schema = &self.schema.column(idx); + let datum_kind = &column_schema.data_type; + let decoder = MemCompactDecoder; + + // Decode each column + let mut datum = Datum::empty(datum_kind); + decoder.decode_to(buf, &mut datum).context(DecodeRowDatum)?; + + datums.push(datum); + } + + Ok(Row::from_datums(datums)) + } +} + +/// Encode the row group in the format that can write to wal. +/// +/// Arguments +/// - row_group: The rows to be encoded and wrote to. +/// - table_schema: The schema the row group need to be encoded into, the schema +/// of the row group need to be write compatible for the table schema. +/// - index_in_writer: The index mapping from table schema to column in the +/// schema of row group. +/// - encoded_rows: The Vec to store bytes of each encoded row. +pub fn encode_row_group_for_wal( + row_group: &RowGroup, + table_schema: &Schema, + index_in_writer: &IndexInWriterSchema, + encoded_rows: &mut Vec, +) -> Result<()> { + let row_encoder = WalRowEncoder { + table_schema, + index_in_writer, + }; + + // Use estimated size of first row to avoid compute all + let row_estimated_size = match row_group.get_row(0) { + Some(first_row) => row_encoder.estimate_encoded_size(first_row), + // The row group is empty + None => return Ok(()), + }; + + encoded_rows.reserve(row_group.num_rows()); + + // Each row is constructed in writer schema, we need to encode it in + // `table_schema` + for row in row_group { + let mut buf = Vec::with_capacity(row_estimated_size); + row_encoder.encode(&mut buf, row)?; + + encoded_rows.push(buf); + } + + Ok(()) +} + +/// Return the next prefix key +/// +/// Assume there are keys like: +/// +/// ```text +/// rowkey1 +/// rowkey1_column1 +/// rowkey1_column2 +/// rowKey2 +/// ``` +/// +/// If we seek 'rowkey1' Next, we will get 'rowkey1_column1'. +/// If we seek 'rowkey1' PrefixNext, we will get 'rowkey2'. +/// +/// Ported from +/// +/// REQUIRE: The key should be memory comparable +// TODO(yingwen): Maybe add scratch param +// TODO(yingwen): Move to another mod +pub fn key_prefix_next(key: &[u8]) -> BytesMut { + let mut buf = BytesMut::from(key); + // isize should be enough to represent the key len + let mut idx = isize::try_from(key.len() - 1).unwrap(); + while idx >= 0 { + let i = idx as usize; + buf[i] += 1; + if buf[i] != 0 { + break; + } + + idx -= 1; + } + if idx == -1 { + buf.copy_from_slice(key); + buf.put_u8(0); + } + + buf +} +#[cfg(test)] +mod test { + use common_types::schema::IndexInWriterSchema; + + use crate::codec::{ + row::{WalRowDecoder, WalRowEncoder}, + Decoder, Encoder, + }; + + #[test] + fn test_wal_encode_decode() { + let schema = common_types::tests::build_schema(); + let rows = common_types::tests::build_rows(); + let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns()); + let wal_encoder = WalRowEncoder { + table_schema: &schema, + index_in_writer: &index_in_writer, + }; + let wal_decoder = WalRowDecoder::new(&schema); + for row in rows { + let mut buf = Vec::new(); + wal_encoder.encode(&mut buf, &row).unwrap(); + let row_decoded = wal_decoder.decode(&mut buf.as_slice()).unwrap(); + assert_eq!(row_decoded, row); + } + } +} diff --git a/common_util/src/codec/varint.rs b/common_util/src/codec/varint.rs new file mode 100644 index 0000000000..eb5616b692 --- /dev/null +++ b/common_util/src/codec/varint.rs @@ -0,0 +1,209 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Varint for codec whose test is covered by compact/number.rs +use common_types::bytes::{MemBuf, MemBufMut}; +use snafu::{Backtrace, ResultExt, Snafu}; + +#[derive(Debug, Snafu)] +#[snafu(visibility(pub(crate)))] +pub enum Error { + #[snafu(display("Failed to encode varint, err:{}", source))] + EncodeVarint { source: common_types::bytes::Error }, + + #[snafu(display("Insufficient bytes to decode value.\nBacktrace:\n{}", backtrace))] + DecodeEmptyValue { backtrace: Backtrace }, + + #[snafu(display("Insufficient bytes to decode value, err:{}", source))] + DecodeValue { source: common_types::bytes::Error }, + + #[snafu(display("Value larger than 64 bits (overflow).\nBacktrace:\n{}", backtrace))] + UvarintOverflow { backtrace: Backtrace }, +} + +define_result!(Error); + +// from https://golang.org/src/encoding/binary/varint.go?s=2506:2545#L68 +// PutVarint encodes an int64 into buf and returns the number of bytes written. +// If the buffer is too small, PutVarint will panic. +// +// ```go +// func PutVarint(buf []byte, x int64) int { +// ux := uint64(x) << 1 +// if x < 0 { +// ux = ^ux +// } +// return PutUvarint(buf, ux) +// } +// ``` +pub fn encode_varint(buf: &mut B, value: i64) -> Result<()> { + let mut x = (value as u64) << 1; + if value < 0 { + x = !x; + } + encode_uvarint(buf, x) +} + +// +// from https://golang.org/src/encoding/binary/varint.go?s=1611:1652#L31 +// +// ```go +// func PutUvarint(buf []byte, x uint64) int { +// i := 0 +// for x >= 0x80 { +// buf[i] = byte(x) | 0x80 +// x >>= 7 +// i++ +// } +// buf[i] = byte(x) +// return i + 1 +// } +// ``` +pub fn encode_uvarint(buf: &mut B, mut x: u64) -> Result<()> { + while x >= 0x80 { + buf.write_u8(x as u8 | 0x80).context(EncodeVarint)?; + x >>= 7; + } + buf.write_u8(x as u8).context(EncodeVarint)?; + Ok(()) +} + +// from https://golang.org/src/encoding/binary/varint.go?s=2955:2991#L84 +// Varint decodes an int64 from buf and returns that value and the +// number of bytes read (> 0). If an error occurred, the value is 0 +// and the number of bytes n is <= 0 with the following meaning: +// +// n == 0: buf too small +// n < 0: value larger than 64 bits (overflow) +// and -n is the number of bytes read +// +// ```go +// func Varint(buf []byte) (int64, int) { +// ux, n := Uvarint(buf) // ok to continue in presence of error +// x := int64(ux >> 1) +// if ux&1 != 0 { +// x = ^x +// } +// return x, n +// } +// ``` +pub fn decode_varint(buf: &mut B) -> Result { + let ux = decode_uvarint(buf)?; + let mut x = (ux >> 1) as i64; + if ux & 1 != 0 { + x = !x; + } + Ok(x) +} + +// from https://golang.org/src/encoding/binary/varint.go?s=2070:2108#L50 +// Uvarint decodes a uint64 from buf and returns that value and the +// number of bytes read (> 0). If an error occurred, the value is 0 +// and the number of bytes n is <= 0 meaning: +// +// n == 0: buf too small +// n < 0: value larger than 64 bits (overflow) +// and -n is the number of bytes read +// +// ```go +// func Uvarint(buf []byte) (uint64, int) { +// var x uint64 +// var s uint +// for i, b := range buf { +// if b < 0x80 { +// if i > 9 || i == 9 && b > 1 { +// return 0, -(i + 1) // overflow +// } +// return x | uint64(b)<(buf: &mut B) -> Result { + let mut x: u64 = 0; + let mut s: usize = 0; + let len = buf.remaining_slice().len(); + for i in 0..len { + let b = buf.read_u8().context(DecodeValue)?; + if b < 0x80 { + if i > 9 || i == 9 && b > 1 { + return UvarintOverflow.fail(); // overflow + } + return Ok(x | u64::from(b) << s); + } + x |= u64::from(b & 0x7f) << s; + s += 7; + } + DecodeEmptyValue.fail() +} + +#[cfg(test)] +mod tests { + use common_types::bytes::BytesMut; + + use super::*; + + #[test] + fn test_encode_decode_varint() { + let nums: Vec<(i64, usize)> = vec![ + (i64::MIN, 10), + (-1000000000000000, 8), + (-100000000000, 6), + (-1000000000, 5), + (-100000, 3), + (-65535, 3), + (-1000, 2), + (-125, 2), + (-32, 1), + (0, 1), + (64, 2), + (125, 2), + (1000, 2), + (65535, 3), + (10000, 3), + (1000000000, 5), + (100000000000, 6), + (10000000000000, 7), + (1000000000000000, 8), + (i64::MAX, 10), + ]; + + for (i, size) in nums { + let mut buf = BytesMut::with_capacity(8); + assert!(encode_varint(&mut buf, i).is_ok()); + assert_eq!(size, buf.len()); + let d = decode_varint(&mut buf); + assert!(d.is_ok()); + assert_eq!(i, d.unwrap()); + } + } + + #[test] + fn test_encode_decode_uvarint() { + let nums: Vec<(u64, usize)> = vec![ + (0, 1), + (64, 1), + (125, 1), + (1000, 2), + (65535, 3), + (10000, 2), + (1000000000, 5), + (100000000000, 6), + (10000000000000, 7), + (1000000000000000, 8), + (u64::MAX, 10), + ]; + + for (i, size) in nums { + let mut buf = BytesMut::with_capacity(8); + assert!(encode_uvarint(&mut buf, i).is_ok()); + assert_eq!(size, buf.len()); + let d = decode_uvarint(&mut buf); + assert!(d.is_ok()); + assert_eq!(i, d.unwrap()); + } + } +} diff --git a/common_util/src/config.rs b/common_util/src/config.rs new file mode 100644 index 0000000000..ac7232767f --- /dev/null +++ b/common_util/src/config.rs @@ -0,0 +1,711 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. + +//! Configure utils + +//This module is forked from tikv and remove unnessary code. +//https://github.com/tikv/tikv/blob/HEAD/src/util/config.rs +use std::{ + fmt::{self, Write}, + ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Sub, SubAssign}, + str::{self, FromStr}, + time::Duration, +}; + +use proto::analytic_common; +use serde::{ + de::{self, Unexpected, Visitor}, + Deserialize, Deserializer, Serialize, Serializer, +}; + +const UNIT: u64 = 1; + +const BINARY_DATA_MAGNITUDE: u64 = 1024; +pub const B: u64 = UNIT; +pub const KIB: u64 = UNIT * BINARY_DATA_MAGNITUDE; +pub const MIB: u64 = KIB * BINARY_DATA_MAGNITUDE; +pub const GIB: u64 = MIB * BINARY_DATA_MAGNITUDE; +pub const TIB: u64 = GIB * BINARY_DATA_MAGNITUDE; +pub const PIB: u64 = TIB * BINARY_DATA_MAGNITUDE; + +const TIME_MAGNITUDE_1: u64 = 1000; +const TIME_MAGNITUDE_2: u64 = 60; +const TIME_MAGNITUDE_3: u64 = 24; +const MS: u64 = UNIT; +const SECOND: u64 = MS * TIME_MAGNITUDE_1; +const MINUTE: u64 = SECOND * TIME_MAGNITUDE_2; +const HOUR: u64 = MINUTE * TIME_MAGNITUDE_2; +const DAY: u64 = HOUR * TIME_MAGNITUDE_3; + +/// Convert Duration to milliseconds. +/// +/// Panic if overflow. Mainly used by `ReadableDuration`. +#[inline] +fn duration_to_ms(d: Duration) -> u64 { + let nanos = u64::from(d.subsec_nanos()); + // Most of case, we can't have so large Duration, so here just panic if overflow + // now. + d.as_secs() * 1_000 + (nanos / 1_000_000) +} + +#[derive(Clone, Debug, Copy, PartialEq, PartialOrd, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum TimeUnit { + Nanoseconds, + Microseconds, + Milliseconds, + Seconds, + Minutes, + Hours, + Days, +} + +impl From for analytic_common::TimeUnit { + fn from(unit: TimeUnit) -> Self { + match unit { + TimeUnit::Nanoseconds => analytic_common::TimeUnit::NANOSECONDS, + TimeUnit::Microseconds => analytic_common::TimeUnit::MICROSECONDS, + TimeUnit::Milliseconds => analytic_common::TimeUnit::MILLISECONDS, + TimeUnit::Seconds => analytic_common::TimeUnit::SECONDS, + TimeUnit::Minutes => analytic_common::TimeUnit::MINUTES, + TimeUnit::Hours => analytic_common::TimeUnit::HOURS, + TimeUnit::Days => analytic_common::TimeUnit::DAYS, + } + } +} + +impl From for TimeUnit { + fn from(unit: analytic_common::TimeUnit) -> Self { + match unit { + analytic_common::TimeUnit::NANOSECONDS => TimeUnit::Nanoseconds, + analytic_common::TimeUnit::MICROSECONDS => TimeUnit::Microseconds, + analytic_common::TimeUnit::MILLISECONDS => TimeUnit::Milliseconds, + analytic_common::TimeUnit::SECONDS => TimeUnit::Seconds, + analytic_common::TimeUnit::MINUTES => TimeUnit::Minutes, + analytic_common::TimeUnit::HOURS => TimeUnit::Hours, + analytic_common::TimeUnit::DAYS => TimeUnit::Days, + } + } +} + +impl FromStr for TimeUnit { + type Err = String; + + fn from_str(tu_str: &str) -> Result { + let tu_str = tu_str.trim(); + if !tu_str.is_ascii() { + return Err(format!("unexpect ascii string: {}", tu_str)); + } + + match tu_str.to_lowercase().as_str() { + "nanoseconds" => Ok(TimeUnit::Nanoseconds), + "microseconds" => Ok(TimeUnit::Microseconds), + "milliseconds" => Ok(TimeUnit::Milliseconds), + "seconds" => Ok(TimeUnit::Seconds), + "minutes" => Ok(TimeUnit::Minutes), + "hours" => Ok(TimeUnit::Hours), + "days" => Ok(TimeUnit::Days), + _ => Err(format!("unexpect TimeUnit: {}", tu_str)), + } + } +} + +impl fmt::Display for TimeUnit { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let s = match self { + TimeUnit::Nanoseconds => "nanoseconds", + TimeUnit::Microseconds => "microseconds", + TimeUnit::Milliseconds => "milliseconds", + TimeUnit::Seconds => "seconds", + TimeUnit::Minutes => "minutes", + TimeUnit::Hours => "hours", + TimeUnit::Days => "days", + }; + write!(f, "{}", s) + } +} + +#[derive(Clone, Debug, Copy, PartialEq, PartialOrd)] +pub struct ReadableSize(pub u64); + +impl ReadableSize { + pub const fn kb(count: u64) -> ReadableSize { + ReadableSize(count * KIB) + } + + pub const fn mb(count: u64) -> ReadableSize { + ReadableSize(count * MIB) + } + + pub const fn gb(count: u64) -> ReadableSize { + ReadableSize(count * GIB) + } + + pub const fn as_mb(self) -> u64 { + self.0 / MIB + } + + pub const fn as_bytes(self) -> u64 { + self.0 + } +} + +impl Div for ReadableSize { + type Output = ReadableSize; + + fn div(self, rhs: u64) -> ReadableSize { + ReadableSize(self.0 / rhs) + } +} + +impl Div for ReadableSize { + type Output = u64; + + fn div(self, rhs: ReadableSize) -> u64 { + self.0 / rhs.0 + } +} + +impl Mul for ReadableSize { + type Output = ReadableSize; + + fn mul(self, rhs: u64) -> ReadableSize { + ReadableSize(self.0 * rhs) + } +} + +impl Serialize for ReadableSize { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let size = self.0; + let mut buffer = String::new(); + if size == 0 { + write!(buffer, "{}KiB", size).unwrap(); + } else if size % PIB == 0 { + write!(buffer, "{}PiB", size / PIB).unwrap(); + } else if size % TIB == 0 { + write!(buffer, "{}TiB", size / TIB).unwrap(); + } else if size % GIB as u64 == 0 { + write!(buffer, "{}GiB", size / GIB).unwrap(); + } else if size % MIB as u64 == 0 { + write!(buffer, "{}MiB", size / MIB).unwrap(); + } else if size % KIB as u64 == 0 { + write!(buffer, "{}KiB", size / KIB).unwrap(); + } else { + return serializer.serialize_u64(size); + } + serializer.serialize_str(&buffer) + } +} + +impl FromStr for ReadableSize { + type Err = String; + + // This method parses value in binary unit. + fn from_str(s: &str) -> Result { + let size_str = s.trim(); + if size_str.is_empty() { + return Err(format!("{:?} is not a valid size.", s)); + } + + if !size_str.is_ascii() { + return Err(format!("ASCII string is expected, but got {:?}", s)); + } + + // size: digits and '.' as decimal separator + let size_len = size_str + .to_string() + .chars() + .take_while(|c| char::is_ascii_digit(c) || ['.', 'e', 'E', '-', '+'].contains(c)) + .count(); + + // unit: alphabetic characters + let (size, unit) = size_str.split_at(size_len); + + let unit = match unit.trim() { + "K" | "KB" | "KiB" => KIB, + "M" | "MB" | "MiB" => MIB, + "G" | "GB" | "GiB" => GIB, + "T" | "TB" | "TiB" => TIB, + "P" | "PB" | "PiB" => PIB, + "B" | "" => UNIT, + _ => { + return Err(format!( + "only B, KB, KiB, MB, MiB, GB, GiB, TB, TiB, PB, and PiB are supported: {:?}", + s + )); + } + }; + + match size.parse::() { + Ok(n) => Ok(ReadableSize((n * unit as f64) as u64)), + Err(_) => Err(format!("invalid size string: {:?}", s)), + } + } +} + +impl<'de> Deserialize<'de> for ReadableSize { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + struct SizeVisitor; + + impl<'de> Visitor<'de> for SizeVisitor { + type Value = ReadableSize; + + fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + formatter.write_str("valid size") + } + + fn visit_i64(self, size: i64) -> Result + where + E: de::Error, + { + if size >= 0 { + self.visit_u64(size as u64) + } else { + Err(E::invalid_value(Unexpected::Signed(size), &self)) + } + } + + fn visit_u64(self, size: u64) -> Result + where + E: de::Error, + { + Ok(ReadableSize(size)) + } + + fn visit_str(self, size_str: &str) -> Result + where + E: de::Error, + { + size_str.parse().map_err(E::custom) + } + } + + deserializer.deserialize_any(SizeVisitor) + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Ord, PartialOrd)] +pub struct ReadableDuration(pub Duration); + +impl Add for ReadableDuration { + type Output = ReadableDuration; + + fn add(self, rhs: ReadableDuration) -> ReadableDuration { + Self(self.0 + rhs.0) + } +} + +impl AddAssign for ReadableDuration { + fn add_assign(&mut self, rhs: ReadableDuration) { + *self = *self + rhs; + } +} + +impl Sub for ReadableDuration { + type Output = ReadableDuration; + + fn sub(self, rhs: ReadableDuration) -> ReadableDuration { + Self(self.0 - rhs.0) + } +} + +impl SubAssign for ReadableDuration { + fn sub_assign(&mut self, rhs: ReadableDuration) { + *self = *self - rhs; + } +} + +impl Mul for ReadableDuration { + type Output = ReadableDuration; + + fn mul(self, rhs: u32) -> Self::Output { + Self(self.0 * rhs) + } +} + +impl MulAssign for ReadableDuration { + fn mul_assign(&mut self, rhs: u32) { + *self = *self * rhs; + } +} + +impl Div for ReadableDuration { + type Output = ReadableDuration; + + fn div(self, rhs: u32) -> ReadableDuration { + Self(self.0 / rhs) + } +} + +impl DivAssign for ReadableDuration { + fn div_assign(&mut self, rhs: u32) { + *self = *self / rhs; + } +} + +impl From for Duration { + fn from(readable: ReadableDuration) -> Duration { + readable.0 + } +} + +// yingwen: Support From. +impl From for ReadableDuration { + fn from(t: Duration) -> ReadableDuration { + ReadableDuration(t) + } +} + +impl FromStr for ReadableDuration { + type Err = String; + + fn from_str(dur_str: &str) -> Result { + let dur_str = dur_str.trim(); + if !dur_str.is_ascii() { + return Err(format!("unexpect ascii string: {}", dur_str)); + } + let err_msg = "valid duration, only d, h, m, s, ms are supported.".to_owned(); + let mut left = dur_str.as_bytes(); + let mut last_unit = DAY + 1; + let mut dur = 0f64; + while let Some(idx) = left.iter().position(|c| b"dhms".contains(c)) { + let (first, second) = left.split_at(idx); + let unit = if second.starts_with(b"ms") { + left = &left[idx + 2..]; + MS + } else { + let u = match second[0] { + b'd' => DAY, + b'h' => HOUR, + b'm' => MINUTE, + b's' => SECOND, + _ => return Err(err_msg), + }; + left = &left[idx + 1..]; + u + }; + if unit >= last_unit { + return Err("d, h, m, s, ms should occur in given order.".to_owned()); + } + // do we need to check 12h360m? + let number_str = unsafe { str::from_utf8_unchecked(first) }; + dur += match number_str.trim().parse::() { + Ok(n) => n * unit as f64, + Err(_) => return Err(err_msg), + }; + last_unit = unit; + } + if !left.is_empty() { + return Err(err_msg); + } + if dur.is_sign_negative() { + return Err("duration should be positive.".to_owned()); + } + let secs = dur as u64 / SECOND as u64; + let millis = (dur as u64 % SECOND as u64) as u32 * 1_000_000; + Ok(ReadableDuration(Duration::new(secs, millis))) + } +} + +impl ReadableDuration { + pub const fn secs(secs: u64) -> ReadableDuration { + ReadableDuration(Duration::from_secs(secs)) + } + + pub const fn millis(millis: u64) -> ReadableDuration { + ReadableDuration(Duration::from_millis(millis)) + } + + pub const fn minutes(minutes: u64) -> ReadableDuration { + ReadableDuration::secs(minutes * 60) + } + + pub const fn hours(hours: u64) -> ReadableDuration { + ReadableDuration::minutes(hours * 60) + } + + pub const fn days(days: u64) -> ReadableDuration { + ReadableDuration::hours(days * 24) + } + + pub fn as_secs(&self) -> u64 { + self.0.as_secs() + } + + pub fn as_millis(&self) -> u64 { + duration_to_ms(self.0) + } + + pub fn is_zero(&self) -> bool { + self.0.as_nanos() == 0 + } +} + +impl fmt::Display for ReadableDuration { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut dur = duration_to_ms(self.0); + let mut written = false; + if dur >= DAY { + written = true; + write!(f, "{}d", dur / DAY)?; + dur %= DAY; + } + if dur >= HOUR { + written = true; + write!(f, "{}h", dur / HOUR)?; + dur %= HOUR; + } + if dur >= MINUTE { + written = true; + write!(f, "{}m", dur / MINUTE)?; + dur %= MINUTE; + } + if dur >= SECOND { + written = true; + write!(f, "{}s", dur / SECOND)?; + dur %= SECOND; + } + if dur > 0 { + written = true; + write!(f, "{}ms", dur)?; + } + if !written { + write!(f, "0s")?; + } + Ok(()) + } +} + +impl Serialize for ReadableDuration { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let mut buffer = String::new(); + write!(buffer, "{}", self).unwrap(); + serializer.serialize_str(&buffer) + } +} + +impl<'de> Deserialize<'de> for ReadableDuration { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + struct DurVisitor; + + impl<'de> Visitor<'de> for DurVisitor { + type Value = ReadableDuration; + + fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + formatter.write_str("valid duration") + } + + fn visit_str(self, dur_str: &str) -> Result + where + E: de::Error, + { + dur_str.parse().map_err(E::custom) + } + } + + deserializer.deserialize_str(DurVisitor) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_readable_size() { + let s = ReadableSize::kb(2); + assert_eq!(s.0, 2048); + assert_eq!(s.as_mb(), 0); + let s = ReadableSize::mb(2); + assert_eq!(s.0, 2 * 1024 * 1024); + assert_eq!(s.as_mb(), 2); + let s = ReadableSize::gb(2); + assert_eq!(s.0, 2 * 1024 * 1024 * 1024); + assert_eq!(s.as_mb(), 2048); + + assert_eq!((ReadableSize::mb(2) / 2).0, MIB); + assert_eq!((ReadableSize::mb(1) / 2).0, 512 * KIB); + assert_eq!(ReadableSize::mb(2) / ReadableSize::kb(1), 2048); + } + + #[test] + fn test_parse_readable_size() { + #[derive(Serialize, Deserialize)] + struct SizeHolder { + s: ReadableSize, + } + + let legal_cases = vec![ + (0, "0KiB"), + (2 * KIB, "2KiB"), + (4 * MIB, "4MiB"), + (5 * GIB, "5GiB"), + (7 * TIB, "7TiB"), + (11 * PIB, "11PiB"), + ]; + for (size, exp) in legal_cases { + let c = SizeHolder { + s: ReadableSize(size), + }; + let res_str = toml::to_string(&c).unwrap(); + let exp_str = format!("s = {:?}\n", exp); + assert_eq!(res_str, exp_str); + let res_size: SizeHolder = toml::from_str(&exp_str).unwrap(); + assert_eq!(res_size.s.0, size); + } + + let c = SizeHolder { + s: ReadableSize(512), + }; + let res_str = toml::to_string(&c).unwrap(); + assert_eq!(res_str, "s = 512\n"); + let res_size: SizeHolder = toml::from_str(&res_str).unwrap(); + assert_eq!(res_size.s.0, c.s.0); + + let decode_cases = vec![ + (" 0.5 PB", PIB / 2), + ("0.5 TB", TIB / 2), + ("0.5GB ", GIB / 2), + ("0.5MB", MIB / 2), + ("0.5KB", KIB / 2), + ("0.5P", PIB / 2), + ("0.5T", TIB / 2), + ("0.5G", GIB / 2), + ("0.5M", MIB / 2), + ("0.5K", KIB / 2), + ("23", 23), + ("1", 1), + ("1024B", KIB), + // units with binary prefixes + (" 0.5 PiB", PIB / 2), + ("1PiB", PIB), + ("0.5 TiB", TIB / 2), + ("2 TiB", TIB * 2), + ("0.5GiB ", GIB / 2), + ("787GiB ", GIB * 787), + ("0.5MiB", MIB / 2), + ("3MiB", MIB * 3), + ("0.5KiB", KIB / 2), + ("1 KiB", KIB), + // scientific notation + ("0.5e6 B", B * 500000), + ("0.5E6 B", B * 500000), + ("1e6B", B * 1000000), + ("8E6B", B * 8000000), + ("8e7", B * 80000000), + ("1e-1MB", MIB / 10), + ("1e+1MB", MIB * 10), + ("0e+10MB", 0), + ]; + for (src, exp) in decode_cases { + let src = format!("s = {:?}", src); + let res: SizeHolder = toml::from_str(&src).unwrap(); + assert_eq!(res.s.0, exp); + } + + let illegal_cases = vec![ + "0.5kb", "0.5kB", "0.5Kb", "0.5k", "0.5g", "b", "gb", "1b", "B", "1K24B", " 5_KB", + "4B7", "5M_", + ]; + for src in illegal_cases { + let src_str = format!("s = {:?}", src); + assert!(toml::from_str::(&src_str).is_err(), "{}", src); + } + } + + #[test] + fn test_duration_construction() { + let mut dur = ReadableDuration::secs(1); + assert_eq!(dur.0, Duration::new(1, 0)); + assert_eq!(dur.as_secs(), 1); + assert_eq!(dur.as_millis(), 1000); + dur = ReadableDuration::millis(1001); + assert_eq!(dur.0, Duration::new(1, 1_000_000)); + assert_eq!(dur.as_secs(), 1); + assert_eq!(dur.as_millis(), 1001); + dur = ReadableDuration::minutes(2); + assert_eq!(dur.0, Duration::new(2 * 60, 0)); + assert_eq!(dur.as_secs(), 120); + assert_eq!(dur.as_millis(), 120000); + dur = ReadableDuration::hours(2); + assert_eq!(dur.0, Duration::new(2 * 3600, 0)); + assert_eq!(dur.as_secs(), 7200); + assert_eq!(dur.as_millis(), 7200000); + } + + #[test] + fn test_parse_readable_duration() { + #[derive(Serialize, Deserialize)] + struct DurHolder { + d: ReadableDuration, + } + + let legal_cases = vec![ + (0, 0, "0s"), + (0, 1, "1ms"), + (2, 0, "2s"), + (24 * 3600, 0, "1d"), + (2 * 24 * 3600, 10, "2d10ms"), + (4 * 60, 0, "4m"), + (5 * 3600, 0, "5h"), + (3600 + 2 * 60, 0, "1h2m"), + (5 * 24 * 3600 + 3600 + 2 * 60, 0, "5d1h2m"), + (3600 + 2, 5, "1h2s5ms"), + (3 * 24 * 3600 + 7 * 3600 + 2, 5, "3d7h2s5ms"), + ]; + for (secs, ms, exp) in legal_cases { + let d = DurHolder { + d: ReadableDuration(Duration::new(secs, ms * 1_000_000)), + }; + let res_str = toml::to_string(&d).unwrap(); + let exp_str = format!("d = {:?}\n", exp); + assert_eq!(res_str, exp_str); + let res_dur: DurHolder = toml::from_str(&exp_str).unwrap(); + assert_eq!(res_dur.d.0, d.d.0); + } + + let decode_cases = vec![(" 0.5 h2m ", 3600 / 2 + 2 * 60, 0)]; + for (src, secs, ms) in decode_cases { + let src = format!("d = {:?}", src); + let res: DurHolder = toml::from_str(&src).unwrap(); + assert_eq!(res.d.0, Duration::new(secs, ms * 1_000_000)); + } + + let illegal_cases = vec!["1H", "1M", "1S", "1MS", "1h1h", "h"]; + for src in illegal_cases { + let src_str = format!("d = {:?}", src); + assert!(toml::from_str::(&src_str).is_err(), "{}", src); + } + assert!(toml::from_str::("d = 23").is_err()); + } + + #[test] + fn test_parse_timeunit() { + let s = "milliseconds"; + assert_eq!(TimeUnit::Milliseconds, s.parse::().unwrap()); + let s = "seconds"; + assert_eq!(TimeUnit::Seconds, s.parse::().unwrap()); + let s = "minutes"; + assert_eq!(TimeUnit::Minutes, s.parse::().unwrap()); + let s = "hours"; + assert_eq!(TimeUnit::Hours, s.parse::().unwrap()); + let s = "days"; + assert_eq!(TimeUnit::Days, s.parse::().unwrap()); + let s = "microseconds"; + assert_eq!(TimeUnit::Microseconds, s.parse::().unwrap()); + } +} diff --git a/common_util/src/lib.rs b/common_util/src/lib.rs new file mode 100644 index 0000000000..f7c2c11e31 --- /dev/null +++ b/common_util/src/lib.rs @@ -0,0 +1,31 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Common utils shared by the whole project + +// We need to define mod with macro_use before other mod so that other +// mods in this crate can use the macros +#[macro_use] +pub mod macros; + +// TODO(yingwen): Move some mod into components as a crate +pub mod alloc_tracker; +pub mod codec; +pub mod config; +pub mod metric; +pub mod panic; +pub mod runtime; +pub mod time; +pub mod toml; + +#[cfg(any(test, feature = "test"))] +pub mod tests { + use std::sync::Once; + + static INIT_LOG: Once = Once::new(); + + pub fn init_log_for_test() { + INIT_LOG.call_once(|| { + env_logger::init(); + }); + } +} diff --git a/common_util/src/macros.rs b/common_util/src/macros.rs new file mode 100644 index 0000000000..5ac5b6f1c8 --- /dev/null +++ b/common_util/src/macros.rs @@ -0,0 +1,25 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Contains all needed macros + +/// Define result for given Error type +#[macro_export] +macro_rules! define_result { + ($t:ty) => { + pub type Result = std::result::Result; + }; +} + +#[cfg(test)] +mod tests { + #[test] + fn test_define_result() { + define_result!(i32); + + fn return_i32_error() -> Result<()> { + Err(18) + } + + assert_eq!(Err(18), return_i32_error()); + } +} diff --git a/common_util/src/metric.rs b/common_util/src/metric.rs new file mode 100644 index 0000000000..3219a3c757 --- /dev/null +++ b/common_util/src/metric.rs @@ -0,0 +1,267 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +/// Copied from https://github.com/sunng87/metriki/blob/master/metriki-core/src/metrics/meter.rs +/// But supports 1 hour and 2 hour rate. +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::{Duration, Instant, SystemTime}; + +use crossbeam_utils::atomic::AtomicCell; +#[cfg(feature = "ser")] +use serde::ser::SerializeMap; +#[cfg(feature = "ser")] +use serde::{Serialize, Serializer}; + +use crate::time; + +/// Meters are used to calculate rate of an event. +#[derive(Debug)] +pub struct Meter { + moving_avarages: ExponentiallyWeightedMovingAverages, + count: AtomicU64, + start_time: SystemTime, +} + +impl Default for Meter { + fn default() -> Self { + Self::new() + } +} + +impl Meter { + pub fn new() -> Meter { + Meter { + moving_avarages: ExponentiallyWeightedMovingAverages::new(), + count: AtomicU64::from(0), + start_time: SystemTime::now(), + } + } + + pub fn mark(&self) { + self.mark_n(1) + } + + pub fn mark_n(&self, n: u64) { + self.count.fetch_add(n, Ordering::Relaxed); + self.moving_avarages.tick_if_needed(); + self.moving_avarages.update(n); + } + + pub fn h1_rate(&self) -> f64 { + self.moving_avarages.tick_if_needed(); + self.moving_avarages.h1_rate() + } + + pub fn h2_rate(&self) -> f64 { + self.moving_avarages.tick_if_needed(); + self.moving_avarages.h2_rate() + } + + pub fn m15_rate(&self) -> f64 { + self.moving_avarages.tick_if_needed(); + self.moving_avarages.m15_rate() + } + + pub fn count(&self) -> u64 { + self.count.load(Ordering::Relaxed) + } + + pub fn mean_rate(&self) -> f64 { + let count = self.count(); + if count > 0 { + if let Ok(elapsed) = SystemTime::now() + .duration_since(self.start_time) + .map(|d| d.as_secs() as f64) + { + count as f64 / elapsed + } else { + 0f64 + } + } else { + 0f64 + } + } +} + +#[derive(Debug)] +struct ExponentiallyWeightedMovingAverage { + alpha: f64, + interval_nanos: u64, + + uncounted: AtomicCell, + rate: AtomicCell>, +} + +impl ExponentiallyWeightedMovingAverage { + fn new(alpha: f64, interval_secs: u64) -> ExponentiallyWeightedMovingAverage { + ExponentiallyWeightedMovingAverage { + alpha, + interval_nanos: time::secs_to_nanos(interval_secs), + + uncounted: AtomicCell::new(0), + rate: AtomicCell::new(None), + } + } + + fn update(&self, n: u64) { + self.uncounted.fetch_add(n); + } + + fn tick(&self) { + let count = self.uncounted.swap(0); + let instant_rate = count as f64 / self.interval_nanos as f64; + + if let Some(prev_rate) = self.rate.load() { + let new_rate = prev_rate + (self.alpha * (instant_rate - prev_rate)); + self.rate.store(Some(new_rate)); + } else { + self.rate.store(Some(instant_rate)); + } + } + + fn get_rate(&self) -> f64 { + if let Some(rate) = self.rate.load() { + rate * time::secs_to_nanos(1) as f64 + } else { + 0f64 + } + } +} + +#[derive(Debug)] +struct ExponentiallyWeightedMovingAverages { + h1: ExponentiallyWeightedMovingAverage, + h2: ExponentiallyWeightedMovingAverage, + m15: ExponentiallyWeightedMovingAverage, + + last_tick: AtomicCell, +} + +#[inline] +fn alpha(interval_secs: u64, minutes: u64) -> f64 { + 1.0 - (-(interval_secs as f64) / 60.0 / minutes as f64).exp() +} + +const DEFAULT_INTERVAL_SECS: u64 = 5; +const DEFAULT_INTERVAL_MILLIS: u64 = DEFAULT_INTERVAL_SECS * 1000; + +impl ExponentiallyWeightedMovingAverages { + fn new() -> ExponentiallyWeightedMovingAverages { + ExponentiallyWeightedMovingAverages { + h1: ExponentiallyWeightedMovingAverage::new( + alpha(DEFAULT_INTERVAL_SECS, 60), + DEFAULT_INTERVAL_SECS, + ), + + h2: ExponentiallyWeightedMovingAverage::new( + alpha(DEFAULT_INTERVAL_SECS, 120), + DEFAULT_INTERVAL_SECS, + ), + + m15: ExponentiallyWeightedMovingAverage::new( + alpha(DEFAULT_INTERVAL_SECS, 15), + DEFAULT_INTERVAL_SECS, + ), + + last_tick: AtomicCell::new(Instant::now()), + } + } + + fn update(&self, n: u64) { + self.h1.update(n); + self.h2.update(n); + self.m15.update(n); + } + + fn tick_if_needed(&self) { + let previous_tick = self.last_tick.load(); + let current_tick = Instant::now(); + + let tick_age = (current_tick - previous_tick).as_millis() as u64; + + if tick_age > DEFAULT_INTERVAL_MILLIS { + let latest_tick = + current_tick - Duration::from_millis(tick_age % DEFAULT_INTERVAL_MILLIS); + if self + .last_tick + .compare_exchange(previous_tick, latest_tick) + .is_ok() + { + let required_ticks = tick_age / DEFAULT_INTERVAL_MILLIS; + for _ in 0..required_ticks { + self.h1.tick(); + self.h2.tick(); + self.m15.tick(); + } + } + } + } + + fn h1_rate(&self) -> f64 { + self.h1.get_rate() + } + + fn h2_rate(&self) -> f64 { + self.h2.get_rate() + } + + fn m15_rate(&self) -> f64 { + self.m15.get_rate() + } +} + +#[cfg(feature = "ser")] +impl Serialize for Meter { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let mut map = serializer.serialize_map(Some(4))?; + + map.serialize_entry("count", &self.count())?; + map.serialize_entry("h1_rate", &self.h1_rate())?; + map.serialize_entry("h2_rate", &self.h2_rate())?; + map.serialize_entry("m15_rate", &self.m15_rate())?; + + map.end() + } +} + +#[cfg(test)] +mod tests { + use std::{thread, time}; + + use super::*; + + macro_rules! assert_float_eq { + ($left:expr, $right:expr) => {{ + match (&$left, &$right) { + (left_val, right_val) => { + let diff = (left_val - right_val).abs(); + + if diff > f64::EPSILON { + panic!( + "assertion failed: `(left == right)`\n left: `{:?}`,\n right: `{:?}`", + &*left_val, &*right_val + ) + } + } + } + }}; + } + + #[test] + fn test_meter() { + let m = Meter::new(); + + for _ in 0..10 { + m.mark(); + } + + thread::sleep(time::Duration::from_millis(DEFAULT_INTERVAL_MILLIS + 10)); + + assert_eq!(10, m.count()); + assert_float_eq!(2.0, m.m15_rate()); + assert_float_eq!(2.0, m.h1_rate()); + assert_float_eq!(2.0, m.h2_rate()); + } +} diff --git a/common_util/src/panic.rs b/common_util/src/panic.rs new file mode 100644 index 0000000000..5b0a9f5713 --- /dev/null +++ b/common_util/src/panic.rs @@ -0,0 +1,159 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::thread; + +use log::error; + +/// fork from https://github.com/tikv/tikv/blob/83d173a2c0058246631f0e71de74238ccff670fd/components/tikv_util/src/lib.rs#L429 +/// Exit the whole process when panic. +pub fn set_panic_hook(panic_abort: bool) { + use std::{panic, process}; + + // HACK! New a backtrace ahead for caching necessary elf sections of this + // tikv-server, in case it can not open more files during panicking + // which leads to no stack info (0x5648bdfe4ff2 - ). + // + // Crate backtrace caches debug info in a static variable `STATE`, + // and the `STATE` lives forever once it has been created. + // See more: https://github.com/alexcrichton/backtrace-rs/blob/\ + // 597ad44b131132f17ed76bf94ac489274dd16c7f/\ + // src/symbolize/libbacktrace.rs#L126-L159 + // Caching is slow, spawn it in another thread to speed up. + thread::Builder::new() + .name("backtrace-loader".to_owned()) + .spawn(::backtrace::Backtrace::new) + .unwrap(); + + panic::set_hook(Box::new(move |info: &panic::PanicInfo<'_>| { + let msg = match info.payload().downcast_ref::<&'static str>() { + Some(s) => *s, + None => match info.payload().downcast_ref::() { + Some(s) => &s[..], + None => "Box", + }, + }; + + let thread = thread::current(); + let name = thread.name().unwrap_or(""); + let loc = info + .location() + .map(|l| format!("{}:{}", l.file(), l.line())); + let bt = backtrace::Backtrace::new(); + error!( + "thread '{}' panicked '{}' at {:?}\n{:?}", + name, + msg, + loc.unwrap_or_else(|| "".to_owned()), + bt + ); + + // There might be remaining logs in the async logger. + // To collect remaining logs and also collect future logs, replace the old one + // with a terminal logger. + // When the old global async logger is replaced, the old async guard will be + // taken and dropped. In the drop() the async guard, it waits for the + // finish of the remaining logs in the async logger. + if let Some(level) = ::log::max_level().to_level() { + let drainer = logger::term_drainer(); + let _ = logger::init_log( + drainer, + logger::convert_log_level_to_slog_level(level), + false, // Use sync logger to avoid an unnecessary log thread. + 0, + false, // It is initialized already. + ); + } + + if panic_abort { + process::abort(); + } else { + unsafe { + // Calling process::exit would trigger global static to destroy, like C++ + // static variables of RocksDB, which may cause other threads encounter + // pure virtual method call. So calling libc::_exit() instead to skip the + // cleanup process. + libc::_exit(1); + } + } + })) +} + +#[cfg(test)] +mod tests { + use std::{io::Read, time::Duration}; + + use nix::{ + sys::wait::{wait, WaitStatus}, + unistd::{fork, ForkResult}, + }; + use slog::{self, Drain, Level, OwnedKVList, Record}; + + use crate::panic::set_panic_hook; + + /// Create a child process and wait to get its exit code. + fn run_and_wait_child_process(child: impl Fn()) -> Result { + match unsafe { fork() } { + Ok(ForkResult::Parent { .. }) => match wait().unwrap() { + WaitStatus::Exited(_, status) => Ok(status), + v => Err(format!("{:?}", v)), + }, + Ok(ForkResult::Child) => { + child(); + std::process::exit(0); + } + Err(e) => Err(format!("Fork failed: {}", e)), + } + } + + #[test] + fn test_panic_hook() { + use gag::BufferRedirect; + + struct DelayDrain(D); + + impl Drain for DelayDrain + where + D: Drain, + ::Err: std::fmt::Display, + { + type Err = ::Err; + type Ok = ::Ok; + + fn log( + &self, + record: &Record<'_>, + values: &OwnedKVList, + ) -> Result { + std::thread::sleep(Duration::from_millis(100)); + self.0.log(record, values) + } + } + + let mut stdout = BufferRedirect::stdout().unwrap(); + let status = run_and_wait_child_process(|| { + set_panic_hook(false); + let drainer = logger::term_drainer(); + let _ = logger::init_log( + drainer, + Level::Debug, + true, // use async drainer + 0, + true, // init std log + ); + + let _ = std::thread::spawn(|| { + // let the global logger is held by the other thread, so the + // drop() of the async drain is not called in time. + let _guard = slog_global::borrow_global(); + std::thread::sleep(Duration::from_secs(1)); + }); + panic!("test"); + }) + .unwrap(); + + assert_eq!(status, 1); + let mut panic = String::new(); + stdout.read_to_string(&mut panic).unwrap(); + assert!(!panic.is_empty()); + } +} diff --git a/common_util/src/runtime/metrics.rs b/common_util/src/runtime/metrics.rs new file mode 100644 index 0000000000..4f82494093 --- /dev/null +++ b/common_util/src/runtime/metrics.rs @@ -0,0 +1,57 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use lazy_static::lazy_static; +use prometheus::{register_int_gauge_vec, IntGauge, IntGaugeVec}; + +lazy_static! { + // Gauges: + static ref RUNTIME_THREAD_ALIVE_GAUGE: IntGaugeVec = register_int_gauge_vec!( + "runtime_thread_alive_gauge", + "alive thread number for runtime", + &["name"] + ) + .unwrap(); + static ref RUNTIME_THREAD_IDLE_GAUGE: IntGaugeVec = register_int_gauge_vec!( + "runtime_thread_idle_gauge", + "idle thread number for runtime", + &["name"] + ) + .unwrap(); +} + +/// Runtime metrics. +#[derive(Debug)] +pub struct Metrics { + // Gauges: + pub thread_alive_gauge: IntGauge, + pub thread_idle_gauge: IntGauge, +} + +impl Metrics { + pub fn new(name: &str) -> Self { + Self { + thread_alive_gauge: RUNTIME_THREAD_ALIVE_GAUGE.with_label_values(&[name]), + thread_idle_gauge: RUNTIME_THREAD_IDLE_GAUGE.with_label_values(&[name]), + } + } + + #[inline] + pub fn on_thread_start(&self) { + self.thread_alive_gauge.inc(); + } + + #[inline] + pub fn on_thread_stop(&self) { + self.thread_alive_gauge.dec(); + } + + #[inline] + pub fn on_thread_park(&self) { + self.thread_idle_gauge.inc(); + } + + #[inline] + pub fn on_thread_unpark(&self) { + self.thread_idle_gauge.dec(); + } +} diff --git a/common_util/src/runtime/mod.rs b/common_util/src/runtime/mod.rs new file mode 100644 index 0000000000..70494d6b6f --- /dev/null +++ b/common_util/src/runtime/mod.rs @@ -0,0 +1,277 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! A multi-threaded runtime that supports running Futures +use std::{ + future::Future, + pin::Pin, + sync::Arc, + task::{Context, Poll}, +}; + +use pin_project_lite::pin_project; +use snafu::{Backtrace, GenerateBacktrace, ResultExt, Snafu}; +use tokio::{ + runtime::{Builder as RuntimeBuilder, Runtime as TokioRuntime}, + task::{JoinError, JoinHandle as TokioJoinHandle}, +}; +mod metrics; +use metrics::Metrics; + +// TODO(yingwen): Use opaque error type +#[derive(Debug, Snafu)] +#[snafu(visibility(pub))] +pub enum Error { + #[snafu(display( + "Runtime Failed to build runtime, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + BuildRuntime { + source: std::io::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Runtime Failed to join task, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + JoinTask { + source: JoinError, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +/// A runtime to run future tasks +#[derive(Debug)] +pub struct Runtime { + rt: TokioRuntime, + metrics: Arc, +} + +impl Runtime { + /// Spawn a future and execute it in this thread pool + /// + /// Similar to tokio::runtime::Runtime::spawn() + pub fn spawn(&self, future: F) -> JoinHandle + where + F: Future + Send + 'static, + F::Output: Send + 'static, + { + JoinHandle { + inner: self.rt.spawn(future), + } + } + + /// Run the provided function on an executor dedicated to blocking + /// operations. + pub fn spawn_blocking(&self, func: F) -> JoinHandle + where + F: FnOnce() -> R + Send + 'static, + R: Send + 'static, + { + JoinHandle { + inner: self.rt.spawn_blocking(func), + } + } + + /// Run a future to complete, this is the runtime's entry point + pub fn block_on(&self, future: F) -> F::Output { + self.rt.block_on(future) + } + + /// Returns the runtime stats + pub fn stats(&self) -> RuntimeStats { + RuntimeStats { + alive_thread_num: self.metrics.thread_alive_gauge.get(), + idle_thread_num: self.metrics.thread_idle_gauge.get(), + } + } +} + +pin_project! { + #[derive(Debug)] + pub struct JoinHandle { + #[pin] + inner: TokioJoinHandle, + } +} + +impl Future for JoinHandle { + type Output = Result; + + fn poll(self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll { + let this = self.project(); + this.inner.poll(ctx).map_err(|source| Error::JoinTask { + source, + backtrace: Backtrace::generate(), + }) + } +} + +/// Runtime statistics +pub struct RuntimeStats { + pub alive_thread_num: i64, + pub idle_thread_num: i64, +} + +pub struct Builder { + thread_name: String, + builder: RuntimeBuilder, +} + +impl Default for Builder { + fn default() -> Self { + Self { + thread_name: "cse-runtime-worker".to_string(), + builder: RuntimeBuilder::new_multi_thread(), + } + } +} + +fn with_metrics(metrics: &Arc, f: F) -> impl Fn() +where + F: Fn(&Arc) + 'static, +{ + let m = metrics.clone(); + move || { + f(&m); + } +} + +impl Builder { + /// Sets the number of worker threads the Runtime will use. + /// + /// This can be any number above 0 + pub fn worker_threads(&mut self, val: usize) -> &mut Self { + self.builder.worker_threads(val); + self + } + + /// Sets name of threads spawned by the Runtime thread pool + pub fn thread_name(&mut self, val: impl Into) -> &mut Self { + self.thread_name = val.into(); + self + } + + /// Enable all feature of the underlying runtime + pub fn enable_all(&mut self) -> &mut Self { + self.builder.enable_all(); + self + } + + pub fn build(&mut self) -> Result { + let metrics = Arc::new(Metrics::new(&self.thread_name)); + + let rt = self + .builder + .thread_name(self.thread_name.clone()) + .on_thread_start(with_metrics(&metrics, |m| { + m.on_thread_start(); + })) + .on_thread_stop(with_metrics(&metrics, |m| { + m.on_thread_stop(); + })) + .on_thread_park(with_metrics(&metrics, |m| { + m.on_thread_park(); + })) + .on_thread_unpark(with_metrics(&metrics, |m| { + m.on_thread_unpark(); + })) + .build() + .context(BuildRuntime)?; + + Ok(Runtime { rt, metrics }) + } +} + +#[cfg(test)] +mod tests { + use std::{sync::Arc, thread, time::Duration}; + + use tokio::sync::oneshot; + use tokio_test::assert_ok; + + use super::*; + + fn rt() -> Arc { + let rt = Builder::default() + .worker_threads(2) + .thread_name("test_spawn_join") + .enable_all() + .build(); + assert!(rt.is_ok()); + Arc::new(rt.unwrap()) + } + + #[test] + fn test_stats() { + let rt = Builder::default() + .worker_threads(5) + .thread_name("test_stats") + .enable_all() + .build(); + assert!(rt.is_ok()); + let rt = Arc::new(rt.unwrap()); + // wait threads created + thread::sleep(Duration::from_millis(50)); + + let s = rt.stats(); + assert_eq!(5, s.alive_thread_num); + assert_eq!(5, s.idle_thread_num); + + rt.spawn(async { + thread::sleep(Duration::from_millis(50)); + }); + + thread::sleep(Duration::from_millis(10)); + let s = rt.stats(); + assert_eq!(5, s.alive_thread_num); + assert_eq!(4, s.idle_thread_num); + } + + #[test] + fn block_on_async() { + let rt = rt(); + + let out = rt.block_on(async { + let (tx, rx) = oneshot::channel(); + + thread::spawn(move || { + thread::sleep(Duration::from_millis(50)); + tx.send("ZOMG").unwrap(); + }); + + assert_ok!(rx.await) + }); + + assert_eq!(out, "ZOMG"); + } + + #[test] + fn spawn_from_blocking() { + let rt = rt(); + let rt1 = rt.clone(); + let out = rt.block_on(async move { + let rt2 = rt1.clone(); + let inner = assert_ok!( + rt1.spawn_blocking(move || { rt2.spawn(async move { "hello" }) }) + .await + ); + + assert_ok!(inner.await) + }); + + assert_eq!(out, "hello") + } + + #[test] + fn test_spawn_join() { + let rt = rt(); + let handle = rt.spawn(async { 1 + 1 }); + + assert_eq!(2, rt.block_on(handle).unwrap()); + } +} diff --git a/common_util/src/time.rs b/common_util/src/time.rs new file mode 100644 index 0000000000..1a44f98402 --- /dev/null +++ b/common_util/src/time.rs @@ -0,0 +1,68 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Time utilities + +// TODO(yingwen): Move to common_types ? + +use std::{ + convert::TryInto, + time::{Duration, Instant}, +}; + +pub trait DurationExt { + /// Convert into u64. + /// + /// Returns u64::MAX if overflow + fn as_millis_u64(&self) -> u64; +} + +impl DurationExt for Duration { + #[inline] + fn as_millis_u64(&self) -> u64 { + match self.as_millis().try_into() { + Ok(v) => v, + Err(_) => u64::MAX, + } + } +} + +pub trait InstantExt { + fn saturating_elapsed(&self) -> Duration; +} + +impl InstantExt for Instant { + fn saturating_elapsed(&self) -> Duration { + Instant::now().saturating_duration_since(*self) + } +} + +#[inline] +pub fn secs_to_nanos(s: u64) -> u64 { + s * 1000000000 +} + +#[cfg(test)] +mod tests { + use std::thread; + + use super::*; + #[test] + fn test_as_mills_u64() { + let d = Duration::from_millis(100); + assert_eq!(100, d.as_millis_u64()); + + let d = Duration::from_secs(100); + assert_eq!(100000, d.as_millis_u64()); + } + + #[test] + fn test_saturating_elapsed() { + let ins = Instant::now(); + let one_hundred_mills = Duration::from_millis(100); + let error = 10; + thread::sleep(one_hundred_mills); + assert!(ins.saturating_elapsed().as_millis_u64() - 100 < error); + thread::sleep(one_hundred_mills); + assert!(ins.saturating_elapsed().as_millis_u64() - 200 < 2 * error); + } +} diff --git a/common_util/src/toml.rs b/common_util/src/toml.rs new file mode 100644 index 0000000000..58332dc4c2 --- /dev/null +++ b/common_util/src/toml.rs @@ -0,0 +1,104 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Toml config utilities. + +use std::{fs::File, io::Read}; + +use serde::de; +use snafu::{Backtrace, ResultExt, Snafu}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Failed to open file, path:{}, err:{}.\nBacktrace:\n{}", + path, + source, + backtrace + ))] + OpenFile { + path: String, + source: std::io::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to read toml, path:{}, err:{}.\nBacktrace:\n{}", + path, + source, + backtrace + ))] + ReadToml { + path: String, + source: std::io::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to parse toml, path:{}, err:{}.\nBacktrace:\n{}", + path, + source, + backtrace + ))] + ParseToml { + path: String, + source: toml::de::Error, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +/// Read toml file from given `path` to `toml_buf`, then parsed it to `T` and +/// return. +pub fn parse_toml_from_path<'a, T>(path: &str, toml_buf: &'a mut String) -> Result +where + T: de::Deserialize<'a>, +{ + let mut file = File::open(path).context(OpenFile { path })?; + file.read_to_string(toml_buf).context(ReadToml { path })?; + + toml::from_str(toml_buf).context(ParseToml { path }) +} + +#[cfg(test)] +mod tests { + use std::io::Write; + + use serde_derive::Deserialize; + use tempfile::tempdir; + + use super::*; + + #[test] + fn test_parse_toml_from_path() { + let dir = tempdir().unwrap(); + let file_path = dir.path().join("test.toml"); + let path = file_path.to_str().unwrap(); + + let mut f = File::create(path).expect("Failed to create test config file"); + f.write_all(b"host=\"localhost\"\nport=1081") + .expect("Failed to write test config"); + + f.sync_all().expect("Failed to sync test config"); + + #[derive(Clone, Debug, Deserialize)] + struct TestConfig { + host: String, + port: u16, + } + let mut config = TestConfig { + host: "".to_string(), + port: 0, + }; + + assert_eq!("", config.host); + assert_eq!(0, config.port); + + let mut toml_str = String::new(); + + config = parse_toml_from_path(path, &mut toml_str).unwrap(); + + assert_eq!("localhost", config.host); + assert_eq!(1081, config.port); + } +} diff --git a/components/arena/Cargo.toml b/components/arena/Cargo.toml new file mode 100644 index 0000000000..ec70993c17 --- /dev/null +++ b/components/arena/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "arena" +version = "0.1.0" +authors = ["Ruihang Xia "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +parking_lot = "0.11.1" \ No newline at end of file diff --git a/components/arena/src/arena_trait.rs b/components/arena/src/arena_trait.rs new file mode 100644 index 0000000000..a8808fa38b --- /dev/null +++ b/components/arena/src/arena_trait.rs @@ -0,0 +1,70 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{alloc::Layout, ptr::NonNull, sync::Arc}; + +/// Memory Arena trait. +/// +/// The trait itself provides and enforces no guarantee about alignment. It's +/// implementation's responsibility to cover. +/// +/// All memory-relavent methods (`alloc()` etc.) are not "unsafe". Compare with +/// "deallocate" which is not included in this trait, allocating is more safer +/// and not likely to run into UB. However in fact, playing with raw pointer is +/// always dangerous and needs to be careful for both who implements and uses +/// this trait. +pub trait Arena { + type Stats; + + // required methods + + /// Try to allocate required memory described by layout. Return a pointer of + /// allocated space in success, while `None` if failed. + fn try_alloc(&self, layout: Layout) -> Option>; + + /// Get arena's statistics. + fn stats(&self) -> Self::Stats; + + // provided methods + + /// Allocate required memory. Panic if failed. + fn alloc(&self, layout: Layout) -> NonNull { + self.try_alloc(layout).unwrap() + } +} + +/// Basic statistics of arena. Offers [bytes_allocated] +/// and [bytes_used]. +#[derive(Debug, Clone, Copy)] +pub struct BasicStats { + pub(crate) bytes_allocated: usize, + pub(crate) bytes_used: usize, +} + +impl BasicStats { + /// Total bytes allocated from system. + #[inline] + pub fn bytes_allocated(&self) -> usize { + self.bytes_allocated + } + + /// Total bytes allocated to user. + #[inline] + pub fn bytes_used(&self) -> usize { + self.bytes_used + } +} + +/// Collect memory usage from Arean +pub trait Collector { + /// Called when `bytes` bytes memory is allocated in arena. + fn on_alloc(&self, bytes: usize); + + /// Called when `bytes` bytes memory is used in arena. + fn on_used(&self, bytes: usize); + + /// Called when `allocated` bytes memory is released, and `used` bytes in + /// it. + fn on_free(&self, used: usize, allocated: usize); +} + +pub type CollectorRef = Arc; diff --git a/components/arena/src/fixed_size.rs b/components/arena/src/fixed_size.rs new file mode 100644 index 0000000000..f7305e6144 --- /dev/null +++ b/components/arena/src/fixed_size.rs @@ -0,0 +1,107 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + alloc::{alloc, dealloc, Layout}, + ptr::NonNull, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, +}; + +use crate::arena_trait::{Arena, BasicStats}; + +const DEFAULT_ALIGN: usize = 8; + +#[derive(Clone)] +pub struct FixedSizeArena { + core: Arc, +} + +impl FixedSizeArena { + pub fn with_capacity(cap: usize) -> Self { + Self { + core: Arc::new(Core::with_capacity(cap)), + } + } +} + +struct Core { + len: AtomicUsize, + cap: usize, + ptr: NonNull, +} + +impl Core { + /// # Safety + /// - alloc + /// See [std::alloc::alloc]. + /// - new_unchecked + /// `ptr` is allocated from allocator. + fn with_capacity(cap: usize) -> Self { + let layout = Layout::from_size_align(cap as usize, DEFAULT_ALIGN).unwrap(); + let ptr = unsafe { alloc(layout) }; + + Self { + len: AtomicUsize::new(0), + cap, + ptr: unsafe { NonNull::new_unchecked(ptr) }, + } + } + + /// # Safety + /// `self.ptr` is allocated from allocator + fn try_alloc(&self, layout: Layout) -> Option> { + let layout = layout.pad_to_align(); + let size = layout.size(); + + let offset = self.len.fetch_add(size, Ordering::SeqCst) as usize; + if offset + size > self.cap { + self.len.fetch_sub(size, Ordering::SeqCst); + return None; + } + + unsafe { Some(NonNull::new_unchecked(self.ptr.as_ptr().add(size))) } + } +} + +impl Drop for Core { + /// Reclaim space pointed by `data`. + fn drop(&mut self) { + unsafe { + dealloc( + self.ptr.as_ptr(), + Layout::from_size_align_unchecked(self.cap, DEFAULT_ALIGN), + ) + } + } +} + +impl Arena for FixedSizeArena { + type Stats = BasicStats; + + fn try_alloc(&self, layout: Layout) -> Option> { + self.core.try_alloc(layout) + } + + fn stats(&self) -> Self::Stats { + Self::Stats { + bytes_used: self.core.cap, + bytes_allocated: self.core.len.load(Ordering::SeqCst) as usize, + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn capacity_overflow() { + let arena = FixedSizeArena::with_capacity(1024); + let layout = unsafe { Layout::from_size_align_unchecked(768, DEFAULT_ALIGN) }; + let _ = arena.alloc(layout); + + assert_eq!(None, arena.try_alloc(layout)); + } +} diff --git a/components/arena/src/lib.rs b/components/arena/src/lib.rs new file mode 100644 index 0000000000..963dd47933 --- /dev/null +++ b/components/arena/src/lib.rs @@ -0,0 +1,11 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! `Arena` Trait and implementations. + +mod arena_trait; +mod fixed_size; +mod mono_inc; + +pub use arena_trait::{Arena, BasicStats, Collector, CollectorRef}; +pub use fixed_size::FixedSizeArena; +pub use mono_inc::{MonoIncArena, NoopCollector}; diff --git a/components/arena/src/mono_inc.rs b/components/arena/src/mono_inc.rs new file mode 100644 index 0000000000..0adc7253de --- /dev/null +++ b/components/arena/src/mono_inc.rs @@ -0,0 +1,347 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + alloc::{alloc, dealloc, Layout}, + ptr::NonNull, + sync::Arc, +}; + +use parking_lot::Mutex; + +use crate::arena_trait::{Arena, BasicStats, Collector, CollectorRef}; + +/// The noop collector does nothing on alloc and free +pub struct NoopCollector; + +impl Collector for NoopCollector { + fn on_alloc(&self, _bytes: usize) {} + + fn on_used(&self, _bytes: usize) {} + + fn on_free(&self, _used: usize, _allocated: usize) {} +} + +const DEFAULT_ALIGN: usize = 8; + +/// A thread-safe arena. All allocated memory is aligned to 8. Organizes its +/// allocated memory as blocks. +#[derive(Clone)] +pub struct MonoIncArena { + core: Arc>, +} + +impl MonoIncArena { + pub fn new(regular_block_size: usize) -> Self { + Self { + core: Arc::new(Mutex::new(ArenaCore::new( + regular_block_size, + Arc::new(NoopCollector {}), + ))), + } + } + + pub fn with_collector(regular_block_size: usize, collector: CollectorRef) -> Self { + Self { + core: Arc::new(Mutex::new(ArenaCore::new(regular_block_size, collector))), + } + } +} + +impl Arena for MonoIncArena { + type Stats = BasicStats; + + fn try_alloc(&self, layout: Layout) -> Option> { + Some(self.core.lock().alloc(layout)) + } + + fn stats(&self) -> Self::Stats { + self.core.lock().stats + } + + fn alloc(&self, layout: Layout) -> NonNull { + self.core.lock().alloc(layout) + } +} + +struct ArenaCore { + collector: CollectorRef, + regular_layout: Layout, + regular_blocks: Vec, + special_blocks: Vec, + stats: BasicStats, +} + +impl ArenaCore { + /// # Safety + /// Required property is tested in debug assertions. + fn new(regular_block_size: usize, collector: CollectorRef) -> Self { + debug_assert_ne!(DEFAULT_ALIGN, 0); + debug_assert_eq!(DEFAULT_ALIGN & (DEFAULT_ALIGN - 1), 0); + // TODO(yingwen): Avoid panic. + let regular_layout = Layout::from_size_align(regular_block_size, DEFAULT_ALIGN).unwrap(); + let regular_blocks = vec![Block::new(regular_layout)]; + let special_blocks = vec![]; + let bytes = regular_layout.size(); + collector.on_alloc(bytes); + + Self { + collector, + regular_layout, + regular_blocks, + special_blocks, + stats: BasicStats { + bytes_allocated: bytes, + bytes_used: 0, + }, + } + } + + /// Input layout will be aligned. + fn alloc(&mut self, layout: Layout) -> NonNull { + let layout = layout + .align_to(self.regular_layout.align()) + .unwrap() + .pad_to_align(); + let bytes = layout.size(); + // TODO(Ruihang): determine threshold + if layout.size() > self.regular_layout.size() { + self.stats.bytes_used += bytes; + self.collector.on_used(bytes); + Self::add_new_block( + layout, + &mut self.special_blocks, + &mut self.stats, + &self.collector, + ); + let block = self.special_blocks.last().unwrap(); + return block.data; + } + + self.stats.bytes_used += bytes; + self.collector.on_used(bytes); + if let Some(ptr) = self.try_alloc(layout) { + ptr + } else { + Self::add_new_block( + self.regular_layout, + &mut self.regular_blocks, + &mut self.stats, + &self.collector, + ); + self.try_alloc(layout).unwrap() + } + } + + /// # Safety + /// `regular_blocks` vector is guaranteed to contains at least one element. + fn try_alloc(&mut self, layout: Layout) -> Option> { + self.regular_blocks.last_mut().unwrap().alloc(layout) + } + + fn add_new_block( + layout: Layout, + container: &mut Vec, + stats: &mut BasicStats, + collector: &CollectorRef, + ) { + let new_block = Block::new(layout); + container.push(new_block); + // Update allocated stats once a new block has been allocated from the system. + stats.bytes_allocated += layout.size(); + collector.on_alloc(layout.size()); + } +} + +impl Drop for ArenaCore { + fn drop(&mut self) { + self.collector + .on_free(self.stats.bytes_used, self.stats.bytes_allocated); + } +} + +struct Block { + data: NonNull, + len: usize, + layout: Layout, +} + +impl Block { + /// Create a new block. Return the pointer of this new block. + /// + /// # Safety + /// See [std::alloc::alloc]. The allocated memory will be deallocated in + /// drop(). + fn new(layout: Layout) -> Block { + let data = unsafe { alloc(layout) }; + + Self { + data: NonNull::new(data).unwrap(), + len: 0, + layout, + } + } + + /// # Safety + /// ## ptr:add() + /// The added offset is checked before. + /// ## NonNull::new_unchecked() + /// `ptr` is added from a NonNull. + fn alloc(&mut self, layout: Layout) -> Option> { + let size = layout.size(); + + if self.len + size <= self.layout.size() { + let ptr = unsafe { self.data.as_ptr().add(self.len) }; + self.len += size; + unsafe { Some(NonNull::new_unchecked(ptr)) } + } else { + None + } + } +} + +impl Drop for Block { + /// Reclaim space pointed by `data`. + fn drop(&mut self) { + unsafe { dealloc(self.data.as_ptr(), self.layout) } + } +} + +unsafe impl Send for Block {} +unsafe impl Sync for Block {} + +#[cfg(test)] +mod test { + use std::{ + mem, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, + }; + + use super::*; + + /// # Safety: + /// Caller should check the input buf has enough space. + fn consume_buf_as_u64_slice(buf: NonNull, n: usize) { + unsafe { + let mut buf = buf.as_ptr() as *mut u64; + for i in 0..n { + *buf = i as u64; + buf = buf.add(1); + } + } + } + + #[test] + fn test_stats() { + let arena = MonoIncArena::new(1024 * 1024); + + // Size is 80 + let layout_slice = Layout::new::<[u64; 10]>().align_to(8).unwrap(); + for _ in 0..20 { + arena.alloc(layout_slice); + } + + assert_eq!(1024 * 1024, arena.stats().bytes_allocated()); + assert_eq!(1600, arena.stats().bytes_used()); + } + + struct MockCollector { + allocated: AtomicUsize, + used: AtomicUsize, + } + + impl Collector for MockCollector { + fn on_alloc(&self, bytes: usize) { + self.allocated.fetch_add(bytes, Ordering::Relaxed); + } + + fn on_used(&self, bytes: usize) { + self.used.fetch_add(bytes, Ordering::Relaxed); + } + + fn on_free(&self, _used: usize, _allocated: usize) {} + } + + #[test] + fn test_collector() { + let collector = Arc::new(MockCollector { + allocated: AtomicUsize::new(0), + used: AtomicUsize::new(0), + }); + + let arena = MonoIncArena::with_collector(1024 * 1024, collector.clone()); + + // Size is 80 + let layout_slice = Layout::new::<[u64; 10]>().align_to(8).unwrap(); + for _ in 0..20 { + arena.alloc(layout_slice); + } + + assert_eq!(1024 * 1024, collector.allocated.load(Ordering::Relaxed)); + assert_eq!(1600, collector.used.load(Ordering::Relaxed)); + } + + #[test] + fn alloc_small_slice() { + let arena = MonoIncArena::new(128); + + let layout_slice = Layout::new::<[u64; 10]>().align_to(8).unwrap(); + for _ in 0..20 { + let buf = arena.alloc(layout_slice); + consume_buf_as_u64_slice(buf, 10); + } + + assert_eq!(2560, arena.stats().bytes_allocated()); + assert_eq!(1600, arena.stats().bytes_used()); + } + + #[test] + fn alloc_huge_slice() { + let arena = MonoIncArena::new(128); + + let layout_slice = Layout::new::<[u64; 20]>().align_to(8).unwrap(); + for _ in 0..20 { + let buf = arena.alloc(layout_slice); + consume_buf_as_u64_slice(buf, 20); + } + + assert_eq!(3328, arena.stats().bytes_allocated()); + assert_eq!(3200, arena.stats().bytes_used()); + } + + #[test] + fn alloc_various_slice() { + let arena = MonoIncArena::new(1024); + const SIZES: [usize; 12] = [10, 200, 30, 1024, 512, 77, 89, 1, 3, 29, 16, 480]; + let total_used: usize = SIZES.iter().map(|v| v * 8).sum(); + + for size in &SIZES { + let layout_slice = Layout::from_size_align(mem::size_of::() * *size, 8).unwrap(); + let buf = arena.alloc(layout_slice); + consume_buf_as_u64_slice(buf, *size); + } + + assert_eq!(20800, arena.stats().bytes_allocated()); + assert_eq!(total_used, arena.stats().bytes_used()); + } + + #[test] + fn unaligned_alloc_request() { + let arena = MonoIncArena::new(1024); + + let regular_req_layout = Layout::from_size_align(mem::size_of::(), 2).unwrap(); + for _ in 0..10 { + let buf = arena.alloc(regular_req_layout).as_ptr() as usize; + assert_eq!(0, buf % DEFAULT_ALIGN); + } + + // 2003 is a prime number and 2004 % 8 != 0 + let special_req_layout = Layout::from_size_align(2003, 2).unwrap(); + for _ in 0..10 { + let buf = arena.alloc(special_req_layout).as_ptr() as usize; + assert_eq!(0, buf % DEFAULT_ALIGN); + } + } +} diff --git a/components/bytes/Cargo.toml b/components/bytes/Cargo.toml new file mode 100644 index 0000000000..0fecefbe8e --- /dev/null +++ b/components/bytes/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "bytes" +version = "0.1.0" +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +upstream = { version = "1.0", package = "bytes" } +snafu = { version ="0.6.10", features = ["backtraces"]} diff --git a/components/bytes/src/lib.rs b/components/bytes/src/lib.rs new file mode 100644 index 0000000000..015aabce0c --- /dev/null +++ b/components/bytes/src/lib.rs @@ -0,0 +1,368 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Provides utilities for byte arrays +//! +//! Use Bytes instead of Vec. Currently just re-export bytes crate + +use std::{ + fmt, + io::{self, Read, Write}, +}; + +use snafu::{ensure, Backtrace, GenerateBacktrace, Snafu}; +// Should not use bytes crate outside of this mod so we can replace the actual +// implementations if needed +pub use upstream::{Buf, BufMut, Bytes, BytesMut}; + +/// Error of MemBuf/MemBufMut +/// +/// We do not use `std::io::Error` because it is too large +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to fill whole buffer.\nBacktrace:\n{}", backtrace))] + UnexpectedEof { backtrace: Backtrace }, + + #[snafu(display("Failed to write whole buffer.\nBacktrace:\n{}", backtrace))] + WouldOverflow { backtrace: Backtrace }, +} + +pub type Result = std::result::Result; + +/// Now is just an alias to `Vec`, prefer to use this alias instead of +/// `Vec` +pub type ByteVec = Vec; + +/// Read bytes from a buffer. +/// +/// Unlike `bytes::Buf`, the underlying storage is in contiguous memory +pub trait MemBuf: fmt::Debug { + /// Return the remaining byte slice + fn remaining_slice(&self) -> &[u8]; + + /// Advance the internal cursor of the buffer, panic if overflow + fn must_advance(&mut self, cnt: usize); + + /// Read bytes from self into dst. + /// + /// The cursor is advanced by the number of bytes copied. + /// + /// Returns error if self does not have enough remaining bytes to fill dst. + fn read_to_slice(&mut self, dst: &mut [u8]) -> Result<()>; + + /// Gets an unsigned 8 bit integer from self and advance current position + /// + /// Returns error if the capacity is not enough + fn read_u8(&mut self) -> Result { + let mut buf = [0; 1]; + self.read_to_slice(&mut buf)?; + Ok(buf[0]) + } + + /// Gets an unsighed 32 bit integer from self in big-endian byte order and + /// advance current position + /// + /// Returns error if the capacity is not enough + fn read_u32(&mut self) -> Result { + let mut buf = [0; 4]; + self.read_to_slice(&mut buf)?; + Ok(u32::from_be_bytes(buf)) + } + + /// Gets an unsighed 64 bit integer from self in big-endian byte order and + /// advance current position + /// + /// Returns error if the capacity is not enough + fn read_u64(&mut self) -> Result { + let mut buf = [0; 8]; + self.read_to_slice(&mut buf)?; + Ok(u64::from_be_bytes(buf)) + } + + fn read_f64(&mut self) -> Result { + let mut buf = [0; 8]; + self.read_to_slice(&mut buf)?; + Ok(f64::from_be_bytes(buf)) + } + + fn read_f32(&mut self) -> Result { + let mut buf = [0; 4]; + self.read_to_slice(&mut buf)?; + Ok(f32::from_be_bytes(buf)) + } +} + +/// Write bytes to a buffer +/// +/// Unlike `bytes::BufMut`, write operations may fail +pub trait MemBufMut: fmt::Debug { + /// Write bytes into self from src, advance the buffer position + /// + /// Returns error if the capacity is not enough + fn write_slice(&mut self, src: &[u8]) -> Result<()>; + + /// Write an unsigned 8 bit integer to self, advance the buffer position + /// + /// Returns error if the capacity is not enough + fn write_u8(&mut self, n: u8) -> Result<()> { + let src = [n]; + self.write_slice(&src) + } + + /// Writes an unsigned 32 bit integer to self in the big-endian byte order, + /// advance the buffer position + /// + /// Returns error if the capacity is not enough + fn write_u32(&mut self, n: u32) -> Result<()> { + self.write_slice(&n.to_be_bytes()) + } + + /// Writes an unsigned 64 bit integer to self in the big-endian byte order, + /// advance the buffer position + /// + /// Returns error if the capacity is not enough + fn write_u64(&mut self, n: u64) -> Result<()> { + self.write_slice(&n.to_be_bytes()) + } + + /// Writes an float 64 to self in the big-endian byte order, + /// advance the buffer position + /// + /// Returns error if the capacity is not enough + fn write_f64(&mut self, n: f64) -> Result<()> { + self.write_slice(&n.to_be_bytes()) + } + + /// Writes an float 32 to self in the big-endian byte order, + /// advance the buffer position + /// + /// Returns error if the capacity is not enough + fn write_f32(&mut self, n: f32) -> Result<()> { + self.write_slice(&n.to_be_bytes()) + } +} + +macro_rules! impl_mem_buf { + () => { + #[inline] + fn remaining_slice(&self) -> &[u8] { + &self + } + + #[inline] + fn must_advance(&mut self, cnt: usize) { + self.advance(cnt); + } + + #[inline] + fn read_to_slice(&mut self, dst: &mut [u8]) -> Result<()> { + ensure!(self.remaining() >= dst.len(), UnexpectedEof); + self.copy_to_slice(dst); + Ok(()) + } + }; +} + +impl MemBuf for Bytes { + impl_mem_buf!(); +} + +impl MemBuf for BytesMut { + impl_mem_buf!(); +} + +impl MemBufMut for BytesMut { + fn write_slice(&mut self, src: &[u8]) -> Result<()> { + ensure!(self.remaining_mut() >= src.len(), WouldOverflow); + self.put_slice(src); + Ok(()) + } +} + +impl MemBuf for &[u8] { + #[inline] + fn remaining_slice(&self) -> &[u8] { + self + } + + #[inline] + fn must_advance(&mut self, cnt: usize) { + *self = &self[cnt..]; + } + + #[inline] + fn read_to_slice(&mut self, dst: &mut [u8]) -> Result<()> { + // slice::read_exact() only throws UnexpectedEof error, see + // + // https://doc.rust-lang.org/src/std/io/impls.rs.html#264-281 + self.read_exact(dst).map_err(|_| Error::UnexpectedEof { + backtrace: Backtrace::generate(), + }) + } +} + +impl MemBufMut for &mut [u8] { + fn write_slice(&mut self, src: &[u8]) -> Result<()> { + // slice::write_all() actually wont fail, see + // + // https://doc.rust-lang.org/src/std/io/impls.rs.html#344-350 + self.write_all(src).map_err(|_| Error::WouldOverflow { + backtrace: Backtrace::generate(), + }) + } +} + +impl MemBufMut for Vec { + fn write_slice(&mut self, src: &[u8]) -> Result<()> { + self.extend_from_slice(src); + Ok(()) + } +} + +/// A `MemBufMut` adapter which implements [std::io::Write] for the inner value +#[derive(Debug)] +pub struct Writer<'a, B> { + buf: &'a mut B, +} + +impl<'a, B: MemBufMut> Writer<'a, B> { + /// Create a new Writer from a mut ref to buf + pub fn new(buf: &'a mut B) -> Self { + Self { buf } + } +} + +impl<'a, B: MemBufMut> Write for Writer<'a, B> { + fn write(&mut self, src: &[u8]) -> io::Result { + self.buf.write_slice(src).map_err(|e| match &e { + Error::UnexpectedEof { .. } => io::Error::new(io::ErrorKind::UnexpectedEof, e), + Error::WouldOverflow { .. } => io::Error::new(io::ErrorKind::WriteZero, e), + })?; + Ok(src.len()) + } + + fn flush(&mut self) -> io::Result<()> { + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_bytes_mut_mem_buf() { + let hello = b"hello"; + let mut buffer = BytesMut::new(); + buffer.write_u8(8).unwrap(); + buffer.write_u64(u64::MAX - 5).unwrap(); + buffer.write_slice(hello).unwrap(); + + assert_eq!(&buffer, buffer.remaining_slice()); + assert_eq!(8, buffer.read_u8().unwrap()); + assert_eq!(u64::MAX - 5, buffer.read_u64().unwrap()); + let mut dst = [0; 5]; + buffer.read_to_slice(&mut dst).unwrap(); + assert_eq!(hello, &dst); + + assert!(buffer.remaining_slice().is_empty()); + } + + #[test] + fn test_bytes_mut_empty() { + let mut buffer = BytesMut::new(); + assert!(buffer.remaining_slice().is_empty()); + assert!(matches!(buffer.read_u8(), Err(Error::UnexpectedEof { .. }))); + assert!(matches!( + buffer.read_u64(), + Err(Error::UnexpectedEof { .. }) + )); + } + + #[test] + fn test_bytes_mem_buf() { + let mut buffer = Bytes::from_static(b"hello world"); + assert_eq!(b"hello world", buffer.remaining_slice()); + + let mut dst = [0; 5]; + buffer.read_to_slice(&mut dst).unwrap(); + assert_eq!(b"hello", &dst); + + assert_eq!(b" world", buffer.remaining_slice()); + buffer.must_advance(1); + assert_eq!(b"world", buffer.remaining_slice()); + + let mut dst = [0; 50]; + assert!(matches!( + buffer.read_to_slice(&mut dst), + Err(Error::UnexpectedEof { .. }) + )); + } + + #[test] + fn test_slice_mem_buf() { + let hello = b"hello world"; + let mut buf = &hello[..]; + + assert_eq!(hello, buf.remaining_slice()); + let mut dst = [0; 6]; + buf.read_to_slice(&mut dst).unwrap(); + assert_eq!(b"hello ", &dst); + assert_eq!(b"world", buf.remaining_slice()); + + buf.must_advance(1); + assert_eq!(b"orld", buf.remaining_slice()); + } + + #[test] + fn test_slice_mem_buf_mut() { + let mut dst = [b'x'; 11]; + { + let mut buf = &mut dst[..]; + + buf.write_slice(b"abcde").unwrap(); + assert_eq!(b"abcdexxxxxx", &dst); + } + + { + let mut buf = &mut dst[..]; + + buf.write_slice(b"hello").unwrap(); + buf.write_slice(b" world").unwrap(); + assert_eq!(b"hello world", &dst); + } + + let mut dst = [0; 3]; + let mut buf = &mut dst[..]; + assert!(matches!( + buf.write_slice(b"a long long long slice"), + Err(Error::WouldOverflow { .. }) + )); + } + + #[test] + fn test_vec_mem_buf_mut() { + let mut buf = Vec::new(); + buf.write_slice(b"hello").unwrap(); + assert_eq!(b"hello", &buf[..]); + } + + #[test] + fn test_writer_write() { + let mut buf = Vec::new(); + let mut writer = Writer::new(&mut buf); + writer.write_all(b"he").unwrap(); + writer.write_all(b"llo").unwrap(); + assert_eq!(b"hello", &buf[..]); + } + + #[test] + fn test_writer_overflow() { + let mut dst = [0; 3]; + let mut buf = &mut dst[..]; + let mut writer = Writer::new(&mut buf); + assert_eq!( + io::ErrorKind::WriteZero, + writer.write_all(b"0123456789").err().unwrap().kind() + ); + } +} diff --git a/components/logger/Cargo.toml b/components/logger/Cargo.toml new file mode 100644 index 0000000000..9fdc938340 --- /dev/null +++ b/components/logger/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "logger" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +chrono = "0.4" +grpcio = { path = "../../grpcio" } +log = "0.4" +slog = "2.7" +slog-async = "2.6" +slog-term = "2.8" +slog_derive = "0.2" + +[dependencies.slog-global] +version = "0.1" +git = "https://github.com/breeswish/slog-global.git" +rev = "0e23a5baff302a9d7bccd85f8f31e43339c2f2c1" diff --git a/components/logger/src/lib.rs b/components/logger/src/lib.rs new file mode 100644 index 0000000000..f0317ab586 --- /dev/null +++ b/components/logger/src/lib.rs @@ -0,0 +1,422 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt, + fs::{File, OpenOptions}, + io, + str::FromStr, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, Mutex, + }, +}; + +use log::{info, SetLoggerError}; +pub use slog::Level; +use slog::{slog_o, Drain, Key, OwnedKVList, Record, KV}; +use slog_async::{Async, OverflowStrategy}; +use slog_term::{Decorator, PlainDecorator, RecordDecorator, TermDecorator}; + +const ASYNC_CHAN_SIZE: usize = 102400; +// This format is required for xflush monitor +const TIMESTAMP_FORMAT: &str = "%Y-%m-%d %H:%M:%S%.3f"; + +// Thanks to tikv +// https://github.com/tikv/tikv/blob/eaeb39a2c85684de08c48cf4b9426b3faf4defe6/components/tikv_util/src/logger/mod.rs + +pub fn convert_slog_level_to_log_level(lv: Level) -> log::Level { + match lv { + Level::Critical | Level::Error => log::Level::Error, + Level::Warning => log::Level::Warn, + Level::Debug => log::Level::Debug, + Level::Trace => log::Level::Trace, + Level::Info => log::Level::Info, + } +} + +pub fn convert_log_level_to_slog_level(lv: log::Level) -> Level { + match lv { + log::Level::Error => Level::Error, + log::Level::Warn => Level::Warning, + log::Level::Debug => Level::Debug, + log::Level::Trace => Level::Trace, + log::Level::Info => Level::Info, + } +} + +// The `to_string()` function of `slog::Level` produces values like `erro` and +// `trce` instead of the full words. This produces the full word. +fn get_string_by_level(lv: Level) -> &'static str { + match lv { + Level::Critical => "critical", + Level::Error => "error", + Level::Warning => "warn", + Level::Debug => "debug", + Level::Trace => "trace", + Level::Info => "info", + } +} + +pub fn term_drainer() -> CeresFormat { + let decorator = TermDecorator::new().stdout().build(); + CeresFormat::new(decorator) +} + +pub fn file_drainer(path: &Option) -> Option>> { + match path { + Some(path) => { + let file = OpenOptions::new() + .create(true) + .append(true) + .open(path) + .unwrap(); + let decorator = PlainDecorator::new(file); + Some(CeresFormat::new(decorator)) + } + None => None, + } +} + +// dispacher +pub struct LogDispatcher { + normal: N, +} + +impl LogDispatcher { + pub fn new(normal: N) -> Self { + Self { normal } + } +} + +impl Drain for LogDispatcher +where + N: Drain, +{ + type Err = io::Error; + type Ok = (); + + fn log(&self, record: &Record, values: &OwnedKVList) -> Result { + self.normal.log(record, values) + } +} + +pub fn init_log( + drain: D, + level: Level, + use_async: bool, + async_log_channel_len: i32, + init_stdlog: bool, +) -> Result +where + D: Drain + Send + 'static, + ::Err: std::fmt::Display, +{ + let runtime_level = RuntimeLevel::new(level); + // TODO(yingwen): Consider print the error instead of just ignoring it? + let root_logger = if use_async { + let drain = if async_log_channel_len <= 0 { + Async::new(drain.ignore_res()) + .chan_size(ASYNC_CHAN_SIZE) + .overflow_strategy(OverflowStrategy::Block) + .build() + } else { + Async::new(drain.ignore_res()) + .chan_size(async_log_channel_len as usize) + .build() + }; + let drain = RuntimeLevelFilter::new(drain, runtime_level.clone()); + slog::Logger::root(drain.ignore_res(), slog_o!()) + } else { + let drain = RuntimeLevelFilter::new(Mutex::new(drain), runtime_level.clone()); + slog::Logger::root(drain.ignore_res(), slog_o!()) + }; + + slog_global::set_global(root_logger); + if init_stdlog { + slog_global::redirect_std_log(Some(level))?; + grpcio::redirect_log(); + } + + Ok(runtime_level) +} + +// e.g. +// ```text +// 2020-01-20 13:00:14.998 INFO [src/engine/rocksdb/rocks_kv.rs:394] RocksKV::open_with_op start, name:autogen +// ``` +pub struct CeresFormat +where + D: Decorator, +{ + decorator: D, +} + +impl CeresFormat +where + D: Decorator, +{ + fn new(decorator: D) -> Self { + Self { decorator } + } +} + +impl Drain for CeresFormat +where + D: Decorator, +{ + type Err = io::Error; + type Ok = (); + + fn log(&self, record: &Record, values: &OwnedKVList) -> Result { + self.decorator.with_record(record, values, |decorator| { + write_log_header(decorator, record)?; + write_log_msg(decorator, record)?; + write_log_fields(decorator, record, values)?; + + decorator.start_whitespace()?; + writeln!(decorator)?; + + decorator.flush()?; + + Ok(()) + }) + } +} + +#[derive(Clone)] +pub struct RuntimeLevel { + level: Arc, + default_level: Level, +} + +impl RuntimeLevel { + fn new(default_level: Level) -> Self { + Self { + level: Arc::new(AtomicUsize::new(default_level.as_usize())), + default_level, + } + } + + #[inline] + pub fn current_level(&self) -> Level { + Level::from_usize(self.level.load(Ordering::Relaxed)).unwrap_or(self.default_level) + } + + pub fn set_level(&self, level: Level) { + self.level.store(level.as_usize(), Ordering::Relaxed); + // Log level of std log is not changed unless we call `log::set_max_level` + log::set_max_level(convert_slog_level_to_log_level(level).to_level_filter()); + + info!( + "RuntimeLevel::set_level log level changed to {}", + get_string_by_level(level) + ); + } + + #[inline] + pub fn reset(&self) { + self.set_level(self.default_level); + } + + #[inline] + pub fn default_level(&self) -> Level { + self.default_level + } + + #[inline] + pub fn current_level_str(&self) -> &str { + get_string_by_level(self.current_level()) + } + + pub fn set_level_by_str(&self, level_str: &str) -> Result<(), String> { + Level::from_str(level_str) + .map_err(|_| format!("Invalid level {}", level_str)) + .and_then(|level| match level { + Level::Trace | Level::Debug | Level::Info => Ok(level), + _ => Err("Only allow to change log level to ".to_owned()), + }) + .map(|level| self.set_level(level)) + } +} + +struct RuntimeLevelFilter { + drain: D, + runtime_level: RuntimeLevel, +} + +impl RuntimeLevelFilter { + fn new(drain: D, runtime_level: RuntimeLevel) -> Self { + Self { + drain, + runtime_level, + } + } +} + +impl Drain for RuntimeLevelFilter +where + D: Drain, +{ + type Err = D::Err; + type Ok = Option; + + fn log(&self, record: &Record, values: &OwnedKVList) -> Result { + let current_level = self.runtime_level.current_level(); + + if record.level().is_at_least(current_level) { + Ok(Some(self.drain.log(record, values)?)) + } else { + Ok(None) + } + } +} + +fn write_log_header(decorator: &mut dyn RecordDecorator, record: &Record<'_>) -> io::Result<()> { + decorator.start_timestamp()?; + write!( + decorator, + "{}", + chrono::Local::now().format(TIMESTAMP_FORMAT) + )?; + + decorator.start_whitespace()?; + write!(decorator, " ")?; + + decorator.start_level()?; + write!(decorator, "{}", record.level().as_short_str())?; + + decorator.start_whitespace()?; + write!(decorator, " ")?; + + // Writes source file info. + decorator.start_msg()?; // There is no `start_file()` or `start_line()`. + write!(decorator, "[{}:{}]", record.file(), record.line())?; + + Ok(()) +} + +fn write_log_msg(decorator: &mut dyn RecordDecorator, record: &Record<'_>) -> io::Result<()> { + decorator.start_whitespace()?; + write!(decorator, " ")?; + + decorator.start_msg()?; + write!(decorator, "{}", record.msg())?; + + Ok(()) +} + +fn write_log_fields( + decorator: &mut dyn RecordDecorator, + record: &Record<'_>, + values: &OwnedKVList, +) -> io::Result<()> { + let mut serializer = Serializer::new(decorator); + + record.kv().serialize(record, &mut serializer)?; + + values.serialize(record, &mut serializer)?; + + serializer.finish()?; + + Ok(()) +} + +struct Serializer<'a> { + decorator: &'a mut dyn RecordDecorator, +} + +impl<'a> Serializer<'a> { + fn new(decorator: &'a mut dyn RecordDecorator) -> Self { + Serializer { decorator } + } + + fn write_whitespace(&mut self) -> io::Result<()> { + self.decorator.start_whitespace()?; + write!(self.decorator, " ")?; + Ok(()) + } + + fn finish(self) -> io::Result<()> { + Ok(()) + } +} + +impl<'a> Drop for Serializer<'a> { + fn drop(&mut self) {} +} + +impl<'a> slog::Serializer for Serializer<'a> { + fn emit_none(&mut self, key: Key) -> slog::Result { + self.emit_arguments(key, &format_args!("None")) + } + + fn emit_arguments(&mut self, key: Key, val: &fmt::Arguments<'_>) -> slog::Result { + self.write_whitespace()?; + + // Write key + write!(self.decorator, "[")?; + self.decorator.start_key()?; + write!(self.decorator, "{}", key)?; + + // Write separator + self.decorator.start_separator()?; + write!(self.decorator, ":")?; + + // Write value + self.decorator.start_value()?; + write!(self.decorator, "{}", val)?; + self.decorator.reset()?; + write!(self.decorator, "]")?; + + Ok(()) + } +} + +pub fn init_test_logger() { + // level + let level = Level::Info; + + // drain + let term_drain = term_drainer(); + let drain = LogDispatcher::new(term_drain); + + // Use async and init stdlog + let _ = init_log(drain, level, false, 12400, true); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_runtime_level() { + let runtime_level = RuntimeLevel::new(Level::Info); + + assert_eq!(runtime_level.current_level(), Level::Info); + assert_eq!(runtime_level.default_level(), Level::Info); + + runtime_level.set_level(Level::Debug); + assert_eq!(runtime_level.current_level(), Level::Debug); + assert_eq!(runtime_level.default_level(), Level::Info); + + runtime_level.reset(); + assert_eq!(runtime_level.current_level(), Level::Info); + assert_eq!(runtime_level.current_level_str(), "info"); + + runtime_level.set_level_by_str("trace").unwrap(); + assert_eq!(runtime_level.current_level(), Level::Trace); + runtime_level.set_level_by_str("debug").unwrap(); + assert_eq!(runtime_level.current_level(), Level::Debug); + runtime_level.set_level_by_str("info").unwrap(); + assert_eq!(runtime_level.current_level(), Level::Info); + + assert!(runtime_level.set_level_by_str("warn").is_err()); + assert_eq!(runtime_level.current_level(), Level::Info); + assert!(runtime_level.set_level_by_str("warning").is_err()); + assert!(runtime_level.set_level_by_str("critical").is_err()); + assert!(runtime_level.set_level_by_str("error").is_err()); + assert!(runtime_level.set_level_by_str("no such level").is_err()); + + assert_eq!(runtime_level.current_level(), Level::Info); + } +} diff --git a/components/object_store/Cargo.toml b/components/object_store/Cargo.toml new file mode 100644 index 0000000000..787f330dcd --- /dev/null +++ b/components/object_store/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "object_store" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +[dependencies] # In alphabetical order +async-trait = "0.1.42" +bytes = "1.0" +common_util = { path = "../../common_util" } +futures = "0.3" +itertools = "0.10" +percent-encoding = "2.1" +snafu = { version = "0.6.10", features = ["futures", "backtraces"] } +tokio = { version = "1.0", features = ["macros", "fs"] } +# Filesystem integration +tokio-util = { version = "0.6.3", features = [ "io","compat" ] } +walkdir = "2.3.2" + +[dev-dependencies] +tempfile = "3.1.0" diff --git a/components/object_store/src/disk.rs b/components/object_store/src/disk.rs new file mode 100644 index 0000000000..14cdbb9cc0 --- /dev/null +++ b/components/object_store/src/disk.rs @@ -0,0 +1,389 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! This module contains the IOx implementation for using local disk as the +//! object store. +use std::{collections::BTreeSet, convert::TryFrom, io, path::PathBuf}; + +use async_trait::async_trait; +use futures::{ + stream, + stream::{BoxStream, StreamExt}, + AsyncRead, +}; +use snafu::{Backtrace, GenerateBacktrace, OptionExt, ResultExt, Snafu}; +use tokio::fs; +use tokio_util::compat::{Compat, FuturesAsyncReadCompatExt}; +use walkdir::WalkDir; + +use crate::{path::file::FilePath, ListResult, ObjectMeta, ObjectStore}; + +/// A specialized `Result` for filesystem object store-related errors +pub type Result = std::result::Result; + +/// A specialized `Error` for filesystem object store-related errors +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Expected streamed data to have length {}, got {}.\nBacktrace:\n{}", + expected, + actual, + backtrace + ))] + DataDoesNotMatchLength { + expected: usize, + actual: usize, + backtrace: Backtrace, + }, + + #[snafu(display("File size for {} did not fit in a usize: {}.\nBacktrace:\n{}", path.display(), source, backtrace))] + FileSizeOverflowedUsize { + path: PathBuf, + source: std::num::TryFromIntError, + backtrace: Backtrace, + }, + + #[snafu(display("Unable to walk dir: {}.\nBacktrace:\n{}", source, backtrace))] + UnableToWalkDir { + source: walkdir::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Unable to access metadata for {}: {}.\nBacktrace:\n{}", path.display(), source, backtrace))] + UnableToAccessMetadata { + path: PathBuf, + source: walkdir::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Unable to copy data to file: {}.\nBacktrace:\n{}", source, backtrace))] + UnableToCopyDataToFile { + source: io::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Unable to create dir {}: {}.\nBacktrace:\n{}", path.display(), source, backtrace))] + UnableToCreateDir { + source: io::Error, + path: PathBuf, + backtrace: Backtrace, + }, + + #[snafu(display("Unable to create file {}: {}.\nBacktrace:\n{}", path.display(), err, backtrace))] + UnableToCreateFile { + path: PathBuf, + err: io::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Unable to delete file {}: {}.\nBacktrace:\n{}", path.display(), source, backtrace))] + UnableToDeleteFile { + source: io::Error, + path: PathBuf, + backtrace: Backtrace, + }, + + #[snafu(display("Unable to open file {}: {}.\nBacktrace:\n{}", path.display(), source, backtrace))] + UnableToOpenFile { + source: io::Error, + path: PathBuf, + backtrace: Backtrace, + }, + + #[snafu(display("Unable to read data from file {}: {}.\nBacktrace:\n{}", path.display(), source, backtrace))] + UnableToReadBytes { + source: io::Error, + path: PathBuf, + backtrace: Backtrace, + }, + + #[snafu(display( + "Unable to stream data from the request into memory: {}.\nBacktrace:\n{}", + source, + backtrace + ))] + UnableToStreamDataIntoMemory { + source: std::io::Error, + backtrace: Backtrace, + }, +} + +/// Local filesystem storage suitable for testing or for opting out of using a +/// cloud storage provider. +#[derive(Debug)] +pub struct File { + root: FilePath, +} + +#[async_trait] +impl ObjectStore for File { + type Error = Error; + type Path = FilePath; + type Reader = Compat; + + fn new_path(&self) -> Self::Path { + FilePath::default() + } + + async fn put( + &self, + location: &Self::Path, + bytes: R, + _length: Option, + ) -> Result<(), Self::Error> + where + R: AsyncRead + Send + Unpin, + { + let path = self.path(location); + + let mut file = match fs::File::create(&path).await { + Ok(f) => f, + Err(err) if err.kind() == std::io::ErrorKind::NotFound => { + let parent = path + .parent() + .context(UnableToCreateFile { path: &path, err })?; + fs::create_dir_all(&parent) + .await + .context(UnableToCreateDir { path: parent })?; + + match fs::File::create(&path).await { + Ok(f) => f, + Err(err) => return UnableToCreateFile { path, err }.fail(), + } + } + Err(err) => return UnableToCreateFile { path, err }.fail(), + }; + + tokio::io::copy(&mut bytes.compat(), &mut file) + .await + .context(UnableToCopyDataToFile)?; + + Ok(()) + } + + async fn get(&self, location: &Self::Path) -> Result { + let path = self.path(location); + let file = fs::File::open(&path) + .await + .context(UnableToOpenFile { path: &path })?; + Ok(file.into_std().await) + } + + async fn delete(&self, location: &Self::Path) -> Result<(), Self::Error> { + let path = self.path(location); + fs::remove_file(&path) + .await + .context(UnableToDeleteFile { path })?; + Ok(()) + } + + async fn list<'a>( + &'a self, + prefix: Option<&'a Self::Path>, + ) -> Result, Self::Error>>, Self::Error> { + let root_path = self.root.to_raw(); + let walkdir = WalkDir::new(&root_path) + // Don't include the root directory itself + .min_depth(1); + + let s = + walkdir.into_iter().filter_map(move |result_dir_entry| { + match convert_walkdir_result(result_dir_entry) { + Err(e) => Some(Err(e)), + Ok(None) => None, + Ok(entry @ Some(_)) => entry + .filter(|dir_entry| dir_entry.file_type().is_file()) + .map(|file| { + let relative_path = file.path().strip_prefix(&root_path).expect( + "Must start with root path because this came from walking the root", + ); + FilePath::raw(relative_path, false) + }) + .filter(|name| prefix.map_or(true, |p| name.prefix_matches(p))) + .map(|name| Ok(vec![name])), + } + }); + + Ok(stream::iter(s).boxed()) + } + + async fn list_with_delimiter( + &self, + prefix: &Self::Path, + ) -> Result, Self::Error> { + // Always treat prefix as relative because the list operations don't know + // anything about where on disk the root of this object store is; they + // only care about what's within this object store's directory. See + // documentation for `push_path`: it deliberately does *not* behave as + // `PathBuf::push` does: there is no way to replace the root. So even if + // `prefix` isn't relative, we treat it as such here. + let mut resolved_prefix = self.root.clone(); + resolved_prefix.push_path(prefix); + + // It is valid to specify a prefix with directories `[foo, bar]` and filename + // `baz`, in which case we want to treat it like a glob for + // `foo/bar/baz*` and there may not actually be a file or directory + // named `foo/bar/baz`. We want to look at all the entries in + // `foo/bar/`, so remove the file name. + let mut search_path = resolved_prefix.clone(); + search_path.unset_file_name(); + + let walkdir = WalkDir::new(&search_path.to_raw()) + .min_depth(1) + .max_depth(1); + + let mut common_prefixes = BTreeSet::new(); + let mut objects = Vec::new(); + + let root_path = self.root.to_raw(); + for entry_res in walkdir.into_iter().map(convert_walkdir_result) { + if let Some(entry) = entry_res? { + let entry_location = FilePath::raw(entry.path(), false); + + if entry_location.prefix_matches(&resolved_prefix) { + let metadata = entry + .metadata() + .context(UnableToAccessMetadata { path: entry.path() })?; + + if metadata.is_dir() { + let parts = entry_location + .parts_after_prefix(&resolved_prefix) + .expect("must have prefix because of the if prefix_matches condition"); + + let mut relative_location = prefix.to_owned(); + relative_location.push_part_as_dir(&parts[0]); + common_prefixes.insert(relative_location); + } else { + let path = entry + .path() + .strip_prefix(&root_path) + .expect("must have prefix because of the if prefix_matches condition"); + let location = FilePath::raw(path, false); + + let last_modified = metadata + .modified() + .expect("Modified file time should be supported on this platform"); + let size = usize::try_from(metadata.len()) + .context(FileSizeOverflowedUsize { path: entry.path() })?; + + objects.push(ObjectMeta { + location, + last_modified, + size, + }); + } + } + } + } + + Ok(ListResult { + next_token: None, + common_prefixes: common_prefixes.into_iter().collect(), + objects, + }) + } +} + +impl File { + /// Create new filesystem storage. + pub fn new(root: impl Into) -> Self { + Self { + root: FilePath::raw(root, true), + } + } + + /// Return full path of the given location + pub fn path(&self, location: &FilePath) -> PathBuf { + let mut path = self.root.clone(); + path.push_path(location); + path.to_raw() + } +} + +/// Convert walkdir results and converts not-found errors into `None`. +fn convert_walkdir_result( + res: std::result::Result, +) -> Result> { + match res { + Ok(entry) => Ok(Some(entry)), + Err(walkdir_err) => match walkdir_err.io_error() { + Some(io_err) => match io_err.kind() { + io::ErrorKind::NotFound => Ok(None), + _ => Err(Error::UnableToWalkDir { + source: walkdir_err, + backtrace: Backtrace::generate(), + }), + }, + None => Err(Error::UnableToWalkDir { + source: walkdir_err, + backtrace: Backtrace::generate(), + }), + }, + } +} + +#[cfg(test)] +mod tests { + use std::io::Read; + + use bytes::Bytes; + use tempfile::TempDir; + + use super::*; + use crate::{ + path::ObjectStorePath, + tests::{list_with_delimiter, put_get_delete_list}, + ObjectStore, + }; + + #[tokio::test] + async fn file_test() { + let root = TempDir::new().unwrap(); + let file = File::new(root.path()); + + put_get_delete_list(&file).await.unwrap(); + list_with_delimiter(&file).await.unwrap(); + } + + #[tokio::test] + async fn creates_dir_if_not_present() { + let root = TempDir::new().unwrap(); + let file = File::new(root.path()); + + let data = Bytes::from("arbitrary data"); + let mut location = file.new_path(); + location.push_all_dirs(&["nested", "file", "test_file"]); + + file.put(&location, Box::new(data.as_ref()), Some(data.len())) + .await + .unwrap(); + + let mut read_data = Vec::with_capacity(data.len()); + file.get(&location) + .await + .unwrap() + .read_to_end(&mut read_data) + .unwrap(); + assert_eq!(&*read_data, data); + } + + #[tokio::test] + async fn unknown_length() { + let root = TempDir::new().unwrap(); + let file = File::new(root.path()); + + let data = Bytes::from("arbitrary data"); + + let mut location = file.new_path(); + location.set_file_name("some_file"); + file.put(&location, Box::new(data.as_ref()), None) + .await + .unwrap(); + let mut read_data = Vec::with_capacity(data.len()); + file.get(&location) + .await + .unwrap() + .read_to_end(&mut read_data) + .unwrap(); + assert_eq!(&*read_data, data); + } +} diff --git a/components/object_store/src/lib.rs b/components/object_store/src/lib.rs new file mode 100644 index 0000000000..326a68459c --- /dev/null +++ b/components/object_store/src/lib.rs @@ -0,0 +1,329 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! # object_store +//! +//! This crate provides APIs for interacting with object storage services. It +//! currently supports PUT, GET, DELETE, and list for in-memory and +//! local file storage. +//! +//! Future compatibility will include Aliyun OSS. +//! +//! Fork from https://github.com/influxdata/influxdb_iox/tree/main/object_store + +use std::time::SystemTime; + +use async_trait::async_trait; +use futures::{stream::BoxStream, AsyncRead}; +use path::ObjectStorePath; + +pub mod disk; +pub mod path; + +/// Universal API to multiple object store services. +// TODO(xikai): ObjectStore -> FileStore +#[async_trait] +pub trait ObjectStore: std::fmt::Debug + Send + Sync + 'static { + /// The type of the locations used in interacting with this object store. + type Path: ObjectStorePath; + + /// The error returned from fallible methods + type Error: std::error::Error + Send + Sync + 'static; + + type Reader: AsyncRead + Send + Unpin; + + /// Return a new location path appropriate for this object storage + fn new_path(&self) -> Self::Path; + + /// Save the provided bytes to the specified location. + async fn put( + &self, + location: &Self::Path, + bytes: R, + length: Option, + ) -> Result<(), Self::Error> + where + R: AsyncRead + Send + Unpin; + + /// Return the bytes that are stored at the specified location. + async fn get(&self, location: &Self::Path) -> Result; + + /// Delete the object at the specified location. + async fn delete(&self, location: &Self::Path) -> Result<(), Self::Error>; + + /// List all the objects with the given prefix. + async fn list<'a>( + &'a self, + prefix: Option<&'a Self::Path>, + ) -> Result, Self::Error>>, Self::Error>; + + /// List objects with the given prefix and an implementation specific + /// delimiter. Returns common prefixes (directories) in addition to object + /// metadata. + async fn list_with_delimiter( + &self, + prefix: &Self::Path, + ) -> Result, Self::Error>; +} + +/// Result of a list call that includes objects, prefixes (directories) and a +/// token for the next set of results. Individual result sets may be limited to +/// 1,00 objects based on the underlying object storage's limitations. +#[derive(Debug)] +pub struct ListResult { + /// Token passed to the API for the next page of list results. + pub next_token: Option, + /// Prefixes that are common (like directories) + pub common_prefixes: Vec

{ + provider: P, +} + +impl

Frontend

{ + pub fn new(provider: P) -> Self { + Self { provider } + } + + /// Parse the sql and returns the statements + pub fn parse_sql(&self, _ctx: &mut Context, sql: &str) -> Result { + Parser::parse_sql(sql).context(InvalidSql { sql }) + } + + /// Parse the request and returns the Expr + pub fn parse_promql( + &self, + _ctx: &mut Context, + mut req: PrometheusQueryRequest, + ) -> Result { + req.take_expr().try_into().context(InvalidPromRequest) + } +} + +impl Frontend

{ + /// Create logical plan for the statement + pub fn statement_to_plan(&self, ctx: &mut Context, stmt: Statement) -> Result { + let planner = Planner::new(&self.provider, ctx.request_id, ctx.read_parallelism); + + planner.statement_to_plan(stmt).context(CreatePlan) + } + + pub fn promql_expr_to_plan( + &self, + ctx: &mut Context, + expr: Expr, + ) -> Result<(Plan, Arc)> { + let planner = Planner::new(&self.provider, ctx.request_id, ctx.read_parallelism); + + planner.promql_expr_to_plan(expr).context(CreatePlan) + } +} diff --git a/sql/src/lib.rs b/sql/src/lib.rs new file mode 100644 index 0000000000..fe2f41e287 --- /dev/null +++ b/sql/src/lib.rs @@ -0,0 +1,19 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! SQL frontend +//! +//! Parse sql into logical plan that can be handled by interpreters + +#[macro_use] +extern crate common_util; + +pub mod ast; +pub mod container; +pub mod frontend; +pub mod parser; +pub mod plan; +pub mod planner; +pub mod promql; +pub mod provider; +#[cfg(any(test, feature = "test"))] +pub mod tests; diff --git a/sql/src/parser.rs b/sql/src/parser.rs new file mode 100644 index 0000000000..dca4d82ba2 --- /dev/null +++ b/sql/src/parser.rs @@ -0,0 +1,814 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! SQL parser +//! +//! Some codes are copied from datafusion: + +use log::debug; +use paste::paste; +use sqlparser::{ + ast::{ColumnDef, ColumnOption, ColumnOptionDef, Ident, TableConstraint}, + dialect::{keywords::Keyword, Dialect, MySqlDialect}, + parser::{IsOptional::Mandatory, Parser as SqlParser, ParserError}, + tokenizer::{Token, Tokenizer}, +}; +use table_engine::ANALYTIC_ENGINE_TYPE; + +use crate::ast::{ + AlterAddColumn, AlterModifySetting, CreateTable, DescribeTable, DropTable, ExistsTable, + ShowCreate, ShowCreateObject, Statement, +}; + +define_result!(ParserError); + +// Use `Parser::expected` instead, if possible +macro_rules! parser_err { + ($MSG:expr) => { + Err(ParserError::ParserError($MSG.to_string())) + }; +} + +const TS_KEY: &str = "__ts_key"; +const TAG: &str = "TAG"; +const COMMENT: &str = "COMMENT"; +const UNSIGN: &str = "UNSIGN"; +const MODIFY: &str = "MODIFY"; +const SETTING: &str = "SETTING"; + +macro_rules! is_custom_column { + ($name: ident) => { + paste! { + #[inline] + pub fn [](opt: &ColumnOption) -> bool { + match opt { + ColumnOption::DialectSpecific(tokens) => { + if let [Token::Word(word)] = &tokens[..] { + return word.value == $name; + } + } + _ => return false, + } + return false; + } + + } + }; +} + +is_custom_column!(TAG); +is_custom_column!(UNSIGN); + +/// Get the comment from the [`ColumnOption`] if it is a comment option. +pub fn get_column_comment(opt: &ColumnOption) -> Option { + if let ColumnOption::DialectSpecific(tokens) = opt { + if let [Token::Word(keyword), Token::SingleQuotedString(comment)] = &tokens[..] { + if keyword.value == COMMENT { + return Some(comment.clone()); + } + } + } + + None +} + +/// Returns true when is a TIMESTAMP KEY table constraint +pub fn is_timestamp_key_constraint(constrait: &TableConstraint) -> bool { + if let TableConstraint::Unique { + name: Some(Ident { + value, + quote_style: None, + }), + columns: _, + is_primary: false, + } = constrait + { + return value == TS_KEY; + } + false +} + +/// SQL Parser with ceresdb dialect support +pub struct Parser<'a> { + parser: SqlParser<'a>, +} + +impl<'a> Parser<'a> { + // Parse the specified tokens with dialect + fn new_with_dialect(sql: &str, dialect: &'a dyn Dialect) -> Result { + let mut tokenizer = Tokenizer::new(dialect, sql); + let tokens = tokenizer.tokenize()?; + + Ok(Parser { + parser: SqlParser::new(tokens, dialect), + }) + } + + /// Parse a SQL statement and produce a set of statements + pub fn parse_sql(sql: &str) -> Result> { + // Use MySqlDialect, so we can support "`" and chinese characters. + let dialect = &MySqlDialect {}; + let mut parser = Parser::new_with_dialect(sql, dialect)?; + let mut stmts = Vec::new(); + let mut expecting_statement_delimiter = false; + loop { + // ignore empty statements (between successive statement delimiters) + while parser.parser.consume_token(&Token::SemiColon) { + expecting_statement_delimiter = false; + } + + if parser.parser.peek_token() == Token::EOF { + break; + } + if expecting_statement_delimiter { + return parser.expected("end of statement", parser.parser.peek_token()); + } + + let statement = parser.parse_statement()?; + stmts.push(statement); + expecting_statement_delimiter = true; + } + + debug!("Parser parsed sql, sql:{}, stmts:{:#?}", sql, stmts); + + Ok(stmts) + } + + // Report unexpected token + fn expected(&self, expected: &str, found: Token) -> Result { + parser_err!(format!("Expected {}, found: {}", expected, found)) + } + + // Parse a new expression + fn parse_statement(&mut self) -> Result { + match self.parser.peek_token() { + Token::Word(w) => { + match w.keyword { + Keyword::CREATE => { + // Move one token forward + self.parser.next_token(); + // Use custom parse + self.parse_create() + } + Keyword::DROP => { + // Move one token forward + self.parser.next_token(); + // Use custom parse + self.parse_drop() + } + Keyword::DESCRIBE | Keyword::DESC => { + self.parser.next_token(); + self.parse_describe() + } + Keyword::ALTER => { + self.parser.next_token(); + self.parse_alter() + } + Keyword::SHOW => { + self.parser.next_token(); + self.parse_show() + } + Keyword::EXISTS => { + self.parser.next_token(); + self.parse_exists() + } + _ => { + // use the native parser + Ok(Statement::Standard(Box::new( + self.parser.parse_statement()?, + ))) + } + } + } + _ => { + // use the native parser + Ok(Statement::Standard(Box::new( + self.parser.parse_statement()?, + ))) + } + } + } + + pub fn parse_alter(&mut self) -> Result { + let nth1_token = self.parser.peek_token(); + let nth2_token = self.parser.peek_nth_token(2); + let nth3_token = self.parser.peek_nth_token(3); + if let (Token::Word(nth1_word), Token::Word(nth2_word), Token::Word(nth3_word)) = + (nth1_token, nth2_token, nth3_token) + { + // example: ALTER TABLE test_ttl modify SETTING ttl='8d' + if let (Keyword::TABLE, MODIFY, SETTING) = ( + nth1_word.keyword, + nth2_word.value.to_uppercase().as_str(), + nth3_word.value.to_uppercase().as_str(), + ) { + return self.parse_alter_modify_setting(); + } + // examples: + // ALTER TABLE test_table ADD COLUMN col_17 STRING TAG + // ALTER TABLE test_table ADD COLUMN (col_18 STRING TAG, col_19 UNIT64) + if let (Keyword::TABLE, Keyword::ADD, Keyword::COLUMN) = + (nth1_word.keyword, nth2_word.keyword, nth3_word.keyword) + { + return self.parse_alter_add_column(); + } + } + Ok(Statement::Standard(Box::new(self.parser.parse_alter()?))) + } + + pub fn parse_show(&mut self) -> Result { + if self + .parser + .parse_one_of_keywords(&[Keyword::CREATE]) + .is_some() + { + Ok(self.parse_show_create()?) + } else { + self.expected("create", self.parser.peek_token()) + } + } + + fn parse_show_create(&mut self) -> Result { + let obj_type = match self.parser.expect_one_of_keywords(&[Keyword::TABLE])? { + Keyword::TABLE => Ok(ShowCreateObject::Table), + keyword => Err(ParserError::ParserError(format!( + "Unable to map keyword to ShowCreateObject: {:?}", + keyword + ))), + }?; + + let obj_name = self.parser.parse_object_name()?; + + Ok(Statement::ShowCreate(ShowCreate { obj_type, obj_name })) + } + + fn parse_alter_add_column(&mut self) -> Result { + self.parser.expect_keyword(Keyword::TABLE)?; + let table_name = self.parser.parse_object_name()?; + self.parser + .expect_keywords(&[Keyword::ADD, Keyword::COLUMN])?; + let (mut columns, _) = self.parse_columns()?; + if columns.is_empty() { + let column_def = self.parse_column_def()?; + columns.push(column_def); + } + Ok(Statement::AlterAddColumn(AlterAddColumn { + table_name, + columns, + })) + } + + fn parse_alter_modify_setting(&mut self) -> Result { + self.parser.expect_keyword(Keyword::TABLE)?; + let table_name = self.parser.parse_object_name()?; + if self.consume_token(MODIFY) && self.consume_token(SETTING) { + let options = self + .parser + .parse_comma_separated(SqlParser::parse_sql_option)?; + Ok(Statement::AlterModifySetting(AlterModifySetting { + table_name, + options, + })) + } else { + unreachable!() + } + } + + pub fn parse_describe(&mut self) -> Result { + let _ = self.parser.parse_keyword(Keyword::TABLE); + let table_name = self.parser.parse_object_name()?; + Ok(Statement::Describe(DescribeTable { table_name })) + } + + // Parse a SQL CREATE statement + pub fn parse_create(&mut self) -> Result { + self.parser.expect_keyword(Keyword::TABLE)?; + let if_not_exists = + self.parser + .parse_keywords(&[Keyword::IF, Keyword::NOT, Keyword::EXISTS]); + let table_name = self.parser.parse_object_name()?; + let (columns, constraints) = self.parse_columns()?; + let engine = self.parse_table_engine()?; + let options = self.parser.parse_options(Keyword::WITH)?; + + Ok(Statement::Create(CreateTable { + if_not_exists, + name: table_name, + columns, + engine, + constraints, + options, + })) + } + + pub fn parse_drop(&mut self) -> Result { + self.parser.expect_keyword(Keyword::TABLE)?; + let if_exists = self.parser.parse_keywords(&[Keyword::IF, Keyword::EXISTS]); + let table_name = self.parser.parse_object_name()?; + let engine = self.parse_table_engine()?; + + Ok(Statement::Drop(DropTable { + name: table_name, + if_exists, + engine, + })) + } + + pub fn parse_exists(&mut self) -> Result { + let _ = self.parser.parse_keyword(Keyword::TABLE); + let table_name = self.parser.parse_object_name()?; + Ok(Statement::Exists(ExistsTable { table_name })) + } + + // Copy from sqlparser + fn parse_columns(&mut self) -> Result<(Vec, Vec)> { + let mut columns = vec![]; + let mut constraints = vec![]; + if !self.parser.consume_token(&Token::LParen) || self.parser.consume_token(&Token::RParen) { + return Ok((columns, constraints)); + } + + loop { + if let Some(constraint) = self.parse_optional_table_constraint()? { + constraints.push(constraint); + } else if let Token::Word(_) = self.parser.peek_token() { + columns.push(self.parse_column_def()?); + } else { + return self.expected( + "column name or constraint definition", + self.parser.peek_token(), + ); + } + let comma = self.parser.consume_token(&Token::Comma); + if self.parser.consume_token(&Token::RParen) { + // allow a trailing comma, even though it's not in standard + break; + } else if !comma { + return self.expected( + "',' or ')' after column definition", + self.parser.peek_token(), + ); + } + } + + Ok((columns, constraints)) + } + + /// Parses the set of valid formats + fn parse_table_engine(&mut self) -> Result { + // TODO make ENGINE as a keyword + if !self.consume_token("ENGINE") { + return Ok(ANALYTIC_ENGINE_TYPE.to_string()); + } + + self.parser.expect_token(&Token::Eq)?; + + match self.parser.next_token() { + Token::Word(w) => Ok(w.value), + unexpected => self.expected("Engine is missing", unexpected), + } + } + + // Copy from sqlparser + fn parse_column_def(&mut self) -> Result { + let name = self.parser.parse_identifier()?; + let data_type = self.parser.parse_data_type()?; + let collation = if self.parser.parse_keyword(Keyword::COLLATE) { + Some(self.parser.parse_object_name()?) + } else { + None + }; + let mut options = vec![]; + loop { + if self.parser.parse_keyword(Keyword::CONSTRAINT) { + let name = Some(self.parser.parse_identifier()?); + if let Some(option) = self.parse_optional_column_option()? { + options.push(ColumnOptionDef { name, option }); + } else { + return self.expected( + "constraint details after CONSTRAINT ", + self.parser.peek_token(), + ); + } + } else if let Some(option) = self.parse_optional_column_option()? { + options.push(ColumnOptionDef { name: None, option }); + } else { + break; + }; + } + Ok(ColumnDef { + name, + data_type, + collation, + options, + }) + } + + // Copy from sqlparser by boyan + fn parse_optional_table_constraint(&mut self) -> Result> { + let name = if self.parser.parse_keyword(Keyword::CONSTRAINT) { + Some(self.parser.parse_identifier()?) + } else { + None + }; + match self.parser.next_token() { + Token::Word(w) if w.keyword == Keyword::PRIMARY => { + self.parser.expect_keyword(Keyword::KEY)?; + let columns = self.parser.parse_parenthesized_column_list(Mandatory)?; + Ok(Some(TableConstraint::Unique { + name, + columns, + is_primary: true, + })) + } + Token::Word(w) if w.keyword == Keyword::TIMESTAMP => { + self.parser.expect_keyword(Keyword::KEY)?; + let columns = self.parser.parse_parenthesized_column_list(Mandatory)?; + // TODO(boyan), TableConstraint doesn't support dialect right now + // we use unique constraint as TIMESTAMP KEY constraint. + Ok(Some(TableConstraint::Unique { + name: Some(Ident { + value: TS_KEY.to_owned(), + quote_style: None, + }), + columns, + is_primary: false, + })) + } + unexpected => { + if name.is_some() { + self.expected("PRIMARY, TIMESTAMP", unexpected) + } else { + self.parser.prev_token(); + Ok(None) + } + } + } + } + + // Copy from sqlparser by boyan + fn parse_optional_column_option(&mut self) -> Result> { + if self.parser.parse_keywords(&[Keyword::NOT, Keyword::NULL]) { + Ok(Some(ColumnOption::NotNull)) + } else if self.parser.parse_keyword(Keyword::NULL) { + Ok(Some(ColumnOption::Null)) + } else if self.parser.parse_keyword(Keyword::DEFAULT) { + Ok(Some(ColumnOption::Default(self.parser.parse_expr()?))) + } else if self + .parser + .parse_keywords(&[Keyword::PRIMARY, Keyword::KEY]) + { + Ok(Some(ColumnOption::Unique { is_primary: true })) + } else if self.consume_token(TAG) { + // Support TAG for ceresdbx + Ok(Some(ColumnOption::DialectSpecific(vec![ + Token::make_keyword(TAG), + ]))) + } else if self.consume_token(UNSIGN) { + // Support unsign for ceresdbx + Ok(Some(ColumnOption::DialectSpecific(vec![ + Token::make_keyword(UNSIGN), + ]))) + } else if self.consume_token(COMMENT) { + let comment = self.parser.parse_literal_string()?; + Ok(Some(ColumnOption::DialectSpecific(vec![ + Token::make_keyword(COMMENT), + Token::SingleQuotedString(comment), + ]))) + } else { + Ok(None) + } + } + + fn consume_token(&mut self, expected: &str) -> bool { + if self.parser.peek_token().to_string().to_uppercase() == *expected.to_uppercase() { + self.parser.next_token(); + true + } else { + false + } + } +} + +#[cfg(test)] +mod tests { + use sqlparser::ast::{DataType, Ident, ObjectName, Value}; + + use super::*; + + fn expect_parse_ok(sql: &str, expected: Statement) -> Result<()> { + let statements = Parser::parse_sql(sql)?; + assert_eq!( + statements.len(), + 1, + "Expected to parse exactly one statement" + ); + assert_eq!(statements[0], expected); + Ok(()) + } + + /// Parses sql and asserts that the expected error message was found + fn expect_parse_error(sql: &str, expected_error: &str) { + match Parser::parse_sql(sql) { + Ok(statements) => { + panic!( + "Expected parse error for '{}', but was successful: {:?}", + sql, statements + ); + } + Err(e) => { + let error_message = e.to_string(); + assert!( + error_message.contains(expected_error), + "Expected error '{}' not found in actual error '{}'", + expected_error, + error_message + ); + } + } + } + + fn make_column_def(name: impl Into, data_type: DataType) -> ColumnDef { + ColumnDef { + name: Ident { + value: name.into(), + quote_style: None, + }, + data_type, + collation: None, + options: vec![], + } + } + + fn make_tag_column_def(name: impl Into, data_type: DataType) -> ColumnDef { + ColumnDef { + name: Ident { + value: name.into(), + quote_style: None, + }, + data_type, + collation: None, + options: vec![ColumnOptionDef { + name: None, + option: ColumnOption::DialectSpecific(vec![Token::make_keyword(TAG)]), + }], + } + } + + fn make_object_name(name: impl Into) -> ObjectName { + ObjectName(vec![Ident::new(name)]) + } + + #[test] + fn create_table() { + // positive case + let sql = "CREATE TABLE IF NOT EXISTS t(c1 double)"; + let expected = Statement::Create(CreateTable { + if_not_exists: true, + name: make_object_name("t"), + columns: vec![make_column_def("c1", DataType::Double)], + engine: table_engine::ANALYTIC_ENGINE_TYPE.to_string(), + constraints: vec![], + options: vec![], + }); + expect_parse_ok(sql, expected).unwrap(); + + // positive case, multiple columns + let sql = "CREATE TABLE mytbl(c1 timestamp, c2 double, c3 string,) ENGINE = XX"; + let expected = Statement::Create(CreateTable { + if_not_exists: false, + name: make_object_name("mytbl"), + columns: vec![ + make_column_def("c1", DataType::Timestamp), + make_column_def("c2", DataType::Double), + make_column_def("c3", DataType::String), + ], + engine: "XX".to_string(), + constraints: vec![], + options: vec![], + }); + expect_parse_ok(sql, expected).unwrap(); + + // Error cases: Invalid sql + let sql = "CREATE TABLE t(c1 timestamp) AS"; + expect_parse_error( + sql, + "sql parser error: Expected end of statement, found: AS", + ); + } + + #[test] + fn test_unsign_tag_column() { + let sql = "CREATE TABLE IF NOT EXISTS t(c1 string tag, c2 float, c3 bigint unsign)"; + let statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + match &statements[0] { + Statement::Create(v) => { + let columns = &v.columns; + assert_eq!(3, columns.len()); + for c in columns { + if c.name.value == "c1" { + assert_eq!(1, c.options.len()); + let opt = &c.options[0]; + assert!(is_tag_column(&opt.option)); + } else if c.name.value == "c2" { + assert_eq!(0, c.options.len()); + } else if c.name.value == "c3" { + assert_eq!(1, c.options.len()); + let opt = &c.options[0]; + assert!(is_unsign_column(&opt.option)); + } else { + panic!("failed"); + } + } + } + _ => panic!("failed"), + } + } + + #[test] + fn test_comment_column() { + let sql = "CREATE TABLE IF NOT EXISTS t(c1 string, c2 float, c3 bigint comment 'id')"; + let statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + match &statements[0] { + Statement::Create(v) => { + let columns = &v.columns; + assert_eq!(3, columns.len()); + for c in columns { + if c.name.value == "c3" { + assert_eq!(1, c.options.len()); + let opt = &c.options[0]; + let comment = get_column_comment(&opt.option).unwrap(); + assert_eq!("id", comment); + } + } + } + _ => panic!("failed"), + } + } + + #[test] + fn test_timestamp_key_constraint() { + let sql = "CREATE TABLE IF NOT EXISTS t(c1 TIMESTAMP, TIMESTAMP key(c1))"; + let statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + match &statements[0] { + Statement::Create(v) => { + let constraints = &v.constraints; + assert_eq!(1, constraints.len()); + assert!(is_timestamp_key_constraint(&constraints[0])); + } + _ => panic!("failed"), + } + } + + #[test] + fn create_table_engine() { + let sql = "CREATE TABLE IF NOT EXISTS t(c1 double)"; + let statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + match &statements[0] { + Statement::Create(v) => { + assert_eq!(v.engine, table_engine::ANALYTIC_ENGINE_TYPE.to_string()) + } + _ => panic!("failed"), + } + + let sql = "CREATE TABLE IF NOT EXISTS t(c1 double) ENGINE = XX"; + let statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + match &statements[0] { + Statement::Create(v) => assert_eq!(v.engine, "XX".to_string()), + _ => panic!("failed"), + } + + let sql = "CREATE TABLE IF NOT EXISTS t(c1 double) engine = XX2"; + let statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + match &statements[0] { + Statement::Create(v) => assert_eq!(v.engine, "XX2".to_string()), + _ => panic!("failed"), + } + } + + #[test] + fn test_alter_table_option() { + let sql = "ALTER TABLE test_ttl modify SETTING arena_block_size='1k';"; + let statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + match &statements[0] { + Statement::AlterModifySetting(v) => { + assert_eq!(v.table_name.to_string(), "test_ttl".to_string()); + assert_eq!(v.options.len(), 1); + assert_eq!(v.options[0].name.value, "arena_block_size".to_string()); + assert_eq!( + v.options[0].value, + Value::SingleQuotedString("1k".to_string()) + ); + } + _ => panic!("failed"), + } + } + + #[test] + fn test_alter_table_column() { + { + let sql = "ALTER TABLE t ADD COLUMN (c1 DOUBLE, c2 STRING)"; + let expected = Statement::AlterAddColumn(AlterAddColumn { + table_name: make_object_name("t"), + columns: vec![ + make_column_def("c1", DataType::Double), + make_column_def("c2", DataType::String), + ], + }); + expect_parse_ok(sql, expected).unwrap(); + } + + { + let sql = "ALTER TABLE t ADD COLUMN c1 DOUBLE"; + let expected = Statement::AlterAddColumn(AlterAddColumn { + table_name: make_object_name("t"), + columns: vec![make_column_def("c1", DataType::Double)], + }); + expect_parse_ok(sql, expected).unwrap(); + } + } + + #[test] + fn test_alter_table_tag_column() { + { + let sql = "ALTER TABLE t ADD COLUMN (c1 DOUBLE, c2 STRING tag)"; + let expected = Statement::AlterAddColumn(AlterAddColumn { + table_name: make_object_name("t"), + columns: vec![ + make_column_def("c1", DataType::Double), + make_tag_column_def("c2", DataType::String), + ], + }); + expect_parse_ok(sql, expected).unwrap(); + } + + { + let sql = "ALTER TABLE t ADD COLUMN c1 string tag"; + let expected = Statement::AlterAddColumn(AlterAddColumn { + table_name: make_object_name("t"), + columns: vec![make_tag_column_def("c1", DataType::String)], + }); + expect_parse_ok(sql, expected).unwrap(); + } + } + + #[test] + fn test_drop_table() { + let sql = "drop table test_ttl"; + let statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + match &statements[0] { + Statement::Drop(DropTable { + name, + if_exists, + engine, + }) => { + assert_eq!(name.to_string(), "test_ttl".to_string()); + assert!(!if_exists); + assert_eq!(*engine, ANALYTIC_ENGINE_TYPE.to_string()); + } + _ => panic!("failed"), + } + + let sql = "drop table if exists test_ttl"; + let statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + match &statements[0] { + Statement::Drop(DropTable { + name, + if_exists, + engine, + }) => { + assert_eq!(name.to_string(), "test_ttl".to_string()); + assert!(if_exists); + assert_eq!(*engine, ANALYTIC_ENGINE_TYPE.to_string()); + } + _ => panic!("failed"), + } + } + + #[test] + fn test_exists_table() { + { + let sql = "EXISTS TABLE xxx_table"; + let expected = Statement::Exists(ExistsTable { + table_name: make_object_name("xxx_table"), + }); + expect_parse_ok(sql, expected).unwrap(); + } + + { + let sql = "EXISTS xxx_table"; + let expected = Statement::Exists(ExistsTable { + table_name: make_object_name("xxx_table"), + }); + expect_parse_ok(sql, expected).unwrap() + } + } +} diff --git a/sql/src/plan.rs b/sql/src/plan.rs new file mode 100644 index 0000000000..25c9fe9874 --- /dev/null +++ b/sql/src/plan.rs @@ -0,0 +1,158 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Logical plans such as select/insert/update/delete + +use std::{ + collections::{BTreeMap, HashMap}, + fmt, + fmt::{Debug, Formatter}, + sync::Arc, +}; + +use arrow_deps::datafusion::logical_plan::LogicalPlan as DataFusionLogicalPlan; +use common_types::{column_schema::ColumnSchema, row::RowGroup, schema::Schema}; +use common_util::define_result; +use snafu::Snafu; +use table_engine::table::TableRef; + +use crate::{ast::ShowCreateObject, container::TableContainer}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Unsupported alter table operation."))] + UnsupportedOperation, + + #[snafu(display("Unsupported column data type, err:{}.", source))] + UnsupportedDataType { source: common_types::datum::Error }, + + #[snafu(display("Unsupported column option:{}.", name))] + UnsupportedColumnOption { name: String }, + + #[snafu(display("Alter primary key is not allowed."))] + AlterPrimaryKey, +} + +define_result!(Error); + +// TODO(yingwen): Custom Debug format +/// Logical plan to be processed by interpreters +#[derive(Debug)] +pub enum Plan { + /// A SQL SELECT plan or other plans related to query + Query(QueryPlan), + // TODO(yingwen): Other sql command + Insert(InsertPlan), + /// Create table plan + Create(CreateTablePlan), + /// Drop table plan + Drop(DropTablePlan), + /// Describe table plan + Describe(DescribeTablePlan), + /// Alter table plan + AlterTable(AlterTablePlan), + /// Show create plan + ShowCreate(ShowCreatePlan), + /// Exists table + Exists(ExistsTablePlan), +} + +pub struct QueryPlan { + pub df_plan: DataFusionLogicalPlan, + // Contains the TableProviders so we can register the them to ExecutionContext later. + // Use TableProviderAdapter here so we can get the underlying TableRef and also be + // able to cast to Arc + pub tables: Arc, +} + +impl Debug for QueryPlan { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("QueryPlan") + .field("df_plan", &self.df_plan) + .finish() + } +} + +pub struct CreateTablePlan { + /// Engine + pub engine: String, + /// Create table if not exists + pub if_not_exists: bool, + /// Table name + pub table: String, + /// Table schema + pub table_schema: Schema, + /// Table options + pub options: HashMap, +} + +impl Debug for CreateTablePlan { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("CreateTablePlan") + .field("engine", &self.engine) + .field("if_not_exists", &self.if_not_exists) + .field("table", &self.table) + .field("table_schema", &self.table_schema) + .field( + "options", + &self + .options + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect::>(), + ) + .finish() + } +} + +#[derive(Debug)] +pub struct DropTablePlan { + /// Engine + pub engine: String, + /// If exists + pub if_exists: bool, + /// Table name + pub table: String, +} + +/// Insert logical plan +#[derive(Debug)] +pub struct InsertPlan { + /// The table to insert + pub table: TableRef, + /// RowGroup to insert + pub rows: RowGroup, +} + +#[derive(Debug)] +pub struct DescribeTablePlan { + /// The table to describe + pub table: TableRef, +} + +#[derive(Debug)] +pub enum AlterTableOperation { + /// Add a new column, the column id will be ignored. + AddColumn(Vec), + ModifySetting(HashMap), +} + +#[derive(Debug)] +pub struct AlterTablePlan { + /// The table to alter. + pub table: TableRef, + // TODO(yingwen): Maybe use smallvec. + pub operations: AlterTableOperation, +} + +#[derive(Debug)] +pub struct ShowCreatePlan { + /// The table to show. + pub table: TableRef, + /// The type to show + pub obj_type: ShowCreateObject, +} + +#[derive(Debug)] +pub struct ExistsTablePlan { + pub exists: bool, +} diff --git a/sql/src/planner.rs b/sql/src/planner.rs new file mode 100644 index 0000000000..5bc467c5c5 --- /dev/null +++ b/sql/src/planner.rs @@ -0,0 +1,1277 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Planner converts a SQL AST into logical plans + +use std::{ + collections::{BTreeMap, HashMap}, + convert::TryFrom, + mem, + sync::Arc, +}; + +use arrow_deps::datafusion::{error::DataFusionError, sql::planner::SqlToRel}; +use common_types::{ + column_schema::{self, ColumnSchema}, + datum::{Datum, DatumKind}, + request_id::RequestId, + row::{RowGroup, RowGroupBuilder}, + schema::{self, Schema, TSID_COLUMN}, +}; +use log::debug; +use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu}; +use sqlparser::ast::{ + ColumnDef, ColumnOption, Expr, ObjectName, Query, SetExpr, SqlOption, + Statement as SqlStatement, TableConstraint, Value, Values, +}; +use table_engine::table::TableRef; + +use crate::{ + ast::{ + AlterAddColumn, AlterModifySetting, CreateTable, DescribeTable, DropTable, ExistsTable, + ShowCreate, Statement, + }, + container::TableReference, + parser, + plan::{ + AlterTableOperation, AlterTablePlan, CreateTablePlan, DescribeTablePlan, DropTablePlan, + ExistsTablePlan, InsertPlan, Plan, QueryPlan, ShowCreatePlan, + }, + promql::{ColumnNames, Expr as PromExpr}, + provider::{ContextProviderAdapter, MetaProvider}, +}; + +// We do not carry backtrace in sql error because it is mainly used in server +// handler and the error is usually caused by invalid/unsupported sql, which +// should be easy to find out the reason. +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("DataFusion Failed to plan, err:{}", source))] + DataFusionPlan { source: DataFusionError }, + + // Statement is too large and complicate to carry in Error, so we + // only return error here, so the caller should attach sql to its + // error context + #[snafu(display("Unsupported SQL statement"))] + UnsupportedStatement, + + #[snafu(display("Create table name is empty"))] + CreateTableNameEmpty, + + #[snafu(display("Table must contain timestamp constraint"))] + RequireTimestamp, + + #[snafu(display( + "Table must contain only one timestamp key and it's data type must be TIMESTAMP" + ))] + InvalidTimetampKey, + + #[snafu(display("Invalid unsign type: {}.\nBacktrace:\n{}", kind, backtrace))] + InvalidUnsignType { + kind: DatumKind, + backtrace: Backtrace, + }, + + #[snafu(display("Primary key not found, column name:{}", name))] + PrimaryKeyNotFound { name: String }, + + #[snafu(display("Tag column not found, name:{}", name))] + TagColumnNotFound { name: String }, + + #[snafu(display("Timestamp column not found, name:{}", name))] + TimestampColumnNotFound { name: String }, + + #[snafu(display("{} is a reserved column name", name))] + ColumnNameReserved { name: String }, + + #[snafu(display("Invalid create table name, err:{}", source))] + InvalidCreateTableName { source: DataFusionError }, + + #[snafu(display("Failed to build schema, err:{}", source))] + BuildTableSchema { source: common_types::schema::Error }, + + #[snafu(display("Unsupported SQL data type, err:{}", source))] + UnsupportedDataType { source: common_types::datum::Error }, + + #[snafu(display("Invalid column schema, column_name:{}, err:{}", column_name, source))] + InvalidColumnSchema { + column_name: String, + source: column_schema::Error, + }, + + #[snafu(display("Invalid table name, err:{}", source))] + InvalidTableName { source: DataFusionError }, + + #[snafu(display("Table not found, table:{}", name))] + TableNotFound { name: String }, + + #[snafu(display("Column is not null, table:{}, column:{}", table, column))] + InsertMissingColumn { table: String, column: String }, + + #[snafu(display("Column is reserved, table:{}, column:{}", table, column))] + InsertReservedColumn { table: String, column: String }, + + #[snafu(display("Unknown insert column, name:{}", name))] + UnknownInsertColumn { name: String }, + + #[snafu(display("Insert values not enough, len:{}, index:{}", len, index))] + InsertValuesNotEnough { len: usize, index: usize }, + + #[snafu(display("Invalid insert stmt, contains duplicate columns"))] + InsertDuplicateColumns, + + #[snafu(display("Invalid insert stmt, source should be a set"))] + InsertSourceBodyNotSet, + + #[snafu(display("Invalid insert stmt, source expr is not value"))] + InsertExprNotValue, + + #[snafu(display("Insert Failed to convert value, err:{}", source))] + InsertConvertValue { source: common_types::datum::Error }, + + #[snafu(display("Failed to build row, err:{}", source))] + BuildRow { source: common_types::row::Error }, + + #[snafu(display("MetaProvider Failed to find table, err:{}", source))] + MetaProviderFindTable { source: crate::provider::Error }, + + #[snafu(display("Failed to find meta during planning, err:{}", source))] + FindMeta { source: crate::provider::Error }, + + #[snafu(display("Invalid alter table operation, err:{}", source))] + InvalidAlterTableOperation { source: crate::plan::Error }, + + #[snafu(display("Unsupported sql option, value:{}", value))] + UnsupportedOption { value: String }, + + #[snafu(display("Failed to build plan from promql, error:{}", source))] + BuildPromPlanError { source: crate::promql::Error }, +} + +define_result!(Error); + +/// Planner produces logical plans from SQL AST +// TODO(yingwen): Rewrite Planner instead of using datafusion's planner +pub struct Planner<'a, P: MetaProvider> { + provider: &'a P, + request_id: RequestId, + read_parallelism: usize, +} + +impl<'a, P: MetaProvider> Planner<'a, P> { + /// Create a new logical planner + pub fn new(provider: &'a P, request_id: RequestId, read_parallelism: usize) -> Self { + Self { + provider, + request_id, + read_parallelism, + } + } + + /// Create a logical plan from Statement + /// + /// Takes the ownership of statement because some statements like INSERT + /// statements contains lots of data + pub fn statement_to_plan(&self, statement: Statement) -> Result { + let adapter = + ContextProviderAdapter::new(self.provider, self.request_id, self.read_parallelism); + // SqlToRel needs to hold the reference to adapter, thus we can't both holds the + // adapter and the SqlToRel in Planner, which is a self-referential + // case. We wrap a PlannerDelegate to workaround this and avoid the usage of + // pin. + let planner = PlannerDelegate::new(adapter); + + match statement { + Statement::Standard(s) => planner.sql_statement_to_plan(*s), + Statement::Create(s) => planner.create_table_to_plan(s), + Statement::Drop(s) => planner.drop_table_to_plan(s), + Statement::Describe(s) => planner.describe_table_to_plan(s), + Statement::AlterModifySetting(s) => planner.alter_modify_setting_to_plan(s), + Statement::AlterAddColumn(s) => planner.alter_add_column_to_plan(s), + Statement::ShowCreate(s) => planner.show_create_to_plan(s), + Statement::Exists(s) => planner.exists_table_to_plan(s), + } + } + + pub fn promql_expr_to_plan(&self, expr: PromExpr) -> Result<(Plan, Arc)> { + let adapter = + ContextProviderAdapter::new(self.provider, self.request_id, self.read_parallelism); + // SqlToRel needs to hold the reference to adapter, thus we can't both holds the + // adapter and the SqlToRel in Planner, which is a self-referential + // case. We wrap a PlannerDelegate to workaround this and avoid the usage of + // pin. + let planner = PlannerDelegate::new(adapter); + + expr.to_plan(planner.meta_provider, self.read_parallelism) + .context(BuildPromPlanError) + } +} + +/// A planner wraps the datafusion's logical planner, and delegate sql like +/// select/explain to datafusion's planner. +struct PlannerDelegate<'a, P: MetaProvider> { + meta_provider: ContextProviderAdapter<'a, P>, +} + +impl<'a, P: MetaProvider> PlannerDelegate<'a, P> { + fn new(meta_provider: ContextProviderAdapter<'a, P>) -> Self { + Self { meta_provider } + } + + fn sql_statement_to_plan(self, sql_stmt: SqlStatement) -> Result { + match sql_stmt { + // Query statement use datafusion planner + SqlStatement::Explain { .. } | SqlStatement::Query(_) => { + self.sql_statement_to_datafusion_plan(sql_stmt) + } + SqlStatement::Insert { .. } => self.insert_to_plan(sql_stmt), + _ => UnsupportedStatement.fail(), + } + } + + fn sql_statement_to_datafusion_plan(self, sql_stmt: SqlStatement) -> Result { + let df_planner = SqlToRel::new(&self.meta_provider); + + let df_plan = df_planner + .sql_statement_to_plan(&sql_stmt) + .context(DataFusionPlan)?; + + debug!("Sql statement to datafusion plan, df_plan:\n{:#?}", df_plan); + + // Get all tables needed in the plan + let tables = self.meta_provider.try_into_container().context(FindMeta)?; + + Ok(Plan::Query(QueryPlan { + df_plan, + tables: Arc::new(tables), + })) + } + + fn create_table_to_plan(&self, stmt: CreateTable) -> Result { + ensure!(!stmt.name.0.is_empty(), CreateTableNameEmpty); + + debug!("Create table to plan, stmt:{:?}", stmt); + + // TODO(yingwen): Maybe support create table on other schema? + let table_ref = TableReference::try_from(&stmt.name).context(InvalidCreateTableName)?; + + // Now we only takes the table name and ignore the schema and catalog name + let table = table_ref.table().to_string(); + + let mut schema_builder = + schema::Builder::with_capacity(stmt.columns.len()).auto_increment_column_id(true); + let mut name_column_map = BTreeMap::new(); + + // Build all column schemas. + for col in &stmt.columns { + name_column_map.insert(col.name.value.as_str(), parse_column(col)?); + } + + // Tsid column is a reserved column. + ensure!( + !name_column_map.contains_key(TSID_COLUMN), + ColumnNameReserved { + name: TSID_COLUMN.to_string(), + } + ); + + // Find timestamp key and primary key contraint + let mut primary_key_constraint_idx = None; + let mut timestamp_name = None; + for (idx, constraint) in stmt.constraints.iter().enumerate() { + if let TableConstraint::Unique { + columns, + is_primary, + .. + } = constraint + { + if *is_primary { + primary_key_constraint_idx = Some(idx); + } else if parser::is_timestamp_key_constraint(constraint) { + // Only one timestamp key constraint + ensure!(timestamp_name.is_none(), InvalidTimetampKey); + // Only one column in constraint + ensure!(columns.len() == 1, InvalidTimetampKey); + + let name = &columns[0].value; + let timestamp_column = name_column_map + .get(name as &str) + .context(TimestampColumnNotFound { name })?; + // Ensure type is timestamp + ensure!( + timestamp_column.data_type == DatumKind::Timestamp, + InvalidTimetampKey + ); + + timestamp_name = Some(name.clone()); + } + } + } + + // Timestamp column must be provided. + let timestamp_name = timestamp_name.context(RequireTimestamp)?; + + // Build primary key, the builder will check timestamp column is in primary key. + if let Some(idx) = primary_key_constraint_idx { + // If primary key is already provided, use that primary key. + if let TableConstraint::Unique { columns, .. } = &stmt.constraints[idx] { + for col in columns { + let key_column = name_column_map.remove(&*col.value).with_context(|| { + PrimaryKeyNotFound { + name: col.value.clone(), + } + })?; + // The schema builder will checks there is only one timestamp column in primary + // key. + schema_builder = schema_builder + .add_key_column(key_column) + .context(BuildTableSchema)?; + } + } + } else { + // If primary key is not set, Use (timestamp, tsid) as primary key. + let timestamp_column = name_column_map.remove(timestamp_name.as_str()).context( + TimestampColumnNotFound { + name: ×tamp_name, + }, + )?; + let column_schema = + column_schema::Builder::new(TSID_COLUMN.to_string(), DatumKind::UInt64) + .is_nullable(false) + .build() + .context(InvalidColumnSchema { + column_name: TSID_COLUMN, + })?; + schema_builder = schema_builder + .enable_tsid_primary_key(true) + .add_key_column(timestamp_column) + .context(BuildTableSchema)? + .add_key_column(column_schema) + .context(BuildTableSchema)?; + } + + // The key columns have been consumed. + for col in name_column_map.into_values() { + schema_builder = schema_builder + .add_normal_column(col) + .context(BuildTableSchema)?; + } + + let table_schema = schema_builder.build().context(BuildTableSchema)?; + + let options = parse_options(stmt.options)?; + + let plan = CreateTablePlan { + engine: stmt.engine, + if_not_exists: stmt.if_not_exists, + table, + table_schema, + options, + }; + + debug!("Create table to plan, plan:{:?}", plan); + + Ok(Plan::Create(plan)) + } + + fn drop_table_to_plan(&self, stmt: DropTable) -> Result { + let table = if stmt.if_exists { + stmt.name.to_string() + } else { + self.find_table(stmt.name)?.name().to_string() + }; + + Ok(Plan::Drop(DropTablePlan { + engine: stmt.engine, + if_exists: stmt.if_exists, + table, + })) + } + + fn describe_table_to_plan(&self, stmt: DescribeTable) -> Result { + let table = self.find_table(stmt.table_name)?; + + Ok(Plan::Describe(DescribeTablePlan { table })) + } + + // REQUIRE: SqlStatement must be INSERT stmt + fn insert_to_plan(&self, sql_stmt: SqlStatement) -> Result { + match sql_stmt { + SqlStatement::Insert { + table_name, + columns, + source, + .. + } => { + let table = self.find_table(table_name)?; + + let schema = table.schema(); + // Column name and its index in insert stmt: {column name} => index + let column_names_idx: HashMap<_, _> = columns + .iter() + .enumerate() + .map(|(idx, ident)| (&ident.value, idx)) + .collect(); + ensure!( + column_names_idx.len() == columns.len(), + InsertDuplicateColumns + ); + + validate_insert_stmt(table.name(), &schema, &column_names_idx)?; + + // Index in insert values stmt of each column in table schema + let mut column_index_in_insert = Vec::with_capacity(schema.num_columns()); + + // Check all not null columns are provided in stmt, also init + // `column_index_in_insert` + for (idx, column) in schema.columns().iter().enumerate() { + if let Some(tsid_idx) = schema.index_of_tsid() { + if idx == tsid_idx { + // This is a tsid column. + column_index_in_insert.push(InsertMode::Auto); + continue; + } + } + match column_names_idx.get(&column.name) { + Some(idx_in_insert) => { + // This column in schema is also in insert stmt + column_index_in_insert.push(InsertMode::Direct(*idx_in_insert)); + } + None => { + // This column in schema is not in insert stmt + if column.is_nullable { + column_index_in_insert.push(InsertMode::Null); + } else { + // Column is not null and input does not contains that column + return InsertMissingColumn { + table: table.name(), + column: &column.name, + } + .fail(); + } + } + } + } + + let rows = build_row_group(schema, source, column_index_in_insert)?; + + Ok(Plan::Insert(InsertPlan { table, rows })) + } + // We already known this stmt is a INSERT stmt + _ => unreachable!(), + } + } + + fn alter_modify_setting_to_plan(&self, stmt: AlterModifySetting) -> Result { + let table = self.find_table(stmt.table_name)?; + let plan = AlterTablePlan { + table, + operations: AlterTableOperation::ModifySetting(parse_options(stmt.options)?), + }; + Ok(Plan::AlterTable(plan)) + } + + fn alter_add_column_to_plan(&self, stmt: AlterAddColumn) -> Result { + let table = self.find_table(stmt.table_name)?; + let plan = AlterTablePlan { + table, + operations: AlterTableOperation::AddColumn(parse_columns(stmt.columns)?), + }; + Ok(Plan::AlterTable(plan)) + } + + fn exists_table_to_plan(&self, stmt: ExistsTable) -> Result { + let table = self.find_table(stmt.table_name); + match table { + Ok(_) => Ok(Plan::Exists(ExistsTablePlan { exists: true })), + Err(_) => Ok(Plan::Exists(ExistsTablePlan { exists: false })), + } + } + + fn show_create_to_plan(&self, show_create: ShowCreate) -> Result { + let table = self.find_table(show_create.obj_name)?; + let plan = ShowCreatePlan { + table, + obj_type: show_create.obj_type, + }; + Ok(Plan::ShowCreate(plan)) + } + + fn find_table(&self, table_name: ObjectName) -> Result { + let table_ref = TableReference::try_from(&table_name).context(InvalidTableName)?; + + self.meta_provider + .table(table_ref) + .context(MetaProviderFindTable)? + .with_context(|| TableNotFound { + name: table_name.to_string(), + }) + } +} + +#[derive(Debug)] +enum InsertMode { + // Insert the value in expr with given index directly. + Direct(usize), + // No value provided, insert a null. + Null, + // Auto generated column, just temporary fill by default value, the real value will + // be filled by interpreter. + Auto, +} + +/// Build RowGroup +fn build_row_group( + schema: Schema, + source: Box, + column_index_in_insert: Vec, +) -> Result { + // Build row group by schema + match source.body { + SetExpr::Values(Values(values)) => { + let mut row_group_builder = + RowGroupBuilder::with_capacity(schema.clone(), values.len()); + for mut exprs in values { + // Try to build row + let mut row_builder = row_group_builder.row_builder(); + + // For each column in schema, append datum into row builder + for (index_opt, column_schema) in + column_index_in_insert.iter().zip(schema.columns()) + { + match index_opt { + InsertMode::Direct(index) => { + let exprs_len = exprs.len(); + let expr = exprs.get_mut(*index).context(InsertValuesNotEnough { + len: exprs_len, + index: *index, + })?; + + match expr { + Expr::Value(value) => { + let datum = Datum::try_from_sql_value( + &column_schema.data_type, + mem::replace(value, Value::Null), + ) + .context(InsertConvertValue)?; + row_builder = + row_builder.append_datum(datum).context(BuildRow)?; + } + _ => { + InsertExprNotValue.fail()?; + } + } + } + InsertMode::Null => { + // This is a null column + row_builder = + row_builder.append_datum(Datum::Null).context(BuildRow)?; + } + InsertMode::Auto => { + // This is an auto generated column, fill by default value. + let kind = &column_schema.data_type; + row_builder = row_builder + .append_datum(Datum::empty(kind)) + .context(BuildRow)?; + } + } + } + + // Finish this row and append into row group + row_builder.finish().context(BuildRow)?; + } + + // Build the whole row group + Ok(row_group_builder.build()) + } + _ => InsertSourceBodyNotSet.fail(), + } +} + +#[inline] +fn is_tsid_column(name: &str) -> bool { + name == TSID_COLUMN +} + +fn validate_insert_stmt( + table_name: &str, + schema: &Schema, + column_name_idx: &HashMap<&String, usize>, +) -> Result<()> { + for name in column_name_idx.keys() { + if is_tsid_column(name.as_str()) { + return Err(Error::InsertReservedColumn { + table: table_name.to_string(), + column: name.to_string(), + }); + } + schema.column_with_name(name).context(UnknownInsertColumn { + name: name.to_string(), + })?; + } + + Ok(()) +} + +fn parse_options(options: Vec) -> Result> { + let mut parsed_options = HashMap::with_capacity(options.len()); + + for option in options { + let key = option.name.value; + if let Some(value) = parse_for_option(option.value)? { + parsed_options.insert(key, value); + }; + } + + Ok(parsed_options) +} + +/// Parse value for sql option. +pub fn parse_for_option(value: Value) -> Result> { + let value_opt = match value { + Value::Number(n, _long) => Some(n), + Value::SingleQuotedString(v) | Value::DoubleQuotedString(v) => Some(v), + Value::NationalStringLiteral(v) | Value::HexStringLiteral(v) => { + return UnsupportedOption { value: v }.fail(); + } + Value::Boolean(v) => Some(v.to_string()), + Value::Interval { value, .. } => { + return UnsupportedOption { value }.fail(); + } + // Ignore this option if value is null. + Value::Null => None, + }; + + Ok(value_opt) +} + +fn parse_columns(cols: Vec) -> Result> { + let mut parsed_columns = Vec::with_capacity(cols.len()); + + // Build all column schemas. + for col in &cols { + parsed_columns.push(parse_column(col)?); + } + + Ok(parsed_columns) +} + +fn parse_column(col: &ColumnDef) -> Result { + let mut data_type = DatumKind::try_from(&col.data_type).context(UnsupportedDataType)?; + + // Process column options + let mut is_nullable = true; // A column is nullable by default. + let mut is_tag = false; + let mut is_unsign = false; + let mut comment = String::new(); + for option_def in &col.options { + if matches!(option_def.option, ColumnOption::NotNull) { + is_nullable = false; + } else if parser::is_tag_column(&option_def.option) { + is_tag = true; + } else if parser::is_unsign_column(&option_def.option) { + is_unsign = true; + } else if let Some(v) = parser::get_column_comment(&option_def.option) { + comment = v; + } + } + + if is_unsign { + data_type = data_type + .unsign_kind() + .context(InvalidUnsignType { kind: data_type })?; + } + + let builder = column_schema::Builder::new(col.name.value.clone(), data_type) + .is_nullable(is_nullable) + .is_tag(is_tag) + .comment(comment); + + builder.build().context(InvalidColumnSchema { + column_name: &col.name.value, + }) +} + +#[cfg(test)] +mod tests { + use sqlparser::ast::Value; + + use super::*; + use crate::{ + parser::Parser, + planner::{parse_for_option, Planner}, + tests::MockMetaProvider, + }; + + fn quick_test(sql: &str, expected: &str) -> Result<()> { + let mock = MockMetaProvider::default(); + let planner = build_planner(&mock); + let mut statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + let plan = planner.statement_to_plan(statements.remove(0))?; + assert_eq!(format!("{:#?}", plan), expected); + Ok(()) + } + + fn build_planner(provider: &MockMetaProvider) -> Planner { + Planner::new(provider, RequestId::next_id(), 1) + } + + #[test] + pub fn test_parse_for_option() { + let test_string = "aa".to_string(); + // input is_err expected + let test_cases = vec![ + ( + Value::Number("1000".to_string(), false), + false, + Some("1000".to_string()), + ), + ( + Value::SingleQuotedString(test_string.clone()), + false, + Some(test_string.clone()), + ), + ( + Value::DoubleQuotedString(test_string.clone()), + false, + Some(test_string.clone()), + ), + ( + Value::NationalStringLiteral(test_string.clone()), + true, + None, + ), + (Value::HexStringLiteral(test_string.clone()), true, None), + (Value::Boolean(true), false, Some("true".to_string())), + ( + Value::Interval { + value: test_string, + leading_field: None, + leading_precision: None, + last_field: None, + fractional_seconds_precision: None, + }, + true, + None, + ), + (Value::Null, false, None), + ]; + + for (input, is_err, expected) in test_cases { + let ret = parse_for_option(input); + assert_eq!(ret.is_err(), is_err); + if !is_err { + assert_eq!(ret.unwrap(), expected); + } + } + } + + #[test] + fn test_create_statement_to_plan() { + let sql = "CREATE TABLE IF NOT EXISTS t(c1 string tag not null,ts timestamp not null, c3 string, timestamp key(ts),primary key(c1, ts)) \ + ENGINE=Analytic WITH (ttl='70d',update_mode='overwrite',arena_block_size='1KB')"; + quick_test( + sql, + r#"Create( + CreateTablePlan { + engine: "Analytic", + if_not_exists: true, + table: "t", + table_schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "c1", + data_type: String, + is_nullable: false, + is_tag: true, + comment: "", + }, + ColumnSchema { + id: 2, + name: "ts", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "c3", + data_type: String, + is_nullable: true, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + options: { + "arena_block_size": "1KB", + "ttl": "70d", + "update_mode": "overwrite", + }, + }, +)"#, + ) + .unwrap(); + } + + #[test] + fn test_query_statement_to_plan() { + let sql = "select * from test_tablex;"; + assert!(quick_test(sql, "").is_err()); + + let sql = "select * from test_table;"; + quick_test(sql, "Query( + QueryPlan { + df_plan: Projection: #test_table.key1, #test_table.key2, #test_table.field1, #test_table.field2 + TableScan: test_table projection=None, + }, +)").unwrap(); + } + + #[test] + fn test_insert_statement_to_plan() { + let sql = "INSERT INTO test_tablex(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3');"; + assert!(quick_test(sql, "").is_err()); + + let sql = "INSERT INTO test_table(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3');"; + quick_test( + sql, + r#"Insert( + InsertPlan { + table: MemoryTable { + name: "test_table", + id: TableId(100, 0, 100), + schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + }, + rows: RowGroup { + schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + rows: [ + Row { + cols: [ + Varbinary( + b"tagk", + ), + Timestamp( + Timestamp( + 1638428434000, + ), + ), + Double( + 100.0, + ), + String( + StringBytes( + b"hello3", + ), + ), + ], + }, + ], + min_timestamp: Timestamp( + 1638428434000, + ), + max_timestamp: Timestamp( + 1638428434000, + ), + }, + }, +)"#, + ) + .unwrap(); + } + + #[test] + fn test_drop_statement_to_plan() { + let sql = "drop table test_table;"; + quick_test( + sql, + r#"Drop( + DropTablePlan { + engine: "Analytic", + if_exists: false, + table: "test_table", + }, +)"#, + ) + .unwrap(); + + let sql = "drop table test_tablex;"; + assert!(quick_test(sql, "",).is_err()); + + let sql = "drop table if exists test_tablex;"; + quick_test( + sql, + r#"Drop( + DropTablePlan { + engine: "Analytic", + if_exists: true, + table: "test_tablex", + }, +)"#, + ) + .unwrap(); + } + + #[test] + fn test_desc_statement_to_plan() { + let sql = "desc test_tablex;"; + assert!(quick_test(sql, "").is_err()); + + let sql = "desc test_table;"; + quick_test( + sql, + r#"Describe( + DescribeTablePlan { + table: MemoryTable { + name: "test_table", + id: TableId(100, 0, 100), + schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + }, + }, +)"#, + ) + .unwrap(); + } + + #[test] + fn test_alter_column_statement_to_plan() { + let sql = "ALTER TABLE test_tablex ADD column add_col string;"; + assert!(quick_test(sql, "").is_err()); + + let sql = "ALTER TABLE test_table ADD column add_col string;"; + quick_test( + sql, + r#"AlterTable( + AlterTablePlan { + table: MemoryTable { + name: "test_table", + id: TableId(100, 0, 100), + schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + }, + operations: AddColumn( + [ + ColumnSchema { + id: 0, + name: "add_col", + data_type: String, + is_nullable: true, + is_tag: false, + comment: "", + }, + ], + ), + }, +)"#, + ) + .unwrap(); + } + + #[test] + fn test_alter_option_statement_to_plan() { + let sql = "ALTER TABLE test_tablex modify SETTING ttl='9d';"; + assert!(quick_test(sql, "").is_err()); + + let sql = "ALTER TABLE test_table modify SETTING ttl='9d';"; + quick_test( + sql, + r#"AlterTable( + AlterTablePlan { + table: MemoryTable { + name: "test_table", + id: TableId(100, 0, 100), + schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + }, + operations: ModifySetting( + { + "ttl": "9d", + }, + ), + }, +)"#, + ) + .unwrap(); + } + + #[test] + fn test_show_create_statement_to_plan() { + let sql = "show create table test_tablex;"; + assert!(quick_test(sql, "").is_err()); + + let sql = "show create table test_table;"; + quick_test( + sql, + r#"ShowCreate( + ShowCreatePlan { + table: MemoryTable { + name: "test_table", + id: TableId(100, 0, 100), + schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + }, + obj_type: Table, + }, +)"#, + ) + .unwrap(); + } +} diff --git a/sql/src/promql.rs b/sql/src/promql.rs new file mode 100644 index 0000000000..2113681eea --- /dev/null +++ b/sql/src/promql.rs @@ -0,0 +1,10 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +mod convert; +mod datafusion_util; +mod pushdown; +mod udf; + +pub use convert::{Error, Expr}; +pub use datafusion_util::{ColumnNames, PromAlignNode}; +pub use pushdown::{AlignParameter, Func}; diff --git a/sql/src/promql/convert.rs b/sql/src/promql/convert.rs new file mode 100644 index 0000000000..005f2ebeb1 --- /dev/null +++ b/sql/src/promql/convert.rs @@ -0,0 +1,673 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + convert::{TryFrom, TryInto}, + sync::Arc, +}; + +use arrow_deps::datafusion::{ + error::DataFusionError, + logical_plan::{ + avg, col, combine_filters, count, lit, max, min, plan::Extension, sum, + Expr as DataFusionExpr, LogicalPlan, LogicalPlanBuilder, + }, + sql::planner::ContextProvider, +}; +use ceresdbproto::prometheus::{ + Expr as ExprPb, Filter as FilterPb, FilterType as FilterPbType, Operand as OperandPb, + Selector as PbSelector, SubExpr as PbSubExpr, SubExpr_OperatorType, +}; +use common_types::{ + schema::{Schema, TSID_COLUMN}, + time::{TimeRange, Timestamp}, +}; +use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu}; + +use crate::{ + plan::{Plan, QueryPlan}, + promql::{ + datafusion_util::{default_sort_exprs, timerange_to_expr}, + pushdown::{AlignParameter, Func}, + udf::{create_unique_id, regex_match_expr}, + ColumnNames, PromAlignNode, + }, + provider::{ContextProviderAdapter, MetaProvider}, +}; + +const INIT_LEVEL: usize = 1; +const DEFAULT_LOOKBACK: i64 = 300_000; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Invalid expr, expected: {}, actual:{:?}", expected, actual))] + UnexpectedExpr { expected: String, actual: String }, + + #[snafu(display("Expr pushdown not implemented. expr:{:?}", expr))] + NotImplemented { expr: String }, + + #[snafu(display("MetaProvider {}, err:{}", msg, source))] + MetaProviderError { + msg: String, + source: crate::provider::Error, + }, + + #[snafu(display("Table not found, table:{}", name))] + TableNotFound { name: String }, + + #[snafu(display("Failed to build schema, err:{}", source))] + BuildTableSchema { source: common_types::schema::Error }, + + #[snafu(display("Failed to build plan, source:{}", source,))] + BuildPlanError { source: DataFusionError }, + + #[snafu(display("Invalid expr, msg:{}\nBacktrace:\n{}", msg, backtrace))] + InvalidExpr { msg: String, backtrace: Backtrace }, + + #[snafu(display("Failed to pushdown, source:{}", source))] + PushdownError { + source: crate::promql::pushdown::Error, + }, +} + +define_result!(Error); + +impl From for Error { + fn from(df_err: DataFusionError) -> Self { + Error::BuildPlanError { source: df_err } + } +} + +#[derive(Debug, Clone)] +pub enum Expr { + SimpleExpr(Operand), + RecursiveExpr(SubExpr), +} + +impl TryFrom for Expr { + type Error = Error; + + fn try_from(mut pb_operand: OperandPb) -> Result { + let op = if pb_operand.has_selector() { + let PbSelector { + measurement: table, + start, + end, + align_start, + align_end, + filters, + range, + field, + offset, + step, + .. + } = pb_operand.take_selector(); + let filters = Into::>::into(filters) + .into_iter() + .map(Filter::from) + .collect::>(); + Operand::Selector(Selector { + table, + filters, + field, + query_range: TimeRange::new_unchecked( + Timestamp::new(start), + Timestamp::new(end + 1), + ), /* [start, end] */ + align_range: TimeRange::new_unchecked( + Timestamp::new(align_start), + Timestamp::new(align_end + 1), + ), /* [align_start, align_end] */ + step, + range, + offset, + }) + } else if pb_operand.has_float_val() { + Operand::Float(pb_operand.get_float_val()) + } else if pb_operand.has_string_val() { + Operand::String(pb_operand.take_string_val()) + } else { + return InvalidExpr { + msg: format!("unknown operand:{:?}", pb_operand), + } + .fail(); + }; + + Ok(Expr::SimpleExpr(op)) + } +} + +impl TryFrom for Expr { + type Error = Error; + + fn try_from(mut expr: ExprPb) -> Result { + if expr.has_operand() { + let operand = expr.take_operand(); + return operand.try_into(); + } else if expr.has_sub_expr() { + let sub_expr = expr.take_sub_expr(); + return sub_expr.try_into(); + } + + InvalidExpr { + msg: format!("unknown expr:{:?}", expr), + } + .fail() + } +} + +impl Expr { + pub fn get_selector(&self) -> &Selector { + match self { + Expr::SimpleExpr(se) => match se { + Operand::Selector(sel) => sel, + _ => unreachable!(), + }, + Expr::RecursiveExpr(re) => re.get_selector(), + } + } + + pub fn is_selector(&self) -> bool { + matches!(self, Expr::SimpleExpr(e) if matches!(e, Operand::Selector(_))) + } + + /// For now, only filters and timestamp are pushdown, we translate it + /// into plan like: + /// Aggregate: (when needed) + /// PromAlign: + /// Sort: (tsid, timestamp) asc + /// Project: + /// Filter: + /// TableScan + pub fn to_plan( + self, + meta_provider: ContextProviderAdapter<'_, P>, + read_parallelism: usize, + ) -> Result<(Plan, Arc)> { + let (logic_plan, column_name, _) = + self.build_plan_iter(&meta_provider, INIT_LEVEL, read_parallelism)?; + let tables = Arc::new( + meta_provider + .try_into_container() + .context(MetaProviderError { + msg: "Failed to find meta", + })?, + ); + Ok(( + Plan::Query(QueryPlan { + df_plan: logic_plan, + tables, + }), + column_name, + )) + } + + fn build_plan_iter( + self, + meta_provider: &ContextProviderAdapter<'_, P>, + level: usize, + read_parallelism: usize, + ) -> Result<(LogicalPlan, Arc, String)> { + match self { + Expr::SimpleExpr(simple_expr) => match simple_expr { + Operand::Selector(selector) => { + let (sub_plan, column_name, table_name) = + selector.clone().into_scan_plan(meta_provider)?; + if level == INIT_LEVEL { + // when only selector is pushdown, align is done in Prometheus itself + // since maybe there are subquery inside one query which require complex + // align logic. + return Ok((sub_plan, column_name, table_name)); + } + // insert PromAlignNode into plan with Func::Instant + let Selector { + align_range, + step, + offset, + .. + } = selector; + let align_param = AlignParameter { + align_range, + step: step.into(), + offset: offset.into(), + lookback_delta: DEFAULT_LOOKBACK.into(), + }; + let align_plan = LogicalPlan::Extension(Extension { + node: Arc::new(PromAlignNode { + input: sub_plan, + func: Func::Instant, + table_name: table_name.clone(), + align_param, + column_name: column_name.clone(), + read_parallelism, + }), + }); + Ok((align_plan, column_name, table_name)) + } + Operand::Float(_) | Operand::String(_) => InvalidExpr { + msg: "scalar value not allowed in plan node", + } + .fail(), + }, + // New plan like: + // PromAlign: + // SubPlan + Expr::RecursiveExpr(recursive_expr) => match recursive_expr { + SubExpr::Func(FuncExpr { op, operands }) => { + assert!(!operands.is_empty()); + let func = Func::try_from(op.as_str()).context(PushdownError {})?; + let first_arg = &operands[0]; + if first_arg.is_selector() { + let selector = first_arg.get_selector(); + let (sub_plan, column_name, table_name) = + selector.clone().into_scan_plan(meta_provider)?; + let Selector { + align_range, + step, + range, + offset, + .. + } = selector; + let align_param = AlignParameter { + align_range: *align_range, + step: step.into(), + offset: offset.into(), + lookback_delta: range.into(), + }; + let align_plan = LogicalPlan::Extension(Extension { + node: Arc::new(PromAlignNode { + input: sub_plan, + table_name: table_name.clone(), + func, + align_param, + column_name: column_name.clone(), + read_parallelism, + }), + }); + return Ok((align_plan, column_name, table_name)); + } + InvalidExpr { + msg: "first arg of func must be selector", + } + .fail() + } + + // New plan like: + // Sort: + // Projection + // Aggregate + // SubPlan + SubExpr::Aggr(AggrExpr { + op, + operands, + group_by, + without, + }) => { + assert!(!operands.is_empty()); + let next_level = level + 1; + // aggregators don't have args, only need to deal with sub_node now. + let sub_node = operands.into_iter().next().unwrap(); + let (sub_plan, column_name, table_name) = + sub_node.build_plan_iter(meta_provider, next_level, read_parallelism)?; + // filter out nonexistent tags + let group_by = group_by + .into_iter() + .filter(|by| column_name.tag_keys.contains(by)) + .collect::>(); + let groupby_columns = if without { + column_name + .tag_keys + .iter() + .filter_map(|tag_key| { + if group_by.contains(tag_key) { + None + } else { + Some(tag_key.as_str()) + } + }) + .collect::>() + } else { + group_by.iter().map(|s| (s.as_str())).collect::>() + }; + let aggr_expr = + Self::aggr_op_expr(&op, &column_name.field, column_name.field.clone())?; + let tag_exprs = groupby_columns.iter().map(|v| col(v)).collect::>(); + let udf_args = tag_exprs.clone(); + let mut groupby_expr = vec![col(&column_name.timestamp)]; + groupby_expr.extend(udf_args); + let unique_id_expr = + // TSID is lost after aggregate, but PromAlignNode need a unique id, so + // mock UUID as tsid based on groupby keys + DataFusionExpr::Alias( + Box::new(DataFusionExpr::ScalarUDF { + fun: Arc::new(create_unique_id(tag_exprs.len())), + args: tag_exprs.clone(), + }), + TSID_COLUMN.to_string(), + ); + let mut projection = tag_exprs.clone(); + projection.extend(vec![ + col(&column_name.timestamp), + col(&column_name.field), + unique_id_expr.clone(), + ]); + let sort_exprs = if tag_exprs.is_empty() { + vec![col(&column_name.timestamp).sort(true, true)] + } else { + vec![ + unique_id_expr.sort(true, true), + col(&column_name.timestamp).sort(true, true), + ] + }; + let builder = LogicalPlanBuilder::from(sub_plan); + let plan = builder + .aggregate(groupby_expr, vec![aggr_expr])? + .project(projection)? + .sort(sort_exprs)? + .build()?; + + Ok((plan, column_name, table_name)) + } + SubExpr::Binary(_) => InvalidExpr { + msg: "Binary Expr not supported", + } + .fail(), + }, + } + } + + fn aggr_op_expr(aggr_op: &str, field: &str, alias: String) -> Result { + let expr = match aggr_op { + "sum" => sum(col(field)), + "max" => max(col(field)), + "min" => min(col(field)), + "count" => count(col(field)), + "avg" => avg(col(field)), + _ => { + return InvalidExpr { + msg: format!("aggr {} not supported now", aggr_op), + } + .fail() + } + }; + + Ok(DataFusionExpr::Alias(Box::new(expr), alias)) + } +} + +#[derive(Debug, Clone)] +pub enum Operand { + String(String), + Float(f64), + Selector(Selector), +} + +#[derive(Debug, Clone)] +pub enum SubExpr { + Aggr(AggrExpr), + Func(FuncExpr), + Binary(BinaryExpr), +} + +impl TryFrom for Expr { + type Error = Error; + + fn try_from(mut pb_sub_expr: PbSubExpr) -> Result { + let op_type = pb_sub_expr.get_op_type(); + + let operator = pb_sub_expr.take_operator(); + let operands = pb_sub_expr + .take_operands() + .into_iter() + .map(Expr::try_from) + .collect::>>()?; + let sub_expr = match op_type { + SubExpr_OperatorType::AGGR => SubExpr::Aggr(AggrExpr { + op: operator, + operands, + group_by: pb_sub_expr.take_group().into_vec(), + without: pb_sub_expr.get_without(), + }), + SubExpr_OperatorType::FUNC => SubExpr::Func(FuncExpr { + op: operator, + operands, + }), + SubExpr_OperatorType::BINARY => { + return NotImplemented { + expr: format!("{:?}", pb_sub_expr), + } + .fail() + } + }; + + Ok(Expr::RecursiveExpr(sub_expr)) + } +} + +impl SubExpr { + pub fn get_selector(&self) -> &Selector { + match self { + SubExpr::Aggr(AggrExpr { operands, .. }) => operands[0].get_selector(), + SubExpr::Func(FuncExpr { operands, .. }) => operands[0].get_selector(), + SubExpr::Binary(BinaryExpr { operands, .. }) => operands[0].get_selector(), + } + } + + pub fn is_range_fn(&self) -> bool { + match self { + Self::Func(FuncExpr { operands, .. }) => match &operands[0] { + Expr::SimpleExpr(Operand::Selector(sel)) => sel.range > 0, + _ => false, + }, + _ => false, + } + } +} + +#[derive(Debug, Clone)] +pub struct AggrExpr { + op: String, + operands: Vec, + group_by: Vec, + without: bool, +} + +#[derive(Debug, Clone)] +pub struct FuncExpr { + op: String, + operands: Vec, +} + +#[derive(Debug, Clone)] +pub struct BinaryExpr { + _op: String, + operands: Vec, + _return_bool: bool, +} + +#[derive(Debug, Clone)] +pub enum FilterType { + LiteralOr, + NotLiteralOr, + Regexp, + NotRegexpMatch, +} + +impl From for FilterType { + fn from(pb_type: FilterPbType) -> Self { + match pb_type { + FilterPbType::LITERAL_OR => FilterType::LiteralOr, + FilterPbType::NOT_LITERAL_OR => FilterType::NotLiteralOr, + FilterPbType::REGEXP => FilterType::Regexp, + FilterPbType::NOT_REGEXP_MATCH => FilterType::NotRegexpMatch, + } + } +} + +#[derive(Debug, Clone)] +pub struct FilterOperator { + typ: FilterType, + params: Vec, +} + +#[derive(Debug, Clone)] +pub struct Filter { + tag_key: String, + operators: Vec, +} + +impl From for DataFusionExpr { + fn from(mut f: Filter) -> DataFusionExpr { + let tag_key = col(&f.tag_key); + // TODO(chenxiang): only compare first op now + let mut first_op = f.operators.remove(0); + match first_op.typ { + // regepx filter only have one param + FilterType::Regexp => regex_match_expr(tag_key, first_op.params.remove(0), true), + FilterType::NotRegexpMatch => { + regex_match_expr(tag_key, first_op.params.remove(0), false) + } + FilterType::LiteralOr => tag_key.in_list( + first_op + .params + .iter() + .map(|v| lit(v.as_str())) + .collect::>(), + false, + ), + FilterType::NotLiteralOr => tag_key.in_list( + first_op + .params + .iter() + .map(|v| lit(v.as_str())) + .collect::>(), + true, + ), + } + } +} + +impl From for Filter { + fn from(mut pb_filter: FilterPb) -> Self { + Self { + tag_key: pb_filter.take_tag_key(), + operators: Into::>::into(pb_filter.take_operators()) + .into_iter() + .map(|mut f| FilterOperator { + typ: f.get_filter_type().into(), + params: f.take_params().into(), + }) + .collect::>(), + } + } +} + +#[derive(Debug, Clone)] +pub struct Selector { + // query params + pub query_range: TimeRange, + pub table: String, + pub filters: Vec, + pub field: String, + + // align params + pub align_range: TimeRange, + pub step: i64, + pub range: i64, + pub offset: i64, +} + +impl Selector { + fn into_scan_plan( + self, + meta_provider: &ContextProviderAdapter<'_, P>, + ) -> Result<(LogicalPlan, Arc, String)> { + let Selector { + query_range, + field, + filters, + table, + .. + } = self; + let table_ref = meta_provider + .table(table.as_str().into()) + .context(MetaProviderError { + msg: "failed to find table".to_string(), + })? + .context(TableNotFound { name: &table })?; + + let table_provider = meta_provider + .get_table_provider(table_ref.name().into()) + .context(TableNotFound { name: &table })?; + let schema = Schema::try_from(table_provider.schema()).context(BuildTableSchema)?; + let timestamp_column_name = schema.timestamp_name().to_string(); + let (projection, tag_keys) = Self::build_projection_tag_keys(&schema, &field)?; + let mut filter_exprs = filters + .iter() + .filter_map(|f| { + // drop non_exist filter + if tag_keys.contains(&f.tag_key) { + Some(DataFusionExpr::from(f.clone())) + } else { + None + } + }) + .collect::>(); + filter_exprs.push(timerange_to_expr(query_range, ×tamp_column_name)); + + let builder = LogicalPlanBuilder::scan(table.clone(), table_provider, None)? + .filter(combine_filters(&filter_exprs).expect("at least one filter(timestamp)"))? + .project(projection)? + .sort(default_sort_exprs(×tamp_column_name))?; + let column_name = Arc::new(ColumnNames { + timestamp: timestamp_column_name, + tag_keys, + field, + }); + let scan_plan = builder.build().context(BuildPlanError)?; + Ok((scan_plan, column_name, table)) + } + + fn build_projection_tag_keys( + schema: &Schema, + field: &str, + ) -> Result<(Vec, Vec)> { + if let Some(f) = schema.column_with_name(field) { + ensure!( + f.data_type.is_f64_castable(), + InvalidExpr { + msg: "field type must be f64-compatibile type", + } + ); + } else { + return InvalidExpr { + msg: format!("field:{} not found", field), + } + .fail(); + }; + let mut tag_keys = Vec::new(); + let mut projection = schema + .columns() + .iter() + .filter_map(|column| { + if column.is_tag { + tag_keys.push(column.name.clone()); + Some(col(&column.name)) + } else { + None + } + }) + .collect::>(); + + let timestamp_expr = col(&schema.column(schema.timestamp_index()).name); + let tsid_expr = schema + .tsid_column() + .map(|c| col(&c.name)) + .context(InvalidExpr { + msg: format!("{} not found", TSID_COLUMN), + })?; + let field_expr = col(field); + projection.extend(vec![timestamp_expr, tsid_expr, field_expr]); + + Ok((projection, tag_keys)) + } +} diff --git a/sql/src/promql/datafusion_util.rs b/sql/src/promql/datafusion_util.rs new file mode 100644 index 0000000000..4e5003e963 --- /dev/null +++ b/sql/src/promql/datafusion_util.rs @@ -0,0 +1,105 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{any::Any, fmt, sync::Arc}; + +use arrow_deps::datafusion::logical_plan::{ + col, lit, DFSchemaRef, Expr as DataFusionExpr, Expr, LogicalPlan, UserDefinedLogicalNode, +}; +use common_types::{schema::TSID_COLUMN, time::TimeRange}; + +use crate::promql::pushdown::{AlignParameter, Func}; + +/// ColumnNames represents meaning of columns in one table. +#[derive(Debug)] +pub struct ColumnNames { + pub timestamp: String, + pub tag_keys: Vec, + pub field: String, +} + +/// Translate to `column_name BETWEEN start AND end` expr +pub fn timerange_to_expr(query_range: TimeRange, column_name: &str) -> DataFusionExpr { + DataFusionExpr::Between { + expr: Box::new(col(column_name)), + negated: false, + low: Box::new(lit(query_range.inclusive_start().as_i64())), + high: Box::new(lit(query_range.exclusive_end().as_i64() + 1)), + } +} + +pub fn default_sort_exprs(timestamp_column: &str) -> Vec { + vec![ + col(TSID_COLUMN).sort(true, true), + col(timestamp_column).sort(true, true), + ] +} + +pub struct PromAlignNode { + pub input: LogicalPlan, + pub column_name: Arc, + pub table_name: String, + pub func: Func, + pub align_param: AlignParameter, + pub read_parallelism: usize, +} + +impl fmt::Debug for PromAlignNode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.fmt_for_explain(f) + } +} + +impl UserDefinedLogicalNode for PromAlignNode { + fn as_any(&self) -> &dyn Any { + self + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.input] + } + + fn schema(&self) -> &DFSchemaRef { + self.input.schema() + } + + fn expressions(&self) -> Vec { + let qualified_name = |n| col(&format!("{}.{}", self.table_name, n)); + + let mut exprs = self + .column_name + .tag_keys + .iter() + .map(qualified_name) + .collect::>(); + + exprs.extend(vec![ + qualified_name(&self.column_name.timestamp), + qualified_name(&self.column_name.field), + ]); + + exprs + } + + fn fmt_for_explain(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "PromAlign: align_param={:?}, column_name={:?}, read_parallelism={}", + self.align_param, self.column_name, self.read_parallelism + ) + } + + fn from_template( + &self, + _exprs: &[Expr], + inputs: &[LogicalPlan], + ) -> std::sync::Arc { + Arc::new(PromAlignNode { + input: inputs[0].clone(), + func: self.func, + table_name: self.table_name.clone(), + column_name: self.column_name.clone(), + align_param: self.align_param, + read_parallelism: self.read_parallelism, + }) + } +} diff --git a/sql/src/promql/pushdown.rs b/sql/src/promql/pushdown.rs new file mode 100644 index 0000000000..f9c0a279d9 --- /dev/null +++ b/sql/src/promql/pushdown.rs @@ -0,0 +1,50 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::convert::TryFrom; + +use common_types::time::{TimeRange, Timestamp}; +use snafu::Snafu; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Func {} is not supported yet", func))] + NotSupportedFunc { func: String }, +} + +define_result!(Error); + +#[derive(Debug, Clone, Copy)] +pub enum Func { + Instant, // used to simulate instant query + Rate, + Irate, + Delta, + Idelta, + Increase, +} + +impl TryFrom<&str> for Func { + type Error = Error; + + fn try_from(op: &str) -> Result { + let t = match op { + "rate" => Func::Rate, + "delta" => Func::Delta, + "irate" => Func::Irate, + "idelta" => Func::Idelta, + "increase" => Func::Increase, + func => return NotSupportedFunc { func }.fail(), + }; + + Ok(t) + } +} + +#[derive(Debug, Clone, Copy)] +pub struct AlignParameter { + pub align_range: TimeRange, + pub step: Timestamp, + pub offset: Timestamp, + /// 0 for no look back + pub lookback_delta: Timestamp, +} diff --git a/sql/src/promql/udf.rs b/sql/src/promql/udf.rs new file mode 100644 index 0000000000..8928f6f790 --- /dev/null +++ b/sql/src/promql/udf.rs @@ -0,0 +1,300 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Copy from IOx +// https://github.com/influxdata/influxdb_iox/blob/d0f588d3b800894fe0ebd06b6f9a184ca6a603d7/predicate/src/regex.rs + +use std::sync::Arc; + +use arrow_deps::{ + arrow::{ + array::{ArrayRef, BooleanArray, StringArray, UInt64Array}, + datatypes::DataType, + }, + datafusion::{ + error::{DataFusionError, Result as DataFusionResult}, + logical_plan::{create_udf, Expr}, + physical_plan::{ + functions::{make_scalar_function, Volatility}, + udf::ScalarUDF, + }, + }, +}; +use common_types::hash::hash64; +use common_util::codec::{compact::MemCompactEncoder, Encoder}; + +/// The name of the regex_match UDF given to DataFusion. +pub const REGEX_MATCH_UDF_NAME: &str = "RegexMatch"; +pub const REGEX_NOT_MATCH_UDF_NAME: &str = "RegexNotMatch"; + +/// Given a column containing string values and a single regex pattern, +/// `regex_match_expr` determines which values satisfy the pattern and which do +/// not. +/// +/// If `matches` is true then this expression will filter values that do not +/// satisfy the regex (equivalent to `col ~= /pattern/`). If `matches` is +/// `false` then the expression will filter values that *do* match the regex, +/// which is equivalent to `col !~ /pattern/`. +/// +/// This UDF is designed to support the regex operator that can be pushed down +/// via the InfluxRPC API. +pub fn regex_match_expr(input: Expr, pattern: String, matches: bool) -> Expr { + // N.B., this function does not utilise the Arrow regexp compute kernel because + // in order to act as a filter it needs to return a boolean array of comparison + // results, not an array of strings as the regex compute kernel does. + let func = move |args: &[ArrayRef]| { + assert_eq!(args.len(), 1); // only works over a single column at a time. + + let input_arr = &args[0].as_any().downcast_ref::().unwrap(); + + let pattern = regex::Regex::new(&pattern).map_err(|e| { + DataFusionError::Internal(format!("error compiling regex pattern: {}", e)) + })?; + + let results = input_arr + .iter() + .map(|row| { + // in arrow, any value can be null. + // Here we decide to make our UDF to return null when either base or exponent is + // null. + row.map(|v| pattern.is_match(v) == matches) + }) + .collect::(); + + Ok(Arc::new(results) as ArrayRef) + }; + + // make_scalar_function is a helper to support accepting scalar values as + // well as arrays. + let func = make_scalar_function(func); + + let udf_name = if matches { + REGEX_MATCH_UDF_NAME + } else { + REGEX_NOT_MATCH_UDF_NAME + }; + + let udf = create_udf( + udf_name, + vec![DataType::Utf8], + Arc::new(DataType::Boolean), + Volatility::Stable, + func, + ); + + udf.call(vec![input]) +} + +pub fn create_unique_id(input_len: usize) -> ScalarUDF { + let func = move |args: &[ArrayRef]| { + if args.is_empty() { + let builder = UUIDBuilder::new(); + let tsid: UInt64Array = [Some(builder.finish())].iter().collect(); + return Ok(Arc::new(tsid) as ArrayRef); + } + let array_len = args[0].len(); + let inputs = args + .iter() + .map(|a| { + a.as_any() + .downcast_ref::() + .ok_or_else(|| DataFusionError::Execution("tag column not string".to_string())) + }) + .collect::>>()?; + + let mut builders = Vec::new(); + builders.resize_with(array_len, UUIDBuilder::new); + for array in &inputs { + array + .iter() + .zip(builders.iter_mut()) + .for_each(|(v, builder)| { + builder.write(v); + }); + } + let results: UInt64Array = builders.into_iter().map(|b| Some(b.finish())).collect(); + Ok(Arc::new(results) as ArrayRef) + }; + + create_udf( + "create_unique_id", + vec![DataType::Utf8; input_len], + Arc::new(DataType::UInt64), + Volatility::Stable, + make_scalar_function(func), + ) +} + +struct UUIDBuilder { + encoder: MemCompactEncoder, + buf: Vec, +} + +impl UUIDBuilder { + fn new() -> Self { + Self { + encoder: MemCompactEncoder, + buf: Vec::new(), + } + } + + fn write(&mut self, value: Option<&str>) { + let value = value.unwrap_or(""); + self.encoder + .encode(&mut self.buf, value.as_bytes()) + .unwrap(); // write mem is safe + } + + fn finish(self) -> u64 { + hash64(&self.buf) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow_deps::{ + arrow::{ + array::{StringArray, UInt64Array}, + record_batch::RecordBatch, + util::pretty::pretty_format_batches, + }, + datafusion::{ + datasource::MemTable, + error::DataFusionError, + logical_plan::{col, Expr as DataFusionExpr}, + prelude::ExecutionContext, + }, + }; + use common_types::schema::{ArrowSchema, ArrowSchemaRef, DataType, Field}; + + #[tokio::test] + async fn regex_match_expr() { + let cases = vec![ + ( + ".*", // match everything except NULL values + true, // keep the values matched + vec![ + "+---------------+--------+", + "| words | length |", + "+---------------+--------+", + "| air | 3 |", + "| aphex twin | 10 |", + "| bruce | 5 |", + "| Blood Orange | 12 |", + "| cocteau twins | 13 |", + "+---------------+--------+", + ], + ), + ( + ".*", // match everything except NULL values + false, // filter away all the values matched + vec!["++", "++"], + ), + ( + "", // an empty pattern also matches everything except NULL + true, + vec![ + "+---------------+--------+", + "| words | length |", + "+---------------+--------+", + "| air | 3 |", + "| aphex twin | 10 |", + "| bruce | 5 |", + "| Blood Orange | 12 |", + "| cocteau twins | 13 |", + "+---------------+--------+", + ], + ), + ( + ".+O.*", // match just words containing "O". + true, + vec![ + "+--------------+--------+", + "| words | length |", + "+--------------+--------+", + "| Blood Orange | 12 |", + "+--------------+--------+", + ], + ), + ( + "^(a|b).*", // match everything beginning with "a" or "b" + false, // negate expression and filter away anything that matches + vec![ + "+---------------+--------+", + "| words | length |", + "+---------------+--------+", + "| Blood Orange | 12 |", + "| cocteau twins | 13 |", + "+---------------+--------+", + ], + ), + ]; + + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("words", DataType::Utf8, true), + Field::new("length", DataType::UInt64, false), + ])); + + // define data for table + let words = vec![ + Some("air"), + Some("aphex twin"), + Some("bruce"), + Some("Blood Orange"), + None, + None, + Some("cocteau twins"), + ]; + let rb = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(StringArray::from(words.clone())), + Arc::new( + words + .iter() + .map(|word| word.map(|word| word.len() as u64)) + .collect::(), + ), + ], + ) + .unwrap(); + let rb = vec![vec![rb]]; + for (pattern, matches, expected) in cases.into_iter() { + let regex_expr = super::regex_match_expr(col("words"), pattern.to_string(), matches); + let actual = run_plan(schema.clone(), rb.clone(), regex_expr) + .await + .unwrap(); + + assert_eq!( + expected, actual, + "\n\nEXPECTED:\n{:#?}\nACTUAL:\n{:#?}\n", + expected, actual + ); + } + } + + // Run a plan against the following input table as "t" + async fn run_plan( + schema: ArrowSchemaRef, + rb: Vec>, + op: DataFusionExpr, + ) -> Result, DataFusionError> { + let provider = MemTable::try_new(Arc::clone(&schema), rb).unwrap(); + let mut ctx = ExecutionContext::new(); + ctx.register_table("t", Arc::new(provider)).unwrap(); + + let df = ctx.table("t").unwrap(); + let df = df.filter(op).unwrap(); + + // execute the query + let record_batches = df.collect().await?; + + Ok(pretty_format_batches(&record_batches) + .unwrap() + .to_string() + .split('\n') + .map(|s| s.to_owned()) + .collect()) + } +} diff --git a/sql/src/provider.rs b/sql/src/provider.rs new file mode 100644 index 0000000000..fee689c411 --- /dev/null +++ b/sql/src/provider.rs @@ -0,0 +1,345 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Adapter to providers in datafusion + +use std::{any::Any, cell::RefCell, collections::HashMap, sync::Arc}; + +use arrow_deps::datafusion::{ + catalog::{catalog::CatalogProvider, schema::SchemaProvider}, + datasource::TableProvider, + physical_plan::{udaf::AggregateUDF, udf::ScalarUDF}, + sql::planner::ContextProvider, +}; +use catalog::manager::Manager; +use common_types::request_id::RequestId; +use snafu::{ResultExt, Snafu}; +use table_engine::{provider::TableProviderAdapter, table::TableRef}; +use udf::{registry::FunctionRegistry, scalar::ScalarUdf, udaf::AggregateUdf}; + +use crate::container::{TableContainer, TableReference}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to find catalog, name:{}, err:{}", name, source))] + FindCatalog { + name: String, + source: catalog::manager::Error, + }, + + #[snafu(display("Failed to find schema, name:{}, err:{}", name, source))] + FindSchema { + name: String, + source: catalog::Error, + }, + + #[snafu(display("Failed to find table, name:{}, err:{}", name, source))] + FindTable { + name: String, + source: catalog::schema::Error, + }, + + #[snafu(display("Failed to find udf, err:{}", source))] + FindUdf { source: udf::registry::Error }, +} + +define_result!(Error); + +/// MetaProvider provides meta info needed by Frontend +pub trait MetaProvider { + /// Default catalog name + fn default_catalog_name(&self) -> &str; + + /// Default schema name + fn default_schema_name(&self) -> &str; + + /// Get table meta by table reference + /// + /// Note that this function may block current thread. We can't make this + /// function async as the underlying (aka. datafusion) planner needs a + /// sync provider. + fn table(&self, name: TableReference) -> Result>; + + /// Get udf by name. + fn scalar_udf(&self, name: &str) -> Result>; + + /// Get udaf by name. + fn aggregate_udf(&self, name: &str) -> Result>; +} + +/// We use an adapter instead of using [catalog::Manager] directly, because +/// - MetaProvider provides blocking method, but catalog::Manager may provide +/// async method +/// - Other meta data like default catalog and schema are needed +// TODO(yingwen): Maybe support schema searching instead of using a fixed +// default schema +pub struct CatalogMetaProvider<'a, M> { + pub manager: &'a M, + pub default_catalog: &'a str, + pub default_schema: &'a str, + pub function_registry: &'a (dyn FunctionRegistry + Send + Sync), +} + +impl<'a, M: Manager> MetaProvider for CatalogMetaProvider<'a, M> { + fn default_catalog_name(&self) -> &str { + self.default_catalog + } + + fn default_schema_name(&self) -> &str { + self.default_schema + } + + fn table(&self, name: TableReference) -> Result> { + let resolved = name.resolve(self.default_catalog, self.default_schema); + + let catalog = match self + .manager + .catalog_by_name(resolved.catalog) + .context(FindCatalog { + name: resolved.catalog, + })? { + Some(c) => c, + None => return Ok(None), + }; + + let schema = match catalog + .schema_by_name(resolved.schema) + .context(FindSchema { + name: resolved.schema, + })? { + Some(s) => s, + None => return Ok(None), + }; + + schema.table_by_name(resolved.table).context(FindTable { + name: resolved.table, + }) + } + + fn scalar_udf(&self, name: &str) -> Result> { + self.function_registry.find_udf(name).context(FindUdf) + } + + fn aggregate_udf(&self, name: &str) -> Result> { + self.function_registry.find_udaf(name).context(FindUdf) + } +} + +/// An adapter to ContextProvider, not thread safe +pub struct ContextProviderAdapter<'a, P> { + /// Local cache for TableProvider to avoid create multiple adapter for the + /// same table, also save all the table needed during planning + table_cache: RefCell, + /// Store the first error MetaProvider returns + err: RefCell>, + meta_provider: &'a P, + request_id: RequestId, + /// Read parallelism for each table. + read_parallelism: usize, +} + +impl<'a, P: MetaProvider> ContextProviderAdapter<'a, P> { + /// Create a adapter from meta provider + pub fn new(meta_provider: &'a P, request_id: RequestId, read_parallelism: usize) -> Self { + let default_catalog = meta_provider.default_catalog_name().to_string(); + let default_schema = meta_provider.default_schema_name().to_string(); + + Self { + table_cache: RefCell::new(TableContainer::new(default_catalog, default_schema)), + err: RefCell::new(None), + meta_provider, + request_id, + read_parallelism, + } + } + + /// Consumes the adapter, returning the tables used during planning if no + /// error occurs, otherwise returning the error + pub fn try_into_container(self) -> Result { + if let Some(e) = self.err.into_inner() { + return Err(e); + } + + Ok(self.table_cache.into_inner()) + } + + /// Save error if there is no existing error. + /// + /// The datafusion's ContextProvider can't return error, so here we save the + /// error in the adapter and return None, also let datafusion + /// return a provider not found error and abort the planning + /// procedure. + fn maybe_set_err(&self, err: Error) { + if self.err.borrow().is_none() { + *self.err.borrow_mut() = Some(err); + } + } +} + +impl<'a, P: MetaProvider> MetaProvider for ContextProviderAdapter<'a, P> { + fn default_catalog_name(&self) -> &str { + self.meta_provider.default_catalog_name() + } + + fn default_schema_name(&self) -> &str { + self.meta_provider.default_schema_name() + } + + fn table(&self, name: TableReference) -> Result> { + self.meta_provider.table(name) + } + + fn scalar_udf(&self, name: &str) -> Result> { + self.meta_provider.scalar_udf(name) + } + + fn aggregate_udf(&self, name: &str) -> Result> { + self.meta_provider.aggregate_udf(name) + } +} + +impl<'a, P: MetaProvider> ContextProvider for ContextProviderAdapter<'a, P> { + fn get_table_provider(&self, name: TableReference) -> Option> { + // Find in local cache + if let Some(p) = self.table_cache.borrow().get(name) { + return Some(p); + } + + // Find in meta provider + match self.meta_provider.table(name) { + Ok(Some(table)) => { + let table_adapter = Arc::new(TableProviderAdapter::new( + table, + self.request_id, + self.read_parallelism, + )); + // Put into cache + self.table_cache + .borrow_mut() + .insert(name, table_adapter.clone()); + + Some(table_adapter) + } + Ok(None) => None, + Err(e) => { + self.maybe_set_err(e); + None + } + } + } + + // ScalarUDF is not supported now + fn get_function_meta(&self, name: &str) -> Option> { + // We don't cache udf used by the query because now we will register all udf to + // datafusion's context. + match self.meta_provider.scalar_udf(name) { + Ok(Some(udf)) => Some(udf.to_datafusion_udf()), + Ok(None) => None, + Err(e) => { + self.maybe_set_err(e); + None + } + } + } + + // AggregateUDF is not supported now + fn get_aggregate_meta(&self, name: &str) -> Option> { + match self.meta_provider.aggregate_udf(name) { + Ok(Some(udaf)) => Some(udaf.to_datafusion_udaf()), + Ok(None) => None, + Err(e) => { + self.maybe_set_err(e); + None + } + } + } +} + +struct SchemaProviderAdapter { + catalog: String, + schema: String, + tables: Arc, +} + +impl SchemaProvider for SchemaProviderAdapter { + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + let mut names = Vec::new(); + let _ = self.tables.visit::<_, ()>(|name, table| { + if name.catalog == self.catalog && name.schema == self.schema { + names.push(table.as_table_ref().name().to_string()); + } + Ok(()) + }); + names + } + + fn table(&self, name: &str) -> Option> { + let name_ref = TableReference::Full { + catalog: &self.catalog, + schema: &self.schema, + table: name, + }; + self.tables + .get(name_ref) + .map(|v| v as Arc) + } + + fn table_exist(&self, name: &str) -> bool { + self.table(name).is_some() + } +} + +#[derive(Default)] +pub struct CatalogProviderAdapter { + schemas: HashMap>, +} + +impl CatalogProviderAdapter { + pub fn new_adapters(tables: Arc) -> HashMap { + let mut catalog_adapters = HashMap::with_capacity(tables.num_catalogs()); + let _ = tables.visit::<_, ()>(|name, _| { + // Get or create catalog + let catalog = match catalog_adapters.get_mut(name.catalog) { + Some(v) => v, + None => catalog_adapters + .entry(name.catalog.to_string()) + .or_insert_with(CatalogProviderAdapter::default), + }; + // Get or create schema + if catalog.schemas.get(name.schema).is_none() { + catalog.schemas.insert( + name.schema.to_string(), + Arc::new(SchemaProviderAdapter { + catalog: name.catalog.to_string(), + schema: name.schema.to_string(), + tables: tables.clone(), + }), + ); + } + + Ok(()) + }); + + catalog_adapters + } +} + +impl CatalogProvider for CatalogProviderAdapter { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + self.schemas.keys().cloned().collect() + } + + fn schema(&self, name: &str) -> Option> { + self.schemas + .get(name) + .cloned() + .map(|v| v as Arc) + } +} diff --git a/sql/src/tests.rs b/sql/src/tests.rs new file mode 100644 index 0000000000..bd49bded4b --- /dev/null +++ b/sql/src/tests.rs @@ -0,0 +1,69 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use arrow_deps::datafusion::catalog::TableReference; +use catalog::consts::{DEFAULT_CATALOG, DEFAULT_SCHEMA}; +use common_types::tests::build_schema; +use table_engine::{ + memory::MemoryTable, + table::{Table, TableId, TableRef}, + ANALYTIC_ENGINE_TYPE, +}; +use udf::{scalar::ScalarUdf, udaf::AggregateUdf}; + +use crate::provider::MetaProvider; + +pub struct MockMetaProvider { + tables: Vec>, +} + +impl Default for MockMetaProvider { + fn default() -> Self { + Self { + tables: vec![ + Arc::new(MemoryTable::new( + "test_table".to_string(), + TableId::from(100), + build_schema(), + ANALYTIC_ENGINE_TYPE.to_string(), + )), + Arc::new(MemoryTable::new( + "test_table2".to_string(), + TableId::from(101), + build_schema(), + ANALYTIC_ENGINE_TYPE.to_string(), + )), + ], + } + } +} + +impl MetaProvider for MockMetaProvider { + fn default_catalog_name(&self) -> &str { + DEFAULT_CATALOG + } + + fn default_schema_name(&self) -> &str { + DEFAULT_SCHEMA + } + + fn table(&self, name: TableReference) -> crate::provider::Result> { + let resolved = name.resolve(self.default_catalog_name(), self.default_schema_name()); + for table in &self.tables { + if resolved.table == table.name() { + return Ok(Some(table.clone())); + } + } + + Ok(None) + } + + fn scalar_udf(&self, _name: &str) -> crate::provider::Result> { + todo!() + } + + fn aggregate_udf(&self, _name: &str) -> crate::provider::Result> { + todo!() + } +} diff --git a/src/bin/ceresdb-server.rs b/src/bin/ceresdb-server.rs new file mode 100644 index 0000000000..627e9ab296 --- /dev/null +++ b/src/bin/ceresdb-server.rs @@ -0,0 +1,83 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! The main entry point to start the server + +// TODO(yingwen): ceresdb-server is a legacy name, maybe use a new name + +use std::env; + +use ceresdbx::setup; +use clap::{App, Arg}; +use common_util::{panic, toml}; +use log::info; +use server::config::Config; + +/// The ip address of current node. +const NODE_ADDR: &str = "CSE_CERES_META_NODE_ADDR"; +const META_PEERS: &str = "META_PEERS"; +const CLUSTER_NAME: &str = "CLUSTER_NAME"; +/// Enable communication with meta node. +const ENABLE_META: &str = "ENABLE_META"; + +fn fetch_version() -> String { + let build_version = env!("VERGEN_BUILD_SEMVER"); + let git_branch = env!("VERGEN_GIT_BRANCH"); + let git_commit_id = env!("VERGEN_GIT_SHA_SHORT"); + let build_time = env!("VERGEN_BUILD_TIMESTAMP"); + + format!( + "\nCeresDB Version: {}\nGit branch: {}\nGit commit: {}\nBuild: {}", + build_version, git_branch, git_commit_id, build_time + ) +} + +fn main() { + let version = fetch_version(); + let matches = App::new("CeresDB Server") + .version(version.as_str()) + .arg( + Arg::with_name("config") + .short("c") + .long("config") + .required(false) + .takes_value(true) + .help("Set configuration file, eg: \"/path/server.toml\""), + ) + .get_matches(); + + let mut config = match matches.value_of("config") { + Some(path) => { + let mut toml_buf = String::new(); + toml::parse_toml_from_path(path, &mut toml_buf).expect("Failed to parse config.") + } + None => Config::default(), + }; + + // Combine configs from env. + if let Ok(enable_meta) = env::var(ENABLE_META) { + if let Ok(enable_meta) = enable_meta.parse::() { + config.meta_client.enable_meta = enable_meta; + } + } + if let Ok(node_addr) = env::var(NODE_ADDR) { + config.meta_client.node = node_addr; + } + if let Ok(meta_addr) = env::var(META_PEERS) { + config.meta_client.meta_addr = meta_addr; + } + if let Ok(cluster) = env::var(CLUSTER_NAME) { + config.meta_client.cluster = cluster; + } + + // Setup log. + let _runtime_level = setup::setup_log(&config); + // Setup tracing. + let _writer_guard = setup::setup_tracing(&config); + + panic::set_panic_hook(false); + + // Log version. + info!("version:{}", version); + + setup::run_server(config); +} diff --git a/src/docs/config.toml b/src/docs/config.toml new file mode 100644 index 0000000000..5a2ede377c --- /dev/null +++ b/src/docs/config.toml @@ -0,0 +1,27 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +bind_addr = "0.0.0.0" +http_port = 5000 +grpc_port = 8831 +log_level = "debug" + +[analytic] +data_path = "/tmp/ceresdbx/" + +[analytic.table_opts] +arena_block_size = 128 + +[[meta_client.cluster_view.shards]] +shard_id = 0 +[[meta_client.cluster_view.shards.nodes]] +addr = '127.0.0.1' +port = 38082 + +[[meta_client.cluster_view.shards]] +shard_id = 1 +[[meta_client.cluster_view.shards.nodes]] +addr = '127.0.0.1' +port = 48082 +[[meta_client.cluster_view.shards.nodes]] +addr = '127.0.0.1' +port = 58082 \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000000..22fed20ac2 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,6 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! ceresdbx + +pub mod setup; +mod signal_handler; diff --git a/src/setup.rs b/src/setup.rs new file mode 100644 index 0000000000..6c2d8263d4 --- /dev/null +++ b/src/setup.rs @@ -0,0 +1,127 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Setup server + +use std::sync::Arc; + +use analytic_engine::{self, setup}; +use catalog_impls::{table_based::TableBasedManager, CatalogManagerImpl}; +use common_util::runtime; +use log::info; +use logger::RuntimeLevel; +use query_engine::executor::ExecutorImpl; +use server::{ + config::{Config, RuntimeConfig}, + server::Builder, + table_engine::{MemoryTableEngine, TableEngineProxy}, +}; +use table_engine::engine::EngineRuntimes; +use tracing_util::{ + self, + tracing_appender::{non_blocking::WorkerGuard, rolling::Rotation}, +}; +use udf::registry::FunctionRegistryImpl; + +use crate::signal_handler; + +/// Setup log with given `config`, returns the runtime log level switch. +pub fn setup_log(config: &Config) -> RuntimeLevel { + server::logger::init_log(config).expect("Failed to init log.") +} + +/// Setup tracing with given `config`, returns the writer guard. +pub fn setup_tracing(config: &Config) -> WorkerGuard { + tracing_util::init_tracing_with_file( + &config.tracing_log_name, + &config.tracing_log_dir, + &config.tracing_level, + Rotation::NEVER, + ) +} + +fn build_runtime(name: &str, threads_num: usize) -> runtime::Runtime { + runtime::Builder::default() + .worker_threads(threads_num) + .thread_name(name) + .enable_all() + .build() + .unwrap_or_else(|e| { + //TODO(yingwen) replace panic with fatal + panic!("Failed to create runtime, err:{}", e); + }) +} + +fn build_engine_runtimes(config: &RuntimeConfig) -> EngineRuntimes { + EngineRuntimes { + read_runtime: Arc::new(build_runtime("cse-read", config.read_thread_num)), + write_runtime: Arc::new(build_runtime("cse-write", config.write_thread_num)), + bg_runtime: Arc::new(build_runtime("cse-bg", config.background_thread_num)), + } +} + +/// Run a server, returns when the server is shutdown by user +pub fn run_server(config: Config) { + let runtimes = Arc::new(build_engine_runtimes(&config.runtime)); + let engine_runtimes = runtimes.clone(); + + info!("Server starts up, config:{:#?}", config); + + runtimes.bg_runtime.block_on(async { + // Build all table engine + // Create memory engine + let memory = MemoryTableEngine; + // Create analytic engine + let analytic_config = config.analytic.clone(); + let analytic = setup::open_analytic_table_engine(analytic_config, engine_runtimes) + .await + .unwrap_or_else(|e| { + panic!("Failed to setup analytic engine, err:{}", e); + }); + + // Create table engine proxy + let engine_proxy = Arc::new(TableEngineProxy { + memory, + analytic: analytic.clone(), + }); + + // Create catalog manager, use analytic table as backend + let catalog_manager = CatalogManagerImpl::new( + TableBasedManager::new(&analytic, engine_proxy.clone()) + .await + .unwrap_or_else(|e| { + panic!("Failed to create catalog manager, err:{}", e); + }), + ); + + // Init function registry. + let mut function_registry = FunctionRegistryImpl::new(); + function_registry.load_functions().unwrap_or_else(|e| { + panic!("Failed to create function registry, err:{}", e); + }); + let function_registry = Arc::new(function_registry); + + // Create query executor + let query_executor = ExecutorImpl::new(); + + // Build and start server + let mut server = Builder::new(config) + .runtimes(runtimes.clone()) + .catalog_manager(catalog_manager) + .query_executor(query_executor) + .table_engine(engine_proxy) + .function_registry(function_registry) + .build() + .unwrap_or_else(|e| { + panic!("Failed to create server, err:{}", e); + }); + server.start().await.unwrap_or_else(|e| { + panic!("Failed to start server,, err:{}", e); + }); + + // Wait for signal + signal_handler::wait_for_signal(); + + // Stop server + server.stop(); + }); +} diff --git a/src/signal_handler.rs b/src/signal_handler.rs new file mode 100644 index 0000000000..39ad1733f4 --- /dev/null +++ b/src/signal_handler.rs @@ -0,0 +1,31 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Signal handler +//! +//! Only works on unix like environments + +pub use self::details::wait_for_signal; + +#[cfg(unix)] +mod details { + use log::info; + use signal_hook::{consts::TERM_SIGNALS, iterator::Signals}; + + pub fn wait_for_signal() { + let mut sigs = Signals::new(TERM_SIGNALS).unwrap_or_else(|e| { + // TODO(yingwen): Log here + panic!("Failed to register signal handlers, err:{}", e); + }); + for signal in &mut sigs { + if TERM_SIGNALS.contains(&signal) { + info!("Received signal {}, stopping server...", signal); + break; + } + } + } +} + +#[cfg(not(unix))] +mod details { + pub fn wait_for_signal() {} +} diff --git a/system_catalog/Cargo.toml b/system_catalog/Cargo.toml new file mode 100644 index 0000000000..c6d4ff7b7a --- /dev/null +++ b/system_catalog/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "system_catalog" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +# In alphabetical order +arrow_deps = { path = "../arrow_deps" } +async-trait = "0.1.41" +catalog = { path = "../catalog" } +common_types = { path = "../common_types" } +common_util = { path = "../common_util" } +futures = "0.3" +log = "0.4" +proto = { path = "../proto" } +protobuf = "2.20" +snafu = { version = "0.6.10", features = ["backtraces"] } +table_engine = { path = "../table_engine" } +tokio = { version = "1.0", features = ["sync"] } diff --git a/system_catalog/src/lib.rs b/system_catalog/src/lib.rs new file mode 100644 index 0000000000..a0e1855a70 --- /dev/null +++ b/system_catalog/src/lib.rs @@ -0,0 +1,168 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! System catalog implementations + +use std::{ + collections::HashMap, + fmt::Debug, + pin::Pin, + sync::Arc, + task::{Context, Poll}, +}; + +use async_trait::async_trait; +use common_types::{ + record_batch::RecordBatch, + row::Row, + schema::{RecordSchema, Schema}, + time::Timestamp, +}; +use futures::Stream; +use table_engine::{ + stream, + stream::{PartitionedStreams, RecordBatchStream, SendableRecordBatchStream}, + table::{ + AlterSchemaRequest, FlushRequest, GetRequest, ReadRequest, Table, TableId, TableStats, + WriteRequest, + }, +}; + +pub mod sys_catalog_table; +pub mod tables; + +/// Timestamp of entry +pub const ENTRY_TIMESTAMP: Timestamp = Timestamp::new(0); + +/// The minimal thing that a system table needs to implement +#[async_trait] +pub trait SystemTable: Send + Sync + Debug { + /// System table name + fn name(&self) -> &str; + + /// System table name + fn id(&self) -> TableId; + + /// Produce the schema from this system table + fn schema(&self) -> Schema; + + /// Get the contents of the system table as a single RecordBatch + async fn read( + &self, + request: ReadRequest, + ) -> table_engine::table::Result; +} + +#[derive(Debug)] +pub struct SystemTableAdapter { + inner: Arc, +} + +impl SystemTableAdapter { + pub fn new(inner: impl SystemTable + 'static) -> Self { + Self { + inner: Arc::new(inner), + } + } +} + +#[async_trait] +impl Table for SystemTableAdapter { + fn name(&self) -> &str { + self.inner.name() + } + + fn id(&self) -> TableId { + self.inner.id() + } + + fn schema(&self) -> Schema { + self.inner.schema() + } + + fn options(&self) -> HashMap { + HashMap::new() + } + + fn engine_type(&self) -> &str { + "system" + } + + fn stats(&self) -> TableStats { + TableStats::default() + } + + async fn write(&self, _request: WriteRequest) -> table_engine::table::Result { + Ok(0) + } + + async fn read( + &self, + request: ReadRequest, + ) -> table_engine::table::Result { + self.inner.read(request).await + } + + async fn get(&self, _request: GetRequest) -> table_engine::table::Result> { + Ok(None) + } + + async fn partitioned_read( + &self, + request: ReadRequest, + ) -> table_engine::table::Result { + let read_parallelism = request.opts.read_parallelism; + let stream = self.inner.read(request).await?; + let mut streams = Vec::with_capacity(read_parallelism); + streams.push(stream); + for _ in 0..read_parallelism - 1 { + streams.push(Box::pin(OneRecordBatchStream { + schema: self.schema().clone().to_record_schema(), + record_batch: None, + })); + } + Ok(PartitionedStreams { streams }) + } + + async fn alter_schema( + &self, + _request: AlterSchemaRequest, + ) -> table_engine::table::Result { + Ok(0) + } + + async fn alter_options( + &self, + _options: HashMap, + ) -> table_engine::table::Result { + Ok(0) + } + + async fn flush(&self, _request: FlushRequest) -> table_engine::table::Result<()> { + Ok(()) + } + + async fn compact(&self) -> table_engine::table::Result<()> { + Ok(()) + } +} + +pub struct OneRecordBatchStream { + schema: RecordSchema, + record_batch: Option, +} +impl Stream for OneRecordBatchStream { + type Item = stream::Result; + + fn poll_next(mut self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll> { + if self.record_batch.is_none() { + Poll::Ready(None) + } else { + Poll::Ready(Some(Ok(self.record_batch.take().unwrap()))) + } + } +} +impl RecordBatchStream for OneRecordBatchStream { + fn schema(&self) -> &RecordSchema { + &self.schema + } +} diff --git a/system_catalog/src/sys_catalog_table.rs b/system_catalog/src/sys_catalog_table.rs new file mode 100644 index 0000000000..e1a4a004be --- /dev/null +++ b/system_catalog/src/sys_catalog_table.rs @@ -0,0 +1,1017 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table to store system catalog + +use std::{collections::HashMap, convert::TryFrom, mem}; + +use async_trait::async_trait; +use catalog::consts; +use common_types::{ + bytes::{Bytes, BytesMut, MemBuf, MemBufMut}, + column_schema, + datum::{Datum, DatumKind}, + projected_schema::ProjectedSchema, + record_batch::RecordBatch, + request_id::RequestId, + row::{Row, RowGroup, RowGroupBuilder}, + schema::{self, Schema}, + time::Timestamp, +}; +use common_util::{ + codec::{memcomparable::MemComparable, Encoder}, + define_result, +}; +use futures::TryStreamExt; +use log::{debug, info, warn}; +use proto::sys_catalog::{CatalogEntry, SchemaEntry, TableEntry}; +use protobuf::Message; +use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu}; +use table_engine::{ + self, + engine::{ + CreateTableRequest, DropTableRequest, OpenTableRequest, TableEngine, TableRequestType, + TableState, + }, + predicate::PredicateBuilder, + table::{ + GetRequest, ReadOptions, ReadOrder, ReadRequest, SchemaId, TableId, TableInfo, TableRef, + TableSeq, WriteRequest, + }, +}; +use tokio::sync::Mutex; + +use crate::ENTRY_TIMESTAMP; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to build schema for sys_catalog, err:{}", source))] + BuildSchema { source: common_types::schema::Error }, + + #[snafu(display( + "Failed to get column index for sys_catalog, name:{}.\nBacktrace:\n{}", + name, + backtrace + ))] + GetColumnIndex { name: String, backtrace: Backtrace }, + + #[snafu(display("Failed to build table for sys_catalog, err:{}", source))] + BuildTable { source: table_engine::engine::Error }, + + #[snafu(display("Failed to open table for sys_catalog, err:{}", source))] + OpenTable { source: table_engine::engine::Error }, + + #[snafu(display("Failed to convert into RowGroup, err:{}", source))] + IntoRowGroup { source: common_types::row::Error }, + + #[snafu(display("Failed to persist catalog to table, err:{}", source))] + PersistCatalog { source: table_engine::table::Error }, + + #[snafu(display("Failed to persist schema to table, err:{}", source))] + PersistSchema { source: table_engine::table::Error }, + + #[snafu(display("Failed to persist tables to table, err:{}", source))] + PersistTables { source: table_engine::table::Error }, + + #[snafu(display("Failed to read table, err:{}", source))] + ReadTable { source: table_engine::table::Error }, + + #[snafu(display("Failed to read stream, err:{}", source))] + ReadStream { source: table_engine::stream::Error }, + + #[snafu(display( + "Visitor catalog not found, catalog:{}.\nBacktrace:\n{}", + catalog, + backtrace + ))] + #[snafu(visibility(pub))] + VisitorCatalogNotFound { + catalog: String, + backtrace: Backtrace, + }, + + #[snafu(display( + "Visitor schema not found, catalog:{}, schema:{}.\nBacktrace:\n{}", + catalog, + schema, + backtrace + ))] + #[snafu(visibility(pub))] + VisitorSchemaNotFound { + catalog: String, + schema: String, + backtrace: Backtrace, + }, + + #[snafu(display("Visitor Failed to open table, err:{}", source))] + #[snafu(visibility(pub))] + VisitorOpenTable { source: table_engine::engine::Error }, + + #[snafu(display("Failed to encode entry key header, err:{}", source))] + EncodeKeyHeader { source: common_types::bytes::Error }, + + #[snafu(display("Failed to encode entry body, err:{}", source))] + EncodeKeyBody { + source: common_util::codec::memcomparable::Error, + }, + + #[snafu(display("Failed to encode table key type, err:{}", source))] + EncodeTableKeyType { source: common_types::bytes::Error }, + + #[snafu(display("Failed to read entry key header, err:{}", source))] + ReadKeyHeader { source: common_types::bytes::Error }, + + #[snafu(display("Failed to read table key header, err:{}", source))] + ReadTableKeyHeader { source: common_types::bytes::Error }, + + #[snafu(display( + "Invalid entry key header, value:{}.\nBacktrace:\n{}", + value, + backtrace + ))] + InvalidKeyHeader { value: u8, backtrace: Backtrace }, + + #[snafu(display("Invalid table key type, value:{}.\nBacktrace:\n{}", value, backtrace))] + InvalidTableKeyType { value: u8, backtrace: Backtrace }, + + #[snafu(display( + "Failed to encode protobuf for entry, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + EncodeEntryPb { + source: protobuf::error::ProtobufError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to build row for entry, err:{}", source))] + BuildRow { source: common_types::row::Error }, + + #[snafu(display( + "Failed to decode protobuf for entry, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + DecodeEntryPb { + source: protobuf::error::ProtobufError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to decode table entry, err:{}", source))] + DecodeTableEntry { + source: table_engine::table::TryFromTableEntryError, + }, + + #[snafu(display( + "Failed to decode schema for table alter entry, table:{}, err:{}", + table, + source + ))] + DecodeSchema { + table: String, + source: common_types::schema::Error, + }, + + #[snafu(display("Table key type not found in key.\nBacktrace:\n{}", backtrace))] + EmptyTableKeyType { backtrace: Backtrace }, + + #[snafu(display( + "The row in the sys_catalog_table is invalid, row:{:?}.\nBacktrace:\n{}", + row, + backtrace + ))] + InvalidTableRow { row: Row, backtrace: Backtrace }, + + #[snafu(display( + "The fetched table is mismatched, expect:{}, given:{}.\nBacktrace:\n{}", + expect_table, + given_table, + backtrace + ))] + TableKeyMismatch { + expect_table: String, + given_table: String, + backtrace: Backtrace, + }, + + #[snafu(display("The table is not found, table:{}.\nBacktrace:\n{}", table, backtrace))] + TableNotFound { table: String, backtrace: Backtrace }, + + #[snafu(display("Fail to get the table info, table:{}, err:{}.", table, source))] + GetTableInfo { + table: String, + source: table_engine::table::Error, + }, + + #[snafu(display("Invalid table state transition, table:{}, err:{}.", table, source))] + InvalidTableStateTransition { + table: String, + source: table_engine::engine::Error, + }, + + #[snafu(display("Invalid schema id, id:{}", id))] + InvalidSchemaId { id: u32 }, +} + +define_result!(Error); + +/// Table name of the sys catalog +pub const TABLE_NAME: &str = "sys_catalog"; +/// Schema id of the sys catalog schema (`system/public`). +pub const SCHEMA_ID: SchemaId = SchemaId::from_u16(1); +/// Table sequence of the sys catalog table, always set to 1 +pub const TABLE_SEQ: TableSeq = TableSeq::from_u32(1); +/// Table id of the `sys_catalog` table. +pub const TABLE_ID: TableId = TableId::new(SCHEMA_ID, TABLE_SEQ); +/// Name of key column (field) +pub const KEY_COLUMN_NAME: &str = "key"; +/// Name of timestamp column (field) +pub const TIMESTAMP_COLUMN_NAME: &str = "timestamp"; +/// Name of value column (field) +pub const VALUE_COLUMN_NAME: &str = "value"; +/// Default enable ttl is false +pub const DEFAULT_ENABLE_TTL: &str = "false"; + +// TODO(yingwen): Add a type column once support int8 type and maybe split key +// into multiple columns. +/// SysCatalogTable is a special table to keep tracks of the system infomations +/// +/// Similar to kudu's SysCatalogTable +/// - see +/// - schema: (key, timestamp) -> metadata +/// +/// The timestamp is used to support metadata ttl in the future, now it can set +/// to 0. +#[derive(Debug)] +pub struct SysCatalogTable { + // TODO(yingwen): Table id + /// Underlying Table to actually store data + table: TableRef, + /// Index of the key column + key_column_index: usize, + /// Index of the value column + value_column_index: usize, + /// Protects table create/alter/drop + // TODO(xikai): A better way is to use a specific struct with the lock that takes + // responsibilities to update table. + update_table_lock: Mutex<()>, +} + +impl SysCatalogTable { + /// Create a new [SysCatalogTable] + pub async fn new(table_engine: &T) -> Result { + let table_schema = new_sys_catalog_schema().context(BuildSchema)?; + let key_column_index = table_schema + .index_of(KEY_COLUMN_NAME) + .context(GetColumnIndex { + name: KEY_COLUMN_NAME, + })?; + let value_column_index = + table_schema + .index_of(VALUE_COLUMN_NAME) + .context(GetColumnIndex { + name: VALUE_COLUMN_NAME, + })?; + + let open_request = OpenTableRequest { + catalog_name: consts::SYSTEM_CATALOG.to_string(), + schema_name: consts::SYSTEM_CATALOG_SCHEMA.to_string(), + table_name: TABLE_NAME.to_string(), + engine: table_engine.engine_type().to_string(), + }; + + let table_opt = table_engine + .open_table(open_request) + .await + .context(OpenTable)?; + match table_opt { + Some(table) => { + info!("Sys catalog table open existing table"); + + // The sys_catalog table is already created + return Ok(Self { + table, + key_column_index, + value_column_index, + update_table_lock: Mutex::new(()), + }); + } + None => { + info!("Sys catalog table is not exists, try to create a new table"); + } + } + + let mut options = HashMap::new(); + options.insert( + table_engine::OPTION_KEY_ENABLE_TTL.to_string(), + DEFAULT_ENABLE_TTL.to_string(), + ); + let create_request = CreateTableRequest { + catalog_name: consts::SYSTEM_CATALOG.to_string(), + schema_name: consts::SYSTEM_CATALOG_SCHEMA.to_string(), + table_id: TABLE_ID, + table_name: TABLE_NAME.to_string(), + table_schema, + partition_info: None, + engine: table_engine.engine_type().to_string(), + options, + state: TableState::Stable, + }; + + let table = table_engine + .create_table(create_request) + .await + .context(BuildTable)?; + + Ok(Self { + table, + key_column_index, + value_column_index, + update_table_lock: Mutex::new(()), + }) + } + + /// Returns the table id of the sys catalog table. + #[inline] + pub fn table_id(&self) -> TableId { + TABLE_ID + } + + /// Add and store the catalog info + pub async fn create_catalog(&self, request: CreateCatalogRequest) -> Result<()> { + info!("Add catalog to sys_catalog table, request:{:?}", request); + + let row_group = request.into_row_group(self.table.schema())?; + + let write_req = WriteRequest { row_group }; + self.table.write(write_req).await.context(PersistCatalog)?; + + Ok(()) + } + + /// Add and store the schema info + pub async fn create_schema(&self, request: CreateSchemaRequest) -> Result<()> { + info!("Add schema to sys_catalog table, request:{:?}", request); + + let row_group = request.into_row_group(self.table.schema())?; + + let write_req = WriteRequest { row_group }; + self.table.write(write_req).await.context(PersistSchema)?; + + Ok(()) + } + + /// Create table in the catalog. + pub async fn create_table(&self, table_info: TableInfo) -> Result<()> { + info!( + "Create table to sys_catalog table, table_info:{:?}", + table_info + ); + + let _lock = self.update_table_lock.lock().await; + self.write_table_info(table_info, TableRequestType::Create) + .await?; + + Ok(()) + } + + /// Prepare to drop the table. + pub async fn prepare_drop_table(&self, request: DropTableRequest) -> Result<()> { + info!( + "Prepare to drop table to sys_catalog table, request:{:?}", + request + ); + + let table_key = TableKey { + catalog: &request.catalog_name, + schema: &request.schema_name, + table: &request.table_name, + }; + + // update the dropped flag the lock held. + { + let _lock = self.update_table_lock.lock().await; + if let Some(mut table_info) = self.get_table_info(table_key).await? { + table_info.state.try_transit(TableState::Dropping).context( + InvalidTableStateTransition { + table: &request.table_name, + }, + )?; + + self.write_table_info(table_info, TableRequestType::Drop) + .await?; + } else { + warn!("Prepare to drop a dropped table, request:{:?}", request); + } + } + + Ok(()) + } + + /// Drop the table. + /// + /// Note that [prepare_drop_table] should be called before this method. + pub async fn drop_table(&self, request: DropTableRequest) -> Result<()> { + info!("Drop table to sys_catalog table, request:{:?}", request); + + let table_key = TableKey { + catalog: &request.catalog_name, + schema: &request.schema_name, + table: &request.table_name, + }; + + // update the table state with the lock held. + { + if let Some(mut table_info) = self.get_table_info(table_key).await? { + table_info.state.try_transit(TableState::Dropped).context( + InvalidTableStateTransition { + table: &request.table_name, + }, + )?; + + self.write_table_info(table_info, TableRequestType::Drop) + .await?; + } else { + warn!("Drop a dropped table, request:{:?}", request); + } + } + + Ok(()) + } + + /// Returns the inner table of the sys catalog. + #[inline] + pub fn inner_table(&self) -> TableRef { + self.table.clone() + } + + /// Write the table info to the sys_catalog table without lock. + async fn write_table_info(&self, table_info: TableInfo, typ: TableRequestType) -> Result<()> { + info!( + "Write table info to sys_catalog table, table_info:{:?}", + table_info + ); + + let table_writer = TableWriter { + catalog_table: self.table.clone(), + table_to_write: table_info, + typ, + }; + + table_writer.write().await?; + + Ok(()) + } + + async fn get_table_info<'a>(&'a self, table_key: TableKey<'a>) -> Result> { + let projected_schema = ProjectedSchema::no_projection(self.table.schema()); + let primary_key = TableWriter::build_table_primary_key(table_key.clone())?; + let get_req = GetRequest { + request_id: RequestId::next_id(), + projected_schema, + primary_key, + }; + + match self.table.get(get_req).await.context(GetTableInfo { + table: table_key.table, + })? { + Some(row) => { + let table_info = self.decode_table_info(row)?; + let decoded_table_key = TableKey { + catalog: &table_info.catalog_name, + schema: &table_info.schema_name, + table: &table_info.table_name, + }; + + ensure!( + table_key == decoded_table_key, + TableKeyMismatch { + expect_table: table_key.table, + given_table: decoded_table_key.table, + } + ); + + Ok(Some(table_info)) + } + None => Ok(None), + } + } + + fn decode_table_info(&self, row: Row) -> Result { + ensure!( + row.num_columns() > self.key_column_index, + InvalidTableRow { row } + ); + + ensure!( + row.num_columns() > self.value_column_index, + InvalidTableRow { row } + ); + + // Key and value column is always varbinary. + let key = &row[self.key_column_index] + .as_varbinary() + .with_context(|| InvalidTableRow { row: row.clone() })?; + let value = &row[self.value_column_index] + .as_varbinary() + .with_context(|| InvalidTableRow { row: row.clone() })?; + + match decode_one_request(key, value)? { + DecodedRequest::TableEntry(request) => Ok(request), + _ => InvalidTableRow { row }.fail(), + } + } + + /// Visit all data in the sys catalog table + // TODO(yingwen): Expose read options + pub async fn visit(&self, opts: ReadOptions, visitor: &mut dyn Visitor) -> Result<()> { + let read_request = ReadRequest { + request_id: RequestId::next_id(), + opts, + // The schema of sys catalog table is never changed + projected_schema: ProjectedSchema::no_projection(self.table.schema()), + predicate: PredicateBuilder::default().build(), + order: ReadOrder::None, + }; + let mut batch_stream = self.table.read(read_request).await.context(ReadTable)?; + + info!("batch_stream schema is:{:?}", batch_stream.schema()); + // TODO(yingwen): Check stream schema and table schema? + while let Some(batch) = batch_stream.try_next().await.context(ReadStream)? { + // Visit all requests in the record batch + info!("real batch_stream schema is:{:?}", batch.schema()); + self.visit_record_batch(batch, visitor).await?; + } + + Ok(()) + } + + /// Visit the record batch + async fn visit_record_batch( + &self, + batch: RecordBatch, + visitor: &mut dyn Visitor, + ) -> Result<()> { + let key_column = batch.column(self.key_column_index); + let value_column = batch.column(self.value_column_index); + + info!( + "Sys catalog table visit record batch, column_num:{}, row_num:{}", + batch.num_columns(), + batch.num_rows() + ); + + let num_rows = batch.num_rows(); + for i in 0..num_rows { + // Key and value column is not nullable + let key = key_column.datum(i); + let value = value_column.datum(i); + + debug!( + "Sys catalog table visit row, i:{}, key:{:?}, value:{:?}", + i, key, value + ); + + // Key and value column is always varbinary. + let request = + decode_one_request(key.as_varbinary().unwrap(), value.as_varbinary().unwrap())?; + + Self::call_visitor(request, visitor).await?; + } + + Ok(()) + } + + /// Invoke visitor + async fn call_visitor(request: DecodedRequest, visitor: &mut dyn Visitor) -> Result<()> { + match request { + DecodedRequest::CreateCatalog(req) => visitor.visit_catalog(req), + DecodedRequest::CreateSchema(req) => visitor.visit_schema(req), + DecodedRequest::TableEntry(req) => visitor.visit_tables(req).await, + } + } +} + +/// Visitor for sys catalog requests +// TODO(yingwen): Define an Error for visitor +#[async_trait] +pub trait Visitor { + // TODO(yingwen): Use enum another type if need more operation (delete/update) + fn visit_catalog(&mut self, request: CreateCatalogRequest) -> Result<()>; + + fn visit_schema(&mut self, request: CreateSchemaRequest) -> Result<()>; + + async fn visit_tables(&mut self, table_info: TableInfo) -> Result<()>; +} + +/// Build a new table schema for sys catalog +fn new_sys_catalog_schema() -> schema::Result { + // NOTICE: Both key and value must be non-nullable, the visit function takes + // this assumption + schema::Builder::with_capacity(3) + .auto_increment_column_id(true) + // key + .add_key_column( + column_schema::Builder::new(KEY_COLUMN_NAME.to_string(), DatumKind::Varbinary) + .is_nullable(false) + .is_tag(false) + .build() + .expect("Should succeed to build column schema of catalog"), + )? + // timestamp + .add_key_column( + column_schema::Builder::new(TIMESTAMP_COLUMN_NAME.to_string(), DatumKind::Timestamp) + .is_nullable(false) + .is_tag(false) + .build() + .expect("Should succeed to build column schema of catalog"), + )? + // value + .add_normal_column( + column_schema::Builder::new(VALUE_COLUMN_NAME.to_string(), DatumKind::Varbinary) + .is_nullable(false) + .is_tag(false) + .build() + .expect("Should succeed to build column schema of catalog"), + )? + .build() +} + +/// Request type, used as key header +/// +/// 0 is reserved +#[derive(Debug, Clone, Copy)] +enum KeyType { + CreateCatalog = 1, + CreateSchema = 2, + TableEntry = 3, +} + +impl KeyType { + fn to_u8(self) -> u8 { + self as u8 + } + + fn decode_from_bytes(mut buf: &[u8]) -> Result { + let v = buf.read_u8().context(ReadKeyHeader)?; + + match v { + v if v == Self::CreateCatalog as u8 => Ok(Self::CreateCatalog), + v if v == Self::CreateSchema as u8 => Ok(Self::CreateSchema), + v if v == Self::TableEntry as u8 => Ok(Self::TableEntry), + value => InvalidKeyHeader { value }.fail(), + } + } +} + +/// Catalog entry key +/// +/// Use catalog name as key +struct CatalogKey<'a>(&'a str); + +/// Schema entry key +/// +/// Use (catalog, schema) as key +struct SchemaKey<'a>(&'a str, &'a str); + +// TODO(yingwen): Maybe use same key for create/alter table. +/// Table entry key +/// +/// Use (catalog, schema, table_id) as key +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct TableKey<'a> { + catalog: &'a str, + schema: &'a str, + table: &'a str, +} + +/// Encoder for entry key +struct EntryKeyEncoder; + +impl<'a> Encoder> for EntryKeyEncoder { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &CatalogKey) -> Result<()> { + buf.write_u8(KeyType::CreateCatalog.to_u8()) + .context(EncodeKeyHeader)?; + let encoder = MemComparable; + encoder + .encode(buf, value.0.as_bytes()) + .context(EncodeKeyBody) + } + + fn estimate_encoded_size(&self, value: &CatalogKey) -> usize { + let encoder = MemComparable; + mem::size_of::() + encoder.estimate_encoded_size(value.0.as_bytes()) + } +} + +impl<'a> Encoder> for EntryKeyEncoder { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &SchemaKey) -> Result<()> { + buf.write_u8(KeyType::CreateSchema.to_u8()) + .context(EncodeKeyHeader)?; + let encoder = MemComparable; + encoder + .encode(buf, value.0.as_bytes()) + .context(EncodeKeyBody)?; + encoder + .encode(buf, value.1.as_bytes()) + .context(EncodeKeyBody) + } + + fn estimate_encoded_size(&self, value: &SchemaKey) -> usize { + let encoder = MemComparable; + mem::size_of::() + + encoder.estimate_encoded_size(value.0.as_bytes()) + + encoder.estimate_encoded_size(value.1.as_bytes()) + } +} + +impl<'a> Encoder> for EntryKeyEncoder { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &TableKey) -> Result<()> { + buf.write_u8(KeyType::TableEntry.to_u8()) + .context(EncodeKeyHeader)?; + let encoder = MemComparable; + encoder + .encode(buf, value.catalog.as_bytes()) + .context(EncodeKeyBody)?; + encoder + .encode(buf, value.schema.as_bytes()) + .context(EncodeKeyBody)?; + encoder + .encode(buf, value.table.as_bytes()) + .context(EncodeKeyBody)?; + Ok(()) + } + + fn estimate_encoded_size(&self, value: &TableKey) -> usize { + let encoder = MemComparable; + mem::size_of::() + + encoder.estimate_encoded_size(value.catalog.as_bytes()) + + encoder.estimate_encoded_size(value.schema.as_bytes()) + + encoder.estimate_encoded_size(value.table.as_bytes()) + } +} + +/// Information of the catalog to add +#[derive(Debug)] +pub struct CreateCatalogRequest { + /// Catalog name + pub catalog_name: String, +} + +impl CreateCatalogRequest { + /// Convert into [common_types::row::RowGroup] + fn into_row_group(self, schema: Schema) -> Result { + let key = self.to_key()?; + let value = self.into_value()?; + let mut builder = RowGroupBuilder::new(schema); + builder + .row_builder() + // key + .append_datum(Datum::Varbinary(key)) + .context(BuildRow)? + // timestamp + .append_datum(Datum::Timestamp(ENTRY_TIMESTAMP)) + .context(BuildRow)? + // value + .append_datum(Datum::Varbinary(value)) + .context(BuildRow)? + .finish() + .context(BuildRow)?; + + Ok(builder.build()) + } + + fn to_key(&self) -> Result { + let encoder = EntryKeyEncoder; + let key = CatalogKey(&self.catalog_name); + let mut buf = BytesMut::with_capacity(encoder.estimate_encoded_size(&key)); + encoder.encode(&mut buf, &key)?; + Ok(buf.into()) + } + + fn into_value(self) -> Result { + let entry = self.into_pb(); + + let buf = entry.write_to_bytes().context(EncodeEntryPb)?; + Ok(buf.into()) + } + + fn into_pb(self) -> CatalogEntry { + let mut entry = CatalogEntry::new(); + entry.set_catalog_name(self.catalog_name); + entry.set_created_time(Timestamp::now().as_i64()); + + entry + } +} + +impl From for CreateCatalogRequest { + fn from(entry: CatalogEntry) -> Self { + Self { + catalog_name: entry.catalog_name, + } + } +} + +/// Information of the schema to add. +#[derive(Debug)] +pub struct CreateSchemaRequest { + pub catalog_name: String, + pub schema_name: String, + pub schema_id: SchemaId, +} + +impl CreateSchemaRequest { + /// Convert into [common_types::row::RowGroup] + fn into_row_group(self, schema: Schema) -> Result { + let key = self.to_key()?; + let value = self.into_value()?; + let mut builder = RowGroupBuilder::new(schema); + builder + .row_builder() + // key + .append_datum(Datum::Varbinary(key)) + .context(BuildRow)? + // timestamp + .append_datum(Datum::Timestamp(ENTRY_TIMESTAMP)) + .context(BuildRow)? + // value + .append_datum(Datum::Varbinary(value)) + .context(BuildRow)? + .finish() + .context(BuildRow)?; + + Ok(builder.build()) + } + + fn to_key(&self) -> Result { + let encoder = EntryKeyEncoder; + let key = SchemaKey(&self.catalog_name, &self.schema_name); + let mut buf = BytesMut::with_capacity(encoder.estimate_encoded_size(&key)); + encoder.encode(&mut buf, &key)?; + Ok(buf.into()) + } + + fn into_value(self) -> Result { + let entry = self.into_pb(); + + let buf = entry.write_to_bytes().context(EncodeEntryPb)?; + Ok(buf.into()) + } + + fn into_pb(self) -> SchemaEntry { + let mut entry = SchemaEntry::new(); + entry.set_catalog_name(self.catalog_name); + entry.set_schema_name(self.schema_name); + entry.set_schema_id(self.schema_id.as_u32()); + entry.set_created_time(Timestamp::now().as_i64()); + + entry + } +} + +impl TryFrom for CreateSchemaRequest { + type Error = Error; + + fn try_from(entry: SchemaEntry) -> Result { + let schema_id = SchemaId::new(entry.schema_id).context(InvalidSchemaId { + id: entry.schema_id, + })?; + + Ok(Self { + catalog_name: entry.catalog_name, + schema_name: entry.schema_name, + schema_id, + }) + } +} + +/// Information of the alter operations to the table. +#[derive(Clone, Debug)] +pub struct AlterTableRequest { + pub catalog_name: String, + pub schema_name: String, + pub table_name: String, + /// Schema after alteration. + pub schema: Schema, +} + +/// Writer for writing the table information into the catalog table. +pub struct TableWriter { + catalog_table: TableRef, + table_to_write: TableInfo, + typ: TableRequestType, +} + +impl TableWriter { + async fn write(&self) -> Result<()> { + let row_group = self.convert_table_info_to_row_group()?; + let write_req = WriteRequest { row_group }; + self.catalog_table + .write(write_req) + .await + .context(PersistTables)?; + + Ok(()) + } + + /// Convert the table to write into [common_types::row::RowGroup]. + fn convert_table_info_to_row_group(&self) -> Result { + let mut builder = RowGroupBuilder::new(self.catalog_table.schema()); + let key = Self::build_create_table_key(&self.table_to_write)?; + let value = Self::build_create_table_value(self.table_to_write.clone(), self.typ)?; + + debug!( + "TableWriter build key value, key:{:?}, value:{:?}", + key, value + ); + + Self::build_row(&mut builder, key, value)?; + + Ok(builder.build()) + } + + fn build_row(builder: &mut RowGroupBuilder, key: Bytes, value: Bytes) -> Result<()> { + builder + .row_builder() + // key + .append_datum(Datum::Varbinary(key)) + .context(BuildRow)? + // timestamp + .append_datum(Datum::Timestamp(ENTRY_TIMESTAMP)) + .context(BuildRow)? + // value + .append_datum(Datum::Varbinary(value)) + .context(BuildRow)? + .finish() + .context(BuildRow)?; + Ok(()) + } + + fn build_create_table_key(table_info: &TableInfo) -> Result { + let key = TableKey { + catalog: &table_info.catalog_name, + schema: &table_info.schema_name, + table: &table_info.table_name, + }; + Self::encode_table_key(key) + } + + fn encode_table_key(key: TableKey) -> Result { + let encoder = EntryKeyEncoder; + let mut buf = BytesMut::with_capacity(encoder.estimate_encoded_size(&key)); + encoder.encode(&mut buf, &key)?; + Ok(buf.into()) + } + + fn build_create_table_value(table_info: TableInfo, typ: TableRequestType) -> Result { + let entry = table_info.into_pb(typ); + + let buf = entry.write_to_bytes().context(EncodeEntryPb)?; + Ok(buf.into()) + } + + fn build_table_primary_key(table_key: TableKey) -> Result> { + let encoded_key = Self::encode_table_key(table_key)?; + + Ok(vec![ + Datum::Varbinary(encoded_key), + Datum::Timestamp(ENTRY_TIMESTAMP), + ]) + } +} + +/// Decoded sys catalog request +#[derive(Debug)] +enum DecodedRequest { + CreateCatalog(CreateCatalogRequest), + CreateSchema(CreateSchemaRequest), + TableEntry(TableInfo), +} + +/// Decode request from key/value +fn decode_one_request(key: &[u8], value: &[u8]) -> Result { + let key_type = KeyType::decode_from_bytes(key)?; + let req = match key_type { + KeyType::CreateCatalog => { + let entry = CatalogEntry::parse_from_bytes(value).context(DecodeEntryPb)?; + DecodedRequest::CreateCatalog(CreateCatalogRequest::from(entry)) + } + KeyType::CreateSchema => { + let entry = SchemaEntry::parse_from_bytes(value).context(DecodeEntryPb)?; + DecodedRequest::CreateSchema(CreateSchemaRequest::try_from(entry)?) + } + KeyType::TableEntry => { + let entry = TableEntry::parse_from_bytes(value).context(DecodeEntryPb)?; + let table_info = TableInfo::try_from(entry).context(DecodeTableEntry)?; + DecodedRequest::TableEntry(table_info) + } + }; + + Ok(req) +} diff --git a/system_catalog/src/tables.rs b/system_catalog/src/tables.rs new file mode 100644 index 0000000000..67edfeaa35 --- /dev/null +++ b/system_catalog/src/tables.rs @@ -0,0 +1,179 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +/// implementation of system table: Tables +/// For example `SELECT * FROM system.public.tables` +use std::fmt::{Debug, Formatter}; + +use async_trait::async_trait; +use catalog::{manager::Manager, schema::SchemaRef, CatalogRef}; +use common_types::{ + column_schema, + datum::{Datum, DatumKind}, + record_batch::RecordBatchWithKeyBuilder, + row::Row, + schema, + schema::Schema, +}; +use snafu::ResultExt; +use table_engine::{ + stream::SendableRecordBatchStream, + table::{ReadRequest, SchemaId, TableId, TableRef, TableSeq}, +}; + +use crate::{OneRecordBatchStream, SystemTable, ENTRY_TIMESTAMP}; + +/// Table name of the sys tables +const TABLE_NAME: &str = "tables"; +/// Schema id of the sys catalog schema (`system/public`). +pub const SCHEMA_ID: SchemaId = SchemaId::from_u16(1); +/// Table sequence of the sys tables +pub const TABLE_SEQ: TableSeq = TableSeq::from_u32(2); +/// Table id of the `sys_catalog` table. +pub const TABLE_ID: TableId = TableId::new(SCHEMA_ID, TABLE_SEQ); + +/// Build a new table schema for tables +fn tables_schema() -> Schema { + schema::Builder::with_capacity(6) + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new("timestamp".to_string(), DatumKind::Timestamp) + .is_nullable(false) + .is_tag(false) + .build() + .unwrap(), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("catalog".to_string(), DatumKind::String) + .is_nullable(false) + .is_tag(false) + .build() + .unwrap(), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("schema".to_string(), DatumKind::String) + .is_nullable(false) + .is_tag(false) + .build() + .unwrap(), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("table_name".to_string(), DatumKind::String) + .is_nullable(false) + .is_tag(false) + .build() + .unwrap(), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("table_id".to_string(), DatumKind::UInt64) + .is_nullable(false) + .is_tag(false) + .build() + .unwrap(), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("engine".to_string(), DatumKind::String) + .is_nullable(false) + .is_tag(false) + .build() + .unwrap(), + ) + .unwrap() + .build() + .unwrap() +} + +pub struct Tables { + schema: Schema, + catalog_manager: M, +} + +impl Debug for Tables { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SysTables") + .field("schema", &self.schema) + .finish() + } +} + +impl Tables { + pub fn new(catalog_manager: M) -> Self { + Self { + schema: tables_schema(), + catalog_manager, + } + } + + fn from_table(&self, catalog: CatalogRef, schema: SchemaRef, table: TableRef) -> Row { + let mut datums = Vec::with_capacity(self.schema.num_columns()); + datums.push(Datum::Timestamp(ENTRY_TIMESTAMP)); + datums.push(Datum::from(catalog.name())); + datums.push(Datum::from(schema.name())); + datums.push(Datum::from(table.name())); + datums.push(Datum::from(table.id().as_u64())); + datums.push(Datum::from(table.engine_type())); + Row::from_datums(datums) + } +} + +#[async_trait] +impl SystemTable for Tables { + fn name(&self) -> &str { + TABLE_NAME + } + + fn id(&self) -> TableId { + TABLE_ID + } + + fn schema(&self) -> Schema { + self.schema.clone() + } + + async fn read( + &self, + request: ReadRequest, + ) -> table_engine::table::Result { + let catalogs = self + .catalog_manager + .all_catalogs() + .map_err(|e| Box::new(e) as _) + .context(table_engine::table::Scan { table: self.name() })?; + let mut builder = + RecordBatchWithKeyBuilder::new(self.schema.clone().to_record_schema_with_key()); + + let projector = request + .projected_schema + .try_project_with_key(&self.schema) + .expect("Should succeed to try_project_key of sys_tables"); + for catalog in &catalogs { + for schema in &catalog + .all_schemas() + .map_err(|e| Box::new(e) as _) + .context(table_engine::table::Scan { table: self.name() })? + { + for table in &schema + .all_tables() + .map_err(|e| Box::new(e) as _) + .context(table_engine::table::Scan { table: self.name() })? + { + let row = self.from_table(catalog.clone(), schema.clone(), table.clone()); + let projected_row = projector.project_row(&row, Vec::new()); + builder + .append_row(projected_row) + .map_err(|e| Box::new(e) as _) + .context(table_engine::table::Scan { table: self.name() })?; + } + } + } + let record_batch = builder.build().unwrap().into_record_batch(); + Ok(Box::pin(OneRecordBatchStream { + schema: self.schema.clone().to_record_schema(), + record_batch: Some(record_batch), + })) + } +} diff --git a/table_engine/Cargo.toml b/table_engine/Cargo.toml new file mode 100644 index 0000000000..b617b9f7cc --- /dev/null +++ b/table_engine/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "table_engine" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +# In alphabetical order +arrow_deps = { path = "../arrow_deps" } +async-trait = "0.1.41" +common_types = { path = "../common_types" } +common_util = { path = "../common_util" } +futures = "0.3" +log = "0.4" +proto = { path = "../proto" } +protobuf = "2.20" +serde = "1.0" +serde_derive = "1.0" +smallvec = "1.6" +snafu = { version ="0.6.10", features = ["backtraces"]} +tokio = { version = "1.0", features = ["sync"] } diff --git a/table_engine/src/engine.rs b/table_engine/src/engine.rs new file mode 100644 index 0000000000..b2aaeaaf6c --- /dev/null +++ b/table_engine/src/engine.rs @@ -0,0 +1,261 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table factory trait + +use std::{collections::HashMap, sync::Arc}; + +use async_trait::async_trait; +use common_types::{schema::Schema, time::Timestamp}; +use common_util::runtime::Runtime; +use proto::sys_catalog::{TableEntry, TableState as TableStatePb}; +use snafu::{ensure, Backtrace, Snafu}; + +use crate::{ + partition::PartitionInfo, + table::{TableId, TableInfo, TableRef}, +}; + +#[derive(Debug, Snafu)] +#[snafu(visibility(pub))] +pub enum Error { + #[snafu(display("Invalid table path, path:{}.\nBacktrace:\n{}", path, backtrace))] + InvalidTablePath { path: String, backtrace: Backtrace }, + + #[snafu(display("Table already exists, table:{}.\nBacktrace:\n{}", table, backtrace))] + TableExists { table: String, backtrace: Backtrace }, + + #[snafu(display("Invalid arguments, err:{}", source))] + InvalidArguments { + table: String, + source: Box, + }, + + #[snafu(display("Failed to write meta data, err:{}", source))] + WriteMeta { + source: Box, + }, + + #[snafu(display("Unexpected error, err:{}", source))] + Unexpected { + source: Box, + }, + + #[snafu(display( + "Unknown engine type, type:{}.\nBacktrace:\n{}", + engine_type, + backtrace + ))] + UnknownEngineType { + engine_type: String, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid table state transition, from:{:?}, to:{:?}.\nBacktrace:\n{}", + from, + to, + backtrace + ))] + InvalidTableStateTransition { + from: TableState, + to: TableState, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to close the table engine, err:{}", source))] + Close { + source: Box, + }, +} + +define_result!(Error); + +/// The state of table. +/// +/// Transition rule is defined in the validate function. +#[derive(Clone, Copy, Debug)] +pub enum TableState { + Stable = 0, + Dropping = 1, + Dropped = 2, +} + +impl TableState { + pub fn validate(&self, to: TableState) -> bool { + match self { + TableState::Stable => matches!(to, TableState::Stable | TableState::Dropping), + TableState::Dropping => matches!(to, TableState::Dropped), + TableState::Dropped => false, + } + } + + /// Try to transit from the self state to the `to` state. + /// + /// Returns error if it is a invalid transition. + pub fn try_transit(&mut self, to: TableState) -> Result<()> { + ensure!( + self.validate(to), + InvalidTableStateTransition { from: *self, to } + ); + *self = to; + + Ok(()) + } +} + +impl From for TableStatePb { + fn from(state: TableState) -> TableStatePb { + match state { + TableState::Stable => TableStatePb::STABLE, + TableState::Dropping => TableStatePb::DROPPING, + TableState::Dropped => TableStatePb::DROPPED, + } + } +} + +impl From for TableState { + fn from(state: TableStatePb) -> TableState { + match state { + TableStatePb::STABLE => TableState::Stable, + TableStatePb::DROPPING => TableState::Dropping, + TableStatePb::DROPPED => TableState::Dropped, + } + } +} + +#[derive(Copy, Clone)] +pub enum TableRequestType { + Create, + Drop, +} + +/// Create table request +// TODO(yingwen): Add option for create_if_not_exists? +#[derive(Debug, Clone)] +pub struct CreateTableRequest { + /// Catalog name + pub catalog_name: String, + /// Schema name + pub schema_name: String, + /// Table id + pub table_id: TableId, + // TODO(yingwen): catalog and schema, or add a table path struct? + /// Table name + pub table_name: String, + /// Table schema + pub table_schema: Schema, + /// Partition info if this is a partitioned table + // TODO(yingwen): TableEngine should not have knowledge of partitioning + pub partition_info: Option, + /// Table engine type + pub engine: String, + /// Table options used by each engine + pub options: HashMap, + /// Tells state of the table + pub state: TableState, +} + +impl CreateTableRequest { + // TODO(chunshao.rcs): refactor + pub fn into_pb(self, typ: TableRequestType) -> TableEntry { + let mut table_entry: TableEntry = self.into(); + match typ { + TableRequestType::Create => table_entry.set_created_time(Timestamp::now().as_i64()), + TableRequestType::Drop => table_entry.set_modified_time(Timestamp::now().as_i64()), + } + table_entry + } +} + +impl From for TableEntry { + fn from(req: CreateTableRequest) -> Self { + let mut entry = TableEntry::new(); + entry.set_catalog_name(req.catalog_name); + entry.set_schema_name(req.schema_name); + entry.set_table_id(req.table_id.as_u64()); + entry.set_table_name(req.table_name); + entry.set_engine(req.engine); + entry.set_state(TableStatePb::from(req.state)); + + entry + } +} + +impl From for TableInfo { + fn from(req: CreateTableRequest) -> Self { + Self { + catalog_name: req.catalog_name, + schema_name: req.schema_name, + table_id: req.table_id, + table_name: req.table_name, + engine: req.engine, + state: req.state, + } + } +} + +/// Drop table request +#[derive(Debug, Clone)] +pub struct DropTableRequest { + /// Catalog name + pub catalog_name: String, + /// Schema name + pub schema_name: String, + /// Table name + pub table_name: String, + /// Table engine type + pub engine: String, +} + +#[derive(Debug, Clone)] +pub struct OpenTableRequest { + /// Catalog name + pub catalog_name: String, + /// Schema name + pub schema_name: String, + /// Table name + pub table_name: String, + /// Table engine type + pub engine: String, +} + +impl From for OpenTableRequest { + fn from(table_info: TableInfo) -> Self { + Self { + catalog_name: table_info.catalog_name, + schema_name: table_info.schema_name, + table_name: table_info.table_name, + engine: table_info.engine, + } + } +} + +/// Table engine +// TODO(yingwen): drop table support to release resource owned by the table +#[async_trait] +pub trait TableEngine { + /// Returns the name of engine. + fn engine_type(&self) -> &str; + + /// Close the engine gracefully. + async fn close(&self) -> Result<()>; + + /// Create table + async fn create_table(&self, request: CreateTableRequest) -> Result; + + /// Drop table + async fn drop_table(&self, request: DropTableRequest) -> Result; + + /// Open table, return None if table not exists + async fn open_table(&self, request: OpenTableRequest) -> Result>; +} + +/// A reference counted pointer to table engine +pub type TableEngineRef = Arc; + +#[derive(Clone, Debug)] +pub struct EngineRuntimes { + pub read_runtime: Arc, + pub write_runtime: Arc, + pub bg_runtime: Arc, +} diff --git a/table_engine/src/lib.rs b/table_engine/src/lib.rs new file mode 100644 index 0000000000..ac60c1e8dc --- /dev/null +++ b/table_engine/src/lib.rs @@ -0,0 +1,20 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table engine facade, provides read/write interfaces of table + +#[macro_use] +extern crate common_util; + +pub mod engine; +pub mod memory; +pub mod partition; +pub mod predicate; +pub mod provider; +pub mod stream; +pub mod table; + +/// Enable ttl key +pub const OPTION_KEY_ENABLE_TTL: &str = "enable_ttl"; + +pub const MEMORY_ENGINE_TYPE: &str = "Memory"; +pub const ANALYTIC_ENGINE_TYPE: &str = "Analytic"; diff --git a/table_engine/src/memory.rs b/table_engine/src/memory.rs new file mode 100644 index 0000000000..d26448fddf --- /dev/null +++ b/table_engine/src/memory.rs @@ -0,0 +1,252 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! In-memory table implementations + +use std::{ + collections::HashMap, + fmt, + pin::Pin, + sync::{Arc, RwLock}, + task::{Context, Poll}, +}; + +use async_trait::async_trait; +use common_types::{ + column::{ColumnBlock, ColumnBlockBuilder}, + datum::{Datum, DatumKind}, + record_batch::RecordBatch, + row::{Row, RowGroup}, + schema::{RecordSchema, Schema}, +}; +use futures::stream::Stream; +use snafu::{OptionExt, ResultExt}; + +use crate::{ + stream::{ + self, ErrNoSource, ErrWithSource, PartitionedStreams, RecordBatchStream, + SendableRecordBatchStream, + }, + table::{ + AlterSchemaRequest, FlushRequest, GetRequest, ReadRequest, Result, Table, TableId, + TableStats, UnsupportedMethod, WriteRequest, + }, +}; + +type RowGroupVec = Vec; + +/// In-memory table +/// +/// Mainly for test, DO NOT use it in production. All data inserted are buffered +/// in memory, does not support schema change. +pub struct MemoryTable { + /// Table name + name: String, + /// Table id + id: TableId, + /// Table schema + schema: Schema, + /// Rows + row_groups: Arc>, + /// Engine type + engine_type: String, +} + +impl MemoryTable { + pub fn new(name: String, id: TableId, schema: Schema, engine_type: String) -> Self { + Self { + name, + id, + schema, + row_groups: Arc::new(RwLock::new(Vec::new())), + engine_type, + } + } +} + +impl fmt::Debug for MemoryTable { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("MemoryTable") + .field("name", &self.name) + .field("id", &self.id) + .field("schema", &self.schema) + // row_groups is ignored + .finish() + } +} + +#[async_trait] +impl Table for MemoryTable { + fn name(&self) -> &str { + &self.name + } + + fn id(&self) -> TableId { + self.id + } + + fn options(&self) -> HashMap { + HashMap::new() + } + + fn schema(&self) -> Schema { + self.schema.clone() + } + + fn engine_type(&self) -> &str { + &self.engine_type + } + + fn stats(&self) -> TableStats { + TableStats::default() + } + + async fn write(&self, request: WriteRequest) -> Result { + // TODO(yingwen) Maybe check schema? + let mut row_groups = self.row_groups.write().unwrap(); + let n = request.row_group.num_rows(); + row_groups.push(request.row_group); + + Ok(n) + } + + // batch_size is ignored now + async fn read(&self, request: ReadRequest) -> Result { + let scan = MemoryScan { + schema: request.projected_schema.to_record_schema(), + row_groups: self.row_groups.clone(), + index: 0, + }; + + Ok(Box::pin(scan)) + } + + async fn get(&self, _request: GetRequest) -> Result> { + // Alter schema is not supported now. + UnsupportedMethod { + table: &self.name, + method: "get", + } + .fail() + } + + async fn partitioned_read(&self, request: ReadRequest) -> Result { + let stream = self.read(request).await?; + + Ok(PartitionedStreams::one_stream(stream)) + } + + // TODO: Alter schema is not supported now + async fn alter_schema(&self, _request: AlterSchemaRequest) -> Result { + Ok(1) + } + + // TODO: Alter modify setting is not supported now + async fn alter_options(&self, _options: HashMap) -> Result { + Ok(1) + } + + async fn flush(&self, _request: FlushRequest) -> Result<()> { + // Flush is not supported now. + UnsupportedMethod { + table: self.name(), + method: "flush", + } + .fail() + } + + async fn compact(&self) -> Result<()> { + // Compact is not supported now. + UnsupportedMethod { + table: self.name(), + method: "compact", + } + .fail() + } +} + +#[derive(Debug)] +struct MemoryScan { + // The schema of projected column indexed by ReadRequest::projection + schema: RecordSchema, + row_groups: Arc>, + index: usize, +} + +impl Stream for MemoryScan { + type Item = stream::Result; + + fn poll_next(mut self: Pin<&mut Self>, _ctx: &mut Context<'_>) -> Poll> { + // TODO(yingwen): Batch row groups + let record_batch = { + let row_groups = self.row_groups.read().unwrap(); + if self.index >= row_groups.len() { + return Poll::Ready(None); + } + + let rows = &row_groups[self.index]; + // Because the row group inserted may have different column order, so we cannot + // reuse the projection index, and must find projection index for each row + // group, which is inefficient + row_group_to_record_batch(rows, &self.schema) + }; + + self.index += 1; + Poll::Ready(Some(record_batch)) + } +} + +impl RecordBatchStream for MemoryScan { + fn schema(&self) -> &RecordSchema { + &self.schema + } +} + +// REQUIRE: The schema is the projected schema +fn row_group_to_record_batch( + rows: &RowGroup, + record_schema: &RecordSchema, +) -> stream::Result { + if rows.is_empty() { + return Ok(RecordBatch::new_empty(record_schema.clone())); + } + + let num_cols = record_schema.num_columns(); + let mut column_blocks = Vec::with_capacity(num_cols); + // For each column, create an array for that column + for column in record_schema.columns().iter() { + let rows_schema = rows.schema(); + let col_index = rows_schema + .index_of(&column.name) + .with_context(|| ErrNoSource { + msg: format!( + "Failed to convert RowGroup to RecordBatch, column not found, column:{}", + &column.name + ), + })?; + let cols = rows.iter_column(col_index); + let column_block = build_column_block(&column.data_type, cols)?; + column_blocks.push(column_block); + } + + RecordBatch::new(record_schema.clone(), column_blocks) + .map_err(|e| Box::new(e) as _) + .context(ErrWithSource { + msg: "Failed to create RecordBatch", + }) +} + +fn build_column_block<'a, I: Iterator>( + data_type: &DatumKind, + iter: I, +) -> stream::Result { + let mut builder = ColumnBlockBuilder::with_capacity(data_type, iter.size_hint().0); + for datum in iter { + builder + .append(datum.clone()) + .map_err(|e| Box::new(e) as _) + .context(ErrWithSource { + msg: "Append datum", + })?; + } + Ok(builder.build()) +} diff --git a/table_engine/src/partition/expression.rs b/table_engine/src/partition/expression.rs new file mode 100644 index 0000000000..ae89d3a099 --- /dev/null +++ b/table_engine/src/partition/expression.rs @@ -0,0 +1,71 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Partition expression + +use std::ops::Deref; + +use common_types::datum::Datum; +use common_util::define_result; +use snafu::{Backtrace, OptionExt, Snafu}; + +use crate::partition::PartitionInfo; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("No datums for eval.\nBacktrace:\n{}", backtrace))] + EmptyDatums { backtrace: Backtrace }, +} + +define_result!(Error); + +/// Partition expression +#[derive(Debug)] +pub enum Expression { + ColumnExpr(ColumnExpr), +} + +impl Expression { + pub fn new(partition_info: &PartitionInfo) -> Self { + Self::parse_expr(partition_info.expr.to_string()) + } + + /// Extract column name in expression + pub fn extract_column_name(&self) -> impl Iterator { + match self { + Expression::ColumnExpr(col_expr) => col_expr.extract_column_name(), + } + } + + fn parse_expr(expr_str: String) -> Expression { + Expression::ColumnExpr(ColumnExpr::new(expr_str)) + } + + pub fn eval_uint>(&self, datums: &[T]) -> Result { + match self { + Expression::ColumnExpr(column_expr) => { + column_expr.eval_uint(datums.get(0).context(EmptyDatums)?) + } + } + } +} + +/// Column +#[derive(Debug)] +pub struct ColumnExpr { + column_name: String, +} + +impl ColumnExpr { + fn new(column_name: String) -> Self { + Self { column_name } + } + + fn extract_column_name(&self) -> impl Iterator { + std::iter::once(self.column_name.as_str()) + } + + // TODO: handle error + fn eval_uint(&self, datum: &Datum) -> Result { + Ok(datum.convert_to_uint64()) + } +} diff --git a/table_engine/src/partition/mod.rs b/table_engine/src/partition/mod.rs new file mode 100644 index 0000000000..e419b3ef72 --- /dev/null +++ b/table_engine/src/partition/mod.rs @@ -0,0 +1,27 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Partitioned table supports + +mod expression; +pub mod rule; + +/// Partition type of table +#[derive(Clone, Debug, PartialEq)] +pub enum PartitionType { + None = 0, + Hash = 1, +} + +/// Size type of partition num +pub type PartitionNum = u16; + +/// Info for how to partition table +#[derive(Debug, Clone)] +pub struct PartitionInfo { + /// Partition type + pub partition_type: PartitionType, + /// Partition expression + pub expr: String, + /// Partition num + pub partition_num: PartitionNum, +} diff --git a/table_engine/src/partition/rule.rs b/table_engine/src/partition/rule.rs new file mode 100644 index 0000000000..28b31401c5 --- /dev/null +++ b/table_engine/src/partition/rule.rs @@ -0,0 +1,108 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Partition rules + +use common_types::{datum::Datum, row::Row, schema::Schema}; +use common_util::define_result; +use smallvec::SmallVec; +use snafu::{ensure, Backtrace, ResultExt, Snafu}; + +use crate::partition::{expression::Expression, PartitionInfo, PartitionType}; + +const HASH_COLUMN_NUM: usize = 1; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("No column for hash partitioning.\nBacktrace:\n{}", backtrace))] + NoColumnForHash { backtrace: Backtrace }, + + #[snafu(display("Only support one hash column.\nBacktrace:\n{}", backtrace))] + TooMuchHashColumn { backtrace: Backtrace }, + + #[snafu(display("Failed to eval partition expr, err:{}", source))] + EvalExpr { + source: crate::partition::expression::Error, + }, +} + +define_result!(Error); + +/// Partition rule locate partition by input records +// TODO(yingwen): Recreate partition rule once the schema of the table is changed +#[derive(Debug)] +pub enum PartitionRule { + None, + Hash(HashPartitionRule), +} + +impl PartitionRule { + pub fn new(partition_info: &PartitionInfo, schema: &Schema) -> Result { + match partition_info.partition_type { + PartitionType::None => Ok(PartitionRule::None), + PartitionType::Hash => { + let rule = HashPartitionRule::new(partition_info, schema)?; + Ok(PartitionRule::Hash(rule)) + } + } + } + + /// Return the index of partition + pub fn locate_partition(&self, row: &Row) -> Result { + match self { + // Always return the first partition + PartitionRule::None => Ok(0), + PartitionRule::Hash(rule) => rule.eval_partition_index(row), + } + } +} + +/// Partition rule based on hash +#[derive(Debug)] +pub struct HashPartitionRule { + /// Total number of partitions + partition_num: u16, + /// Expression to evaluate a hash value + expression: Expression, + /// Offsets of columns for evaluate + // TODO(yingwen): The column index may be invalid after schema change (add/del column) + column_index: SmallVec<[usize; HASH_COLUMN_NUM]>, +} + +impl HashPartitionRule { + pub fn new(partition_info: &PartitionInfo, schema: &Schema) -> Result { + let expr = Expression::new(partition_info); + + let col_name_list = expr.extract_column_name(); + let mut column_index = SmallVec::with_capacity(col_name_list.size_hint().0); + for col_name in col_name_list { + for (i, v) in schema.columns().iter().enumerate() { + if col_name == v.name { + column_index.push(i); + break; + } + } + } + + ensure!(!column_index.is_empty(), NoColumnForHash); + ensure!(column_index.len() == 1, TooMuchHashColumn); + + Ok(Self { + partition_num: partition_info.partition_num, + expression: expr, + column_index, + }) + } + + // TODO(yingwen): Also pass schema? + pub fn eval_partition_index(&self, row: &Row) -> Result { + let mut col_vals: SmallVec<[&Datum; HASH_COLUMN_NUM]> = + SmallVec::with_capacity(self.column_index.len()); + for i in &self.column_index { + // TODO(yingwen): Check index? + col_vals.push(&row[*i]); + } + let eval_uint = self.expression.eval_uint(&col_vals).context(EvalExpr)?; + + Ok((eval_uint % self.partition_num as u64) as usize) + } +} diff --git a/table_engine/src/predicate/filter_record_batch.rs b/table_engine/src/predicate/filter_record_batch.rs new file mode 100644 index 0000000000..cafbd960da --- /dev/null +++ b/table_engine/src/predicate/filter_record_batch.rs @@ -0,0 +1,249 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use arrow_deps::datafusion::{ + logical_plan::{Expr, Operator}, + scalar::ScalarValue, +}; +use common_types::{datum::DatumView, record_batch::RecordBatchWithKey}; + +#[derive(Debug)] +struct ColumnFilter { + name: String, + op: Operator, + literal: ScalarValue, +} + +fn evaluate_by_operator(lhs: &T, rhs: &T, op: &Operator) -> Option { + let cmp_res = lhs.partial_cmp(rhs)?; + let v = match op { + Operator::Lt => cmp_res.is_lt(), + Operator::LtEq => cmp_res.is_le(), + Operator::Gt => cmp_res.is_gt(), + Operator::GtEq => cmp_res.is_ge(), + Operator::NotEq => cmp_res.is_ne(), + Operator::Eq => cmp_res.is_eq(), + _ => return None, + }; + Some(v) +} + +fn evaluate_datums_by_operator<'a>( + lhs: &DatumView<'a>, + rhs: &DatumView<'a>, + op: &Operator, +) -> Option { + macro_rules! impl_evaluate { + ($($Kind: ident), *) => { + match (lhs, rhs){ + (DatumView::Null, DatumView::Null) => Some(true), + $((DatumView::$Kind(v1), DatumView::$Kind(v2)) => evaluate_by_operator(v1, v2, op),)* + _ => None, + } + }; + } + + impl_evaluate!( + Timestamp, Double, Float, Varbinary, String, UInt64, UInt32, UInt16, UInt8, Int64, Int32, + Int16, Int8, Boolean + ) +} + +impl ColumnFilter { + fn filter(&self, record_batch: &RecordBatchWithKey, selected_buf: &mut [bool]) -> Option<()> { + let filter_datum_view = DatumView::from_scalar_value(&self.literal)?; + + let column_idx = record_batch.schema_with_key().index_of(&self.name)?; + let column_data = record_batch.column(column_idx); + + assert!(selected_buf.len() >= column_data.num_rows()); + for (i, selected) in selected_buf + .iter_mut() + .enumerate() + .take(column_data.num_rows()) + { + if *selected { + let datum_view = column_data.datum_view(i); + *selected = evaluate_datums_by_operator(&datum_view, &filter_datum_view, &self.op) + .unwrap_or(true); + } + } + + Some(()) + } +} + +/// Filter record batch by applying the `column_filters`. +pub struct RecordBatchFilter { + column_filters: Vec, +} + +impl RecordBatchFilter { + /// Create filter according to the `exprs` whose logical relationship is + /// `AND` between each other. Note that the created filter is not + /// equivalent to the original `exprs` and actually only a subset of the + /// exprs is chosen to create the [`RecordBatchFilter`]. + pub fn new(exprs: &[Expr]) -> Self { + let mut filters = Vec::with_capacity(exprs.len()); + for expr in exprs { + if let Expr::BinaryExpr { left, op, right } = expr { + let (column_name, literal) = match (left.as_ref(), right.as_ref()) { + (Expr::Column(col), Expr::Literal(v)) + | (Expr::Literal(v), Expr::Column(col)) => (col.name.to_string(), v.clone()), + _ => continue, + }; + + if matches!( + op, + Operator::NotEq + | Operator::Eq + | Operator::Gt + | Operator::GtEq + | Operator::Lt + | Operator::LtEq + ) { + filters.push(ColumnFilter { + name: column_name, + op: *op, + literal, + }) + } + } + } + + RecordBatchFilter { + column_filters: filters, + } + } + + /// Filter `record_batch` and save the filtering results into the + /// `selected_rows_buf`. + /// + /// Requires: `selected_rows_buf.len() == record_batch.num_rows()`. + pub fn filter( + &self, + record_batch: &RecordBatchWithKey, + selected_rows_buf: &mut [bool], + ) -> usize { + assert_eq!(record_batch.num_rows(), selected_rows_buf.len()); + + for selected in &mut *selected_rows_buf { + *selected = true; + } + + for column_filter in &self.column_filters { + column_filter.filter(record_batch, selected_rows_buf.as_mut()); + } + + selected_rows_buf + .iter() + .map(|selected| if *selected { 1 } else { 0 }) + .sum() + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.column_filters.is_empty() + } +} + +impl From<&[Expr]> for RecordBatchFilter { + fn from(exprs: &[Expr]) -> Self { + Self::new(exprs) + } +} + +#[cfg(test)] +mod test { + use arrow_deps::datafusion::prelude::Column; + use common_types::{ + row::Row, + tests::{build_record_batch_with_key_by_rows, build_row}, + }; + + use super::*; + + fn build_record_batch(rows: Vec) -> RecordBatchWithKey { + build_record_batch_with_key_by_rows(rows) + } + + fn build_filter_expr(column_name: &str, literal: ScalarValue, op: Operator) -> Expr { + Expr::BinaryExpr { + left: Box::new(Expr::Column(Column::from_name(column_name.to_string()))), + op, + right: Box::new(Expr::Literal(literal)), + } + } + + #[test] + fn test_empty_filter() { + let rows = vec![ + build_row(b"aaaa", 1, 11.0, "AAAA"), + build_row(b"aaaa", 1, 21.0, "BBBB"), + ]; + let batch = build_record_batch(rows); + + let filter = RecordBatchFilter::new(&[]); + let mut selected_rows = vec![false; batch.num_rows()]; + let selected_num = filter.filter(&batch, &mut selected_rows); + + assert_eq!(selected_num, selected_rows.len()); + assert!(selected_rows.iter().all(|v| *v)); + } + + #[test] + fn test_all_filter() { + let rows = vec![ + build_row(b"aaaa", 1, 11.0, "AAAA"), + build_row(b"aaaa", 1, 21.0, "BBBB"), + build_row(b"aaaa", 2, 21.0, "CCCC"), + build_row(b"bbbb", 2, 31.0, "DDDD"), + build_row(b"bbbb", 2, 31.0, "DDDD"), + ]; + let batch = build_record_batch(rows); + + let expr = build_filter_expr("key2", ScalarValue::Int64(Some(2)), Operator::LtEq); + let filter = RecordBatchFilter::new(&[expr]); + let mut selected_rows = vec![false; batch.num_rows()]; + let selected_num = filter.filter(&batch, &mut selected_rows); + + assert_eq!(selected_num, selected_rows.len()); + assert!(selected_rows.iter().all(|v| *v)); + } + + #[test] + fn test_partial_filter() { + let rows = vec![ + build_row(b"aaaa", 1, 11.0, "AAAA"), + build_row(b"aaaa", 1, 21.0, "BBBB"), + build_row(b"aaaa", 2, 21.0, "CCCC"), + build_row(b"bbbb", 2, 31.0, "DDDD"), + build_row(b"bbbb", 2, 31.0, "DDDD"), + ]; + let batch = build_record_batch(rows); + + let expr1 = build_filter_expr("key2", ScalarValue::Int64(Some(2)), Operator::LtEq); + let expr2 = build_filter_expr( + "key1", + ScalarValue::Binary(Some(b"aabb".to_vec())), + Operator::GtEq, + ); + let filter = RecordBatchFilter::new(&[expr1, expr2]); + let mut selected_rows = vec![false; batch.num_rows()]; + let selected_num = filter.filter(&batch, &mut selected_rows); + let expect_selected_rows = vec![false, false, false, true, true]; + + assert_eq!(selected_num, 2); + assert_eq!(selected_rows, expect_selected_rows); + } + + #[test] + fn test_filter_empty_batch() { + let batch = build_record_batch(vec![]); + let expr1 = build_filter_expr("key2", ScalarValue::Int64(Some(2)), Operator::LtEq); + let filter = RecordBatchFilter::new(&[expr1]); + let mut selected_rows = vec![false; batch.num_rows()]; + filter.filter(&batch, &mut selected_rows); + + assert!(selected_rows.is_empty()); + } +} diff --git a/table_engine/src/predicate/mod.rs b/table_engine/src/predicate/mod.rs new file mode 100644 index 0000000000..2758dac513 --- /dev/null +++ b/table_engine/src/predicate/mod.rs @@ -0,0 +1,540 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Predict for query table. +//! Reference to: https://github.com/influxdata/influxdb_iox/blob/29b10413051f8c4a2193e8633aa133e45b0e505a/query/src/predicate.rs + +use std::{collections::HashSet, convert::TryInto, sync::Arc}; + +use arrow_deps::{ + arrow::{ + array::ArrayRef, + datatypes::{Schema as ArrowSchema, SchemaRef}, + }, + datafusion::{ + logical_plan::{Column, Expr, Operator}, + optimizer::utils as datafusion_util, + parquet::file::metadata::RowGroupMetaData, + physical_optimizer::pruning::{PruningPredicate, PruningStatistics}, + scalar::ScalarValue, + }, + parquet::file::statistics::Statistics as ParquetStatistics, +}; +use common_types::{ + schema::Schema, + time::{TimeRange, Timestamp}, +}; +use log::{debug, error}; +use snafu::{ResultExt, Snafu}; + +pub mod filter_record_batch; + +#[derive(Debug, Snafu)] +#[snafu(visibility = "pub")] +pub enum Error { + #[snafu(display("Failed ot do pruning, err:{}", source))] + Prune { + source: arrow_deps::datafusion::error::DataFusionError, + }, +} + +define_result!(Error); + +/// port from datafusion. +/// Extract the min/max statistics from a `ParquetStatistics` object +macro_rules! get_statistic { + ($column_statistics:expr, $func:ident, $bytes_func:ident) => {{ + if !$column_statistics.has_min_max_set() { + return None; + } + match $column_statistics { + ParquetStatistics::Boolean(s) => Some(ScalarValue::Boolean(Some(*s.$func()))), + ParquetStatistics::Int32(s) => Some(ScalarValue::Int32(Some(*s.$func()))), + ParquetStatistics::Int64(s) => Some(ScalarValue::Int64(Some(*s.$func()))), + // 96 bit ints not supported + ParquetStatistics::Int96(_) => None, + ParquetStatistics::Float(s) => Some(ScalarValue::Float32(Some(*s.$func()))), + ParquetStatistics::Double(s) => Some(ScalarValue::Float64(Some(*s.$func()))), + ParquetStatistics::ByteArray(s) => { + let s = std::str::from_utf8(s.$bytes_func()) + .map(|s| s.to_string()) + .ok(); + Some(ScalarValue::Utf8(s)) + } + // type not supported yet + ParquetStatistics::FixedLenByteArray(_) => None, + } + }}; +} + +/// port from datafusion. +// Extract the min or max value calling `func` or `bytes_func` on the +// ParquetStatistics as appropriate +macro_rules! get_min_max_values { + ($self:expr, $column:expr, $func:ident, $bytes_func:ident) => {{ + let (column_index, field) = + if let Some((v, f)) = $self.parquet_schema.column_with_name(&$column.name) { + (v, f) + } else { + // Named column was not present + return None; + }; + + let data_type = field.data_type(); + let null_scalar: ScalarValue = if let Ok(v) = data_type.try_into() { + v + } else { + // DataFusion doesn't have support for ScalarValues of the column type + return None; + }; + + let scalar_values: Vec = $self + .row_group_metadata + .iter() + .flat_map(|meta| meta.column(column_index).statistics()) + .map(|stats| get_statistic!(stats, $func, $bytes_func)) + .map(|maybe_scalar| { + // column either did't have statistics at all or didn't have min/max values + maybe_scalar.unwrap_or_else(|| null_scalar.clone()) + }) + .collect(); + + // ignore errors converting to arrays (e.g. different types) + ScalarValue::iter_to_array(scalar_values).ok() + }}; +} + +/// Wraps parquet statistics in a way +/// that implements [`PruningStatistics`] +struct RowGroupPruningStatistics<'a> { + row_group_metadata: &'a [RowGroupMetaData], + parquet_schema: &'a ArrowSchema, +} + +impl<'a> PruningStatistics for RowGroupPruningStatistics<'a> { + fn min_values(&self, column: &Column) -> Option { + get_min_max_values!(self, column, min, min_bytes) + } + + fn max_values(&self, column: &Column) -> Option { + get_min_max_values!(self, column, max, max_bytes) + } + + fn num_containers(&self) -> usize { + self.row_group_metadata.len() + } +} + +fn build_row_group_predicate( + predicate_builder: &PruningPredicate, + row_group_metadata: &[RowGroupMetaData], +) -> Result> { + let parquet_schema = predicate_builder.schema().as_ref(); + + let pruning_stats = RowGroupPruningStatistics { + row_group_metadata, + parquet_schema, + }; + + predicate_builder + .prune(&pruning_stats) + .map_err(|e| { + error!("Error evaluating row group predicate values {}", e); + e + }) + .context(Prune) +} + +/// Predicate helps determine whether specific row group should be read. +#[derive(Debug, Clone)] +pub struct Predicate { + /// Predicates in the query for filter out the columns that meet all the + /// exprs. + pub exprs: Vec, + /// The time range involved by the query. + pub time_range: TimeRange, +} + +pub type PredicateRef = Arc; + +impl Predicate { + pub fn empty() -> Self { + Self::new(TimeRange::min_to_max()) + } + + pub fn new(time_range: TimeRange) -> Self { + Self { + exprs: Vec::new(), + time_range, + } + } + + /// Determine whether a row group should be read according to the meta data + /// in the `row_groups`. + /// + /// The boolean value in the returned vector denotes the corresponding row + /// group in the `row_groups` whether should be read. + pub fn filter_row_groups(&self, schema: &Schema, row_groups: &[RowGroupMetaData]) -> Vec { + let mut results = vec![true; row_groups.len()]; + let arrow_schema: SchemaRef = schema.clone().into_arrow_schema_ref(); + for expr in &self.exprs { + match PruningPredicate::try_new(expr, arrow_schema.clone()) { + Ok(pruning_predicate) => { + debug!("pruning_predicate is:{:?}", pruning_predicate); + + if let Ok(values) = build_row_group_predicate(&pruning_predicate, row_groups) { + for (curr_val, result_val) in values.into_iter().zip(results.iter_mut()) { + *result_val = curr_val && *result_val + } + }; + // if fail to build, just ignore this filter so that all the + // row groups should be read for this + // filter. + } + Err(e) => { + // for any error just ignore it and that is to say, for this filter all the row + // groups should be read. + error!("fail to build pruning predicate, err:{}", e); + } + } + } + + results + } +} + +/// Builder for [Predicate] +#[derive(Debug, Clone, Default)] +#[must_use] +pub struct PredicateBuilder { + time_range: Option, + exprs: Vec, +} + +impl PredicateBuilder { + /// Adds the expressions from `filter_exprs` that can be pushed down to + /// query engine. + pub fn add_pushdown_exprs(mut self, filter_exprs: &[Expr]) -> Self { + // For each expression of the filter_exprs, recursively split it if it is is an + // AND conjunction. For example, expression (x AND y) is split into [x, + // y]. + let mut split_exprs = vec![]; + for filter_expr in filter_exprs { + Self::split_and_expr(filter_expr, &mut split_exprs) + } + + // Only keep single_column and primitive binary expressions + let pushdown_exprs: Vec<_> = split_exprs + .into_iter() + .filter(Self::is_able_to_pushdown) + .collect(); + + self.exprs = pushdown_exprs; + + self + } + + /// Extract the time range from the `filter_exprs` and set it as + /// `TimeRange::zero_to_max()` if no timestamp predicate is found. + pub fn set_time_range(mut self, schema: &Schema, filter_exprs: &[Expr]) -> Self { + let time_range_extractor = TimeRangeExtractor { + timestamp_column_name: schema.timestamp_name(), + filters: filter_exprs, + }; + + let time_range = time_range_extractor.extract(); + debug!( + "finish extract time range from the filters, time_range:{:?}, filters:{:?}", + time_range, filter_exprs + ); + + self.time_range = Some(time_range); + + self + } + + pub fn build(self) -> PredicateRef { + Arc::new(Predicate { + exprs: self.exprs, + time_range: self.time_range.unwrap_or_else(TimeRange::min_to_max), + }) + } + + /// Determine whether the `expr` can be pushed down. + /// Returns false if any error occurs. + fn is_able_to_pushdown(expr: &Expr) -> bool { + let mut columns = HashSet::new(); + if let Err(e) = datafusion_util::expr_to_columns(expr, &mut columns) { + error!( + "Failed to extract columns from the expr, ignore this expr:{:?}, err:{}", + expr, e + ); + return false; + } + + columns.len() == 1 && Self::is_primitive_binary_expr(expr) + } + + /// Recursively split all "AND" expressions into smaller one + /// Example: "A AND B AND C" => [A, B, C] + fn split_and_expr(expr: &Expr, predicates: &mut Vec) { + match expr { + Expr::BinaryExpr { + right, + op: Operator::And, + left, + } => { + Self::split_and_expr(left, predicates); + Self::split_and_expr(right, predicates); + } + other => predicates.push(other.clone()), + } + } + + /// Return true if the given expression is in a primitive binary in the + /// form: `column op constant` and op must be a comparison one. + fn is_primitive_binary_expr(expr: &Expr) -> bool { + match expr { + Expr::BinaryExpr { left, op, right } => { + matches!( + (&**left, &**right), + (Expr::Column(_), Expr::Literal(_)) | (Expr::Literal(_), Expr::Column(_)) + ) && matches!( + op, + Operator::Eq + | Operator::NotEq + | Operator::Lt + | Operator::LtEq + | Operator::Gt + | Operator::GtEq + ) + } + _ => false, + } + } +} + +struct TimeRangeExtractor<'a> { + timestamp_column_name: &'a str, + filters: &'a [Expr], +} + +impl<'a> TimeRangeExtractor<'a> { + /// Do extraction from the `self.filters` for TimeRange. + /// + /// Returns `TimeRange::zero_to_max()` if no timestamp predicate is found. + fn extract(&self) -> TimeRange { + let mut time_range = TimeRange::min_to_max(); + for expr in self.filters { + let sub_time_range = self.extract_time_range_from_expr(expr); + let new_time_range = Self::and_time_ranges(&time_range, &sub_time_range); + + debug!( + "do and logic for time range, left:{:?}, right:{:?}, output:{:?}, expr:{:?}", + time_range, sub_time_range, new_time_range, expr + ); + time_range = new_time_range + } + + time_range + } + + /// Extract timestamp from the literal scalar expression. + fn timestamp_from_scalar_expr(expr: &Expr) -> Option { + if let Expr::Literal(ScalarValue::TimestampMillisecond(v, _)) = expr { + return v.map(Timestamp::new); + } + + None + } + + /// Compute the intersection of the two time ranges. + fn and_time_ranges(left: &TimeRange, right: &TimeRange) -> TimeRange { + let start = left.inclusive_start().max(right.inclusive_start()); + let end = left.exclusive_end().min(right.exclusive_end()); + TimeRange::new(start, end).unwrap_or_else(TimeRange::empty) + } + + /// Compute the union of the two time ranges and the union is defined as the + /// [min(left.start(), right.start()), max(left.end(), right.end())). + fn or_time_ranges(left: &TimeRange, right: &TimeRange) -> TimeRange { + let start = left.inclusive_start().min(right.inclusive_start()); + let end = left.exclusive_end().max(right.exclusive_end()); + TimeRange::new_unchecked(start, end) + } + + /// Extract the timestamp from the column expression and its corresponding + /// literal expression. Returns `None` if the expression pair is not + /// involved with timestamp column. No assumption on the order of the + /// `left` and `right`. + fn timestamp_from_column_and_value_expr(&self, left: &Expr, right: &Expr) -> Option { + let (column, val) = match (left, right) { + (Expr::Column(column), Expr::Literal(_)) => (column, right), + (Expr::Literal(_), Expr::Column(column)) => (column, left), + _ => return None, + }; + + if column.name == self.timestamp_column_name { + Self::timestamp_from_scalar_expr(val) + } else { + None + } + } + + /// Extract time range from the binary expression. + fn extract_time_range_from_binary_expr( + &self, + left: &Expr, + right: &Expr, + op: &Operator, + ) -> TimeRange { + match op { + Operator::And => { + let time_range_left = self.extract_time_range_from_expr(left); + let time_range_right = self.extract_time_range_from_expr(right); + Self::and_time_ranges(&time_range_left, &time_range_right) + } + Operator::Or => { + let time_range_left = self.extract_time_range_from_expr(left); + let time_range_right = self.extract_time_range_from_expr(right); + Self::or_time_ranges(&time_range_left, &time_range_right) + } + Operator::Eq => self + .timestamp_from_column_and_value_expr(left, right) + .map(TimeRange::from_timestamp) + .unwrap_or_else(TimeRange::min_to_max), + Operator::NotEq => TimeRange::min_to_max(), + Operator::Lt => self + .timestamp_from_column_and_value_expr(left, right) + .map(|right_t| TimeRange::new_unchecked(Timestamp::MIN, right_t)) + .unwrap_or_else(TimeRange::min_to_max), + Operator::LtEq => self + .timestamp_from_column_and_value_expr(left, right) + .map(|right_t| { + let right_t = right_t.checked_add_i64(1).unwrap_or(right_t); + TimeRange::new_unchecked(Timestamp::MIN, right_t) + }) + .unwrap_or_else(TimeRange::min_to_max), + Operator::Gt => self + .timestamp_from_column_and_value_expr(left, right) + .map(|left_t| { + let left_t = left_t.checked_add_i64(1).unwrap_or(left_t); + TimeRange::new_unchecked(left_t, Timestamp::MAX) + }) + .unwrap_or_else(TimeRange::min_to_max), + Operator::GtEq => self + .timestamp_from_column_and_value_expr(left, right) + .map(|left_t| TimeRange::new_unchecked(left_t, Timestamp::MAX)) + .unwrap_or_else(TimeRange::min_to_max), + Operator::Plus + | Operator::Minus + | Operator::Multiply + | Operator::Divide + | Operator::Modulo + | Operator::Like + | Operator::NotLike + | Operator::IsDistinctFrom + | Operator::IsNotDistinctFrom + | Operator::RegexMatch + | Operator::RegexNotMatch + | Operator::RegexIMatch + | Operator::RegexNotIMatch => TimeRange::min_to_max(), + } + } + + /// Extract time range from the between expression. + fn time_range_from_between_expr(low: &Expr, high: &Expr, negated: bool) -> TimeRange { + if negated { + return TimeRange::min_to_max(); + } + + let low_t = Self::timestamp_from_scalar_expr(low).unwrap_or(Timestamp::MIN); + // the two operands are inclusive in the `between` expression. + let high_t = { + let t = Self::timestamp_from_scalar_expr(high).unwrap_or(Timestamp::MAX); + t.checked_add_i64(1).unwrap_or(Timestamp::MAX) + }; + TimeRange::new(low_t, high_t).unwrap_or_else(TimeRange::empty) + } + + /// Extract time range from the list expressions. + fn time_range_from_list_expr(list: &[Expr], negated: bool) -> TimeRange { + if negated { + return TimeRange::min_to_max(); + } + + if list.is_empty() { + return TimeRange::empty(); + } + + let (mut inclusive_start, mut inclusive_end) = (Timestamp::MAX, Timestamp::MIN); + for expr in list { + match Self::timestamp_from_scalar_expr(expr) { + Some(t) => { + inclusive_start = inclusive_start.min(t); + inclusive_end = inclusive_end.max(t); + } + None => return TimeRange::min_to_max(), + } + } + + TimeRange::new(inclusive_start, inclusive_end).unwrap_or_else(TimeRange::empty) + } + + /// Extract the time range recursively from the `expr`. + /// + /// Now the strategy is conservative: for the sub-expr which we are not sure + /// how to handle it, returns `TimeRange::zero_to_max()`. + fn extract_time_range_from_expr(&self, expr: &Expr) -> TimeRange { + match expr { + Expr::BinaryExpr { left, op, right } => { + self.extract_time_range_from_binary_expr(left, right, op) + } + Expr::Between { + expr, + negated, + low, + high, + } => { + if let Expr::Column(column) = expr.as_ref() { + if column.name == self.timestamp_column_name { + return Self::time_range_from_between_expr(&*low, &*high, *negated); + } + } + + TimeRange::min_to_max() + } + Expr::InList { + expr, + list, + negated, + } => { + if let Expr::Column(column) = expr.as_ref() { + if column.name == self.timestamp_column_name { + return Self::time_range_from_list_expr(list, *negated); + } + } + + TimeRange::min_to_max() + } + Expr::Not(_) + | Expr::Alias(_, _) + | Expr::ScalarVariable(_) + | Expr::Column(_) + | Expr::Literal(_) + | Expr::IsNotNull(_) + | Expr::IsNull(_) + | Expr::Negative(_) + | Expr::Case { .. } + | Expr::Cast { .. } + | Expr::TryCast { .. } + | Expr::Sort { .. } + | Expr::ScalarFunction { .. } + | Expr::ScalarUDF { .. } + | Expr::AggregateFunction { .. } + | Expr::WindowFunction { .. } + | Expr::AggregateUDF { .. } + | Expr::Wildcard { .. } + | Expr::GetIndexedField { .. } => TimeRange::min_to_max(), + } + } +} diff --git a/table_engine/src/provider.rs b/table_engine/src/provider.rs new file mode 100644 index 0000000000..92e2ed57e0 --- /dev/null +++ b/table_engine/src/provider.rs @@ -0,0 +1,275 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Datafusion `TableProvider` adapter + +use std::{any::Any, fmt, sync::Arc}; + +use arrow_deps::{ + arrow::datatypes::SchemaRef, + datafusion::{ + datasource::datasource::{TableProvider, TableProviderFilterPushDown}, + error::{DataFusionError, Result}, + execution::runtime_env::RuntimeEnv, + logical_plan::Expr, + physical_plan::{ + DisplayFormatType, ExecutionPlan, Partitioning, + SendableRecordBatchStream as DfSendableRecordBatchStream, Statistics, + }, + }, +}; +use async_trait::async_trait; +use common_types::{projected_schema::ProjectedSchema, request_id::RequestId, schema::Schema}; +use log::debug; +use tokio::sync::Mutex; + +use crate::{ + predicate::{PredicateBuilder, PredicateRef}, + stream::{SendableRecordBatchStream, ToDfStream}, + table::{self, ReadOptions, ReadOrder, ReadRequest, TableRef}, +}; + +/// An adapter to [TableProvider] with schema snapshot. +/// +/// This adapter holds a schema snapshot of the table and always returns that +/// schema to caller. +#[derive(Debug)] +pub struct TableProviderAdapter { + table: TableRef, + /// The schema of the table when this adapter is created, used as schema + /// snapshot for read to avoid the reader sees different schema during + /// query + read_schema: Schema, + request_id: RequestId, + read_parallelism: usize, +} + +impl TableProviderAdapter { + pub fn new(table: TableRef, request_id: RequestId, read_parallelism: usize) -> Self { + // Take a snapshot of the schema + let read_schema = table.schema(); + + Self { + table, + read_schema, + request_id, + read_parallelism, + } + } + + pub fn as_table_ref(&self) -> &TableRef { + &self.table + } + + pub fn scan_table( + &self, + projection: &Option>, + filters: &[Expr], + limit: Option, + read_order: ReadOrder, + ) -> Result> { + debug!( + "scan table, table:{}, request_id:{}, projection:{:?}, filters:{:?}, limit:{:?}, read_order:{:?}", + self.table.name(), + self.request_id, + projection, + filters, + limit, + read_order, + ); + + // Forbid the parallel reading if the data order is required. + let read_parallelism = if read_order.is_in_order() { + 1 + } else { + self.read_parallelism + }; + + let predicate = self.predicate_from_filters(filters); + Ok(Arc::new(ScanTable { + projected_schema: ProjectedSchema::new(self.read_schema.clone(), projection.clone()) + .map_err(|e| { + DataFusionError::Internal(format!( + "Invalid projection, plan:{:?}, projection:{:?}, err:{:?}", + self, projection, e + )) + })?, + table: self.table.clone(), + request_id: self.request_id, + read_order, + read_parallelism, + predicate, + stream_state: Mutex::new(ScanStreamState::default()), + })) + } + + fn predicate_from_filters(&self, filters: &[Expr]) -> PredicateRef { + PredicateBuilder::default() + .add_pushdown_exprs(filters) + .set_time_range(&self.read_schema, filters) + .build() + } +} + +#[async_trait] +impl TableProvider for TableProviderAdapter { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + // We use the `read_schema` as the schema of this `TableProvider` + self.read_schema.clone().into_arrow_schema_ref() + } + + async fn scan( + &self, + projection: &Option>, + filters: &[Expr], + limit: Option, + ) -> Result> { + self.scan_table(projection, filters, limit, ReadOrder::None) + } + + fn supports_filter_pushdown(&self, _filter: &Expr) -> Result { + Ok(TableProviderFilterPushDown::Inexact) + } +} + +#[derive(Default)] +struct ScanStreamState { + inited: bool, + err: Option, + streams: Vec>, +} + +impl ScanStreamState { + fn take_stream(&mut self, index: usize) -> Result { + if let Some(e) = &self.err { + return Err(DataFusionError::Execution(format!( + "Failed to read table, partition:{}, err:{}", + index, e + ))); + } + + // TODO(yingwen): Return an empty stream if index is out of bound. + self.streams[index].take().ok_or_else(|| { + DataFusionError::Execution(format!( + "Read partition multiple times is not supported, partition:{}", + index + )) + }) + } +} + +/// Physical plan of scanning table. +struct ScanTable { + projected_schema: ProjectedSchema, + table: TableRef, + request_id: RequestId, + read_order: ReadOrder, + read_parallelism: usize, + predicate: PredicateRef, + + stream_state: Mutex, +} + +impl ScanTable { + async fn maybe_init_stream(&self, runtime: Arc) -> Result<()> { + let mut stream_state = self.stream_state.lock().await; + if stream_state.inited { + return Ok(()); + } + + let req = ReadRequest { + request_id: self.request_id, + opts: ReadOptions { + batch_size: runtime.batch_size(), + read_parallelism: self.read_parallelism, + }, + projected_schema: self.projected_schema.clone(), + predicate: self.predicate.clone(), + order: self.read_order, + }; + + let read_res = self.table.partitioned_read(req).await; + match read_res { + Ok(partitioned_streams) => { + assert_eq!(self.read_parallelism, partitioned_streams.streams.len()); + stream_state.streams = partitioned_streams.streams.into_iter().map(Some).collect(); + } + Err(e) => { + stream_state.err = Some(e); + } + } + stream_state.inited = true; + + Ok(()) + } +} + +#[async_trait] +impl ExecutionPlan for ScanTable { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.projected_schema.to_projected_arrow_schema() + } + + fn output_partitioning(&self) -> Partitioning { + Partitioning::RoundRobinBatch(self.read_parallelism) + } + + fn children(&self) -> Vec> { + // this is a leaf node and has no children + vec![] + } + + fn with_new_children(&self, _: Vec>) -> Result> { + Err(DataFusionError::Internal(format!( + "Children cannot be replaced in {:?}", + self + ))) + } + + async fn execute( + &self, + partition: usize, + runtime: Arc, + ) -> Result { + self.maybe_init_stream(runtime).await?; + + let mut stream_state = self.stream_state.lock().await; + let stream = stream_state.take_stream(partition)?; + + Ok(Box::pin(ToDfStream(stream))) + } + + fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "ScanTable: table={}, parallelism={}, order={:?}, ", + self.table.name(), + self.read_parallelism, + self.read_order, + ) + } + + fn statistics(&self) -> Statistics { + // TODO(yingwen): Implement this + Statistics::default() + } +} + +impl fmt::Debug for ScanTable { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ScanTable") + .field("projected_schema", &self.projected_schema) + .field("table", &self.table.name()) + .field("read_order", &self.read_order) + .field("read_parallelism", &self.read_parallelism) + .field("predicate", &self.predicate) + .finish() + } +} diff --git a/table_engine/src/stream.rs b/table_engine/src/stream.rs new file mode 100644 index 0000000000..fc8245d07c --- /dev/null +++ b/table_engine/src/stream.rs @@ -0,0 +1,128 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table record stream + +use std::{ + convert::TryFrom, + pin::Pin, + task::{Context, Poll}, +}; + +use arrow_deps::{ + arrow::{ + datatypes::SchemaRef, + error::{ArrowError, Result as ArrowResult}, + record_batch::RecordBatch as ArrowRecordBatch, + }, + datafusion::physical_plan::{ + RecordBatchStream as DfRecordBatchStream, + SendableRecordBatchStream as DfSendableRecordBatchStream, + }, +}; +use common_types::{record_batch::RecordBatch, schema::RecordSchema}; +use common_util::define_result; +use futures::stream::Stream; +use snafu::{Backtrace, ResultExt, Snafu}; + +// TODO(yingwen): Classify the error. +#[derive(Debug, Snafu)] +#[snafu(visibility(pub))] +pub enum Error { + #[snafu(display("Stream error, msg:{}, err:{}", msg, source))] + ErrWithSource { + msg: String, + source: Box, + }, + + #[snafu(display("Stream error, msg:{}.\nBacktrace:\n{}", msg, backtrace))] + ErrNoSource { msg: String, backtrace: Backtrace }, +} + +define_result!(Error); + +pub trait RecordBatchStream: Stream> { + fn schema(&self) -> &RecordSchema; +} + +pub type SendableRecordBatchStream = Pin>; + +/// Record batch streams divided by time range. +pub struct PartitionedStreams { + pub streams: Vec, +} + +impl PartitionedStreams { + pub fn one_stream(stream: SendableRecordBatchStream) -> Self { + Self { + streams: vec![stream], + } + } +} + +pub struct ToDfStream(pub SendableRecordBatchStream); + +impl Stream for ToDfStream { + type Item = ArrowResult; + + fn poll_next(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll> { + match self.0.as_mut().poll_next(ctx) { + Poll::Ready(Some(Ok(record_batch))) => { + Poll::Ready(Some(Ok(record_batch.into_arrow_record_batch()))) + } + Poll::Ready(Some(Err(e))) => { + Poll::Ready(Some(Err(ArrowError::ExternalError(Box::new(e))))) + } + Poll::Ready(None) => Poll::Ready(None), + Poll::Pending => Poll::Pending, + } + } +} + +impl DfRecordBatchStream for ToDfStream { + fn schema(&self) -> SchemaRef { + self.0.schema().to_arrow_schema_ref() + } +} + +pub struct FromDfStream { + schema: RecordSchema, + df_stream: DfSendableRecordBatchStream, +} + +impl FromDfStream { + pub fn new(df_stream: DfSendableRecordBatchStream) -> Result { + let df_schema = df_stream.schema(); + let schema = RecordSchema::try_from(df_schema) + .map_err(|e| Box::new(e) as _) + .context(ErrWithSource { + msg: "Convert record schema", + })?; + + Ok(Self { schema, df_stream }) + } +} + +impl Stream for FromDfStream { + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll> { + match self.df_stream.as_mut().poll_next(ctx) { + Poll::Ready(Some(record_batch_res)) => Poll::Ready(Some( + record_batch_res + .map_err(|e| Box::new(e) as _) + .and_then(|batch| RecordBatch::try_from(batch).map_err(|e| Box::new(e) as _)) + .context(ErrWithSource { + msg: "Convert from arrow record batch", + }), + )), + Poll::Ready(None) => Poll::Ready(None), + Poll::Pending => Poll::Pending, + } + } +} + +impl RecordBatchStream for FromDfStream { + fn schema(&self) -> &RecordSchema { + &self.schema + } +} diff --git a/table_engine/src/table.rs b/table_engine/src/table.rs new file mode 100644 index 0000000000..b361756e8d --- /dev/null +++ b/table_engine/src/table.rs @@ -0,0 +1,608 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table abstraction + +use std::{ + collections::HashMap, + convert::TryFrom, + fmt, + sync::{ + atomic::{AtomicU32, AtomicU64, Ordering}, + Arc, + }, +}; + +use async_trait::async_trait; +use common_types::{ + column_schema::ColumnSchema, + datum::Datum, + projected_schema::ProjectedSchema, + request_id::RequestId, + row::{Row, RowGroup}, + schema::{RecordSchemaWithKey, Schema, Version}, + time::Timestamp, +}; +use proto::sys_catalog::{TableEntry, TableState as TableStatePb}; +use serde_derive::Deserialize; +use snafu::{Backtrace, Snafu}; + +use crate::{ + engine::{TableRequestType, TableState}, + predicate::PredicateRef, + stream::{PartitionedStreams, SendableRecordBatchStream}, +}; + +/// Contains common error variant, implementation specific error should +/// be cast into Box +#[derive(Debug, Snafu)] +#[snafu(visibility = "pub")] +pub enum Error { + #[snafu(display( + "Unsupported table method, table:{}, method:{}.\nBacktrace:\n{}", + table, + method, + backtrace + ))] + UnsupportedMethod { + table: String, + method: String, + backtrace: Backtrace, + }, + + #[snafu(display( + "Get Invalid primary key, expected schema:{:?}, given_primary_keys:{:?}.\nBacktrace:\n{}", + schema, + primary_key_columns, + backtrace + ))] + GetInvalidPrimaryKey { + schema: RecordSchemaWithKey, + primary_key_columns: Vec, + backtrace: Backtrace, + }, + + #[snafu(display( + "Get null primary key, expected schema:{:?}, given_primary_keys:{:?}.\nBacktrace:\n{}", + schema, + primary_key_columns, + backtrace + ))] + GetNullPrimaryKey { + schema: RecordSchemaWithKey, + primary_key_columns: Vec, + backtrace: Backtrace, + }, + + #[snafu(display("Unexpected error, err:{}", source))] + Unexpected { + source: Box, + }, + + #[snafu(display("Invalid arguments, err:{}", source))] + InvalidArguments { + table: String, + source: Box, + }, + + #[snafu(display("Failed to write table, table:{}, err:{}", table, source))] + Write { + table: String, + source: Box, + }, + + #[snafu(display("Failed to scan table, table:{}, err:{}", table, source))] + Scan { + table: String, + source: Box, + }, + + #[snafu(display("Failed to get table, table:{}, err:{}", table, source))] + Get { + table: String, + source: Box, + }, + + #[snafu(display("Failed to alter schema, table:{}, err:{}", table, source))] + AlterSchema { + table: String, + source: Box, + }, + + #[snafu(display("Failed to alter options, table:{}, err:{}", table, source))] + AlterOptions { + table: String, + source: Box, + }, + + #[snafu(display("Failed to flush table, table:{}, err:{}", table, source))] + Flush { + table: String, + source: Box, + }, + + #[snafu(display("Failed to compact table, table:{}, err:{}", table, source))] + Compact { + table: String, + source: Box, + }, +} + +define_result!(Error); + +/// Default partition num to scan in parallelism. +pub const DEFAULT_READ_PARALLELISM: usize = 8; + +/// Schema id (24 bits) +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct SchemaId(u32); + +impl SchemaId { + /// Bits of schema id. + const BITS: u32 = 24; + /// 24 bits mask (0xffffff) + const MASK: u32 = (1 << Self::BITS) - 1; + /// Max schema id. + pub const MAX: SchemaId = SchemaId(Self::MASK); + /// Min schema id. + pub const MIN: SchemaId = SchemaId(0); + + /// Create a new schema id from u32, return None if `id` is invalid. + pub fn new(id: u32) -> Option { + // Only need to check max as min is 0. + if id <= SchemaId::MAX.0 { + Some(Self(id)) + } else { + None + } + } + + // It is safe to convert u16 into schema id. + pub const fn from_u16(id: u16) -> Self { + Self(id as u32) + } + + /// Convert the schema id into u32. + #[inline] + pub fn as_u32(&self) -> u32 { + self.0 + } +} + +impl PartialEq for SchemaId { + fn eq(&self, other: &u32) -> bool { + self.0 == *other + } +} + +impl From for SchemaId { + fn from(id: u16) -> SchemaId { + SchemaId::from_u16(id) + } +} + +/// Sequence of a table under a schema (40 bits). +#[derive(Debug, Clone, Copy)] +pub struct TableSeq(u64); + +impl TableSeq { + /// Bits of schema id. + const BITS: u64 = 40; + /// 40 bits mask (0xffffffffff). + const MASK: u64 = (1 << Self::BITS) - 1; + /// Max sequence of table in a schema. + pub const MAX: TableSeq = TableSeq(Self::MASK); + /// Min sequence of table in a schema. + pub const MIN: TableSeq = TableSeq(0); + + /// Create a new table sequence from u64, return None if `seq` is invalid. + pub const fn new(seq: u64) -> Option { + // Only need to check max as min is 0. + if seq <= TableSeq::MAX.0 { + Some(Self(seq)) + } else { + None + } + } + + // It is safe to convert u32 into table seq. + pub const fn from_u32(id: u32) -> Self { + Self(id as u64) + } + + /// Convert the table sequence into u64. + #[inline] + pub fn as_u64(&self) -> u64 { + self.0 + } +} + +impl From for TableSeq { + fn from(id: u32) -> TableSeq { + TableSeq::from_u32(id) + } +} + +/// Table Id (64 bits) +/// +/// Table id is constructed via schema id (24 bits) and a table sequence (40 +/// bits). +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Deserialize)] +pub struct TableId(u64); + +impl TableId { + /// Min table id. + pub const MIN: TableId = TableId(0); + + /// Create a new table id from `schema_id` and `table_seq`. + pub const fn new(schema_id: SchemaId, table_seq: TableSeq) -> Self { + let schema_id_data = schema_id.0 as u64; + let schema_id_part = schema_id_data << TableSeq::BITS; + let table_id_data = schema_id_part | table_seq.0; + + Self(table_id_data) + } + + /// Get the schema id part of the table id. + #[inline] + pub fn schema_id(&self) -> SchemaId { + let schema_id_part = self.0 >> TableSeq::BITS; + + SchemaId(schema_id_part as u32) + } + + /// Get the sequence part of the table id. + #[inline] + pub fn table_seq(&self) -> TableSeq { + let seq_part = self.0 & TableSeq::MASK; + + TableSeq(seq_part) + } + + /// Convert table id into u64. + #[inline] + pub fn as_u64(&self) -> u64 { + self.0 + } +} + +impl From for TableId { + fn from(id: u64) -> TableId { + TableId(id) + } +} + +impl fmt::Debug for TableId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "TableId({}, {}, {})", + self.0, + self.schema_id().as_u32(), + self.table_seq().as_u64() + ) + } +} + +impl fmt::Display for TableId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } +} + +// TODO(yingwen): Support DELETE/UPDATE... , a mutation type is needed. +#[derive(Debug)] +pub struct WriteRequest { + /// rows to write + pub row_group: RowGroup, +} + +#[derive(Debug)] +pub struct ReadOptions { + pub batch_size: usize, + /// Suggested read parallelism, the actual returned stream should equal to + /// `read_parallelism`. + pub read_parallelism: usize, +} + +impl Default for ReadOptions { + fn default() -> Self { + Self { + batch_size: 10000, + read_parallelism: DEFAULT_READ_PARALLELISM, + } + } +} + +#[derive(Debug)] +pub struct GetRequest { + /// Query request id. + pub request_id: RequestId, + /// The schema and projection for get, the output data should match this + /// schema. + pub projected_schema: ProjectedSchema, + /// The primary key of the row to get. + pub primary_key: Vec, +} + +#[derive(Copy, Clone, Debug)] +pub enum ReadOrder { + /// No order requirements from the read request. + None, + Asc, + Desc, +} + +impl ReadOrder { + pub fn from_is_asc(is_asc: Option) -> Self { + match is_asc { + Some(true) => ReadOrder::Asc, + Some(false) => ReadOrder::Desc, + None => ReadOrder::None, + } + } + + #[inline] + pub fn is_out_of_order(&self) -> bool { + matches!(self, ReadOrder::None) + } + + #[inline] + pub fn is_in_order(&self) -> bool { + !self.is_out_of_order() + } + + #[inline] + pub fn is_in_desc_order(&self) -> bool { + matches!(self, ReadOrder::Desc) + } +} + +#[derive(Debug)] +pub struct ReadRequest { + /// Read request id. + pub request_id: RequestId, + /// Read options. + pub opts: ReadOptions, + /// The schema and projection for read, the output data should match this + /// schema. + pub projected_schema: ProjectedSchema, + /// Predicate of the query. + pub predicate: PredicateRef, + /// Read the rows in reverse order. + pub order: ReadOrder, +} + +#[derive(Debug)] +pub struct AlterSchemaRequest { + /// The new schema. + pub schema: Schema, + /// Previous schema version before alteration. + pub pre_schema_version: Version, +} + +#[derive(Debug)] +pub struct FlushRequest { + /// Trigger a compaction after flush, default is true. + pub compact_after_flush: bool, + /// Whether to wait flush task finishes, default is true. + pub sync: bool, +} + +impl Default for FlushRequest { + fn default() -> Self { + Self { + compact_after_flush: true, + sync: true, + } + } +} + +/// Table abstraction +/// +/// We do not let Table trait extends datafusion's TableProvider, since +/// that will tie out abstraction with datafusion. However, we still use +/// datafusion's RecordBatchStream trait. +#[async_trait] +pub trait Table: std::fmt::Debug { + /// Returns table name. + fn name(&self) -> &str; + + /// Returns the id of this table. + fn id(&self) -> TableId; + + /// Schema of this table. + fn schema(&self) -> Schema; + + /// Options of this table. + fn options(&self) -> HashMap; + + /// Engine type of this table. + fn engine_type(&self) -> &str; + + /// Get table's statistics. + fn stats(&self) -> TableStats; + + /// Write to table. + async fn write(&self, request: WriteRequest) -> Result; + + /// Read from table. + async fn read(&self, request: ReadRequest) -> Result; + + /// Get the specific row according to the primary key. + /// TODO(xikai): object-safety is not ensured by now if the default + /// implementation is provided. Actually it is better to use the read + /// method to implement the get method. + async fn get(&self, request: GetRequest) -> Result>; + + /// Read multiple partition of the table in parallel. + async fn partitioned_read(&self, request: ReadRequest) -> Result; + + /// Alter table schema to the schema specific in [AlterSchemaRequest] if + /// the `pre_schema_version` is equal to current schema version. + /// + /// Returns the affected rows (always 1). + async fn alter_schema(&self, request: AlterSchemaRequest) -> Result; + + /// Alter table options. + /// + /// Returns the affected rows (always 1). + async fn alter_options(&self, options: HashMap) -> Result; + + /// Flush this table. + async fn flush(&self, request: FlushRequest) -> Result<()>; + + /// Compact this table and wait until compaction completes. + async fn compact(&self) -> Result<()>; +} + +/// Basic statistics of table. +#[derive(Debug, Clone, Copy, Default)] +pub struct TableStats { + /// Total write request + pub num_write: u64, + /// Total read request + pub num_read: u64, + /// Total flush request + pub num_flush: u64, +} + +/// A reference-counted pointer to Table +pub type TableRef = Arc; + +/// Helper to generate a schema id. +pub struct SchemaIdGenerator { + last_schema_id: AtomicU32, +} + +impl SchemaIdGenerator { + pub fn last_schema_id_u32(&self) -> u32 { + self.last_schema_id.load(Ordering::Relaxed) + } + + pub fn set_last_schema_id(&self, last_schema_id: SchemaId) { + self.last_schema_id + .store(last_schema_id.as_u32(), Ordering::Relaxed); + } + + pub fn alloc_schema_id(&self) -> Option { + let last = self.last_schema_id.fetch_add(1, Ordering::Relaxed); + + SchemaId::new(last + 1) + } +} + +impl Default for SchemaIdGenerator { + fn default() -> Self { + Self { + last_schema_id: AtomicU32::new(SchemaId::MIN.as_u32()), + } + } +} + +/// Helper to generate a table sequence. +pub struct TableSeqGenerator { + last_table_seq: AtomicU64, +} + +impl TableSeqGenerator { + pub fn last_table_seq_u64(&self) -> u64 { + self.last_table_seq.load(Ordering::Relaxed) + } + + pub fn set_last_table_seq(&self, last_table_seq: TableSeq) { + self.last_table_seq + .store(last_table_seq.as_u64(), Ordering::Relaxed); + } + + pub fn alloc_table_seq(&self) -> Option { + let last = self.last_table_seq.fetch_add(1, Ordering::Relaxed); + + TableSeq::new(last + 1) + } +} + +impl Default for TableSeqGenerator { + fn default() -> Self { + Self { + last_table_seq: AtomicU64::new(TableSeq::MIN.as_u64()), + } + } +} + +/// Create table request in catalog +#[derive(Debug, Clone)] +pub struct TableInfo { + /// Catalog name + pub catalog_name: String, + /// Schema name + pub schema_name: String, + /// Table id + pub table_id: TableId, + /// Table name + pub table_name: String, + /// Table engine type + pub engine: String, + /// Tells state of the table + pub state: TableState, +} + +#[derive(Debug, Snafu)] +pub struct TryFromTableEntryError(common_types::schema::Error); + +impl TryFrom for TableInfo { + type Error = TryFromTableEntryError; + + fn try_from(entry: TableEntry) -> std::result::Result { + Ok(Self { + catalog_name: entry.catalog_name, + schema_name: entry.schema_name, + table_id: entry.table_id.into(), + table_name: entry.table_name, + engine: entry.engine, + state: TableState::from(entry.state), + }) + } +} + +impl From for TableEntry { + fn from(table_info: TableInfo) -> Self { + let mut entry = TableEntry::new(); + entry.set_catalog_name(table_info.catalog_name); + entry.set_schema_name(table_info.schema_name); + entry.set_table_id(table_info.table_id.as_u64()); + entry.set_table_name(table_info.table_name); + entry.set_engine(table_info.engine); + entry.set_state(TableStatePb::from(table_info.state)); + + entry + } +} + +impl TableInfo { + // TODO(chunshao.rcs): refactor + pub fn into_pb(self, typ: TableRequestType) -> TableEntry { + let mut table_entry: TableEntry = self.into(); + match typ { + TableRequestType::Create => table_entry.set_created_time(Timestamp::now().as_i64()), + TableRequestType::Drop => table_entry.set_modified_time(Timestamp::now().as_i64()), + } + table_entry + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_schema_id() { + assert_eq!(0, SchemaId::MIN.as_u32()); + assert_eq!(0xffffff, SchemaId::MAX.as_u32()); + } + + #[test] + fn test_table_seq() { + assert_eq!(0, TableSeq::MIN.as_u64()); + assert_eq!(0xffffffffff, TableSeq::MAX.as_u64()); + } +} diff --git a/udf/Cargo.toml b/udf/Cargo.toml new file mode 100644 index 0000000000..a4895e787d --- /dev/null +++ b/udf/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "udf" +version = "0.1.0" +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +arrow_deps = { path = "../arrow_deps" } +base64 = "0.13" +chrono = "0.4" +common_types = { path = "../common_types" } +common_util = { path = "../common_util" } +hyperloglog = { path = "../components/rust-hyperloglog" } +smallvec = "1.6" +snafu = { version ="0.6.10", features = ["backtraces"]} diff --git a/udf/src/aggregate.rs b/udf/src/aggregate.rs new file mode 100644 index 0000000000..45fa24b73b --- /dev/null +++ b/udf/src/aggregate.rs @@ -0,0 +1,164 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Aggregate functions. + +use std::{fmt, ops::Deref}; + +use arrow_deps::{ + arrow::array::ArrayRef as DfArrayRef, + datafusion::{ + error::{DataFusionError, Result as DfResult}, + physical_plan::Accumulator as DfAccumulator, + scalar::ScalarValue as DfScalarValue, + }, +}; +use common_util::define_result; +use snafu::Snafu; + +use crate::functions::{ScalarValue, ScalarValueRef}; + +#[derive(Debug, Snafu)] +#[snafu(visibility(pub(crate)))] +pub enum Error { + #[snafu(display("Failed to get state, err:{}", source))] + GetState { + source: Box, + }, + + #[snafu(display("Failed to merge state, err:{}", source))] + MergeState { + source: Box, + }, +} + +define_result!(Error); + +pub struct State(Vec); + +impl State { + fn into_df_scalar_values(self) -> Vec { + self.0 + } +} + +impl From for State { + fn from(value: ScalarValue) -> Self { + Self(vec![value.into_df_scalar_value()]) + } +} + +pub struct Input<'a>(&'a [DfScalarValue]); + +impl<'a> Input<'a> { + pub fn iter(&self) -> impl Iterator { + self.0.iter().map(ScalarValueRef::from) + } + + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + pub fn value(&self, index: usize) -> ScalarValueRef { + ScalarValueRef::from(&self.0[index]) + } +} + +pub struct StateRef<'a>(Input<'a>); + +impl<'a> Deref for StateRef<'a> { + type Target = Input<'a>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +/// An accumulator represents a stateful object that lives throughout the +/// evaluation of multiple rows and generically accumulates values. +/// +/// An accumulator knows how to: +/// * update its state from inputs via `update` +/// * convert its internal state to a vector of scalar values +/// * update its state from multiple accumulators' states via `merge` +/// * compute the final value from its internal state via `evaluate` +pub trait Accumulator: Send + Sync + fmt::Debug { + /// Returns the state of the accumulator at the end of the accumulation. + // in the case of an average on which we track `sum` and `n`, this function + // should return a vector of two values, sum and n. + fn state(&self) -> Result; + + /// updates the accumulator's state from a vector of scalars. + fn update(&mut self, values: Input) -> Result<()>; + + /// updates the accumulator's state from a vector of scalars. + fn merge(&mut self, states: StateRef) -> Result<()>; + + /// returns its value based on its current state. + fn evaluate(&self) -> Result; +} + +#[derive(Debug)] +pub struct ToDfAccumulator { + accumulator: T, +} + +impl ToDfAccumulator { + pub fn new(accumulator: T) -> Self { + Self { accumulator } + } +} + +impl DfAccumulator for ToDfAccumulator { + fn state(&self) -> DfResult> { + let state = self.accumulator.state().map_err(|e| { + DataFusionError::Execution(format!("Accumulator failed to get state, err:{}", e)) + })?; + Ok(state.into_df_scalar_values()) + } + + fn update_batch(&mut self, values: &[DfArrayRef]) -> DfResult<()> { + if values.is_empty() { + return Ok(()); + }; + (0..values[0].len()).try_for_each(|index| { + let v = values + .iter() + .map(|array| DfScalarValue::try_from_array(array, index)) + .collect::>>()?; + let input = Input(&v); + + self.accumulator.update(input).map_err(|e| { + DataFusionError::Execution(format!("Accumulator failed to update, err:{}", e)) + }) + }) + } + + fn merge_batch(&mut self, states: &[DfArrayRef]) -> DfResult<()> { + if states.is_empty() { + return Ok(()); + }; + (0..states[0].len()).try_for_each(|index| { + let v = states + .iter() + .map(|array| DfScalarValue::try_from_array(array, index)) + .collect::>>()?; + let state_ref = StateRef(Input(&v)); + + self.accumulator.merge(state_ref).map_err(|e| { + DataFusionError::Execution(format!("Accumulator failed to merge, err:{}", e)) + }) + }) + } + + fn evaluate(&self) -> DfResult { + let value = self.accumulator.evaluate().map_err(|e| { + DataFusionError::Execution(format!("Accumulator failed to evaluate, err:{}", e)) + })?; + + Ok(value.into_df_scalar_value()) + } +} diff --git a/udf/src/functions.rs b/udf/src/functions.rs new file mode 100644 index 0000000000..6fcd2df4be --- /dev/null +++ b/udf/src/functions.rs @@ -0,0 +1,326 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Functions. + +use std::{ + hash::{Hash, Hasher}, + sync::Arc, +}; + +use arrow_deps::{ + arrow::datatypes::DataType, + datafusion::{ + error::DataFusionError, + physical_plan::{ + aggregates::{AccumulatorFunctionImplementation, StateTypeFunction}, + functions::{ + ReturnTypeFunction, ScalarFunctionImplementation, Signature as DfSignature, + TypeSignature as DfTypeSignature, Volatility, + }, + ColumnarValue as DfColumnarValue, + }, + scalar::ScalarValue as DfScalarValue, + }, +}; +use common_types::{column::ColumnBlock, datum::DatumKind}; +use common_util::define_result; +use smallvec::SmallVec; +use snafu::{ResultExt, Snafu}; + +use crate::aggregate::{Accumulator, ToDfAccumulator}; + +// Most functions have no more than 5 args. +const FUNC_ARG_NUM: usize = 5; + +#[derive(Debug, Snafu)] +#[snafu(visibility(pub(crate)))] +pub enum Error { + #[snafu(display("Failed to convert array to ColumnarValue, err:{}", source))] + InvalidArray { source: common_types::column::Error }, + + #[snafu(display("Invalid function arguments, err:{}", source))] + InvalidArguments { + source: Box, + }, + + #[snafu(display("Failed to execute function, err:{}", source))] + CallFunction { + source: Box, + }, +} + +define_result!(Error); + +/// A dynamically typed, nullable single value. +// TODO(yingwen): Can we use Datum? +#[derive(Debug)] +pub struct ScalarValue(DfScalarValue); + +impl ScalarValue { + pub(crate) fn into_df_scalar_value(self) -> DfScalarValue { + self.0 + } + + fn from_df_scalar_value(df_scalar: &DfScalarValue) -> Self { + Self(df_scalar.clone()) + } + + pub fn as_str(&self) -> Option<&str> { + match &self.0 { + DfScalarValue::Utf8(value_opt) => value_opt.as_ref().map(|v| v.as_str()), + _ => None, + } + } +} + +impl From for ScalarValue { + fn from(value: String) -> Self { + Self(DfScalarValue::Utf8(Some(value))) + } +} + +impl From for ScalarValue { + fn from(value: u64) -> Self { + Self(value.into()) + } +} + +pub struct ScalarValueRef<'a>(&'a DfScalarValue); + +impl<'a> ScalarValueRef<'a> { + pub fn as_str(&self) -> Option<&str> { + match self.0 { + DfScalarValue::Utf8(value_opt) | DfScalarValue::LargeUtf8(value_opt) => { + value_opt.as_ref().map(|v| v.as_str()) + } + _ => None, + } + } +} + +impl<'a> From<&'a DfScalarValue> for ScalarValueRef<'a> { + fn from(value: &DfScalarValue) -> ScalarValueRef { + ScalarValueRef(value) + } +} + +impl<'a> Hash for ScalarValueRef<'a> { + fn hash(&self, state: &mut H) { + self.0.hash(state) + } +} + +/// Represent a value of function result. +#[derive(Debug)] +pub enum ColumnarValue { + /// Array of values. + Array(ColumnBlock), + /// A single value. + Scalar(ScalarValue), +} + +impl ColumnarValue { + fn into_df_columnar_value(self) -> DfColumnarValue { + match self { + ColumnarValue::Array(v) => DfColumnarValue::Array(v.to_arrow_array_ref()), + ColumnarValue::Scalar(v) => DfColumnarValue::Scalar(v.into_df_scalar_value()), + } + } + + fn try_from_df_columnar_value(df_value: &DfColumnarValue) -> Result { + let columnar_value = match df_value { + DfColumnarValue::Array(array) => { + let column_block = + ColumnBlock::try_cast_arrow_array_ref(array).context(InvalidArray)?; + ColumnarValue::Array(column_block) + } + DfColumnarValue::Scalar(v) => { + ColumnarValue::Scalar(ScalarValue::from_df_scalar_value(v)) + } + }; + + Ok(columnar_value) + } +} + +/// A function's TypeSignature. +#[derive(Debug)] +pub enum TypeSignature { + /// exact number of arguments of an exact type + Exact(Vec), + /// fixed number of arguments of an arbitrary but equal type out of a list + /// of valid types + // A function of one argument of double is `Uniform(1, vec![DatumKind::Double])` + // A function of one argument of double or uint64 is `Uniform(1, vec![DatumKind::Double, + // DatumKind::UInt64])` + Uniform(usize, Vec), + /// One of a list of signatures + OneOf(Vec), +} + +impl TypeSignature { + pub(crate) fn to_datafusion_signature(&self) -> DfSignature { + DfSignature::new(self.to_datafusion_type_signature(), Volatility::Immutable) + } + + fn to_datafusion_type_signature(&self) -> DfTypeSignature { + match self { + TypeSignature::Exact(kinds) => { + let data_types = kinds.iter().map(|v| DataType::from(*v)).collect(); + DfTypeSignature::Exact(data_types) + } + TypeSignature::Uniform(num, kinds) => { + let data_types = kinds.iter().map(|v| DataType::from(*v)).collect(); + DfTypeSignature::Uniform(*num, data_types) + } + TypeSignature::OneOf(sigs) => { + let df_sigs = sigs + .iter() + .map(|v| v.to_datafusion_type_signature()) + .collect(); + DfTypeSignature::OneOf(df_sigs) + } + } + } +} + +/// A scalar function's return type. +#[derive(Debug)] +pub struct ReturnType { + kind: DatumKind, +} + +impl ReturnType { + pub(crate) fn to_datafusion_return_type(&self) -> ReturnTypeFunction { + let data_type = Arc::new(DataType::from(self.kind)); + Arc::new(move |_| Ok(data_type.clone())) + } +} + +pub struct ScalarFunction { + signature: TypeSignature, + return_type: ReturnType, + df_scalar_fn: ScalarFunctionImplementation, +} + +impl ScalarFunction { + pub fn make_by_fn(signature: TypeSignature, return_type: DatumKind, func: F) -> Self + where + F: Fn(&[ColumnarValue]) -> Result + Send + Sync + 'static, + { + let return_type = ReturnType { kind: return_type }; + + // Adapter to map func to Fn(&[DfColumnarValue]) -> Result + let df_adapter = move |df_args: &[DfColumnarValue]| { + // Convert df_args from DfColumnarValue to ColumnarValue. + let mut values: SmallVec<[ColumnarValue; FUNC_ARG_NUM]> = + SmallVec::with_capacity(df_args.len()); + for df_arg in df_args { + let value = ColumnarValue::try_from_df_columnar_value(df_arg).map_err(|e| { + DataFusionError::Internal(format!( + "Failed to convert datafusion columnar value, err:{}", + e + )) + })?; + values.push(value); + } + + // Execute our function. + let result_value = func(&values).map_err(|e| { + DataFusionError::Execution(format!("Failed to execute function, err:{}", e)) + })?; + + // Convert the result value to DfColumnarValue. + Ok(result_value.into_df_columnar_value()) + }; + + let df_scalar_fn = Arc::new(df_adapter); + + Self { + signature, + return_type, + df_scalar_fn, + } + } + + #[inline] + pub fn signature(&self) -> &TypeSignature { + &self.signature + } + + #[inline] + pub fn return_type(&self) -> &ReturnType { + &self.return_type + } + + #[inline] + pub(crate) fn to_datafusion_function(&self) -> ScalarFunctionImplementation { + self.df_scalar_fn.clone() + } +} + +pub struct AggregateFunction { + type_signature: TypeSignature, + return_type: ReturnType, + df_accumulator: AccumulatorFunctionImplementation, + state_type: Vec, +} + +impl AggregateFunction { + pub fn make_by_fn( + type_signature: TypeSignature, + return_type: DatumKind, + state_type: Vec, + accumulator_fn: F, + ) -> Self + where + F: Fn() -> Result + Send + Sync + 'static, + A: Accumulator + 'static, + { + // Create accumulator. + let df_adapter = move || { + let accumulator = accumulator_fn().map_err(|e| { + DataFusionError::Execution(format!("Failed to create accumulator, err:{}", e)) + })?; + let accumulator = Box::new(ToDfAccumulator::new(accumulator)); + + Ok(accumulator as _) + }; + let df_accumulator = Arc::new(df_adapter); + + // Create return type. + let return_type = ReturnType { kind: return_type }; + + Self { + type_signature, + return_type, + df_accumulator, + state_type, + } + } + + #[inline] + pub fn signature(&self) -> &TypeSignature { + &self.type_signature + } + + #[inline] + pub fn return_type(&self) -> &ReturnType { + &self.return_type + } + + #[inline] + pub(crate) fn to_datafusion_accumulator(&self) -> AccumulatorFunctionImplementation { + self.df_accumulator.clone() + } + + pub(crate) fn to_datafusion_state_type(&self) -> StateTypeFunction { + let data_types = Arc::new( + self.state_type + .iter() + .map(|kind| DataType::from(*kind)) + .collect::>(), + ); + Arc::new(move |_| Ok(data_types.clone())) + } +} diff --git a/udf/src/lib.rs b/udf/src/lib.rs new file mode 100644 index 0000000000..36d5f32fdf --- /dev/null +++ b/udf/src/lib.rs @@ -0,0 +1,10 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! UDF support. + +pub mod aggregate; +pub mod functions; +pub mod registry; +pub mod scalar; +pub mod udaf; +pub mod udfs; diff --git a/udf/src/registry.rs b/udf/src/registry.rs new file mode 100644 index 0000000000..34e0af7051 --- /dev/null +++ b/udf/src/registry.rs @@ -0,0 +1,92 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Function registry. + +use std::{collections::HashMap, sync::Arc}; + +use common_util::define_result; +use snafu::{ensure, Backtrace, Snafu}; + +use crate::{scalar::ScalarUdf, udaf::AggregateUdf, udfs}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Udf already exists, name:{}.\nBacktrace:\n{}", name, backtrace))] + UdfExists { name: String, backtrace: Backtrace }, +} + +define_result!(Error); + +/// A registry knows how to build logical expressions out of user-defined +/// function' names +pub trait FunctionRegistry { + fn register_udf(&mut self, udf: ScalarUdf) -> Result<()>; + + fn register_udaf(&mut self, udaf: AggregateUdf) -> Result<()>; + + fn find_udf(&self, name: &str) -> Result>; + + fn find_udaf(&self, name: &str) -> Result>; + + fn list_udfs(&self) -> Result>; +} + +/// Default function registry. +#[derive(Debug, Default)] +pub struct FunctionRegistryImpl { + scalar_functions: HashMap, + aggregate_functions: HashMap, +} + +impl FunctionRegistryImpl { + pub fn new() -> Self { + Self::default() + } + + /// Load all provided udfs. + pub fn load_functions(&mut self) -> Result<()> { + udfs::register_all_udfs(self) + } +} + +impl FunctionRegistry for FunctionRegistryImpl { + fn register_udf(&mut self, udf: ScalarUdf) -> Result<()> { + ensure!( + !self.scalar_functions.contains_key(udf.name()), + UdfExists { name: udf.name() } + ); + + self.scalar_functions.insert(udf.name().to_string(), udf); + + Ok(()) + } + + fn register_udaf(&mut self, udaf: AggregateUdf) -> Result<()> { + ensure!( + !self.aggregate_functions.contains_key(udaf.name()), + UdfExists { name: udaf.name() } + ); + + self.aggregate_functions + .insert(udaf.name().to_string(), udaf); + + Ok(()) + } + + fn find_udf(&self, name: &str) -> Result> { + let udf = self.scalar_functions.get(name).cloned(); + Ok(udf) + } + + fn find_udaf(&self, name: &str) -> Result> { + let udaf = self.aggregate_functions.get(name).cloned(); + Ok(udaf) + } + + fn list_udfs(&self) -> Result> { + let udfs = self.scalar_functions.values().cloned().collect(); + Ok(udfs) + } +} + +pub type FunctionRegistryRef = Arc; diff --git a/udf/src/scalar.rs b/udf/src/scalar.rs new file mode 100644 index 0000000000..2ce056c3f3 --- /dev/null +++ b/udf/src/scalar.rs @@ -0,0 +1,39 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Scalar udfs. + +use std::sync::Arc; + +use arrow_deps::datafusion::physical_plan::udf::ScalarUDF; + +use crate::functions::ScalarFunction; + +/// Logical representation of a UDF. +#[derive(Debug, Clone)] +pub struct ScalarUdf { + /// DataFusion UDF. + df_udf: Arc, +} + +impl ScalarUdf { + pub fn create(name: &str, func: ScalarFunction) -> Self { + let signature = func.signature().to_datafusion_signature(); + let return_type = func.return_type().to_datafusion_return_type(); + let scalar_fn = func.to_datafusion_function(); + + let df_udf = Arc::new(ScalarUDF::new(name, &signature, &return_type, &scalar_fn)); + + Self { df_udf } + } + + #[inline] + pub fn name(&self) -> &str { + &self.df_udf.name + } + + /// Convert into datafusion's udf + #[inline] + pub fn to_datafusion_udf(&self) -> Arc { + self.df_udf.clone() + } +} diff --git a/udf/src/udaf.rs b/udf/src/udaf.rs new file mode 100644 index 0000000000..06f8983460 --- /dev/null +++ b/udf/src/udaf.rs @@ -0,0 +1,45 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! UDAF support. + +use std::sync::Arc; + +use arrow_deps::datafusion::physical_plan::udaf::AggregateUDF; + +use crate::functions::AggregateFunction; + +/// Logical representation of a UDAF. +#[derive(Debug, Clone)] +pub struct AggregateUdf { + /// DataFusion UDAF. + df_udaf: Arc, +} + +impl AggregateUdf { + pub fn create(name: &str, func: AggregateFunction) -> Self { + let signature = func.signature().to_datafusion_signature(); + let return_type = func.return_type().to_datafusion_return_type(); + let accumulator = func.to_datafusion_accumulator(); + let state_type = func.to_datafusion_state_type(); + + let df_udaf = Arc::new(AggregateUDF::new( + name, + &signature, + &return_type, + &accumulator, + &state_type, + )); + + Self { df_udaf } + } + + #[inline] + pub fn name(&self) -> &str { + &self.df_udaf.name + } + + #[inline] + pub fn to_datafusion_udaf(&self) -> Arc { + self.df_udaf.clone() + } +} diff --git a/udf/src/udfs/mod.rs b/udf/src/udfs/mod.rs new file mode 100644 index 0000000000..5d64edf237 --- /dev/null +++ b/udf/src/udfs/mod.rs @@ -0,0 +1,16 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! UDFs + +use crate::registry::{FunctionRegistry, Result}; + +mod thetasketch_distinct; +mod time_bucket; + +pub fn register_all_udfs(registry: &mut dyn FunctionRegistry) -> Result<()> { + // Register all udfs + time_bucket::register_to_registry(registry)?; + thetasketch_distinct::register_to_registry(registry)?; + + Ok(()) +} diff --git a/udf/src/udfs/thetasketch_distinct.rs b/udf/src/udfs/thetasketch_distinct.rs new file mode 100644 index 0000000000..90ef3aefa5 --- /dev/null +++ b/udf/src/udfs/thetasketch_distinct.rs @@ -0,0 +1,166 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! thetasketch_distinct() udaf. + +use std::{fmt, mem}; + +use common_types::datum::DatumKind; +use common_util::define_result; +use hyperloglog::HyperLogLog; +use snafu::{ensure, OptionExt, ResultExt, Snafu}; + +use crate::{ + aggregate::{self, Accumulator, GetState, Input, MergeState, State, StateRef}, + functions::{AggregateFunction, ScalarValue, TypeSignature}, + registry::{self, FunctionRegistry}, + udaf::AggregateUdf, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Invalid argument number."))] + InvalidArgNum, + + #[snafu(display("Invalid state len."))] + InvalidStateLen, + + #[snafu(display("Invalid state, state is not string."))] + StateNotString, + + #[snafu(display("Failed to decode base64 of hll, err:{}.", source))] + DecodeBase64 { source: base64::DecodeError }, + + #[snafu(display("Invalid state, failed to decode hll, err:{}.", source))] + DecodeHll { source: hyperloglog::Error }, +} + +define_result!(Error); + +const HLL_ERROR_RATE: f64 = 0.01; +// Hll seeds: +const HLL_KEY0: u64 = 0; +const HLL_KEY1: u64 = 0; + +pub fn register_to_registry(registry: &mut dyn FunctionRegistry) -> registry::Result<()> { + registry.register_udaf(new_udaf()) +} + +fn new_udaf() -> AggregateUdf { + let aggregate_function = new_function(); + + AggregateUdf::create("thetasketch_distinct", aggregate_function) +} + +pub(crate) fn new_function() -> AggregateFunction { + // Aways use the same hasher with same keys. + let hll = HyperLogLog::new_with_keys(HLL_ERROR_RATE, HLL_KEY0, HLL_KEY1); + + let accumulator_fn = move || { + let distinct = HllDistinct { + hll: HyperLogLog::new_from_template(&hll), + }; + + Ok(distinct) + }; + + let type_signature = make_type_signature(); + let state_type = make_state_type(); + + AggregateFunction::make_by_fn( + type_signature, + DatumKind::UInt64, + state_type, + accumulator_fn, + ) +} + +fn make_type_signature() -> TypeSignature { + TypeSignature::Uniform( + 1, + vec![ + DatumKind::Timestamp, + DatumKind::Double, + DatumKind::Varbinary, + DatumKind::String, + DatumKind::UInt64, + ], + ) +} + +fn make_state_type() -> Vec { + vec![DatumKind::String] +} + +/// Distinct counter based on HyperLogLog. +/// +/// The HyperLogLogs must be initialized with same hash seeds (new from same +/// template). +struct HllDistinct { + hll: HyperLogLog, +} + +// TODO(yingwen): Avoid base64 encode/decode if datafusion supports converting +// binary datatype to scalarvalue. +impl HllDistinct { + fn merge_impl(&mut self, states: StateRef) -> Result<()> { + // The states are serialize from hll. + ensure!(states.len() == 1, InvalidStateLen); + let value_ref = states.value(0); + let hll_string = value_ref.as_str().context(StateNotString)?; + let hll_bytes = base64::decode(hll_string).context(DecodeBase64)?; + let mut buf = &hll_bytes[..]; + // Try to deserialize the hll. + let hll = HyperLogLog::read_from_buf(&mut buf).context(DecodeHll)?; + + // Merge the hll, note that the two hlls must created or serialized from the + // same template hll. + self.hll.merge(&hll); + + Ok(()) + } +} + +impl fmt::Debug for HllDistinct { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("HllDistinct") + .field("len", &self.hll.len()) + .finish() + } +} + +impl Accumulator for HllDistinct { + fn state(&self) -> aggregate::Result { + // Serialize `self.hll` to bytes. + let mut buf = Vec::with_capacity(mem::size_of::()); + self.hll + .write_to_buf(&mut buf) + .map_err(|e| Box::new(e) as _) + .context(GetState)?; + // HACK: DataFusion does not support creating a scalar from binary, so we need + // to use base64 to convert a binary into string. + let hll_string = base64::encode(buf); + + Ok(State::from(ScalarValue::from(hll_string))) + } + + fn update(&mut self, values: Input) -> aggregate::Result<()> { + for value_ref in values.iter() { + // Insert value into hll. + self.hll.insert(&value_ref); + } + + Ok(()) + } + + fn merge(&mut self, states: StateRef) -> aggregate::Result<()> { + self.merge_impl(states) + .map_err(|e| Box::new(e) as _) + .context(MergeState) + } + + fn evaluate(&self) -> aggregate::Result { + let count = self.hll.len() as u64; + + Ok(ScalarValue::from(count)) + } +} diff --git a/udf/src/udfs/time_bucket.rs b/udf/src/udfs/time_bucket.rs new file mode 100644 index 0000000000..40e428ec5a --- /dev/null +++ b/udf/src/udfs/time_bucket.rs @@ -0,0 +1,324 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! time_bucket UDF. + +use std::time::Duration; + +use chrono::{Datelike, FixedOffset, TimeZone}; +use common_types::{ + column::{ColumnBlock, ColumnBlockBuilder, TimestampColumn}, + datum::{Datum, DatumKind}, + time::Timestamp, +}; +use common_util::define_result; +use snafu::{ensure, OptionExt, ResultExt, Snafu}; + +use crate::{ + functions::{CallFunction, ColumnarValue, InvalidArguments, ScalarFunction, TypeSignature}, + registry::{self, FunctionRegistry}, + scalar::ScalarUdf, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Invalid period, period:{}", period))] + InvalidPeriod { period: String }, + + #[snafu(display("Invalid period number, period:{}, err:{}", period, source))] + InvalidPeriodNumber { + period: String, + source: std::num::ParseIntError, + }, + + #[snafu(display("Invalid argument number."))] + InvalidArgNum, + + #[snafu(display("Invalid arguments, require timestamp column."))] + NotTimestampColumn, + + #[snafu(display("Invalid arguments, require period."))] + NotPeriod, + + #[snafu(display("Period of week only support P1W."))] + UnsupportedWeek, + + #[snafu(display("Period of month only support P1M."))] + UnsupportedMonth, + + #[snafu(display("Period of year only support P1Y."))] + UnsupportedYear, + + #[snafu(display( + "Failed to truncate timestamp, timestamp:{}, period:{:?}", + timestamp, + period + ))] + TruncateTimestamp { timestamp: i64, period: Period }, + + #[snafu(display("Failed to build result column, err:{}", source))] + BuildColumn { source: common_types::column::Error }, +} + +define_result!(Error); + +/// Default timezone: +08:00 +const DEFAULT_TIMEZONE_OFFSET_SECS: i32 = 8 * 3600; + +pub fn register_to_registry(registry: &mut dyn FunctionRegistry) -> registry::Result<()> { + registry.register_udf(new_udf()) +} + +fn new_udf() -> ScalarUdf { + // args: + // - timestamp column. + // - period. + // - input timestamp format in PARTITION BY (unsed now). + // - input timezone (ignored now). + // - timestamp output format (ignored now). + let func = |args: &[ColumnarValue]| { + let bucket = TimeBucket::parse_args(args) + .map_err(|e| Box::new(e) as _) + .context(InvalidArguments)?; + + let result_column = bucket + .call() + .map_err(|e| Box::new(e) as _) + .context(CallFunction)?; + + Ok(ColumnarValue::Array(result_column)) + }; + + let signature = make_signature(); + let scalar_function = ScalarFunction::make_by_fn(signature, DatumKind::Timestamp, func); + + ScalarUdf::create("time_bucket", scalar_function) +} + +fn make_signature() -> TypeSignature { + let sigs = vec![ + TypeSignature::Exact(vec![DatumKind::Timestamp, DatumKind::String]), + TypeSignature::Exact(vec![ + DatumKind::Timestamp, + DatumKind::String, + DatumKind::String, + ]), + TypeSignature::Exact(vec![ + DatumKind::Timestamp, + DatumKind::String, + DatumKind::String, + DatumKind::String, + ]), + TypeSignature::Exact(vec![ + DatumKind::Timestamp, + DatumKind::String, + DatumKind::String, + DatumKind::String, + DatumKind::String, + ]), + ]; + TypeSignature::OneOf(sigs) +} + +struct TimeBucket<'a> { + column: &'a TimestampColumn, + period: Period, +} + +impl<'a> TimeBucket<'a> { + fn parse_args(args: &[ColumnarValue]) -> Result { + ensure!(args.len() >= 2, InvalidArgNum); + + let column = match &args[0] { + ColumnarValue::Array(block) => block.as_timestamp().context(NotTimestampColumn)?, + _ => return NotTimestampColumn.fail(), + }; + let period = match &args[1] { + ColumnarValue::Scalar(value) => { + let period_str = value.as_str().context(NotPeriod)?; + Period::parse(period_str)? + } + _ => return NotPeriod.fail(), + }; + + Ok(TimeBucket { column, period }) + } + + fn call(&self) -> Result { + let mut out_column_builder = + ColumnBlockBuilder::with_capacity(&DatumKind::Timestamp, self.column.num_rows()); + for ts_opt in self.column.iter() { + match ts_opt { + Some(ts) => { + let truncated = self.period.truncate(ts).context(TruncateTimestamp { + timestamp: ts, + period: self.period, + })?; + out_column_builder + .append(Datum::Timestamp(truncated)) + .context(BuildColumn)?; + } + None => { + out_column_builder + .append(Datum::Null) + .context(BuildColumn)?; + } + } + } + Ok(out_column_builder.build()) + } +} + +/// A time bucket period. +/// +/// e.g. +/// - PT1S +/// - PT1M +/// - PT1H +/// - P1D +/// - P1W +/// - P1M +/// - P1Y +#[derive(Debug, Clone, Copy)] +pub enum Period { + Second(u16), + Minute(u16), + Hour(u16), + Day(u16), + Week, + Month, + Year, +} + +impl Period { + fn parse(period: &str) -> Result { + ensure!(period.len() >= 3, InvalidPeriod { period }); + let is_pt = if period.starts_with("PT") { + true + } else if period.starts_with('P') { + false + } else { + return InvalidPeriod { period }.fail(); + }; + + let back = period.chars().last().context(InvalidPeriod { period })?; + let parsed = if is_pt { + let number = &period[2..period.len() - 1]; + let number = number + .parse::() + .context(InvalidPeriodNumber { period })?; + match back { + 'S' => Period::Second(number), + 'M' => Period::Minute(number), + 'H' => Period::Hour(number), + _ => return InvalidPeriod { period }.fail(), + } + } else { + let number = &period[1..period.len() - 1]; + let number = number + .parse::() + .context(InvalidPeriodNumber { period })?; + match back { + 'D' => Period::Day(number), + 'W' => { + ensure!(number == 1, UnsupportedWeek); + Period::Week + } + 'M' => { + ensure!(number == 1, UnsupportedMonth); + Period::Month + } + 'Y' => { + ensure!(number == 1, UnsupportedYear); + Period::Year + } + _ => return InvalidPeriod { period }.fail(), + } + }; + + Ok(parsed) + } + + fn truncate(&self, ts: Timestamp) -> Option { + const MINUTE_SECONDS: u64 = 60; + const HOUR_SECONDS: u64 = 60 * MINUTE_SECONDS; + + let truncated_ts = match self { + Period::Second(period) => { + let duration = Duration::from_secs(u64::from(*period)); + ts.truncate_by(duration) + } + Period::Minute(period) => { + let duration = Duration::from_secs(u64::from(*period) * MINUTE_SECONDS); + ts.truncate_by(duration) + } + Period::Hour(period) => { + let duration = Duration::from_secs(u64::from(*period) * HOUR_SECONDS); + ts.truncate_by(duration) + } + Period::Day(period) => Self::truncate_day(ts, *period)?, + Period::Week => Self::truncate_week(ts), + Period::Month => Self::truncate_month(ts), + Period::Year => Self::truncate_year(ts), + }; + + Some(truncated_ts) + } + + fn truncate_day(ts: Timestamp, period: u16) -> Option { + let offset = FixedOffset::east(DEFAULT_TIMEZONE_OFFSET_SECS); + // Convert to local time. + let datetime = offset.timestamp_millis(ts.as_i64()); + + // Truncate day + let day = datetime.day(); + let day = day - (day % u32::from(period)); + let truncated_datetime = offset + .ymd(datetime.year(), datetime.month(), day) + .and_hms(0, 0, 0); + let truncated_ts = truncated_datetime.timestamp_millis(); + + Some(Timestamp::new(truncated_ts)) + } + + fn truncate_week(ts: Timestamp) -> Timestamp { + let offset = FixedOffset::east(DEFAULT_TIMEZONE_OFFSET_SECS); + // Convert to local time. + let datetime = offset.timestamp_millis(ts.as_i64()); + + // Truncate week. + let week_offset = datetime.weekday().num_days_from_monday(); + let week_millis = 7 * 24 * 3600 * 1000; + let ts_offset = week_offset * week_millis; + // TODO(yingwen): Impl sub/divide for Timestamp + let week_millis = i64::from(week_millis); + let truncated_ts = (ts.as_i64() - i64::from(ts_offset)) / week_millis * week_millis; + + Timestamp::new(truncated_ts) + } + + fn truncate_month(ts: Timestamp) -> Timestamp { + let offset = FixedOffset::east(DEFAULT_TIMEZONE_OFFSET_SECS); + // Convert to local time. + let datetime = offset.timestamp_millis(ts.as_i64()); + + // Truncate month + let truncated_datetime = offset + .ymd(datetime.year(), datetime.month(), 1) + .and_hms(0, 0, 0); + let truncated_ts = truncated_datetime.timestamp_millis(); + + Timestamp::new(truncated_ts) + } + + fn truncate_year(ts: Timestamp) -> Timestamp { + let offset = FixedOffset::east(DEFAULT_TIMEZONE_OFFSET_SECS); + // Convert to local time. + let datetime = offset.timestamp_millis(ts.as_i64()); + + // Truncate year + let truncated_datetime = offset.ymd(datetime.year(), 1, 1).and_hms(0, 0, 0); + let truncated_ts = truncated_datetime.timestamp_millis(); + + Timestamp::new(truncated_ts) + } +} diff --git a/wal/Cargo.toml b/wal/Cargo.toml new file mode 100644 index 0000000000..574cffa9e2 --- /dev/null +++ b/wal/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "wal" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +async-trait = "0.1.41" +common_util = {path = "../common_util"} +common_types = {path = "../common_types"} +log = "0.4" +snafu = { version ="0.6.10", features = ["backtraces"] } +tokio = { version = "1.0", features = ["sync"] } + +[dev-dependencies] +tempfile = "3.1.0" +futures = { version = "0.3", features = ["async-await"] } + +[dependencies.rocksdb] +git = "https://github.com/tikv/rust-rocksdb.git" +branch = "tikv-5.2" +features = ["portable"] diff --git a/wal/src/lib.rs b/wal/src/lib.rs new file mode 100644 index 0000000000..440edb2d1e --- /dev/null +++ b/wal/src/lib.rs @@ -0,0 +1,10 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Write Ahead Log + +pub mod log_batch; +pub mod manager; +pub mod rocks_impl; + +#[cfg(test)] +mod tests; diff --git a/wal/src/log_batch.rs b/wal/src/log_batch.rs new file mode 100644 index 0000000000..7e08c6c10d --- /dev/null +++ b/wal/src/log_batch.rs @@ -0,0 +1,89 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Log entries definition. + +use std::fmt::Debug; + +use common_types::{ + bytes::{MemBuf, MemBufMut}, + SequenceNumber, +}; + +use crate::manager::RegionId; + +pub trait Payload: Send + Sync + Debug { + type Error: std::error::Error + Send + Sync + 'static; + /// Compute size of the encoded payload. + fn encode_size(&self) -> usize; + /// Append the encoded payload to the `buf`. + fn encode_to(&self, buf: &mut B) -> Result<(), Self::Error>; +} + +#[derive(Debug)] +pub struct LogEntry

{ + pub sequence: SequenceNumber, + pub payload: P, +} + +/// An entry to be written into the Wal. +/// +/// Generally, the `payload` is a lazily encoder whose constraint is +/// `PayloadEncoder`. `region_id` is a logically region and set it as 0 if +/// unnecessary. +#[derive(Debug)] +pub struct LogWriteEntry

{ + pub payload: P, +} + +/// A batch of `LogWriteEntry`s. +#[derive(Debug)] +pub struct LogWriteBatch

{ + pub(crate) region_id: RegionId, + pub(crate) entries: Vec>, +} + +impl LogWriteBatch

{ + pub fn new(region_id: RegionId) -> Self { + Self::with_capacity(region_id, 0) + } + + pub fn with_capacity(region_id: RegionId, cap: usize) -> Self { + Self { + region_id, + entries: Vec::with_capacity(cap), + } + } + + #[inline] + pub fn push(&mut self, entry: LogWriteEntry

) { + self.entries.push(entry) + } + + #[inline] + pub fn len(&self) -> usize { + self.entries.len() + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.entries.is_empty() + } + + #[inline] + pub fn clear(&mut self) { + self.entries.clear() + } +} + +impl Default for LogWriteBatch

{ + fn default() -> Self { + Self::new(0) + } +} + +pub trait PayloadDecoder: Send + Sync { + type Error: std::error::Error + Send + Sync + 'static; + type Target: Send + Sync; + /// Decode `Target` from the `bytes`. + fn decode(&self, buf: &mut B) -> Result; +} diff --git a/wal/src/manager.rs b/wal/src/manager.rs new file mode 100644 index 0000000000..4ea8fe97ab --- /dev/null +++ b/wal/src/manager.rs @@ -0,0 +1,237 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! WalManager abstraction + +use std::{fmt, time::Duration}; + +use async_trait::async_trait; +pub use common_types::SequenceNumber; + +use crate::log_batch::{LogEntry, LogWriteBatch, Payload, PayloadDecoder}; + +pub mod error { + use common_util::define_result; + use snafu::{Backtrace, Snafu}; + + use crate::manager::RegionId; + + // Now most error from manage implementation don't have backtrace, so we add + // backtrace here. + #[derive(Debug, Snafu)] + #[snafu(visibility(pub))] + pub enum Error { + #[snafu(display( + "Failed to open wal, path:{}, err:{}.\nBacktrace:\n{}", + wal_path, + source, + backtrace + ))] + Open { + wal_path: String, + source: Box, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to initialize wal, err:{}.\nBacktrace:\n{}", source, backtrace))] + Initialization { + source: Box, + backtrace: Backtrace, + }, + + #[snafu(display( + "Region is not found, region_id:{}.\nBacktrace:\n{}", + region_id, + backtrace + ))] + RegionNotFound { + region_id: RegionId, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to write log entries, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + Write { + source: Box, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to read log entries, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + Read { + source: Box, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to delete log entries, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + Delete { + source: Box, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to encode, err:{}.\nBacktrace:\n{}", source, backtrace))] + Encoding { + source: Box, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to decode, err:{}.\nBacktrace:\n{}", source, backtrace))] + Decoding { + source: Box, + backtrace: Backtrace, + }, + } + + define_result!(Error); +} + +use common_types::{MAX_SEQUENCE_NUMBER, MIN_SEQUENCE_NUMBER}; +pub use error::*; + +pub type RegionId = u64; +pub const MAX_REGION_ID: RegionId = u64::MAX; + +#[derive(Debug, Clone)] +pub struct WriteContext { + /// Timeout to write wal and it only takes effect when writing to a Wal on a + /// remote machine (writing to the local disk does not have timeout). + pub timeout: Duration, +} + +impl Default for WriteContext { + fn default() -> Self { + Self { + timeout: Duration::from_secs(1), + } + } +} + +/// Write abstraction for log entries in Wal. +#[async_trait] +pub trait LogWriter { + /// Write a batch of log entries to log. + /// + /// Returns the max sequence number for the batch of log entries. + async fn write( + &self, + ctx: &WriteContext, + batch: &LogWriteBatch

, + ) -> Result; +} + +#[derive(Debug, Clone)] +pub struct ReadContext { + /// Timeout to read log entries and it only takes effect when reading from a + /// Wal on a remote machine (reading from the local disk does not have + /// timeout). + pub timeout: Duration, +} + +impl Default for ReadContext { + fn default() -> Self { + Self { + timeout: Duration::from_secs(5), + } + } +} + +#[derive(Debug, Clone, Copy)] +pub enum ReadBoundary { + Max, + Min, + Included(SequenceNumber), + Excluded(SequenceNumber), +} + +impl ReadBoundary { + /// Convert the boundary to start sequence number. + /// + /// Returns `None` if the boundary is `Excluded(MAX_SEQUENCE_NUM)` + pub fn as_start_sequence_number(&self) -> Option { + match *self { + ReadBoundary::Max => Some(MAX_SEQUENCE_NUMBER), + ReadBoundary::Min => Some(MIN_SEQUENCE_NUMBER), + ReadBoundary::Included(n) => Some(n), + ReadBoundary::Excluded(n) => { + if n == MAX_SEQUENCE_NUMBER { + None + } else { + Some(n + 1) + } + } + } + } + + /// Convert the boundary to start sequence number. + /// + /// Returns `None` if the boundary is `Excluded(MIN_SEQUENCE_NUM)` + pub fn as_end_sequence_number(&self) -> Option { + match *self { + ReadBoundary::Max => Some(MAX_SEQUENCE_NUMBER), + ReadBoundary::Min => Some(MIN_SEQUENCE_NUMBER), + ReadBoundary::Included(n) => Some(n), + ReadBoundary::Excluded(n) => { + if n == MIN_SEQUENCE_NUMBER { + None + } else { + Some(n - 1) + } + } + } + } +} + +#[derive(Debug, Clone)] +pub struct ReadRequest { + /// Region id of the wal to read + pub region_id: RegionId, + // TODO(yingwen): Or just rename to ReadBound? + /// Start bound + pub start: ReadBoundary, + /// End bound + pub end: ReadBoundary, +} + +/// Iterator abstraction for log entry. +pub trait LogIterator { + fn next_log_entry( + &mut self, + decoder: &D, + ) -> Result>>; +} + +/// Read abstraction for log entries in the Wal. +pub trait LogReader { + /// Iterator over log entries. + type Iterator: LogIterator + Send; + /// Provide iterator on necessary entries according to `ReadRequest`. + fn read(&self, ctx: &ReadContext, req: &ReadRequest) -> Result; +} + +// TODO(xikai): define Error as associate type. +/// Management of multi-region Wals. +/// +/// Every region has its own increasing (and maybe hallow) sequence number +/// space. +#[async_trait] +pub trait WalManager: LogWriter + LogReader + fmt::Debug { + /// Get current sequence number. + fn sequence_num(&self, region_id: RegionId) -> Result; + + /// Mark the entries whose sequence number is in [0, `sequence_number`] to + /// be deleted in the future. + async fn mark_delete_entries_up_to( + &self, + region_id: RegionId, + sequence_num: SequenceNumber, + ) -> Result<()>; +} diff --git a/wal/src/rocks_impl/encoding.rs b/wal/src/rocks_impl/encoding.rs new file mode 100644 index 0000000000..727b5715f2 --- /dev/null +++ b/wal/src/rocks_impl/encoding.rs @@ -0,0 +1,533 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Encoding for Wal logs + +use common_types::{ + bytes::{self, BytesMut, MemBuf, MemBufMut}, + SequenceNumber, +}; +use common_util::{ + codec::{Decoder, Encoder}, + define_result, +}; +use snafu::{ensure, Backtrace, ResultExt, Snafu}; + +use crate::{ + log_batch::{Payload, PayloadDecoder}, + manager::{self, RegionId}, +}; + +const LOG_KEY_ENCODING_V0: u8 = 0; +const NEWEST_LOG_KEY_ENCODING_VERSION: u8 = LOG_KEY_ENCODING_V0; + +const LOG_VALUE_ENCODING_V0: u8 = 0; +const NEWEST_LOG_VALUE_ENCODING_VERSION: u8 = LOG_VALUE_ENCODING_V0; + +const META_KEY_ENCODING_V0: u8 = 0; +const NEWEST_META_KEY_ENCODING_VERSION: u8 = META_KEY_ENCODING_V0; + +const META_VALUE_ENCODING_V0: u8 = 0; +const NEWEST_META_VALUE_ENCODING_VERSION: u8 = META_VALUE_ENCODING_V0; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to encode log key, err:{}", source))] + EncodeLogKey { + source: bytes::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to encode log value header, err:{}", source))] + EncodeLogValueHeader { source: bytes::Error }, + + #[snafu(display("Failed to encode log value payload, err:{}", source))] + EncodeLogValuePayload { + source: Box, + }, + + #[snafu(display("Failed to decode log key, err:{}", source))] + DecodeLogKey { source: bytes::Error }, + + #[snafu(display("Failed to decode log value header, err:{}", source))] + DecodeLogValueHeader { source: bytes::Error }, + + #[snafu(display("Failed to decode log value payload, err:{}", source))] + DecodeLogValuePayload { + source: Box, + }, + + #[snafu(display("Failed to encode meta key, err:{}", source))] + EncodeMetaKey { + source: bytes::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to encode meta value, err:{}", source))] + EncodeMetaValue { source: bytes::Error }, + + #[snafu(display("Failed to decode meta key, err:{}", source))] + DecodeMetaKey { source: bytes::Error }, + + #[snafu(display("Failed to decode meta value, err:{}", source))] + DecodeMetaValue { source: bytes::Error }, + + #[snafu(display( + "Found invalid meta key type, expect:{:?}, given:{}.\nBacktrace:\n{}", + expect, + given, + backtrace + ))] + InvalidMetaKeyType { + expect: MetaKeyType, + given: u8, + backtrace: Backtrace, + }, + + #[snafu(display( + "Found invalid namespace, expect:{:?}, given:{}.\nBacktrace:\n{}", + expect, + given, + backtrace + ))] + InvalidNamespace { + expect: Namespace, + given: u8, + backtrace: Backtrace, + }, + + #[snafu(display( + "Found invalid version, expect:{}, given:{}.\nBacktrace:\n{}", + expect, + given, + backtrace + ))] + InvalidVersion { + expect: u8, + given: u8, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +#[derive(Debug, Copy, Clone)] +pub enum Namespace { + Meta = 0, + Log = 1, +} + +#[derive(Debug, Clone)] +pub struct LogEncoding { + key_enc: LogKeyEncoder, + value_enc: LogValueEncoder, + // value decoder is created dynamically from the version, + value_enc_version: u8, +} + +impl LogEncoding { + pub fn newest() -> Self { + Self { + key_enc: LogKeyEncoder { + version: NEWEST_LOG_KEY_ENCODING_VERSION, + namespace: Namespace::Log, + }, + value_enc: LogValueEncoder { + version: NEWEST_LOG_VALUE_ENCODING_VERSION, + }, + value_enc_version: NEWEST_LOG_VALUE_ENCODING_VERSION, + } + } + + // Encode [LogKey] into `buf` and caller should knows that the keys are ordered + // by ([RegionId], [SequenceNum]) so the caller can use this method to + // generate min/max key in specific scope(global or in some region). + pub fn encode_key(&self, buf: &mut BytesMut, log_key: &LogKey) -> manager::Result<()> { + buf.clear(); + buf.reserve(self.key_enc.estimate_encoded_size(log_key)); + self.key_enc + .encode(buf, log_key) + .map_err(|e| Box::new(e) as _) + .context(manager::Encoding)?; + + Ok(()) + } + + pub fn encode_value(&self, buf: &mut BytesMut, payload: &impl Payload) -> manager::Result<()> { + buf.clear(); + buf.reserve(self.value_enc.estimate_encoded_size(payload)); + self.value_enc + .encode(buf, payload) + .map_err(|e| Box::new(e) as _) + .context(manager::Encoding) + } + + pub fn is_log_key(&self, mut buf: &[u8]) -> manager::Result { + self.key_enc + .is_valid(&mut buf) + .map_err(|e| Box::new(e) as _) + .context(manager::Decoding) + } + + pub fn decode_key(&self, mut buf: &[u8]) -> manager::Result { + self.key_enc + .decode(&mut buf) + .map_err(|e| Box::new(e) as _) + .context(manager::Decoding) + } + + pub fn decode_value( + &self, + mut buf: &[u8], + decoder: &D, + ) -> manager::Result { + let value_dec = LogValueDecoder { + version: self.value_enc_version, + payload_dec: decoder, + }; + + value_dec + .decode(&mut buf) + .map_err(|e| Box::new(e) as _) + .context(manager::Decoding) + } +} + +pub type LogKey = (RegionId, SequenceNumber); + +#[derive(Debug, Clone)] +struct LogKeyEncoder { + version: u8, + namespace: Namespace, +} + +impl LogKeyEncoder { + /// Determine whether the raw bytes is a log key. + pub fn is_valid(&self, buf: &mut B) -> Result { + let namespace = buf.read_u8().context(DecodeLogKey)?; + Ok(namespace == self.namespace as u8) + } +} + +impl Encoder for LogKeyEncoder { + type Error = Error; + + /// Key format: + /// + /// ```text + /// +---------------+----------------+-------------------+--------------------+ + /// | namespace(u8) | region_id(u64) | sequence_num(u64) | version header(u8) | + /// +---------------+----------------+-------------------+--------------------+ + /// ``` + /// + /// More information can be extended after the incremented `version header`. + fn encode(&self, buf: &mut B, log_key: &LogKey) -> Result<()> { + buf.write_u8(self.namespace as u8).context(EncodeLogKey)?; + buf.write_u64(log_key.0).context(EncodeLogKey)?; + buf.write_u64(log_key.1).context(EncodeLogKey)?; + buf.write_u8(self.version).context(EncodeLogKey)?; + + Ok(()) + } + + fn estimate_encoded_size(&self, _log_key: &LogKey) -> usize { + // Refer to key format. + 1 + 8 + 8 + 1 + } +} + +impl Decoder for LogKeyEncoder { + type Error = Error; + + fn decode(&self, buf: &mut B) -> Result { + // check namespace + let namespace = buf.read_u8().context(DecodeLogKey)?; + ensure!( + namespace == self.namespace as u8, + InvalidNamespace { + expect: self.namespace, + given: namespace + } + ); + + let log_key = ( + buf.read_u64().context(DecodeLogKey)?, + buf.read_u64().context(DecodeLogKey)?, + ); + + // check version + let version = buf.read_u8().context(DecodeLogKey)?; + ensure!( + version == self.version, + InvalidVersion { + expect: self.version, + given: version + } + ); + + Ok(log_key) + } +} + +#[derive(Debug, Clone)] +struct LogValueEncoder { + version: u8, +} + +impl Encoder for LogValueEncoder { + type Error = Error; + + /// Value format: + /// +--------------------+---------+ + /// | version_header(u8) | payload | + /// +--------------------+---------+ + fn encode(&self, buf: &mut B, payload: &T) -> Result<()> { + buf.write_u8(self.version).context(EncodeLogValueHeader)?; + + payload + .encode_to(buf) + .map_err(|e| Box::new(e) as _) + .context(EncodeLogValuePayload) + } + + fn estimate_encoded_size(&self, payload: &T) -> usize { + // Refer to value format. + 1 + payload.encode_size() + } +} + +struct LogValueDecoder<'a, D: PayloadDecoder> { + version: u8, + payload_dec: &'a D, +} + +impl<'a, D: PayloadDecoder> Decoder for LogValueDecoder<'a, D> { + type Error = Error; + + fn decode(&self, buf: &mut B) -> Result { + let version = buf.read_u8().context(DecodeLogValueHeader)?; + ensure!( + version == self.version, + InvalidVersion { + expect: self.version, + given: version + } + ); + + self.payload_dec + .decode(buf) + .map_err(|e| Box::new(e) as _) + .context(DecodeLogValuePayload) + } +} + +#[derive(Clone, Copy, Debug)] +pub enum MetaKeyType { + MaxSeq = 0, +} + +#[derive(Clone, Debug)] +pub struct MetaKeyEncoder { + version: u8, + key_type: MetaKeyType, + namespace: Namespace, +} + +#[derive(Clone, Debug)] +pub struct MetaKey { + pub region_id: RegionId, +} + +impl MetaKeyEncoder { + /// Determine whether the raw bytes is a valid meta key. + pub fn is_valid(&self, buf: &mut B) -> Result { + let namespace = buf.read_u8().context(DecodeMetaKey)?; + let key_type = buf.read_u8().context(DecodeMetaKey)?; + Ok(namespace == self.namespace as u8 && key_type == self.key_type as u8) + } +} + +impl Encoder for MetaKeyEncoder { + type Error = Error; + + /// Key format: + /// + /// ```text + /// +---------------+--------------+----------------+--------------------+ + /// | namespace(u8) | key_type(u8) | region_id(u64) | version header(u8) | + /// +---------------+--------------+----------------+--------------------+ + /// ``` + /// + /// More information can be extended after the incremented `version header`. + fn encode(&self, buf: &mut B, meta_key: &MetaKey) -> Result<()> { + buf.write_u8(self.namespace as u8).context(EncodeMetaKey)?; + buf.write_u8(self.key_type as u8).context(EncodeMetaKey)?; + buf.write_u64(meta_key.region_id).context(EncodeMetaKey)?; + buf.write_u8(self.version).context(EncodeMetaKey)?; + + Ok(()) + } + + fn estimate_encoded_size(&self, _log_key: &MetaKey) -> usize { + // Refer to key format. + 1 + 1 + 8 + 1 + } +} + +impl Decoder for MetaKeyEncoder { + type Error = Error; + + fn decode(&self, buf: &mut B) -> Result { + // check namespace + let namespace = buf.read_u8().context(DecodeMetaKey)?; + ensure!( + namespace == self.namespace as u8, + InvalidNamespace { + expect: self.namespace, + given: namespace + } + ); + + let key_type = buf.read_u8().context(DecodeMetaKey)?; + ensure!( + key_type == self.key_type as u8, + InvalidMetaKeyType { + expect: self.key_type, + given: key_type, + } + ); + + let region_id = buf.read_u64().context(DecodeMetaKey)?; + + // check version + let version = buf.read_u8().context(DecodeMetaKey)?; + ensure!( + version == self.version, + InvalidVersion { + expect: self.version, + given: version + } + ); + + Ok(MetaKey { region_id }) + } +} + +#[derive(Clone, Debug)] +pub struct MaxSeqMetaValue { + pub max_seq: SequenceNumber, +} + +#[derive(Clone, Debug)] +pub struct MaxSeqMetaValueEncoder { + version: u8, +} + +impl Encoder for MaxSeqMetaValueEncoder { + type Error = Error; + + /// Value format: + /// + /// ```text + /// +--------------------+--------------+ + /// | version header(u8) | max_seq(u64) | + /// +--------------------+--------------+ + /// ``` + /// + /// More information can be extended after the incremented `version header`. + fn encode(&self, buf: &mut B, meta_value: &MaxSeqMetaValue) -> Result<()> { + buf.write_u8(self.version).context(EncodeMetaValue)?; + buf.write_u64(meta_value.max_seq).context(EncodeMetaValue)?; + + Ok(()) + } + + fn estimate_encoded_size(&self, _meta_value: &MaxSeqMetaValue) -> usize { + // Refer to value format. + 1 + 8 + } +} + +impl Decoder for MaxSeqMetaValueEncoder { + type Error = Error; + + fn decode(&self, buf: &mut B) -> Result { + // check version + let version = buf.read_u8().context(DecodeMetaValue)?; + ensure!( + version == self.version, + InvalidVersion { + expect: self.version, + given: version + } + ); + + let max_seq = buf.read_u64().context(DecodeMetaValue)?; + Ok(MaxSeqMetaValue { max_seq }) + } +} + +#[derive(Clone, Debug)] +pub struct MaxSeqMetaEncoding { + key_enc: MetaKeyEncoder, + value_enc: MaxSeqMetaValueEncoder, +} + +impl MaxSeqMetaEncoding { + pub fn newest() -> Self { + Self { + key_enc: MetaKeyEncoder { + version: NEWEST_META_KEY_ENCODING_VERSION, + key_type: MetaKeyType::MaxSeq, + namespace: Namespace::Meta, + }, + value_enc: MaxSeqMetaValueEncoder { + version: NEWEST_META_VALUE_ENCODING_VERSION, + }, + } + } + + pub fn is_max_seq_meta_key(&self, mut buf: &[u8]) -> manager::Result { + self.key_enc + .is_valid(&mut buf) + .map_err(|e| Box::new(e) as _) + .context(manager::Decoding) + } + + pub fn encode_key(&self, buf: &mut BytesMut, meta_key: &MetaKey) -> manager::Result<()> { + buf.clear(); + buf.reserve(self.key_enc.estimate_encoded_size(meta_key)); + self.key_enc + .encode(buf, meta_key) + .map_err(|e| Box::new(e) as _) + .context(manager::Encoding)?; + + Ok(()) + } + + pub fn encode_value( + &self, + buf: &mut BytesMut, + meta_value: &MaxSeqMetaValue, + ) -> manager::Result<()> { + buf.clear(); + buf.reserve(self.value_enc.estimate_encoded_size(meta_value)); + self.value_enc + .encode(buf, meta_value) + .map_err(|e| Box::new(e) as _) + .context(manager::Encoding) + } + + pub fn decode_key(&self, mut buf: &[u8]) -> manager::Result { + self.key_enc + .decode(&mut buf) + .map_err(|e| Box::new(e) as _) + .context(manager::Decoding) + } + + pub fn decode_value(&self, mut buf: &[u8]) -> manager::Result { + self.value_enc + .decode(&mut buf) + .map_err(|e| Box::new(e) as _) + .context(manager::Decoding) + } +} diff --git a/wal/src/rocks_impl/manager.rs b/wal/src/rocks_impl/manager.rs new file mode 100644 index 0000000000..bdf71eba0e --- /dev/null +++ b/wal/src/rocks_impl/manager.rs @@ -0,0 +1,621 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! WalManager implementation based on RocksDB + +use std::{ + collections::HashMap, + fmt, + fmt::Formatter, + path::PathBuf, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, RwLock, + }, +}; + +use async_trait::async_trait; +use common_types::{bytes::BytesMut, SequenceNumber, MAX_SEQUENCE_NUMBER, MIN_SEQUENCE_NUMBER}; +use common_util::runtime::Runtime; +use log::{debug, info, warn}; +use rocksdb::{DBIterator, DBOptions, ReadOptions, SeekKey, Writable, WriteBatch, DB}; +use snafu::ResultExt; +use tokio::sync::Mutex; + +use crate::{ + log_batch::{LogEntry, LogWriteBatch, Payload, PayloadDecoder}, + manager::{ + error::*, LogIterator, LogReader, LogWriter, ReadContext, ReadRequest, RegionId, + WalManager, WriteContext, MAX_REGION_ID, + }, + rocks_impl::encoding::{LogEncoding, LogKey, MaxSeqMetaEncoding, MaxSeqMetaValue, MetaKey}, +}; + +/// Region in the Wal. +struct Region { + /// id of the Region + id: RegionId, + /// RocksDB instance + db: Arc, + /// `next_sequence_num` is ensured to be positive + next_sequence_num: AtomicU64, + /// Encoding for log entries + log_encoding: LogEncoding, + /// Encoding for meta data of max sequence + max_seq_meta_encoding: MaxSeqMetaEncoding, + /// Runtime for write requests + runtime: Arc, + /// Ensure the delete procedure to be sequential + delete_lock: Mutex<()>, +} + +impl Region { + /// Allocate a continuous range of [SequenceNumber] and returns + /// the start [SequenceNumber] of the range [start, start+`number`). + #[inline] + fn alloc_sequence_num(&self, number: u64) -> SequenceNumber { + self.next_sequence_num.fetch_add(number, Ordering::Relaxed) + } + + #[inline] + /// Generate [LogKey] from `region_id` and `sequence_num` + fn log_key(&self, sequence_num: SequenceNumber) -> LogKey { + (self.id, sequence_num) + } + + /// Returns the current sequence number which must be positive. + fn sequence_num(&self) -> Result { + let next_seq_num = self.next_sequence_num.load(Ordering::Relaxed); + debug_assert!(next_seq_num > 0); + + Ok(next_seq_num - 1) + } + + /// Delete entries in the range `[0, sequence_num]`. + /// + /// The delete procedure is ensured to be sequential. + async fn delete_entries_up_to(&self, mut sequence_num: SequenceNumber) -> Result<()> { + debug!( + "Wal Region delete entries begin deleting, sequence_num:{:?}", + sequence_num + ); + + let _delete_guard = self.delete_lock.lock().await; + let max_seq = self.sequence_num()?; + if sequence_num > max_seq { + warn!( + "Try to delete entries up to sequence number({}) greater than current max sequence \ + number({})", + sequence_num, + max_seq + ); + sequence_num = max_seq; + } + + let wb = { + let wb = WriteBatch::default(); + + // Delete the range [0, sequence_num] + let start_log_key = (self.id, 0); + let end_log_key = if sequence_num < MAX_SEQUENCE_NUMBER { + (self.id, sequence_num + 1) + } else { + // Region id is unlikely to overflow. + (self.id + 1, 0) + }; + let (mut start_key_buf, mut end_key_buf) = (BytesMut::new(), BytesMut::new()); + self.log_encoding + .encode_key(&mut start_key_buf, &start_log_key)?; + self.log_encoding + .encode_key(&mut end_key_buf, &end_log_key)?; + wb.delete_range(&start_key_buf, &end_key_buf) + .map_err(|e| e.into()) + .context(Delete)?; + + // Update the max sequence number. + let meta_key = MetaKey { region_id: self.id }; + let meta_value = MaxSeqMetaValue { max_seq }; + let (mut meta_key_buf, mut meta_value_buf) = (BytesMut::new(), BytesMut::new()); + self.max_seq_meta_encoding + .encode_key(&mut meta_key_buf, &meta_key)?; + self.max_seq_meta_encoding + .encode_value(&mut meta_value_buf, &meta_value)?; + wb.put(&meta_key_buf, &meta_value_buf) + .map_err(|e| e.into()) + .context(Delete)?; + + wb + }; + + let db = self.db.clone(); + self.runtime + .spawn_blocking(move || db.write(&wb).map_err(|e| e.into()).context(Delete)) + .await + .map_err(|e| Box::new(e) as _) + .context(Delete)? + } + + fn read(&self, ctx: &ReadContext, req: &ReadRequest) -> Result { + debug!("Wal region begin reading, ctx:{:?}, req:{:?}", ctx, req); + + let read_opts = ReadOptions::default(); + let iter = DBIterator::new(self.db.clone(), read_opts); + + let start_sequence = if let Some(n) = req.start.as_start_sequence_number() { + n + } else { + return Ok(RocksLogIterator::new_empty(self.log_encoding.clone(), iter)); + }; + + let end_sequence = if let Some(n) = req.end.as_end_sequence_number() { + n + } else { + return Ok(RocksLogIterator::new_empty(self.log_encoding.clone(), iter)); + }; + + let (min_log_key, max_log_key) = (self.log_key(start_sequence), self.log_key(end_sequence)); + + let log_iter = + RocksLogIterator::with_data(self.log_encoding.clone(), iter, min_log_key, max_log_key); + Ok(log_iter) + } + + async fn write(&self, ctx: &WriteContext, batch: &LogWriteBatch

) -> Result { + debug!( + "Wal region begin writing, ctx:{:?}, log_entries_num:{}", + ctx, + batch.entries.len() + ); + + let entries_num = batch.len() as u64; + let (wb, max_sequence_num) = { + let wb = WriteBatch::default(); + let mut next_sequence_num = self.alloc_sequence_num(entries_num); + let mut key_buf = BytesMut::new(); + let mut value_buf = BytesMut::new(); + + for entry in &batch.entries { + self.log_encoding + .encode_key(&mut key_buf, &(batch.region_id, next_sequence_num))?; + self.log_encoding + .encode_value(&mut value_buf, &entry.payload)?; + wb.put(&key_buf, &value_buf) + .map_err(|e| e.into()) + .context(Write)?; + + next_sequence_num += 1; + } + + (wb, next_sequence_num - 1) + }; + + let db = self.db.clone(); + self.runtime + .spawn_blocking(move || { + db.write(&wb) + .map(|_| max_sequence_num) + .map_err(|e| e.into()) + .context(Write) + }) + .await + .map_err(|e| Box::new(e) as _) + .context(Write)? + } +} + +/// [WalManager] implementation based on RocksDB. +/// A [RocksImpl] consists of multiple [Region]s and any read/write/delete +/// request is delegated to specific [Region]. +pub struct RocksImpl { + /// Wal data path + wal_path: String, + /// RocksDB instance + db: Arc, + /// Runtime for read/write log entries + runtime: Arc, + /// Encoding for log entry + log_encoding: LogEncoding, + /// Encoding for meta data of max sequence + max_seq_meta_encoding: MaxSeqMetaEncoding, + /// Regions + regions: RwLock>>, +} + +impl Drop for RocksImpl { + fn drop(&mut self) { + // Clear all regions. + { + let mut regions = self.regions.write().unwrap(); + regions.clear(); + } + + info!("RocksImpl dropped, wal_path:{}", self.wal_path); + } +} + +impl RocksImpl { + fn build_regions(&self) -> Result<()> { + let region_seqs = self.find_region_seqs_from_db()?; + + info!( + "RocksImpl build regions, wal_path:{}, region_seqs:{:?}", + self.wal_path, region_seqs + ); + + let mut regions = self.regions.write().unwrap(); + for (region_id, sequence_number) in region_seqs { + let region = Region { + id: region_id, + db: self.db.clone(), + next_sequence_num: AtomicU64::new(sequence_number + 1), + log_encoding: self.log_encoding.clone(), + max_seq_meta_encoding: self.max_seq_meta_encoding.clone(), + runtime: self.runtime.clone(), + delete_lock: Mutex::new(()), + }; + + regions.insert(region_id, Arc::new(region)); + } + + Ok(()) + } + + fn find_region_seqs_from_region_data( + &self, + region_max_seqs: &mut HashMap, + ) -> Result<()> { + let mut current_region_id = MAX_REGION_ID; + let mut end_boundary_key_buf = BytesMut::new(); + loop { + let log_key = (current_region_id, MAX_SEQUENCE_NUMBER); + self.log_encoding + .encode_key(&mut end_boundary_key_buf, &log_key)?; + let mut iter = self.db.iter(); + let seek_key = SeekKey::Key(&end_boundary_key_buf); + + let found = iter + .seek_for_prev(seek_key) + .map_err(|e| e.into()) + .context(Initialization)?; + + if !found { + debug!("RocksImpl find region pairs stop scanning, because of no entries to scan"); + break; + } + + if !self.log_encoding.is_log_key(iter.key())? { + debug!("RocksImpl find region pairs stop scanning, because log keys are exhausted"); + break; + } + + let log_key = self.log_encoding.decode_key(iter.key())?; + region_max_seqs.insert(log_key.0, log_key.1); + + if log_key.0 == 0 { + debug!("RocksImpl find region pairs stop scanning, because region 0 is reached"); + break; + } + current_region_id = log_key.0 - 1; + } + + Ok(()) + } + + fn find_region_seqs_from_region_meta( + &self, + region_max_seqs: &mut HashMap, + ) -> Result<()> { + let meta_key = MetaKey { region_id: 0 }; + let mut start_boundary_key_buf = BytesMut::new(); + self.max_seq_meta_encoding + .encode_key(&mut start_boundary_key_buf, &meta_key)?; + let mut iter = self.db.iter(); + let seek_key = SeekKey::Key(&start_boundary_key_buf); + iter.seek(seek_key) + .map_err(|e| e.into()) + .context(Initialization)?; + + loop { + if !iter.valid().map_err(|e| e.into()).context(Initialization)? { + debug!("RocksImpl exhausts the iterator for meta information"); + break; + } + if !self.max_seq_meta_encoding.is_max_seq_meta_key(iter.key())? { + debug!("RocksImpl exhausts max sequence meta key"); + break; + } + + let meta_key = self.max_seq_meta_encoding.decode_key(iter.key())?; + let meta_value = self.max_seq_meta_encoding.decode_value(iter.value())?; + region_max_seqs + .entry(meta_key.region_id) + .and_modify(|v| { + *v = meta_value.max_seq.max(*v); + }) + .or_insert(meta_value.max_seq); + + iter.next().map_err(|e| e.into()).context(Initialization)?; + } + + Ok(()) + } + + /// Collect all the regions with its max sequence number from the db. + /// + /// Returns the mapping: region_id -> max_sequence_number + fn find_region_seqs_from_db(&self) -> Result> { + // build the mapping: region_id -> max_sequence_number + let mut region_max_seqs = HashMap::new(); + + // scan the region information from the data part. + self.find_region_seqs_from_region_data(&mut region_max_seqs)?; + + // scan the region information from the meta part. + self.find_region_seqs_from_region_meta(&mut region_max_seqs)?; + + Ok(region_max_seqs) + } + + /// Get the region and create it if not found. + fn get_or_create_region(&self, region_id: RegionId) -> Arc { + { + let regions = self.regions.read().unwrap(); + if let Some(region) = regions.get(®ion_id) { + return region.clone(); + } + } + + let mut regions = self.regions.write().unwrap(); + if let Some(region) = regions.get(®ion_id) { + return region.clone(); + } + + info!( + "RocksImpl create new region, wal_path:{}, region_id:{}", + self.wal_path, region_id + ); + + // create a new region + let region = Arc::new(Region { + id: region_id, + db: self.db.clone(), + // ensure `next_sequence_number` to start from 1 (larger than MIN_SEQUENCE_NUMBER) + next_sequence_num: AtomicU64::new(MIN_SEQUENCE_NUMBER + 1), + log_encoding: self.log_encoding.clone(), + max_seq_meta_encoding: self.max_seq_meta_encoding.clone(), + runtime: self.runtime.clone(), + delete_lock: Mutex::new(()), + }); + + regions.insert(region_id, region.clone()); + region + } + + /// Get the region + fn region(&self, region_id: RegionId) -> Option> { + let regions = self.regions.read().unwrap(); + regions.get(®ion_id).cloned() + } +} + +/// Builder for `RocksImpl`. +pub struct Builder { + wal_path: String, + rocksdb_config: DBOptions, + runtime: Arc, +} + +impl Builder { + pub fn with_default_rocksdb_config( + wal_path: impl Into, + runtime: Arc, + ) -> Self { + let mut rocksdb_config = DBOptions::default(); + // TODO(yingwen): Move to another function? + rocksdb_config.create_if_missing(true); + Self::new(wal_path, runtime, rocksdb_config) + } + + pub fn new( + wal_path: impl Into, + runtime: Arc, + rocksdb_config: DBOptions, + ) -> Self { + let wal_path: PathBuf = wal_path.into(); + Self { + wal_path: wal_path.to_str().unwrap().to_owned(), + rocksdb_config, + runtime, + } + } + + pub fn build(self) -> Result { + let db = DB::open(self.rocksdb_config, &self.wal_path) + .map_err(|e| e.into()) + .context(Open { + wal_path: self.wal_path.clone(), + })?; + let rocks_impl = RocksImpl { + wal_path: self.wal_path, + db: Arc::new(db), + runtime: self.runtime, + log_encoding: LogEncoding::newest(), + max_seq_meta_encoding: MaxSeqMetaEncoding::newest(), + regions: RwLock::new(HashMap::new()), + }; + rocks_impl.build_regions()?; + + Ok(rocks_impl) + } +} + +/// Iterator over log entries based on RocksDB iterator. +pub struct RocksLogIterator { + log_encoding: LogEncoding, + /// denotes no more data to iterate and it is set to true when: + /// - initialized as no data iterator, or + /// - iterate to the end. + no_more_data: bool, + min_log_key: LogKey, + max_log_key: LogKey, + /// denote whether `iter` is seeked + seeked: bool, + /// RocksDB iterator + iter: DBIterator>, +} + +impl RocksLogIterator { + /// Create iterator maybe containing data. + fn with_data( + log_encoding: LogEncoding, + iter: DBIterator>, + min_log_key: LogKey, + max_log_key: LogKey, + ) -> Self { + Self { + log_encoding, + no_more_data: false, + min_log_key, + max_log_key, + seeked: false, + iter, + } + } + + /// Create empty iterator. + fn new_empty(log_encoding: LogEncoding, iter: DBIterator>) -> Self { + Self { + log_encoding, + no_more_data: true, + min_log_key: (0, 0), + max_log_key: (0, 0), + seeked: false, + iter, + } + } + + /// it's a valid log key if it is in the range `[self.min_log_key, + /// self.max_log_key]`. + fn is_valid_log_key(&self, curr_log_key: &LogKey) -> bool { + curr_log_key <= &self.max_log_key && curr_log_key >= &self.min_log_key + } + + /// End is reached iteration if `curr_log_key` is greater than + /// `max_log_key`. + fn is_end_reached(&self, curr_log_key: &LogKey) -> bool { + curr_log_key >= &self.max_log_key + } + + /// let `iter` seek to `min_log_key` + /// no guarantee on that `self.iter` is valid + fn seek(&mut self) -> Result<()> { + self.seeked = true; + + let mut seek_key_buf = BytesMut::new(); + self.log_encoding + .encode_key(&mut seek_key_buf, &self.min_log_key)?; + let seek_key = SeekKey::Key(&seek_key_buf); + self.iter + .seek(seek_key) + .map_err(|e| e.into()) + .context(Read)?; + + Ok(()) + } +} + +impl LogIterator for RocksLogIterator { + fn next_log_entry( + &mut self, + decoder: &D, + ) -> Result>> { + if self.no_more_data { + return Ok(None); + } + + if !self.seeked { + self.seek()?; + + let valid = self.iter.valid().map_err(|e| e.into()).context(Read)?; + if !valid { + self.no_more_data = true; + return Ok(None); + } + } else { + let found = self.iter.next().map_err(|e| e.into()).context(Read)?; + if !found { + self.no_more_data = true; + return Ok(None); + } + } + + let curr_log_key = self.log_encoding.decode_key(self.iter.key())?; + self.no_more_data = self.is_end_reached(&curr_log_key); + + if self.is_valid_log_key(&curr_log_key) { + let payload = self.log_encoding.decode_value(self.iter.value(), decoder)?; + let log_entry = LogEntry { + sequence: curr_log_key.1, + payload, + }; + Ok(Some(log_entry)) + } else { + Ok(None) + } + } +} + +impl LogReader for RocksImpl { + type Iterator = RocksLogIterator; + + fn read(&self, ctx: &ReadContext, req: &ReadRequest) -> Result { + if let Some(region) = self.region(req.region_id) { + region.read(ctx, req) + } else { + let iter = DBIterator::new(self.db.clone(), ReadOptions::default()); + Ok(RocksLogIterator::new_empty(self.log_encoding.clone(), iter)) + } + } +} + +#[async_trait] +impl LogWriter for RocksImpl { + async fn write( + &self, + ctx: &WriteContext, + batch: &LogWriteBatch

, + ) -> Result { + let region = self.get_or_create_region(batch.region_id); + region.write(ctx, batch).await + } +} + +#[async_trait] +impl WalManager for RocksImpl { + fn sequence_num(&self, region_id: RegionId) -> Result { + if let Some(region) = self.region(region_id) { + return region.sequence_num(); + } + + Ok(MIN_SEQUENCE_NUMBER) + } + + async fn mark_delete_entries_up_to( + &self, + region_id: RegionId, + sequence_num: SequenceNumber, + ) -> Result<()> { + if let Some(region) = self.region(region_id) { + return region.delete_entries_up_to(sequence_num).await; + } + + Ok(()) + } +} + +impl fmt::Debug for RocksImpl { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("RocksImpl") + .field("wal_path", &self.wal_path) + .finish() + } +} diff --git a/wal/src/rocks_impl/mod.rs b/wal/src/rocks_impl/mod.rs new file mode 100644 index 0000000000..e25bca788a --- /dev/null +++ b/wal/src/rocks_impl/mod.rs @@ -0,0 +1,6 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! WalManager implementation based on RocksDB + +pub mod encoding; +pub mod manager; diff --git a/wal/src/tests/mod.rs b/wal/src/tests/mod.rs new file mode 100644 index 0000000000..c52a689521 --- /dev/null +++ b/wal/src/tests/mod.rs @@ -0,0 +1,6 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! integration tests for wal + +mod read_write; +pub mod util; diff --git a/wal/src/tests/read_write.rs b/wal/src/tests/read_write.rs new file mode 100644 index 0000000000..a38bb1282c --- /dev/null +++ b/wal/src/tests/read_write.rs @@ -0,0 +1,449 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ops::Deref, sync::Arc}; + +use common_types::SequenceNumber; + +use crate::{ + log_batch::LogWriteBatch, + manager::{LogReader, LogWriter, ReadBoundary, ReadRequest, RegionId, WalManager}, + tests::util::{RocksTestEnv, TestEnv, TestPayload, WalBuilder}, +}; + +fn check_write_batch_with_read_request( + env: &TestEnv, + wal: Arc, + read_req: ReadRequest, + max_seq: SequenceNumber, + write_batch: &LogWriteBatch, +) { + let iter = wal + .read(&env.read_ctx, &read_req) + .expect("should succeed to read"); + env.check_log_entries(max_seq, write_batch, iter); +} + +fn check_write_batch( + env: &TestEnv, + wal: Arc, + region_id: RegionId, + max_seq: SequenceNumber, + write_batch: &LogWriteBatch, +) { + let read_req = ReadRequest { + region_id, + start: ReadBoundary::Included(max_seq + 1 - write_batch.entries.len() as u64), + end: ReadBoundary::Included(max_seq), + }; + check_write_batch_with_read_request(env, wal, read_req, max_seq, write_batch) +} + +async fn simple_read_write_with_wal( + env: impl Deref>, + wal: Arc, + region_id: RegionId, +) { + let write_batch = env.build_log_batch(region_id, 0, 10); + let seq = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + check_write_batch(&env, wal, region_id, seq, &write_batch) +} + +async fn simple_read_write(env: &TestEnv, region_id: RegionId) { + let wal = env.build_wal(); + simple_read_write_with_wal(env, wal.clone(), region_id).await; +} + +/// Test the read with different kinds of boundaries. +async fn read_with_boundary(env: &TestEnv) { + let wal = env.build_wal(); + let region_id = 0; + let write_batch = env.build_log_batch(region_id, 0, 10); + let end_seq = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + let start_seq = end_seq + 1 - write_batch.entries.len() as u64; + + // [min, max] + { + let read_req = ReadRequest { + region_id, + start: ReadBoundary::Min, + end: ReadBoundary::Max, + }; + check_write_batch_with_read_request(env, wal.clone(), read_req, end_seq, &write_batch); + } + + // [0, 10] + { + let read_req = ReadRequest { + region_id, + start: ReadBoundary::Included(start_seq), + end: ReadBoundary::Included(end_seq), + }; + check_write_batch_with_read_request(env, wal.clone(), read_req, end_seq, &write_batch); + } + + // (0, 10] + { + let read_req = ReadRequest { + region_id, + start: ReadBoundary::Excluded(start_seq), + end: ReadBoundary::Included(end_seq), + }; + let write_batch = env.build_log_batch(region_id, 1, 10); + check_write_batch_with_read_request(env, wal.clone(), read_req, end_seq, &write_batch); + } + + // [0, 10) + { + let read_req = ReadRequest { + region_id, + start: ReadBoundary::Included(start_seq), + end: ReadBoundary::Excluded(end_seq), + }; + let write_batch = env.build_log_batch(region_id, 0, 9); + check_write_batch_with_read_request(env, wal.clone(), read_req, end_seq - 1, &write_batch); + } + + // (0, 10) + { + let read_req = ReadRequest { + region_id, + start: ReadBoundary::Excluded(start_seq), + end: ReadBoundary::Excluded(end_seq), + }; + let write_batch = env.build_log_batch(region_id, 1, 9); + check_write_batch_with_read_request(env, wal.clone(), read_req, end_seq - 1, &write_batch); + } +} + +/// Test read and write across multiple regions parallely. +async fn write_multiple_regions_parallelly(env: Arc>) { + let wal = env.build_wal(); + let mut handles = Vec::with_capacity(10); + for i in 0..5 { + let read_write_0 = + env.runtime + .spawn(simple_read_write_with_wal(env.clone(), wal.clone(), i)); + let read_write_1 = + env.runtime + .spawn(simple_read_write_with_wal(env.clone(), wal.clone(), i)); + handles.push(read_write_0); + handles.push(read_write_1); + } + futures::future::join_all(handles) + .await + .into_iter() + .for_each(|res| { + res.expect("should succeed to join the write"); + }); +} + +/// Test whether the written logs can be read after reopen. +async fn reopen(env: &TestEnv) { + let region_id = 0; + let (write_batch, seq) = { + let wal = env.build_wal(); + let write_batch = env.build_log_batch(region_id, 0, 10); + let seq = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + (write_batch, seq) + }; + + // reopen the wal + let wal = env.build_wal(); + let read_req = ReadRequest { + region_id, + start: ReadBoundary::Included(seq + 1 - write_batch.entries.len() as u64), + end: ReadBoundary::Included(seq), + }; + let iter = wal + .read(&env.read_ctx, &read_req) + .expect("should succeed to read"); + env.check_log_entries(seq, &write_batch, iter); +} + +/// A complex test case for read and write: +/// - Write two log batch +/// - Read the first batch and then read the second batch. +/// - Read the whole batch. +/// - Read the part of first batch and second batch. +async fn complex_read_write(env: &TestEnv) { + let wal = env.build_wal(); + let region_id = 0; + + // write two batches + let (start_val, mid_val, end_val) = (0, 10, 50); + let write_batch_1 = env.build_log_batch(region_id, start_val, mid_val); + let seq_1 = wal + .write(&env.write_ctx, &write_batch_1) + .await + .expect("should succeed to write"); + let write_batch_2 = env.build_log_batch(region_id, mid_val, end_val); + let seq_2 = wal + .write(&env.write_ctx, &write_batch_2) + .await + .expect("should succeed to write"); + + // read the first batch + check_write_batch(env, wal.clone(), region_id, seq_1, &write_batch_1); + // read the second batch + check_write_batch(env, wal.clone(), region_id, seq_2, &write_batch_2); + + // read the whole batch + let (seq_3, write_batch_3) = (seq_2, env.build_log_batch(region_id, start_val, end_val)); + check_write_batch(env, wal.clone(), region_id, seq_3, &write_batch_3); + + // read the part of batch1 and batch2 + let (seq_4, write_batch_4) = { + let new_start = (start_val + mid_val) / 2; + let new_end = (mid_val + end_val) / 2; + let seq = seq_2 - (end_val - new_end) as u64; + (seq, env.build_log_batch(region_id, new_start, new_end)) + }; + check_write_batch(env, wal.clone(), region_id, seq_4, &write_batch_4); +} + +/// Test whether data can be deleted. +async fn simple_write_delete(env: &TestEnv) { + let region_id = 0; + let wal = env.build_wal(); + let mut write_batch = env.build_log_batch(region_id, 0, 10); + let seq = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + check_write_batch(env, wal.clone(), region_id, seq, &write_batch); + + // delete all logs + wal.mark_delete_entries_up_to(region_id, seq) + .await + .expect("should succeed to delete"); + let read_req = ReadRequest { + region_id, + start: ReadBoundary::Min, + end: ReadBoundary::Max, + }; + let iter = wal + .read(&env.read_ctx, &read_req) + .expect("should succeed to read"); + write_batch.entries.clear(); + env.check_log_entries(seq, &write_batch, iter); +} + +/// Delete half of the written data and check the remaining half can be read. +async fn write_delete_half(env: &TestEnv) { + let region_id = 0; + let wal = env.build_wal(); + let mut write_batch = env.build_log_batch(region_id, 0, 10); + let seq = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + check_write_batch(env, wal.clone(), region_id, seq, &write_batch); + + // delete all logs + wal.mark_delete_entries_up_to(region_id, seq / 2) + .await + .expect("should succeed to delete"); + let read_req = ReadRequest { + region_id, + start: ReadBoundary::Min, + end: ReadBoundary::Max, + }; + let iter = wal + .read(&env.read_ctx, &read_req) + .expect("should succeed to read"); + write_batch.entries.drain(..write_batch.entries.len() / 2); + env.check_log_entries(seq, &write_batch, iter); +} + +/// Test delete across multiple regions. +async fn write_delete_multiple_regions(env: &TestEnv) { + let (region_id_1, region_id_2) = (1, 2); + let wal = env.build_wal(); + let mut write_batch_1 = env.build_log_batch(region_id_1, 0, 10); + let seq_1 = wal + .write(&env.write_ctx, &write_batch_1) + .await + .expect("should succeed to write"); + + let write_batch_2 = env.build_log_batch(region_id_2, 10, 20); + let seq_2 = wal + .write(&env.write_ctx, &write_batch_2) + .await + .expect("should succeed to write"); + + // delete all logs of region 1. + wal.mark_delete_entries_up_to(region_id_1, seq_1) + .await + .expect("should succeed to delete"); + let read_req = ReadRequest { + region_id: region_id_1, + start: ReadBoundary::Min, + end: ReadBoundary::Max, + }; + let iter = wal + .read(&env.read_ctx, &read_req) + .expect("should succeed to read"); + write_batch_1.entries.clear(); + env.check_log_entries(seq_1, &write_batch_1, iter); + + check_write_batch(env, wal.clone(), region_id_2, seq_2, &write_batch_2); +} + +/// The sequence number should increase monotonically after multiple writes. +async fn sequence_increase_monotonically_multiple_writes(env: &TestEnv) { + let region_id = 0; + let wal = env.build_wal(); + let write_batch = env.build_log_batch(region_id, 0, 10); + let seq_1 = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + let seq_2 = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + let seq_3 = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + + assert!(seq_2 > seq_1); + assert!(seq_3 > seq_2); +} + +/// The sequence number should increase monotonically after write, delete and +/// one more write. +async fn sequence_increase_monotonically_delete_write(env: &TestEnv) { + let region_id = 0; + let wal = env.build_wal(); + let write_batch = env.build_log_batch(region_id, 0, 10); + // write + let seq_1 = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + // delete + wal.mark_delete_entries_up_to(region_id, seq_1) + .await + .expect("should succeed to delete"); + // write again + let seq_2 = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + + assert!(seq_2 > seq_1); +} + +/// The sequence number should increase monotonically after write, delete, +/// reopen and write. +async fn sequence_increase_monotonically_delete_reopen_write(env: &TestEnv) { + let region_id = 0; + let wal = env.build_wal(); + let write_batch = env.build_log_batch(region_id, 0, 10); + // write + let seq_1 = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + // delete + wal.mark_delete_entries_up_to(region_id, seq_1) + .await + .expect("should succeed to delete"); + // restart + drop(wal); + let wal = env.build_wal(); + // write again + let seq_2 = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + + assert!(seq_2 > seq_1); +} + +#[test] +fn test_simple_read_write() { + let rocks_env = RocksTestEnv::new(2); + rocks_env.runtime.block_on(simple_read_write(&rocks_env, 0)); +} + +#[test] +fn test_read_with_boundary() { + let rocks_env = RocksTestEnv::new(2); + rocks_env.runtime.block_on(read_with_boundary(&rocks_env)); +} + +#[test] +fn test_write_multiple_regions() { + let rocks_env = Arc::new(RocksTestEnv::new(4)); + rocks_env + .runtime + .block_on(write_multiple_regions_parallelly(rocks_env.clone())); +} + +#[test] +fn test_reopen() { + let rocks_env = RocksTestEnv::new(2); + rocks_env.runtime.block_on(reopen(&rocks_env)); +} + +#[test] +fn test_complex_read_write() { + let rocks_env = RocksTestEnv::new(2); + rocks_env.runtime.block_on(complex_read_write(&rocks_env)); +} + +#[test] +fn test_simple_write_delete() { + let rocks_env = RocksTestEnv::new(2); + rocks_env.runtime.block_on(simple_write_delete(&rocks_env)); +} + +#[test] +fn test_write_delete_half() { + let rocks_env = RocksTestEnv::new(2); + rocks_env.runtime.block_on(write_delete_half(&rocks_env)); +} +#[test] +fn test_write_delete_multiple_regions() { + let rocks_env = RocksTestEnv::new(2); + rocks_env + .runtime + .block_on(write_delete_multiple_regions(&rocks_env)); +} + +#[test] +fn test_sequence_increase_monotonically_multiple_writes() { + let rocks_env = RocksTestEnv::new(2); + rocks_env + .runtime + .block_on(sequence_increase_monotonically_multiple_writes(&rocks_env)); +} + +#[test] +fn test_sequence_increase_monotonically_delete_write() { + let rocks_env = RocksTestEnv::new(2); + rocks_env + .runtime + .block_on(sequence_increase_monotonically_delete_write(&rocks_env)); +} + +#[test] +fn test_sequence_increase_monotonically_delete_reopen_write() { + let rocks_env = RocksTestEnv::new(2); + rocks_env + .runtime + .block_on(sequence_increase_monotonically_delete_reopen_write( + &rocks_env, + )); +} diff --git a/wal/src/tests/util.rs b/wal/src/tests/util.rs new file mode 100644 index 0000000000..cd631363f6 --- /dev/null +++ b/wal/src/tests/util.rs @@ -0,0 +1,158 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! utilities for testing wal module. + +use std::{path::Path, sync::Arc}; + +use common_types::bytes::{MemBuf, MemBufMut}; +use common_util::runtime::{self, Runtime}; +use tempfile::TempDir; + +use crate::{ + log_batch::{LogWriteBatch, LogWriteEntry, Payload, PayloadDecoder}, + manager::{LogIterator, LogReader, ReadContext, RegionId, WalManager, WriteContext}, + rocks_impl::{self, manager::RocksImpl}, +}; + +pub trait WalBuilder: Default + Send + Sync { + type Wal: WalManager + Send + Sync; + fn build(&self, data_path: &Path, runtime: Arc) -> Arc; +} +use common_types::SequenceNumber; +use snafu::Snafu; + +#[derive(Debug, Snafu)] +pub enum Error {} + +#[derive(Default)] +pub struct RocksWalBuilder; + +impl WalBuilder for RocksWalBuilder { + type Wal = RocksImpl; + + fn build(&self, data_path: &Path, runtime: Arc) -> Arc { + let wal_builder = + rocks_impl::manager::Builder::with_default_rocksdb_config(data_path, runtime); + + Arc::new( + wal_builder + .build() + .expect("should succeed to build rocksimpl wal"), + ) + } +} + +pub type RocksTestEnv = TestEnv; + +/// The environment for testing wal. +pub struct TestEnv { + pub dir: TempDir, + pub runtime: Arc, + pub write_ctx: WriteContext, + pub read_ctx: ReadContext, + /// Builder for a specific wal. + builder: B, +} + +impl TestEnv { + pub fn new(num_workers: usize) -> Self { + let runtime = runtime::Builder::default() + .worker_threads(num_workers) + .enable_all() + .build() + .unwrap(); + + Self { + dir: tempfile::tempdir().unwrap(), + runtime: Arc::new(runtime), + write_ctx: WriteContext::default(), + read_ctx: ReadContext::default(), + builder: B::default(), + } + } + + pub fn build_wal(&self) -> Arc { + self.builder.build(self.dir.path(), self.runtime.clone()) + } + + /// Build the log batch with [TestPayload].val range [start, end). + pub fn build_log_batch( + &self, + region_id: RegionId, + start: u32, + end: u32, + ) -> LogWriteBatch { + let mut write_batch = LogWriteBatch::new(region_id); + for val in start..end { + let payload = TestPayload { val }; + write_batch.entries.push(LogWriteEntry { payload }); + } + + write_batch + } + + /// Check whether the log entries from the iterator equals the + /// `write_batch`. + pub fn check_log_entries( + &self, + max_seq: SequenceNumber, + write_batch: &LogWriteBatch, + mut iter: ::Iterator, + ) { + let dec = TestPayloadDecoder; + let mut log_entries = Vec::with_capacity(write_batch.entries.len()); + loop { + let log_entry = iter + .next_log_entry(&dec) + .expect("should succeed to fetch next log entry"); + if log_entry.is_none() { + break; + } + + log_entries.push(log_entry.unwrap()); + } + + assert_eq!(write_batch.entries.len(), log_entries.len()); + for (idx, (expect_log_write_entry, log_entry)) in write_batch + .entries + .iter() + .zip(log_entries.iter()) + .rev() + .enumerate() + { + assert_eq!(max_seq - idx as u64, log_entry.sequence); + assert_eq!(expect_log_write_entry.payload, log_entry.payload); + } + } +} + +/// The payload for Wal log entry for testing. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct TestPayload { + val: u32, +} + +impl Payload for TestPayload { + type Error = Error; + + fn encode_size(&self) -> usize { + 4 + } + + fn encode_to(&self, buf: &mut B) -> Result<(), Self::Error> { + buf.write_u32(self.val).expect("must write"); + Ok(()) + } +} + +pub struct TestPayloadDecoder; + +impl PayloadDecoder for TestPayloadDecoder { + type Error = Error; + type Target = TestPayload; + + fn decode(&self, buf: &mut B) -> Result { + let val = buf.read_u32().expect("should succeed to read u32"); + Ok(TestPayload { val }) + } +}

, + /// Object metadata for the listing + pub objects: Vec>, +} + +/// The metadata that describes an object. +#[derive(Debug)] +pub struct ObjectMeta { + /// The full path to the object + pub location: P, + /// The last modified time + pub last_modified: SystemTime, + /// The size in bytes of the object + pub size: usize, +} + +#[cfg(test)] +mod tests { + use std::io::Read; + + use bytes::Bytes; + use futures::{stream, StreamExt, TryStreamExt}; + + use super::*; + use crate::path::{file::FilePath, parsed::DirsAndFileName}; + + type Error = Box; + type Result = std::result::Result; + + async fn flatten_list_stream< + P: path::ObjectStorePath, + E: std::error::Error + Send + Sync + 'static, + R: AsyncRead + Unpin, + >( + storage: &impl ObjectStore, + prefix: Option<&P>, + ) -> Result> { + storage + .list(prefix) + .await? + .map_ok(|v| stream::iter(v).map(Ok)) + .try_flatten() + .try_collect() + .await + } + + pub(crate) async fn put_get_delete_list< + P: path::ObjectStorePath, + E: std::error::Error + Send + Sync + 'static, + R: AsyncRead + Unpin, + >( + storage: &impl ObjectStore, + ) -> Result<()> { + delete_fixtures(storage).await; + + let content_list = flatten_list_stream(storage, None).await?; + assert!( + content_list.is_empty(), + "Expected list to be empty; found: {:?}", + content_list + ); + + let data = Bytes::from("arbitrary data"); + let mut location = storage.new_path(); + location.push_dir("test_dir"); + location.set_file_name("test_file.json"); + + storage + .put(&location, data.as_ref(), Some(data.len())) + .await?; + + // List everything + let content_list = flatten_list_stream(storage, None).await?; + assert_eq!(content_list, &[location.clone()]); + + // List everything starting with a prefix that should return results + let mut prefix = storage.new_path(); + prefix.push_dir("test_dir"); + let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + assert_eq!(content_list, &[location.clone()]); + + // List everything starting with a prefix that shouldn't return results + let mut prefix = storage.new_path(); + prefix.push_dir("something"); + let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + assert!(content_list.is_empty()); + + let mut read_data = Vec::with_capacity(data.len()); + + storage.get(&location).await?.read_to_end(&mut read_data)?; + assert_eq!(&*read_data, data); + + storage.delete(&location).await?; + + let content_list = flatten_list_stream(storage, None).await?; + assert!(content_list.is_empty()); + + Ok(()) + } + + pub(crate) async fn list_with_delimiter< + P: path::ObjectStorePath, + E: std::error::Error + Send + Sync + 'static, + R: AsyncRead + Unpin, + >( + storage: &impl ObjectStore, + ) -> Result<()> { + delete_fixtures(storage).await; + + // ==================== check: store is empty ==================== + let content_list = flatten_list_stream(storage, None).await?; + assert!(content_list.is_empty()); + + // ==================== do: create files ==================== + let data = Bytes::from("arbitrary data"); + + let files: Vec<_> = [ + "test_file", + "mydb/wb/000/000/000.segment", + "mydb/wb/000/000/001.segment", + "mydb/wb/000/000/002.segment", + "mydb/wb/001/001/000.segment", + "mydb/wb/foo.json", + "mydb/data/whatevs", + ] + .iter() + .map(|&s| str_to_path(storage, s)) + .collect(); + + for f in &files { + storage + .put(f, data.as_ref(), Some(data.len())) + .await + .unwrap(); + } + + // ==================== check: prefix-list `mydb/wb` (directory) + // ==================== + let mut prefix = storage.new_path(); + prefix.push_all_dirs(&["mydb", "wb"]); + + let mut expected_000 = prefix.clone(); + expected_000.push_dir("000"); + let mut expected_001 = prefix.clone(); + expected_001.push_dir("001"); + let mut expected_location = prefix.clone(); + expected_location.set_file_name("foo.json"); + + let result = storage.list_with_delimiter(&prefix).await.unwrap(); + + assert_eq!(result.common_prefixes, vec![expected_000, expected_001]); + assert_eq!(result.objects.len(), 1); + + let object = &result.objects[0]; + + assert_eq!(object.location, expected_location); + assert_eq!(object.size, data.len()); + + // ==================== check: prefix-list `mydb/wb/000/000/001` (partial + // filename) ==================== + let mut prefix = storage.new_path(); + prefix.push_all_dirs(&["mydb", "wb", "000", "000"]); + prefix.set_file_name("001"); + + let mut expected_location = storage.new_path(); + expected_location.push_all_dirs(&["mydb", "wb", "000", "000"]); + expected_location.set_file_name("001.segment"); + + let result = storage.list_with_delimiter(&prefix).await.unwrap(); + assert!(result.common_prefixes.is_empty()); + assert_eq!(result.objects.len(), 1); + + let object = &result.objects[0]; + + assert_eq!(object.location, expected_location); + + // ==================== check: prefix-list `not_there` (non-existing prefix) + // ==================== + let mut prefix = storage.new_path(); + prefix.push_all_dirs(&["not_there"]); + + let result = storage.list_with_delimiter(&prefix).await.unwrap(); + assert!(result.common_prefixes.is_empty()); + assert!(result.objects.is_empty()); + + // ==================== do: remove all files ==================== + for f in &files { + storage.delete(f).await.unwrap(); + } + + // ==================== check: store is empty ==================== + let content_list = flatten_list_stream(storage, None).await?; + assert!(content_list.is_empty()); + + Ok(()) + } + + /// Parse a str as a `CloudPath` into a `DirAndFileName`, even though the + /// associated storage might not be cloud storage, to reuse the cloud + /// path parsing logic. Then convert into the correct type of path for + /// the given storage. + fn str_to_path< + P: path::ObjectStorePath, + E: std::error::Error + Send + Sync, + R: AsyncRead + Unpin, + >( + storage: &impl ObjectStore, + val: &str, + ) -> P { + let cloud_path = FilePath::raw(val, false); + let parsed: DirsAndFileName = cloud_path.into(); + + let mut new_path = storage.new_path(); + for part in parsed.directories { + new_path.push_dir(part.to_string()); + } + + if let Some(file_name) = parsed.file_name { + new_path.set_file_name(file_name.to_string()); + } + new_path + } + + async fn delete_fixtures< + P: path::ObjectStorePath, + E: std::error::Error + Send + Sync, + R: AsyncRead + Unpin, + >( + storage: &impl ObjectStore, + ) { + let files: Vec<_> = [ + "test_file", + "mydb/wb/000/000/000.segment", + "mydb/wb/000/000/001.segment", + "mydb/wb/000/000/002.segment", + "mydb/wb/001/001/000.segment", + "mydb/wb/foo.json", + "mydb/data/whatevs", + ] + .iter() + .map(|&s| str_to_path(storage, s)) + .collect(); + + for f in &files { + // don't care if it errors, should fail elsewhere + let _ = storage.delete(f).await; + } + } + + // Tests TODO: + // GET nonexisting location (in_memory/file) + // DELETE nonexisting location + // PUT overwriting +} diff --git a/components/object_store/src/path/file.rs b/components/object_store/src/path/file.rs new file mode 100644 index 0000000000..acdae35f69 --- /dev/null +++ b/components/object_store/src/path/file.rs @@ -0,0 +1,518 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + mem, + path::{is_separator, PathBuf}, +}; + +use crate::path::{parsed::DirsAndFileName, parts::PathPart, ObjectStorePath}; + +/// An object storage location suitable for passing to disk based object +/// storage. +#[derive(Debug, Clone, Default, PartialEq, Eq, Ord, PartialOrd)] +pub struct FilePath { + inner: FilePathRepresentation, +} + +impl ObjectStorePath for FilePath { + fn set_file_name(&mut self, part: impl Into) { + self.inner = mem::take(&mut self.inner).set_file_name(part); + } + + fn push_dir(&mut self, part: impl Into) { + self.inner = mem::take(&mut self.inner).push_dir(part); + } + + fn push_all_dirs<'a>(&mut self, parts: impl AsRef<[&'a str]>) { + self.inner = mem::take(&mut self.inner).push_all_dirs(parts); + } + + fn display(&self) -> String { + self.to_raw().display().to_string() + } +} + +impl FilePath { + /// Creates a file storage location from a `PathBuf` without parsing or + /// allocating unless other methods are called on this instance that + /// need it. + /// + /// The "nature" of path (i.e. if it is a directory or file) will be + /// guessed. So paths ending with a separator (e.g. `/foo/bar/` on + /// Linux) are treated as a directory. However for all other paths (like + /// `/foo/bar` on Linux) it is not clear if a directory or file is meant + /// w/o inspecting the underlying store. To workaround that there is the + /// `assume_directory` flag which will treat ambiguous paths as directories. + /// If set to `false`, these cases will be treated as files. + pub fn raw(path: impl Into, assume_directory: bool) -> Self { + let path = path.into(); + Self { + inner: FilePathRepresentation::Raw(path, assume_directory), + } + } + + /// Creates a filesystem `PathBuf` location by using the standard library's + /// `PathBuf` building implementation appropriate for the current + /// platform. + pub fn to_raw(&self) -> PathBuf { + use FilePathRepresentation::*; + + match &self.inner { + Raw(path, _) => path.to_owned(), + Parsed(dirs_and_file_name) => { + let mut path: PathBuf = dirs_and_file_name + .directories + .iter() + .map(PathPart::encoded) + .collect(); + if let Some(file_name) = &dirs_and_file_name.file_name { + path.push(file_name.encoded()); + } + path + } + } + } + + /// Add the parts of `path` to the end of this path. Notably does + /// *not* behave as `PathBuf::push` does: there is no way to replace the + /// root. If `self` has a file name, that will be removed, then the + /// directories of `path` will be appended, then any file name of `path` + /// will be assigned to `self`. + pub fn push_path(&mut self, path: &Self) { + self.inner = mem::take(&mut self.inner).push_path(path) + } + + /// Add a `PathPart` to the end of the path's directories. + pub fn push_part_as_dir(&mut self, part: &PathPart) { + self.inner = mem::take(&mut self.inner).push_part_as_dir(part); + } + + /// Whether the prefix is the start of this path or not. + pub fn prefix_matches(&self, prefix: &Self) -> bool { + self.inner.prefix_matches(&prefix.inner) + } + + /// Returns all directory and file name `PathParts` in `self` after the + /// specified `prefix`. Ignores any `file_name` part of `prefix`. + /// Returns `None` if `self` dosen't start with `prefix`. + pub fn parts_after_prefix(&self, prefix: &Self) -> Option> { + self.inner.parts_after_prefix(&prefix.inner) + } + + /// Remove this path's file name, if there is one. + pub fn unset_file_name(&mut self) { + self.inner = mem::take(&mut self.inner).unset_file_name(); + } +} + +impl From for DirsAndFileName { + fn from(file_path: FilePath) -> Self { + file_path.inner.into() + } +} + +impl From for FilePath { + fn from(dirs_and_file_name: DirsAndFileName) -> Self { + Self { + inner: FilePathRepresentation::Parsed(dirs_and_file_name), + } + } +} + +#[derive(Debug, Clone, Eq)] +enum FilePathRepresentation { + // raw: native path representation and also remember if we always assume it is a directory + // assume_directory: bool + Raw(PathBuf, bool), + Parsed(DirsAndFileName), +} + +impl Default for FilePathRepresentation { + fn default() -> Self { + Self::Parsed(DirsAndFileName::default()) + } +} + +impl PartialEq for FilePathRepresentation { + fn eq(&self, other: &Self) -> bool { + matches!(self.cmp(other), std::cmp::Ordering::Equal) + } +} +impl PartialOrd for FilePathRepresentation { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for FilePathRepresentation { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + use FilePathRepresentation::*; + match (self, other) { + (Parsed(self_parts), Parsed(other_parts)) => self_parts.cmp(other_parts), + (Parsed(self_parts), _) => { + let other_parts: DirsAndFileName = other.to_owned().into(); + self_parts.cmp(&other_parts) + } + (_, Parsed(other_parts)) => { + let self_parts: DirsAndFileName = self.to_owned().into(); + self_parts.cmp(other_parts) + } + _ => { + let self_parts: DirsAndFileName = self.to_owned().into(); + let other_parts: DirsAndFileName = other.to_owned().into(); + self_parts.cmp(&other_parts) + } + } + } +} + +impl FilePathRepresentation { + fn push_dir(self, part: impl Into) -> Self { + let mut dirs_and_file_name: DirsAndFileName = self.into(); + + dirs_and_file_name.push_dir(part); + Self::Parsed(dirs_and_file_name) + } + + fn push_all_dirs<'a>(self, parts: impl AsRef<[&'a str]>) -> Self { + let mut dirs_and_file_name: DirsAndFileName = self.into(); + + dirs_and_file_name.push_all_dirs(parts); + Self::Parsed(dirs_and_file_name) + } + + fn set_file_name(self, part: impl Into) -> Self { + let mut dirs_and_file_name: DirsAndFileName = self.into(); + + dirs_and_file_name.set_file_name(part); + Self::Parsed(dirs_and_file_name) + } + + fn unset_file_name(self) -> Self { + let mut dirs_and_file_name: DirsAndFileName = self.into(); + + dirs_and_file_name.unset_file_name(); + Self::Parsed(dirs_and_file_name) + } + + /// Add the parts of `path` to the end of this path. Notably does + /// *not* behave as `PathBuf::push` does: there is no way to replace the + /// root. If `self` has a file name, that will be removed, then the + /// directories of `path` will be appended, then any file name of `path` + /// will be assigned to `self`. + fn push_path(self, path: &FilePath) -> Self { + let DirsAndFileName { + directories: path_dirs, + file_name: path_file_name, + } = path.inner.to_owned().into(); + let mut dirs_and_file_name: DirsAndFileName = self.into(); + + dirs_and_file_name.directories.extend(path_dirs); + dirs_and_file_name.file_name = path_file_name; + + Self::Parsed(dirs_and_file_name) + } + + /// Add a `PathPart` to the end of the path's directories. + fn push_part_as_dir(self, part: &PathPart) -> Self { + let mut dirs_and_file_name: DirsAndFileName = self.into(); + + dirs_and_file_name.push_part_as_dir(part); + + Self::Parsed(dirs_and_file_name) + } + + fn prefix_matches(&self, prefix: &Self) -> bool { + use FilePathRepresentation::*; + match (self, prefix) { + (Parsed(self_parts), Parsed(prefix_parts)) => self_parts.prefix_matches(prefix_parts), + (Parsed(self_parts), _) => { + let prefix_parts: DirsAndFileName = prefix.to_owned().into(); + self_parts.prefix_matches(&prefix_parts) + } + (_, Parsed(prefix_parts)) => { + let self_parts: DirsAndFileName = self.to_owned().into(); + self_parts.prefix_matches(prefix_parts) + } + _ => { + let self_parts: DirsAndFileName = self.to_owned().into(); + let prefix_parts: DirsAndFileName = prefix.to_owned().into(); + self_parts.prefix_matches(&prefix_parts) + } + } + } + + /// Returns all directory and file name `PathParts` in `self` after the + /// specified `prefix`. Ignores any `file_name` part of `prefix`. + /// Returns `None` if `self` dosen't start with `prefix`. + fn parts_after_prefix(&self, prefix: &Self) -> Option> { + use FilePathRepresentation::*; + match (self, prefix) { + (Parsed(self_parts), Parsed(prefix_parts)) => { + self_parts.parts_after_prefix(prefix_parts) + } + (Parsed(self_parts), _) => { + let prefix_parts: DirsAndFileName = prefix.to_owned().into(); + self_parts.parts_after_prefix(&prefix_parts) + } + (_, Parsed(prefix_parts)) => { + let self_parts: DirsAndFileName = self.to_owned().into(); + self_parts.parts_after_prefix(prefix_parts) + } + _ => { + let self_parts: DirsAndFileName = self.to_owned().into(); + let prefix_parts: DirsAndFileName = prefix.to_owned().into(); + self_parts.parts_after_prefix(&prefix_parts) + } + } + } +} + +impl From for DirsAndFileName { + fn from(file_path_rep: FilePathRepresentation) -> Self { + use FilePathRepresentation::*; + + match file_path_rep { + Raw(path, assume_directory) => { + let mut parts: Vec = path + .iter() + .flat_map(|s| s.to_os_string().into_string().map(PathPart)) + .collect(); + + if !assume_directory && !parts.is_empty() && !is_directory(&path) { + let file_name = Some(parts.pop().expect("cannot be empty")); + Self { + directories: parts, + file_name, + } + } else { + Self { + directories: parts, + file_name: None, + } + } + } + Parsed(dirs_and_file_name) => dirs_and_file_name, + } + } +} + +/// Checks if the path is for sure a directory (i.e. ends with a separator). +fn is_directory(path: &std::path::Path) -> bool { + if let Some(s) = path.to_str() { + if let Some(c) = s.chars().last() { + return is_separator(c); + } + } + false +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::parsed_path; + + #[test] + fn path_buf_to_dirs_and_file_name_conversion() { + // Last section ending in `.json` is a file name + let path_buf: PathBuf = "/one/two/blah.json".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + let mut expected_parts = parsed_path!(["/", "one", "two"], "blah.json"); + expected_parts.directories[0] = PathPart("/".to_string()); // not escaped + assert_eq!(parts, expected_parts); + + // Last section ending in `.segment` is a file name + let path_buf: PathBuf = "/one/two/blah.segment".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + let mut expected_parts = parsed_path!(["/", "one", "two"], "blah.segment"); + expected_parts.directories[0] = PathPart("/".to_string()); // not escaped + assert_eq!(parts, expected_parts); + + // Last section ending in `.parquet` is a file name + let path_buf: PathBuf = "/one/two/blah.parquet".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + let mut expected_parts = parsed_path!(["/", "one", "two"], "blah.parquet"); + expected_parts.directories[0] = PathPart("/".to_string()); // not escaped + assert_eq!(parts, expected_parts); + + // Last section ending in `.txt` is NOT a file name; we don't recognize that + // extension + let path_buf: PathBuf = "/one/two/blah.txt".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + let mut expected_parts = parsed_path!(["/", "one", "two"], "blah.txt"); + expected_parts.directories[0] = PathPart("/".to_string()); // not escaped + assert_eq!(parts, expected_parts); + + // Last section containing a `.` isn't a file name + let path_buf: PathBuf = "/one/two/blah.blah".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + let mut expected_parts = parsed_path!(["/", "one", "two"], "blah.blah"); + expected_parts.directories[0] = PathPart("/".to_string()); // not escaped + assert_eq!(parts, expected_parts); + + // Last section starting with a `.` isn't a file name (macos temp dirs do this) + let path_buf: PathBuf = "/one/two/.blah".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + let mut expected_parts = parsed_path!(["/", "one", "two"], ".blah"); + expected_parts.directories[0] = PathPart("/".to_string()); // not escaped + assert_eq!(parts, expected_parts); + + let path_buf: PathBuf = "/a/b/d".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + let mut expected_parts = parsed_path!(["/", "a", "b"], "d"); + expected_parts.directories[0] = PathPart("/".to_string()); // not escaped + assert_eq!(parts, expected_parts); + + let path_buf: PathBuf = "/a/b/c".into(); + let file_path = FilePath::raw(path_buf, true); + let parts: DirsAndFileName = file_path.into(); + let mut expected_parts = parsed_path!(["/", "a", "b", "c"]); + expected_parts.directories[0] = PathPart("/".to_string()); // not escaped + assert_eq!(parts, expected_parts); + } + + #[test] + fn conversions() { + // dir and file name + let path_buf: PathBuf = "foo/bar/blah.json".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + + let expected_parts = parsed_path!(["foo", "bar"], "blah.json"); + assert_eq!(parts, expected_parts); + + // dir, no file name + let path_buf: PathBuf = "foo/bar/".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + + let expected_parts = parsed_path!(["foo", "bar"]); + assert_eq!(parts, expected_parts); + + // same but w/o the final marker + let path_buf: PathBuf = "foo/bar".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + + let expected_parts = parsed_path!(["foo"], "bar"); + assert_eq!(parts, expected_parts); + + // same but w/o the final marker, but forced to be a directory + let path_buf: PathBuf = "foo/bar".into(); + let file_path = FilePath::raw(path_buf, true); + let parts: DirsAndFileName = file_path.into(); + + let expected_parts = parsed_path!(["foo", "bar"]); + assert_eq!(parts, expected_parts); + + // no dir, file name + let path_buf: PathBuf = "blah.json".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + + let expected_parts = parsed_path!([], "blah.json"); + assert_eq!(parts, expected_parts); + + // empty + let path_buf: PathBuf = "".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + + let expected_parts = parsed_path!(); + assert_eq!(parts, expected_parts); + + // weird file name + let path_buf: PathBuf = "blah.x".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + + let expected_parts = parsed_path!("blah.x"); + assert_eq!(parts, expected_parts); + } + + #[test] + fn equality() { + let path_buf: PathBuf = "foo/bar/blah.json".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.clone().into(); + let parsed: FilePath = parts.into(); + + assert_eq!(file_path, parsed); + } + + #[test] + fn ordering() { + let a_path_buf: PathBuf = "foo/bar/a.json".into(); + let a_file_path = FilePath::raw(&a_path_buf, false); + let a_parts: DirsAndFileName = a_file_path.into(); + let a_parsed: FilePath = a_parts.into(); + + let b_path_buf: PathBuf = "foo/bar/b.json".into(); + let b_file_path = FilePath::raw(&b_path_buf, false); + + assert!(a_path_buf < b_path_buf); + assert!( + a_parsed < b_file_path, + "a was not less than b: a = {:#?}\nb = {:#?}", + a_parsed, + b_file_path + ); + } + + #[test] + fn path_display() { + let a_path_buf: PathBuf = "foo/bar/a.json".into(); + let expected_display = a_path_buf.display().to_string(); + let a_file_path = FilePath::raw(&a_path_buf, false); + + assert_eq!(a_file_path.display(), expected_display); + + let a_parts: DirsAndFileName = a_file_path.into(); + let a_parsed: FilePath = a_parts.into(); + + assert_eq!(a_parsed.display(), expected_display); + } + + #[test] + fn test_file_path_represent_ord() { + let file1 = FilePathRepresentation::Raw(PathBuf::from("/aa/bb"), false); + let file1_bak = FilePathRepresentation::Raw(PathBuf::from("/aa/bb"), false); + let file2 = FilePathRepresentation::Raw(PathBuf::from("/zz/aa/bb"), false); + + assert!(file1 == file1_bak); + assert!(file1 < file2) + } + + #[test] + fn test_file_path_parts_after_prefix() { + let file = FilePath::raw("/a/b/c", false); + let file2 = FilePath::raw("/a/b", true); + let ret = file.parts_after_prefix(&file2); + assert_eq!(ret, Some(vec![PathPart("c".to_string())])); + + let file = FilePath::raw("/a/b/c", false); + let file2 = FilePath::raw("/a/b", false); + let ret = file.parts_after_prefix(&file2); + assert_eq!( + ret, + Some(vec![PathPart("b".to_string()), PathPart("c".to_string())]) + ); + + let file = FilePath::raw("/a/b/d", false); + let file2 = FilePath::raw("/a/b/c/dd", true); + let ret = file.parts_after_prefix(&file2); + assert_eq!(ret, None); + + let file = FilePath::raw("/a/b/d", true); + let file2 = FilePath::raw("/a/b/c", true); + let ret = file.parts_after_prefix(&file2); + assert_eq!(ret, None); + } +} diff --git a/components/object_store/src/path/mod.rs b/components/object_store/src/path/mod.rs new file mode 100644 index 0000000000..e5922d6df8 --- /dev/null +++ b/components/object_store/src/path/mod.rs @@ -0,0 +1,35 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! This module contains code for abstracting object locations that work +//! across different backing implementations and platforms. + +pub mod file; +pub mod parsed; +pub mod parts; + +/// The delimiter to separate object namespaces, creating a directory structure. +pub const DELIMITER: &str = "/"; + +/// Universal interface for handling paths and locations for objects and +/// directories in the object store. +/// +/// +/// Deliberately does not implement `Display` or `ToString`! +pub trait ObjectStorePath: + std::fmt::Debug + Clone + PartialEq + Eq + Send + Sync + 'static +{ + /// Set the file name of this path + fn set_file_name(&mut self, part: impl Into); + + /// Add a part to the end of the path's directories, encoding any restricted + /// characters. + fn push_dir(&mut self, part: impl Into); + + /// Push a bunch of parts as directories in one go. + fn push_all_dirs<'a>(&mut self, parts: impl AsRef<[&'a str]>); + + /// Like `std::path::Path::display, converts an `ObjectStorePath` to a + /// `String` suitable for printing; not suitable for sending to + /// APIs. + fn display(&self) -> String; +} diff --git a/components/object_store/src/path/parsed.rs b/components/object_store/src/path/parsed.rs new file mode 100644 index 0000000000..0c9781a9b6 --- /dev/null +++ b/components/object_store/src/path/parsed.rs @@ -0,0 +1,389 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use itertools::Itertools; + +use crate::path::{parts::PathPart, ObjectStorePath, DELIMITER}; + +/// A path stored as a collection of 0 or more directories and 0 or 1 file name +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)] +pub struct DirsAndFileName { + /// Directory hierarchy. + pub directories: Vec, + + /// Filename, if set. + pub file_name: Option, +} + +impl ObjectStorePath for DirsAndFileName { + fn set_file_name(&mut self, part: impl Into) { + let part = part.into(); + self.file_name = Some((&*part).into()); + } + + fn push_dir(&mut self, part: impl Into) { + let part = part.into(); + self.directories.push((&*part).into()); + } + + fn push_all_dirs<'a>(&mut self, parts: impl AsRef<[&'a str]>) { + self.directories + .extend(parts.as_ref().iter().map(|&v| v.into())); + } + + fn display(&self) -> String { + let mut s = self + .directories + .iter() + .map(PathPart::encoded) + .join(DELIMITER); + + if !s.is_empty() { + s.push_str(DELIMITER); + } + if let Some(file_name) = &self.file_name { + s.push_str(file_name.encoded()); + } + s + } +} + +impl DirsAndFileName { + pub(crate) fn prefix_matches(&self, prefix: &Self) -> bool { + let diff = itertools::diff_with( + self.directories.iter(), + prefix.directories.iter(), + |a, b| a == b, + ); + + use itertools::Diff; + match diff { + None => match (self.file_name.as_ref(), prefix.file_name.as_ref()) { + (Some(self_file), Some(prefix_file)) => { + self_file.encoded().starts_with(prefix_file.encoded()) + } + (Some(_self_file), None) => true, + (None, Some(_prefix_file)) => false, + (None, None) => true, + }, + Some(Diff::Shorter(_, mut remaining_self)) => { + let next_dir = remaining_self + .next() + .expect("must have at least one mismatch to be in this case"); + match prefix.file_name.as_ref() { + Some(prefix_file) => next_dir.encoded().starts_with(prefix_file.encoded()), + None => true, + } + } + Some(Diff::FirstMismatch(_, mut remaining_self, mut remaining_prefix)) => { + let first_prefix = remaining_prefix + .next() + .expect("must have at least one mismatch to be in this case"); + + // There must not be any other remaining parts in the prefix + remaining_prefix.next().is_none() + // and the next item in self must start with the last item in the prefix + && remaining_self + .next() + .expect("must be at least one value") + .encoded() + .starts_with(first_prefix.encoded()) + } + _ => false, + } + } + + /// Returns all directory and file name `PathParts` in `self` after the + /// specified `prefix`. Ignores any `file_name` part of `prefix`. + /// Returns `None` if `self` dosen't start with `prefix`. + pub(crate) fn parts_after_prefix(&self, prefix: &Self) -> Option> { + if self.directories.len() < prefix.directories.len() { + return None; + } + + let mut dirs_iter = self.directories.iter(); + let mut prefix_dirs_iter = prefix.directories.iter(); + + let mut parts = vec![]; + + for dir in &mut dirs_iter { + let pre = prefix_dirs_iter.next(); + + match pre { + None => { + parts.push(dir.to_owned()); + break; + } + Some(p) if p == dir => continue, + Some(_) => return None, + } + } + + parts.extend(dirs_iter.cloned()); + + if let Some(file_name) = &self.file_name { + parts.push(file_name.to_owned()); + } + + Some(parts) + } + + /// Add a `PathPart` to the end of the path's directories. + pub(crate) fn push_part_as_dir(&mut self, part: &PathPart) { + self.directories.push(part.to_owned()); + } + + /// Remove the file name, if any. + pub(crate) fn unset_file_name(&mut self) { + self.file_name = None; + } +} + +/// Short-cut macro to create [`DirsAndFileName`] instances. +/// +/// # Example +/// ``` +/// use object_store::parsed_path; +/// +/// // empty path +/// parsed_path!(); +/// +/// // filename only +/// parsed_path!("test.txt"); +/// +/// // directories only +/// parsed_path!(["path", "to"]); +/// +/// // filename + directories +/// parsed_path!(["path", "to"], "test.txt"); +/// ``` +#[macro_export] +macro_rules! parsed_path { + ([$($dir:expr),*], $file:expr) => { + $crate::path::parsed::DirsAndFileName { + directories: vec![$($crate::path::parts::PathPart::from($dir)),*], + file_name: Some($crate::path::parts::PathPart::from($file)), + } + }; + ([$($dir:expr),*]) => { + $crate::path::parsed::DirsAndFileName { + directories: vec![$($crate::path::parts::PathPart::from($dir)),*], + file_name: None, + } + }; + ($file:expr) => { + parsed_path!([], $file) + }; + () => { + parsed_path!([]) + }; +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parts_after_prefix_behavior() { + let mut existing_path = DirsAndFileName::default(); + existing_path.push_all_dirs(&["apple", "bear", "cow", "dog"]); + existing_path.file_name = Some("egg.json".into()); + + // Prefix with one directory + let mut prefix = DirsAndFileName::default(); + prefix.push_dir("apple"); + let expected_parts: Vec = vec!["bear", "cow", "dog", "egg.json"] + .into_iter() + .map(Into::into) + .collect(); + let parts = existing_path.parts_after_prefix(&prefix).unwrap(); + assert_eq!(parts, expected_parts); + + // Prefix with two directories + let mut prefix = DirsAndFileName::default(); + prefix.push_all_dirs(&["apple", "bear"]); + let expected_parts: Vec = vec!["cow", "dog", "egg.json"] + .into_iter() + .map(Into::into) + .collect(); + let parts = existing_path.parts_after_prefix(&prefix).unwrap(); + assert_eq!(parts, expected_parts); + + // Not a prefix + let mut prefix = DirsAndFileName::default(); + prefix.push_dir("cow"); + assert!(existing_path.parts_after_prefix(&prefix).is_none()); + + // Prefix with a partial directory + let mut prefix = DirsAndFileName::default(); + prefix.push_dir("ap"); + assert!(existing_path.parts_after_prefix(&prefix).is_none()); + + // Prefix matches but there aren't any parts after it + let mut existing_path = DirsAndFileName::default(); + existing_path.push_all_dirs(&["apple", "bear", "cow", "dog"]); + let prefix = existing_path.clone(); + let parts = existing_path.parts_after_prefix(&prefix).unwrap(); + assert!(parts.is_empty()); + } + + #[test] + fn prefix_matches() { + let mut haystack = DirsAndFileName::default(); + haystack.push_all_dirs(&["foo/bar", "baz%2Ftest", "something"]); + + // self starts with self + assert!( + haystack.prefix_matches(&haystack), + "{:?} should have started with {:?}", + haystack, + haystack + ); + + // a longer prefix doesn't match + let mut needle = haystack.clone(); + needle.push_dir("longer now"); + assert!( + !haystack.prefix_matches(&needle), + "{:?} shouldn't have started with {:?}", + haystack, + needle + ); + + // one dir prefix matches + let mut needle = DirsAndFileName::default(); + needle.push_dir("foo/bar"); + assert!( + haystack.prefix_matches(&needle), + "{:?} should have started with {:?}", + haystack, + needle + ); + + // two dir prefix matches + needle.push_dir("baz%2Ftest"); + assert!( + haystack.prefix_matches(&needle), + "{:?} should have started with {:?}", + haystack, + needle + ); + + // partial dir prefix matches + let mut needle = DirsAndFileName::default(); + needle.push_dir("f"); + assert!( + haystack.prefix_matches(&needle), + "{:?} should have started with {:?}", + haystack, + needle + ); + + // one dir and one partial dir matches + let mut needle = DirsAndFileName::default(); + needle.push_all_dirs(&["foo/bar", "baz"]); + assert!( + haystack.prefix_matches(&needle), + "{:?} should have started with {:?}", + haystack, + needle + ); + } + + #[test] + fn prefix_matches_with_file_name() { + let mut haystack = DirsAndFileName::default(); + haystack.push_all_dirs(&["foo/bar", "baz%2Ftest", "something"]); + + let mut needle = haystack.clone(); + + // All directories match and file name is a prefix + haystack.set_file_name("foo.segment"); + needle.set_file_name("foo"); + + assert!( + haystack.prefix_matches(&needle), + "{:?} should have started with {:?}", + haystack, + needle + ); + + // All directories match but file name is not a prefix + needle.set_file_name("e"); + + assert!( + !haystack.prefix_matches(&needle), + "{:?} should not have started with {:?}", + haystack, + needle + ); + + // Not all directories match; file name is a prefix of the next directory; this + // matches + let mut needle = DirsAndFileName::default(); + needle.push_all_dirs(&["foo/bar", "baz%2Ftest"]); + needle.set_file_name("s"); + + assert!( + haystack.prefix_matches(&needle), + "{:?} should have started with {:?}", + haystack, + needle + ); + + // Not all directories match; file name is NOT a prefix of the next directory; + // no match + needle.set_file_name("p"); + + assert!( + !haystack.prefix_matches(&needle), + "{:?} should not have started with {:?}", + haystack, + needle + ); + } + + #[test] + fn test_macro() { + let actual = parsed_path!(["foo", "bar"], "baz"); + let expected = DirsAndFileName { + directories: vec![PathPart::from("foo"), PathPart::from("bar")], + file_name: Some(PathPart::from("baz")), + }; + assert_eq!(actual, expected); + + let actual = parsed_path!([], "foo"); + let expected = DirsAndFileName { + directories: vec![], + file_name: Some(PathPart::from("foo")), + }; + assert_eq!(actual, expected); + + let actual = parsed_path!("foo"); + let expected = DirsAndFileName { + directories: vec![], + file_name: Some(PathPart::from("foo")), + }; + assert_eq!(actual, expected); + + let actual = parsed_path!(["foo", "bar"]); + let expected = DirsAndFileName { + directories: vec![PathPart::from("foo"), PathPart::from("bar")], + file_name: None, + }; + assert_eq!(actual, expected); + + let actual = parsed_path!([]); + let expected = DirsAndFileName { + directories: vec![], + file_name: None, + }; + assert_eq!(actual, expected); + + let actual = parsed_path!(); + let expected = DirsAndFileName { + directories: vec![], + file_name: None, + }; + assert_eq!(actual, expected); + } +} diff --git a/components/object_store/src/path/parts.rs b/components/object_store/src/path/parts.rs new file mode 100644 index 0000000000..b9e69becfb --- /dev/null +++ b/components/object_store/src/path/parts.rs @@ -0,0 +1,142 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use percent_encoding::{percent_decode_str, percent_encode, AsciiSet, CONTROLS}; + +use super::DELIMITER; + +// percent_encode's API needs this as a byte +const DELIMITER_BYTE: u8 = DELIMITER.as_bytes()[0]; + +// special encoding of the empty string part. +// Using '%' is the safest character since it will always be used in the +// output of percent_encode no matter how we evolve the INVALID AsciiSet over +// time. +const EMPTY: &str = "%"; + +/// The PathPart type exists to validate the directory/file names that form part +/// of a path. +/// +/// A PathPart instance is guaranteed to be non-empty and to contain no `/` +/// characters as it can only be constructed by going through the `from` impl. +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)] +pub struct PathPart(pub(super) String); + +/// Characters we want to encode. +const INVALID: &AsciiSet = &CONTROLS + // The delimiter we are reserving for internal hierarchy + .add(DELIMITER_BYTE) + // Characters AWS recommends avoiding for object keys + // https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingMetadata.html + .add(b'\\') + .add(b'{') + // TODO: Non-printable ASCII characters (128–255 decimal characters) + .add(b'^') + .add(b'}') + .add(b'%') + .add(b'`') + .add(b']') + .add(b'"') // " <-- my editor is confused about double quotes within single quotes + .add(b'>') + .add(b'[') + .add(b'~') + .add(b'<') + .add(b'#') + .add(b'|') + // Characters Google Cloud Storage recommends avoiding for object names + // https://cloud.google.com/storage/docs/naming-objects + .add(b'\r') + .add(b'\n') + .add(b'*') + .add(b'?'); + +impl From<&str> for PathPart { + fn from(v: &str) -> Self { + match v { + // We don't want to encode `.` generally, but we do want to disallow parts of paths + // to be equal to `.` or `..` to prevent file system traversal shenanigans. + "." => Self(String::from("%2E")), + ".." => Self(String::from("%2E%2E")), + + // Every string except the empty string will be percent encoded. + // The empty string will be transformed into a sentinel value EMPTY + // which can safely be a prefix of an encoded value since it will be + // fully matched at decode time (see impl Display for PathPart). + "" => Self(String::from(EMPTY)), + other => Self(percent_encode(other.as_bytes(), INVALID).to_string()), + } + } +} + +impl std::fmt::Display for PathPart { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match &self.0[..] { + EMPTY => "".fmt(f), + _ => percent_decode_str(&self.0) + .decode_utf8() + .expect("Valid UTF-8 that came from String") + .fmt(f), + } + } +} + +impl PathPart { + /// Encode as string. + pub fn encoded(&self) -> &str { + &self.0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn path_part_delimiter_gets_encoded() { + let part: PathPart = "foo/bar".into(); + assert_eq!(part, PathPart(String::from("foo%2Fbar"))); + } + + #[test] + fn path_part_gets_decoded_for_display() { + let part: PathPart = "foo/bar".into(); + assert_eq!(part.to_string(), "foo/bar"); + } + + #[test] + fn path_part_given_already_encoded_string() { + let part: PathPart = "foo%2Fbar".into(); + assert_eq!(part, PathPart(String::from("foo%252Fbar"))); + assert_eq!(part.to_string(), "foo%2Fbar"); + } + + #[test] + fn path_part_cant_be_one_dot() { + let part: PathPart = ".".into(); + assert_eq!(part, PathPart(String::from("%2E"))); + assert_eq!(part.to_string(), "."); + } + + #[test] + fn path_part_cant_be_two_dots() { + let part: PathPart = "..".into(); + assert_eq!(part, PathPart(String::from("%2E%2E"))); + assert_eq!(part.to_string(), ".."); + } + + #[test] + fn path_part_cant_be_empty() { + let part: PathPart = "".into(); + assert_eq!(part, PathPart(String::from(EMPTY))); + assert_eq!(part.to_string(), ""); + } + + #[test] + fn empty_is_safely_encoded() { + let part: PathPart = EMPTY.into(); + assert_eq!( + part, + PathPart(percent_encode(EMPTY.as_bytes(), INVALID).to_string()) + ); + assert_eq!(part.to_string(), EMPTY); + } +} diff --git a/components/parquet/Cargo.toml b/components/parquet/Cargo.toml new file mode 100644 index 0000000000..c33523280e --- /dev/null +++ b/components/parquet/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "parquet" +version = "0.1.0" +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +arrow_deps = { path = "../../arrow_deps" } +lru = "0.7.0" +parquet-format = "4.0.0" +thrift = "0.13" \ No newline at end of file diff --git a/components/parquet/src/cache.rs b/components/parquet/src/cache.rs new file mode 100644 index 0000000000..393d49b63e --- /dev/null +++ b/components/parquet/src/cache.rs @@ -0,0 +1,67 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt::Debug, + sync::{Arc, RwLock}, +}; + +use arrow_deps::parquet::file::metadata::ParquetMetaData; +use lru::LruCache; + +pub trait MetaCache: Debug { + fn get(&self, key: &str) -> Option>; + + fn put(&self, key: String, value: Arc); +} + +pub trait DataCache: Debug { + fn get(&self, key: &str) -> Option>>; + + fn put(&self, key: String, value: Arc>); +} + +#[derive(Debug)] +pub struct LruMetaCache { + cache: RwLock>>, +} + +impl LruMetaCache { + pub fn new(cap: usize) -> Self { + Self { + cache: RwLock::new(LruCache::new(cap)), + } + } +} + +impl MetaCache for LruMetaCache { + fn get(&self, key: &str) -> Option> { + self.cache.write().unwrap().get(key).cloned() + } + + fn put(&self, key: String, value: Arc) { + self.cache.write().unwrap().put(key, value); + } +} + +#[derive(Debug)] +pub struct LruDataCache { + cache: RwLock>>>, +} + +impl LruDataCache { + pub fn new(cap: usize) -> Self { + Self { + cache: RwLock::new(LruCache::new(cap)), + } + } +} + +impl DataCache for LruDataCache { + fn get(&self, key: &str) -> Option>> { + self.cache.write().unwrap().get(key).cloned() + } + + fn put(&self, key: String, value: Arc>) { + self.cache.write().unwrap().put(key, value); + } +} diff --git a/components/parquet/src/lib.rs b/components/parquet/src/lib.rs new file mode 100644 index 0000000000..b2b8d28c46 --- /dev/null +++ b/components/parquet/src/lib.rs @@ -0,0 +1,17 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +pub mod cache; +pub mod reverse_reader; +mod serialized_reader; +#[cfg(test)] +pub mod tests; + +// use cache::Cache; +use std::sync::Arc; + +pub use serialized_reader::CachableSerializedFileReader; + +use crate::cache::{DataCache, MetaCache}; + +pub type MetaCacheRef = Arc; +pub type DataCacheRef = Arc; diff --git a/components/parquet/src/reverse_reader.rs b/components/parquet/src/reverse_reader.rs new file mode 100644 index 0000000000..ca201c3bea --- /dev/null +++ b/components/parquet/src/reverse_reader.rs @@ -0,0 +1,231 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{collections::VecDeque, sync::Arc}; + +use arrow_deps::{ + arrow::{ + datatypes::SchemaRef, + error::Result as ArrowResult, + record_batch::{RecordBatch, RecordBatchReader}, + }, + parquet::{ + arrow::{ + self, arrow_reader::ParquetRecordBatchReader, ArrowReader, ParquetFileArrowReader, + }, + errors::Result, + file::{ + metadata::{FileMetaData, ParquetMetaData}, + reader::{FileReader, RowGroupReader}, + }, + record::reader::RowIter, + schema::types::Type as SchemaType, + }, +}; + +/// The reverse reader for [FileReader]. +/// +/// The details of implementation is: +/// - Split the original [FileReader] into [RowGroup]s. +/// - Reverse all the [RowGroup]s into `reversed_readers` so the order of +/// [RowGroup] is already reversed. +/// - Reverse all the [RecordBatch]es of the [RowGroup] into the +/// `current_reversed_batches`. +/// - Pop one [RecordBatch] from the `current_reversed_batches`and reverse its +/// data and send it to caller. +pub struct ReversedFileReader { + schema: SchemaRef, + /// The readers are arranged in reversed order and built from the + /// [RowGroup]. + reversed_readers: Vec, + /// Buffer all the record batches of one reader and every record batch is + /// reversed. + current_reversed_batches: VecDeque>, + next_reader_idx: usize, +} + +impl ReversedFileReader { + fn fetch_next_batches_if_necessary(&mut self) { + if !self.current_reversed_batches.is_empty() { + // current reader is not exhausted and no need to fetch data. + return; + } + + if self.next_reader_idx >= self.reversed_readers.len() { + // all the readers have been exhausted. + return; + } + + let reader = &mut self.reversed_readers[self.next_reader_idx]; + for batch in reader { + // reverse the order of the data of every record batch. + let reversed_batch = match batch { + Ok(v) => arrow_deps::util::reverse_record_batch(&v), + Err(e) => Err(e), + }; + // reverse the order of the record batches. + self.current_reversed_batches.push_front(reversed_batch); + } + + self.next_reader_idx += 1; + } +} + +impl Iterator for ReversedFileReader { + type Item = ArrowResult; + + fn next(&mut self) -> Option { + self.fetch_next_batches_if_necessary(); + self.current_reversed_batches.pop_front() + } +} + +impl RecordBatchReader for ReversedFileReader { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} + +/// Reader for one [RowGroup] of the [FileReader]. +struct SingleRowGroupFileReader { + file_reader: Arc, + /// The index of row group in `file_reader` to read. + row_group_idx: usize, + /// The meta data for the reader of the one row group. + meta_data: ParquetMetaData, +} + +impl SingleRowGroupFileReader { + fn new(file_reader: Arc, row_group_idx: usize) -> Self { + let meta_data = { + let orig_meta_data = file_reader.metadata(); + let orig_file_meta_data = orig_meta_data.file_metadata(); + let row_group_meta_data = orig_meta_data.row_group(row_group_idx); + let file_meta_data = FileMetaData::new( + orig_file_meta_data.version(), + // provide the row group's row number because of the reader only contains one row + // group. + row_group_meta_data.num_rows(), + orig_file_meta_data.created_by().clone(), + orig_file_meta_data.key_value_metadata().clone(), + orig_file_meta_data.schema_descr_ptr(), + orig_file_meta_data.column_orders().cloned(), + ); + ParquetMetaData::new(file_meta_data, vec![row_group_meta_data.clone()]) + }; + + Self { + file_reader, + row_group_idx, + meta_data, + } + } +} + +impl FileReader for SingleRowGroupFileReader { + fn metadata(&self) -> &ParquetMetaData { + &self.meta_data + } + + fn num_row_groups(&self) -> usize { + 1 + } + + fn get_row_group(&self, i: usize) -> Result> { + self.file_reader.get_row_group(self.row_group_idx + i) + } + + fn get_row_iter(&self, projection: Option) -> Result { + RowIter::from_file(projection, self) + } +} + +/// Builder for [ReverseRecordBatchReader] from the `file_reader`. +#[must_use] +pub struct Builder { + file_reader: Arc, + batch_size: usize, + projection: Option>, +} + +impl Builder { + pub fn new(file_reader: Arc, batch_size: usize) -> Self { + Self { + file_reader, + batch_size, + projection: None, + } + } + + pub fn projection(mut self, projection: Option>) -> Self { + self.projection = projection; + + self + } + + pub fn build(self) -> Result { + let mut reversed_readers = Vec::with_capacity(self.file_reader.num_row_groups()); + for row_group_idx in (0..self.file_reader.num_row_groups()).rev() { + let row_group_file_reader = + SingleRowGroupFileReader::new(self.file_reader.clone(), row_group_idx); + let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(row_group_file_reader)); + let batch_reader = if let Some(proj) = &self.projection { + arrow_reader.get_record_reader_by_columns(proj.iter().cloned(), self.batch_size)? + } else { + arrow_reader.get_record_reader(self.batch_size)? + }; + reversed_readers.push(batch_reader); + } + + let schema = { + let file_metadata = self.file_reader.metadata().file_metadata(); + Arc::new(arrow::parquet_to_arrow_schema( + file_metadata.schema_descr(), + file_metadata.key_value_metadata(), + )?) + }; + + Ok(ReversedFileReader { + schema, + reversed_readers, + current_reversed_batches: VecDeque::new(), + next_reader_idx: 0, + }) + } +} + +#[cfg(test)] +mod tests { + use arrow_deps::parquet::file::reader::SerializedFileReader; + + use super::*; + + const TEST_FILE: &str = "binary.parquet"; + const TEST_BATCH_SIZE: usize = 1000; + + fn check_reversed_row_iter(original: RowIter, reversed: ReversedFileReader) { + let mut original_reversed_rows: Vec<_> = original.into_iter().collect(); + original_reversed_rows.reverse(); + + let reversed_record_batches: Vec<_> = reversed + .into_iter() + .map(|v| v.expect("Fail to fetch record batch")) + .collect(); + + crate::tests::check_rows_and_record_batches( + &original_reversed_rows, + &reversed_record_batches, + ); + } + + #[test] + fn test_reverse_file_reader() { + let test_file = crate::tests::get_test_file(TEST_FILE); + let file_reader: Arc = Arc::new( + SerializedFileReader::new(test_file).expect("Should succeed to init file reader"), + ); + let reversed_reader = Builder::new(file_reader.clone(), TEST_BATCH_SIZE) + .build() + .expect("Should succeed to build reversed file reader"); + check_reversed_row_iter(file_reader.get_row_iter(None).unwrap(), reversed_reader); + } +} diff --git a/components/parquet/src/serialized_reader.rs b/components/parquet/src/serialized_reader.rs new file mode 100644 index 0000000000..a79c13ed07 --- /dev/null +++ b/components/parquet/src/serialized_reader.rs @@ -0,0 +1,738 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! fork from https://github.com/apache/arrow-rs/blob/5.2.0/parquet/src/file/serialized_reader.rs + +//! Contains implementations of the reader traits FileReader, RowGroupReader and +//! PageReader Also contains implementations of the ChunkReader for files (with +//! buffering) and byte arrays (RAM) + +use std::{fs::File, io::Read, option::Option::Some, sync::Arc}; + +use arrow_deps::parquet::{ + basic::{Compression, Encoding, Type}, + column::page::{Page, PageReader}, + compression::{create_codec, Codec}, + errors::{ParquetError, Result}, + file::{footer, metadata::*, reader::*, statistics}, + record::{reader::RowIter, Row}, + schema::types::Type as SchemaType, + util::{cursor::SliceableCursor, memory::ByteBufferPtr}, +}; +use parquet_format::{PageHeader, PageType}; +use thrift::protocol::TCompactInputProtocol; + +use crate::{DataCacheRef, MetaCacheRef}; + +fn format_page_data_key(name: &str, col_start: u64, col_length: u64) -> String { + format!("{}_{}_{}", name, col_start, col_length) +} + +/// Conversion into a [`RowIter`](crate::record::reader::RowIter) +/// using the full file schema over all row groups. +impl IntoIterator for CachableSerializedFileReader { + type IntoIter = RowIter<'static>; + type Item = Row; + + fn into_iter(self) -> Self::IntoIter { + RowIter::from_file_into(Box::new(self)) + } +} + +// ---------------------------------------------------------------------- +// Implementations of file & row group readers + +/// A serialized with cache implementation for Parquet [`FileReader`]. +/// Two kinds of items are cacheable: +/// - [`ParquetMetaData`]: only used for creating the reader. +/// - Column chunk bytes: used for reading data by +/// [`SerializedRowGroupReader`]. +/// +/// Note: the implementation is based on the https://github.com/apache/arrow-rs/blob/5.2.0/parquet/src/file/serialized_reader.rs. +pub struct CachableSerializedFileReader { + name: String, + chunk_reader: Arc, + metadata: Arc, + data_cache: Option, +} + +impl CachableSerializedFileReader { + /// Creates file reader from a Parquet file. + /// Returns error if Parquet file does not exist or is corrupt. + pub fn new( + name: String, + chunk_reader: R, + meta_cache: Option, + data_cache: Option, + ) -> Result { + // MODIFICATION START: consider cache for meta data. + let metadata = if let Some(meta_cache) = meta_cache { + if let Some(v) = meta_cache.get(&name) { + v + } else { + let meta_data = Arc::new(footer::parse_metadata(&chunk_reader)?); + meta_cache.put(name.clone(), meta_data.clone()); + meta_data + } + } else { + Arc::new(footer::parse_metadata(&chunk_reader)?) + }; + // MODIFICATION END. + + Ok(Self { + name, + chunk_reader: Arc::new(chunk_reader), + metadata, + data_cache, + }) + } + + /// Filters row group metadata to only those row groups, + /// for which the predicate function returns true + pub fn filter_row_groups(&mut self, predicate: &dyn Fn(&RowGroupMetaData, usize) -> bool) { + let mut filtered_row_groups = Vec::::new(); + for (i, row_group_metadata) in self.metadata.row_groups().iter().enumerate() { + if predicate(row_group_metadata, i) { + filtered_row_groups.push(row_group_metadata.clone()); + } + } + self.metadata = Arc::new(ParquetMetaData::new( + self.metadata.file_metadata().clone(), + filtered_row_groups, + )); + } +} + +impl FileReader for CachableSerializedFileReader { + fn metadata(&self) -> &ParquetMetaData { + &self.metadata + } + + fn num_row_groups(&self) -> usize { + self.metadata.num_row_groups() + } + + fn get_row_group(&self, i: usize) -> Result> { + let row_group_metadata = self.metadata.row_group(i); + // Row groups should be processed sequentially. + let f = Arc::clone(&self.chunk_reader); + Ok(Box::new(SerializedRowGroupReader::new( + f, + row_group_metadata, + self.name.clone(), + self.data_cache.clone(), + ))) + } + + fn get_row_iter(&self, projection: Option) -> Result { + RowIter::from_file(projection, self) + } +} + +/// A serialized with cache implementation for Parquet [`RowGroupReader`]. +/// +/// The cache is used for column data chunk when building [`PageReader`]. +/// +/// NOTE: the implementation is based on the https://github.com/apache/arrow-rs/blob/5.2.0/parquet/src/file/serialized_reader.rs +pub struct SerializedRowGroupReader<'a, R: ChunkReader> { + chunk_reader: Arc, + metadata: &'a RowGroupMetaData, + name: String, + data_cache: Option, +} + +impl<'a, R: ChunkReader> SerializedRowGroupReader<'a, R> { + /// Creates new row group reader from a file and row group metadata. + fn new( + chunk_reader: Arc, + metadata: &'a RowGroupMetaData, + name: String, + data_cache: Option, + ) -> Self { + Self { + chunk_reader, + metadata, + name, + data_cache, + } + } + + fn get_data(&self, col_start: u64, col_length: u64) -> Result> { + let mut file_chunk = self.chunk_reader.get_read(col_start, col_length as usize)?; + let mut buf = Vec::with_capacity(col_length as usize); + file_chunk.read_to_end(&mut buf).unwrap(); + Ok(buf) + } + + fn get_file_chunk(&self, col_start: u64, col_length: u64) -> Result { + if let Some(data_cache) = &self.data_cache { + let key = format_page_data_key(&self.name, col_start, col_length); + if let Some(v) = data_cache.get(&key) { + Ok(SliceableCursor::new(v)) + } else { + let buf_arc = Arc::new(self.get_data(col_start, col_length)?); + data_cache.put(key, buf_arc.clone()); + let slice = SliceableCursor::new(buf_arc); + Ok(slice) + } + } else { + let buf_arc = Arc::new(self.get_data(col_start, col_length)?); + let slice = SliceableCursor::new(buf_arc); + Ok(slice) + } + } +} + +impl<'a, R: 'static + ChunkReader> RowGroupReader for SerializedRowGroupReader<'a, R> { + fn metadata(&self) -> &RowGroupMetaData { + self.metadata + } + + fn num_columns(&self) -> usize { + self.metadata.num_columns() + } + + // TODO: fix PARQUET-816 + fn get_column_page_reader(&self, i: usize) -> Result> { + let col = self.metadata.column(i); + let (col_start, col_length) = col.byte_range(); + + // MODIFICATION START: consider the cache for the data chunk: [col_start, + // col_start+col_length). + let file_chunk = self.get_file_chunk(col_start, col_length)?; + // MODIFICATION END. + + let page_reader = SerializedPageReader::new( + file_chunk, + col.num_values(), + col.compression(), + col.column_descr().physical_type(), + )?; + Ok(Box::new(page_reader)) + } + + fn get_row_iter(&self, projection: Option) -> Result { + RowIter::from_row_group(projection, self) + } +} + +/// A serialized implementation for Parquet [`PageReader`]. +pub struct SerializedPageReader { + // The file source buffer which references exactly the bytes for the column trunk + // to be read by this page reader. + buf: T, + + // The compression codec for this column chunk. Only set for non-PLAIN codec. + decompressor: Option>, + + // The number of values we have seen so far. + seen_num_values: i64, + + // The number of total values in this column chunk. + total_num_values: i64, + + // Column chunk type. + physical_type: Type, +} + +impl SerializedPageReader { + /// Creates a new serialized page reader from file source. + pub fn new( + buf: T, + total_num_values: i64, + compression: Compression, + physical_type: Type, + ) -> Result { + let decompressor = create_codec(compression)?; + let result = Self { + buf, + total_num_values, + seen_num_values: 0, + decompressor, + physical_type, + }; + Ok(result) + } + + /// Reads Page header from Thrift. + fn read_page_header(&mut self) -> Result { + let mut prot = TCompactInputProtocol::new(&mut self.buf); + let page_header = PageHeader::read_from_in_protocol(&mut prot)?; + Ok(page_header) + } +} + +impl Iterator for SerializedPageReader { + type Item = Result; + + fn next(&mut self) -> Option { + self.get_next_page().transpose() + } +} + +impl PageReader for SerializedPageReader { + fn get_next_page(&mut self) -> Result> { + while self.seen_num_values < self.total_num_values { + let page_header = self.read_page_header()?; + + // When processing data page v2, depending on enabled compression for the + // page, we should account for uncompressed data ('offset') of + // repetition and definition levels. + // + // We always use 0 offset for other pages other than v2, `true` flag means + // that compression will be applied if decompressor is defined + let mut offset: usize = 0; + let mut can_decompress = true; + + if let Some(ref header_v2) = page_header.data_page_header_v2 { + offset = (header_v2.definition_levels_byte_length + + header_v2.repetition_levels_byte_length) as usize; + // When is_compressed flag is missing the page is considered compressed + can_decompress = header_v2.is_compressed.unwrap_or(true); + } + + let compressed_len = page_header.compressed_page_size as usize - offset; + let uncompressed_len = page_header.uncompressed_page_size as usize - offset; + // We still need to read all bytes from buffered stream + let mut buffer = vec![0; offset + compressed_len]; + self.buf.read_exact(&mut buffer)?; + + // TODO: page header could be huge because of statistics. We should set a + // maximum page header size and abort if that is exceeded. + if let Some(decompressor) = self.decompressor.as_mut() { + if can_decompress { + let mut decompressed_buffer = Vec::with_capacity(uncompressed_len); + let decompressed_size = + decompressor.decompress(&buffer[offset..], &mut decompressed_buffer)?; + if decompressed_size != uncompressed_len { + return Err(ParquetError::General(format!( + "Actual decompressed size doesn't match the expected one ({} vs {})", + decompressed_size, uncompressed_len, + ))); + } + if offset == 0 { + buffer = decompressed_buffer; + } else { + // Prepend saved offsets to the buffer + buffer.truncate(offset); + buffer.append(&mut decompressed_buffer); + } + } + } + + let result = match page_header.type_ { + PageType::DictionaryPage => { + assert!(page_header.dictionary_page_header.is_some()); + let dict_header = page_header.dictionary_page_header.as_ref().unwrap(); + let is_sorted = dict_header.is_sorted.unwrap_or(false); + Page::DictionaryPage { + buf: ByteBufferPtr::new(buffer), + num_values: dict_header.num_values as u32, + encoding: Encoding::from(dict_header.encoding), + is_sorted, + } + } + PageType::DataPage => { + assert!(page_header.data_page_header.is_some()); + let header = page_header.data_page_header.unwrap(); + self.seen_num_values += header.num_values as i64; + Page::DataPage { + buf: ByteBufferPtr::new(buffer), + num_values: header.num_values as u32, + encoding: Encoding::from(header.encoding), + def_level_encoding: Encoding::from(header.definition_level_encoding), + rep_level_encoding: Encoding::from(header.repetition_level_encoding), + statistics: statistics::from_thrift(self.physical_type, header.statistics), + } + } + PageType::DataPageV2 => { + assert!(page_header.data_page_header_v2.is_some()); + let header = page_header.data_page_header_v2.unwrap(); + let is_compressed = header.is_compressed.unwrap_or(true); + self.seen_num_values += header.num_values as i64; + Page::DataPageV2 { + buf: ByteBufferPtr::new(buffer), + num_values: header.num_values as u32, + encoding: Encoding::from(header.encoding), + num_nulls: header.num_nulls as u32, + num_rows: header.num_rows as u32, + def_levels_byte_len: header.definition_levels_byte_length as u32, + rep_levels_byte_len: header.repetition_levels_byte_length as u32, + is_compressed, + statistics: statistics::from_thrift(self.physical_type, header.statistics), + } + } + _ => { + // For unknown page type (e.g., INDEX_PAGE), skip and read next. + continue; + } + }; + return Ok(Some(result)); + } + + // We are at the end of this column chunk and no more page left. Return None. + Ok(None) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow_deps::parquet::basic::ColumnOrder; + + use super::*; + use crate::cache::{LruDataCache, LruMetaCache}; + + #[test] + fn test_cursor_and_file_has_the_same_behaviour() { + let mut buf: Vec = Vec::new(); + crate::tests::get_test_file("alltypes_plain.parquet") + .read_to_end(&mut buf) + .unwrap(); + let cursor = SliceableCursor::new(buf); + let read_from_cursor = + CachableSerializedFileReader::new("read_from_cursor".to_string(), cursor, None, None) + .unwrap(); + + let test_file = crate::tests::get_test_file("alltypes_plain.parquet"); + let read_from_file = + CachableSerializedFileReader::new("read_from_file".to_string(), test_file, None, None) + .unwrap(); + + let file_iter = read_from_file.get_row_iter(None).unwrap(); + let cursor_iter = read_from_cursor.get_row_iter(None).unwrap(); + + assert!(file_iter.eq(cursor_iter)); + } + + #[test] + fn test_reuse_file_chunk() { + // This test covers the case of maintaining the correct start position in a file + // stream for each column reader after initializing and moving to the next one + // (without necessarily reading the entire column). + let test_file = crate::tests::get_test_file("alltypes_plain.parquet"); + let reader = + CachableSerializedFileReader::new("test".to_string(), test_file, None, None).unwrap(); + let row_group = reader.get_row_group(0).unwrap(); + + let mut page_readers = Vec::new(); + for i in 0..row_group.num_columns() { + page_readers.push(row_group.get_column_page_reader(i).unwrap()); + } + + // Now buffer each col reader, we do not expect any failures like: + // General("underlying Thrift error: end of file") + for mut page_reader in page_readers { + assert!(page_reader.get_next_page().is_ok()); + } + } + + fn new_filer_reader_with_cache() -> CachableSerializedFileReader { + let data_cache: Option = Some(Arc::new(LruDataCache::new(1000))); + let meta_cache: Option = Some(Arc::new(LruMetaCache::new(1000))); + let test_file = crate::tests::get_test_file("alltypes_plain.parquet"); + let reader_result = CachableSerializedFileReader::new( + "test".to_string(), + test_file, + meta_cache.clone(), + data_cache.clone(), + ); + assert!(reader_result.is_ok()); + reader_result.unwrap() + } + + fn test_with_file_reader(reader: &CachableSerializedFileReader) { + // Test contents in Parquet metadata + let metadata = reader.metadata(); + assert_eq!(metadata.num_row_groups(), 1); + + // Test contents in file metadata + let file_metadata = metadata.file_metadata(); + assert!(file_metadata.created_by().is_some()); + assert_eq!( + file_metadata.created_by().as_ref().unwrap(), + "impala version 1.3.0-INTERNAL (build 8a48ddb1eff84592b3fc06bc6f51ec120e1fffc9)" + ); + assert!(file_metadata.key_value_metadata().is_none()); + assert_eq!(file_metadata.num_rows(), 8); + assert_eq!(file_metadata.version(), 1); + assert_eq!(file_metadata.column_orders(), None); + + // Test contents in row group metadata + let row_group_metadata = metadata.row_group(0); + assert_eq!(row_group_metadata.num_columns(), 11); + assert_eq!(row_group_metadata.num_rows(), 8); + assert_eq!(row_group_metadata.total_byte_size(), 671); + // Check each column order + for i in 0..row_group_metadata.num_columns() { + assert_eq!(file_metadata.column_order(i), ColumnOrder::UNDEFINED); + } + + // Test row group reader + let row_group_reader_result = reader.get_row_group(0); + assert!(row_group_reader_result.is_ok()); + let row_group_reader: Box = row_group_reader_result.unwrap(); + assert_eq!( + row_group_reader.num_columns(), + row_group_metadata.num_columns() + ); + assert_eq!( + row_group_reader.metadata().total_byte_size(), + row_group_metadata.total_byte_size() + ); + + // Test page readers + // TODO: test for every column + let page_reader_0_result = row_group_reader.get_column_page_reader(0); + assert!(page_reader_0_result.is_ok()); + let mut page_reader_0: Box = page_reader_0_result.unwrap(); + let mut page_count = 0; + while let Ok(Some(page)) = page_reader_0.get_next_page() { + let is_expected_page = match page { + Page::DictionaryPage { + buf, + num_values, + encoding, + is_sorted, + } => { + assert_eq!(buf.len(), 32); + assert_eq!(num_values, 8); + assert_eq!(encoding, Encoding::PLAIN_DICTIONARY); + assert!(!is_sorted); + true + } + Page::DataPage { + buf, + num_values, + encoding, + def_level_encoding, + rep_level_encoding, + statistics, + } => { + assert_eq!(buf.len(), 11); + assert_eq!(num_values, 8); + assert_eq!(encoding, Encoding::PLAIN_DICTIONARY); + assert_eq!(def_level_encoding, Encoding::RLE); + assert_eq!(rep_level_encoding, Encoding::BIT_PACKED); + assert!(statistics.is_none()); + true + } + _ => false, + }; + assert!(is_expected_page); + page_count += 1; + } + assert_eq!(page_count, 2); + } + + #[test] + fn test_file_reader() { + let test_file = crate::tests::get_test_file("alltypes_plain.parquet"); + let reader = CachableSerializedFileReader::new("test".to_string(), test_file, None, None) + .expect("Should succeed to build test reader"); + test_with_file_reader(&reader); + } + + #[test] + fn test_file_reader_with_cache() { + let reader = new_filer_reader_with_cache(); + let test_num = 10usize; + for _ in 0..test_num { + test_with_file_reader(&reader); + } + } + + #[test] + fn test_file_reader_datapage_v2() { + let test_file = crate::tests::get_test_file("datapage_v2.snappy.parquet"); + let reader_result = + CachableSerializedFileReader::new("test".to_string(), test_file, None, None); + assert!(reader_result.is_ok()); + let reader = reader_result.unwrap(); + + // Test contents in Parquet metadata + let metadata = reader.metadata(); + assert_eq!(metadata.num_row_groups(), 1); + + // Test contents in file metadata + let file_metadata = metadata.file_metadata(); + assert!(file_metadata.created_by().is_some()); + assert_eq!( + file_metadata.created_by().as_ref().unwrap(), + "parquet-mr version 1.8.1 (build 4aba4dae7bb0d4edbcf7923ae1339f28fd3f7fcf)" + ); + assert!(file_metadata.key_value_metadata().is_some()); + assert_eq!( + file_metadata.key_value_metadata().to_owned().unwrap().len(), + 1 + ); + + assert_eq!(file_metadata.num_rows(), 5); + assert_eq!(file_metadata.version(), 1); + assert_eq!(file_metadata.column_orders(), None); + + let row_group_metadata = metadata.row_group(0); + + // Check each column order + for i in 0..row_group_metadata.num_columns() { + assert_eq!(file_metadata.column_order(i), ColumnOrder::UNDEFINED); + } + + // Test row group reader + let row_group_reader_result = reader.get_row_group(0); + assert!(row_group_reader_result.is_ok()); + let row_group_reader: Box = row_group_reader_result.unwrap(); + assert_eq!( + row_group_reader.num_columns(), + row_group_metadata.num_columns() + ); + assert_eq!( + row_group_reader.metadata().total_byte_size(), + row_group_metadata.total_byte_size() + ); + + // Test page readers + // TODO: test for every column + let page_reader_0_result = row_group_reader.get_column_page_reader(0); + assert!(page_reader_0_result.is_ok()); + let mut page_reader_0: Box = page_reader_0_result.unwrap(); + let mut page_count = 0; + while let Ok(Some(page)) = page_reader_0.get_next_page() { + let is_expected_page = match page { + Page::DictionaryPage { + buf, + num_values, + encoding, + is_sorted, + } => { + assert_eq!(buf.len(), 7); + assert_eq!(num_values, 1); + assert_eq!(encoding, Encoding::PLAIN); + assert!(!is_sorted); + true + } + Page::DataPageV2 { + buf, + num_values, + encoding, + num_nulls, + num_rows, + def_levels_byte_len, + rep_levels_byte_len, + is_compressed, + statistics, + } => { + assert_eq!(buf.len(), 4); + assert_eq!(num_values, 5); + assert_eq!(encoding, Encoding::RLE_DICTIONARY); + assert_eq!(num_nulls, 1); + assert_eq!(num_rows, 5); + assert_eq!(def_levels_byte_len, 2); + assert_eq!(rep_levels_byte_len, 0); + assert!(is_compressed); + assert!(statistics.is_some()); + true + } + _ => false, + }; + assert!(is_expected_page); + page_count += 1; + } + assert_eq!(page_count, 2); + } + + #[test] + fn test_page_iterator() { + let file = crate::tests::get_test_file("alltypes_plain.parquet"); + let file_reader = Arc::new( + CachableSerializedFileReader::new("test".to_string(), file, None, None).unwrap(), + ); + + let mut page_iterator = FilePageIterator::new(0, file_reader.clone()).unwrap(); + + // read first page + let page = page_iterator.next(); + assert!(page.is_some()); + assert!(page.unwrap().is_ok()); + + // reach end of file + let page = page_iterator.next(); + assert!(page.is_none()); + + let row_group_indices = Box::new(0..1); + let mut page_iterator = + FilePageIterator::with_row_groups(0, row_group_indices, file_reader).unwrap(); + + // read first page + let page = page_iterator.next(); + assert!(page.is_some()); + assert!(page.unwrap().is_ok()); + + // reach end of file + let page = page_iterator.next(); + assert!(page.is_none()); + } + + #[test] + fn test_file_reader_key_value_metadata() { + let file = crate::tests::get_test_file("binary.parquet"); + let file_reader = Arc::new( + CachableSerializedFileReader::new("test".to_string(), file, None, None).unwrap(), + ); + + let metadata = file_reader + .metadata + .file_metadata() + .key_value_metadata() + .as_ref() + .unwrap(); + + assert_eq!(metadata.len(), 3); + + assert_eq!(metadata.get(0).unwrap().key, "parquet.proto.descriptor"); + + assert_eq!(metadata.get(1).unwrap().key, "writer.model.name"); + assert_eq!(metadata.get(1).unwrap().value, Some("protobuf".to_owned())); + + assert_eq!(metadata.get(2).unwrap().key, "parquet.proto.class"); + assert_eq!( + metadata.get(2).unwrap().value, + Some("foo.baz.Foobaz$Event".to_owned()) + ); + } + + #[test] + fn test_file_reader_filter_row_groups() -> Result<()> { + let test_file = crate::tests::get_test_file("alltypes_plain.parquet"); + let mut reader = + CachableSerializedFileReader::new("test".to_string(), test_file, None, None)?; + + // test initial number of row groups + let metadata = reader.metadata(); + assert_eq!(metadata.num_row_groups(), 1); + + // test filtering out all row groups + reader.filter_row_groups(&|_, _| false); + let metadata = reader.metadata(); + assert_eq!(metadata.num_row_groups(), 0); + + Ok(()) + } +} diff --git a/components/parquet/src/tests.rs b/components/parquet/src/tests.rs new file mode 100644 index 0000000000..69d6904e8f --- /dev/null +++ b/components/parquet/src/tests.rs @@ -0,0 +1,118 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{env, error::Error, fs, path::PathBuf, str::FromStr}; + +use arrow_deps::{ + arrow::{array::*, datatypes::DataType, record_batch::RecordBatch}, + parquet::record::{Field, Row}, +}; + +fn get_data_dir( + udf_env: &str, + submodule_data: &str, +) -> std::result::Result> { + // Try user defined env. + if let Ok(dir) = env::var(udf_env) { + let trimmed = dir.trim().to_string(); + if !trimmed.is_empty() { + let pb = PathBuf::from(trimmed); + if pb.is_dir() { + return Ok(pb); + } else { + return Err(format!( + "the data dir `{}` defined by env {} not found", + pb.display(), + udf_env + ) + .into()); + } + } + } + + // The env is undefined or its value is trimmed to empty, let's try default dir. + + // env "CARGO_MANIFEST_DIR" is "the directory containing the manifest of your + // package", set by `cargo run` or `cargo test`, see: + // https://doc.rust-lang.org/cargo/reference/environment-variables.html + let dir = env!("CARGO_MANIFEST_DIR"); + + let pb = PathBuf::from(dir).join(submodule_data); + if pb.is_dir() { + Ok(pb) + } else { + Err(format!( + "env `{}` is undefined or has empty value, and the pre-defined data dir `{}` not found\n\ + HINT: try running `git submodule update --init`", + udf_env, + pb.display(), + ).into()) + } +} + +fn parquet_test_data() -> String { + match get_data_dir("PARQUET_TEST_DATA", "../parquet-testing/data") { + Ok(pb) => pb.display().to_string(), + Err(err) => panic!("failed to get parquet data dir: {}", err), + } +} + +/// Returns path to the test parquet file in 'data' directory +fn get_test_path(file_name: &str) -> PathBuf { + let mut pathbuf = PathBuf::from_str(&parquet_test_data()).unwrap(); + pathbuf.push(file_name); + pathbuf +} + +/// Returns file handle for a test parquet file from 'data' directory +pub fn get_test_file(file_name: &str) -> fs::File { + let path = get_test_path(file_name); + fs::File::open(path.as_path()).unwrap_or_else(|err| { + panic!( + "Test file {} could not be opened, did you do `git submodule update`?: {}", + path.display(), + err + ) + }) +} + +struct RowViewOfRecordBatch<'a> { + record_batch: &'a RecordBatch, + row_idx: usize, +} + +impl<'a> RowViewOfRecordBatch<'a> { + fn check_row(&self, expect_row: &Row) { + for (col_idx, (_, field)) in expect_row.get_column_iter().enumerate() { + let array_ref = self.record_batch.column(col_idx); + + match array_ref.data_type() { + DataType::Binary => { + let array = array_ref.as_any().downcast_ref::().unwrap(); + let v = array.value(self.row_idx); + + if let Field::Bytes(field_value) = field { + assert_eq!(v, field_value.data()); + } else { + panic!("different value type"); + } + } + _ => unimplemented!("not support {:?}", array_ref.data_type()), + } + } + } +} + +pub fn check_rows_and_record_batches(rows: &[Row], record_batches: &[RecordBatch]) { + let mut row_idx = 0; + for record_batch in record_batches { + for row_idx_in_batch in 0..record_batch.num_rows() { + let expect_row = &rows[row_idx]; + let row_view = RowViewOfRecordBatch { + record_batch, + row_idx: row_idx_in_batch, + }; + row_view.check_row(expect_row); + row_idx += 1; + } + } +} diff --git a/components/profile/Cargo.toml b/components/profile/Cargo.toml new file mode 100644 index 0000000000..044fb5685a --- /dev/null +++ b/components/profile/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "profile" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[dependencies] +log = "0.4" +tempfile = "3.0" +jemallocator = "0.3.2" +jemalloc-ctl = "0.3.2" + +[dependencies.jemalloc-sys] +version = "0.3.2" +features = ["stats", "profiling", "unprefixed_malloc_on_supported_platforms"] diff --git a/components/profile/src/lib.rs b/components/profile/src/lib.rs new file mode 100644 index 0000000000..2f63f8c536 --- /dev/null +++ b/components/profile/src/lib.rs @@ -0,0 +1,142 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Memory profiler for running application based on jemalloc features. + +use std::{ + fmt::Formatter, + fs::File, + io, + io::Read, + sync::{Mutex, MutexGuard}, + thread, time, +}; + +use jemalloc_ctl::{Access, AsName}; +use jemallocator; +use log::{error, info}; + +#[derive(Debug)] +pub enum Error { + Internal { msg: String }, + IO(io::Error), + Jemalloc(jemalloc_ctl::Error), +} + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "Profile Error: {:?}", self) + } +} + +impl std::error::Error for Error {} + +pub type Result = std::result::Result; + +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + +const PROF_ACTIVE: &'static [u8] = b"prof.active\0"; +const PROF_DUMP: &'static [u8] = b"prof.dump\0"; +const PROFILE_OUTPUT: &'static [u8] = b"profile.out\0"; +const PROFILE_OUTPUT_FILE_PATH: &str = "/tmp/profile.out"; + +fn set_prof_active(active: bool) -> Result<()> { + let name = PROF_ACTIVE.name(); + name.write(active).map_err(|e| Error::Jemalloc(e)) +} + +fn dump_profile() -> Result<()> { + let name = PROF_DUMP.name(); + name.write(PROFILE_OUTPUT).map_err(|e| Error::Jemalloc(e)) +} + +struct ProfLockGuard<'a>(MutexGuard<'a, ()>); + +/// ProfLockGuard hold the profile lock and take responsibilities for +/// (de)activating mem profiling. NOTE: Keeping mem profiling on may cause some +/// extra runtime cost so we choose to activating it dynamically. +impl<'a> ProfLockGuard<'a> { + pub fn new(guard: MutexGuard<'a, ()>) -> Result { + set_prof_active(true)?; + Ok(Self(guard)) + } +} + +impl<'a> Drop for ProfLockGuard<'a> { + fn drop(&mut self) { + if let Err(e) = set_prof_active(false) { + error!("Fail to deactivate profiling, err:{}", e); + } + } +} + +pub struct Profiler { + mem_prof_lock: Mutex<()>, +} + +impl Default for Profiler { + fn default() -> Self { + Self::new() + } +} + +impl Profiler { + pub fn new() -> Self { + Self { + mem_prof_lock: Mutex::new(()), + } + } + + // dump_mem_prof collects mem profiling data in `seconds`. + // TODO(xikai): limit the profiling duration + pub fn dump_mem_prof(&self, seconds: u64) -> Result> { + // concurrent profiling is disabled. + let lock_guard = self.mem_prof_lock.try_lock().map_err(|e| Error::Internal { + msg: format!("failed to acquire mem_prof_lock, err:{}", e), + })?; + + let _guard = ProfLockGuard::new(lock_guard)?; + + info!( + "Profiler::dump_mem_prof start memory profiling {} seconds", + seconds + ); + // wait for seconds for collect the profiling data + thread::sleep(time::Duration::from_secs(seconds)); + + // clearing the profile output file before dumping profile results. + { + let f = File::open(PROFILE_OUTPUT_FILE_PATH).map_err(|e| { + error!("Failed to open prof data file, err:{}", e); + Error::IO(e) + })?; + f.set_len(0).map_err(|e| { + error!("Failed to truncate profile output file, err:{}", e); + Error::IO(e) + })?; + } + + // dump the profile results to profile output file. + dump_profile().map_err(|e| { + error!( + "Failed to dump prof to {}, err:{}", + PROFILE_OUTPUT_FILE_PATH, e + ); + e + })?; + + // read the profile results into buffer + let mut f = File::open(PROFILE_OUTPUT_FILE_PATH).map_err(|e| { + error!("Failed to open prof data file, err:{}", e); + Error::IO(e) + })?; + + let mut buffer = Vec::new(); + f.read_to_end(&mut buffer).map_err(|e| { + error!("Failed to read prof data file, err:{}", e); + Error::IO(e) + })?; + + Ok(buffer) + } +} diff --git a/components/rust-hyperloglog/.github/dependabot.yml b/components/rust-hyperloglog/.github/dependabot.yml new file mode 100644 index 0000000000..66cef947a2 --- /dev/null +++ b/components/rust-hyperloglog/.github/dependabot.yml @@ -0,0 +1,10 @@ +# // Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +version: 2 +updates: +- package-ecosystem: cargo + directory: "/" + schedule: + interval: daily + time: "04:00" + open-pull-requests-limit: 10 diff --git a/components/rust-hyperloglog/.gitignore b/components/rust-hyperloglog/.gitignore new file mode 100644 index 0000000000..4468cbfb1c --- /dev/null +++ b/components/rust-hyperloglog/.gitignore @@ -0,0 +1,7 @@ +*.dSYM +*~ +.rust +build +Cargo.lock +src/hyperloglog/hyperloglog +target diff --git a/components/rust-hyperloglog/.travis.yml b/components/rust-hyperloglog/.travis.yml new file mode 100644 index 0000000000..52635e58a8 --- /dev/null +++ b/components/rust-hyperloglog/.travis.yml @@ -0,0 +1,6 @@ +# // Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +language: rust +rust: + - nightly + - stable diff --git a/components/rust-hyperloglog/Cargo.toml b/components/rust-hyperloglog/Cargo.toml new file mode 100644 index 0000000000..40c7cb83f1 --- /dev/null +++ b/components/rust-hyperloglog/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "hyperloglog" +version = "1.0.0" +authors = ["Frank Denis "] +description = "Hyperloglog implementation in Rust" +license = "ISC" +homepage = "https://github.com/jedisct1/rust-hyperloglog" +repository = "https://github.com/jedisct1/rust-hyperloglog" +edition = "2018" + +[lib] +name = "hyperloglog" +path = "src/hyperloglog/lib.rs" + +[dependencies] +bytecount = "0.6" +bytes = { path = "../bytes" } +rand = "0.8.0" +siphasher = "0.3" +snafu = { version ="0.6.10", features = ["backtraces"]} diff --git a/components/rust-hyperloglog/LICENSE b/components/rust-hyperloglog/LICENSE new file mode 100644 index 0000000000..ab647ead82 --- /dev/null +++ b/components/rust-hyperloglog/LICENSE @@ -0,0 +1,23 @@ +Copyright (c) 2013-2016, Frank Denis +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/components/rust-hyperloglog/README.md b/components/rust-hyperloglog/README.md new file mode 100644 index 0000000000..f104f9d59a --- /dev/null +++ b/components/rust-hyperloglog/README.md @@ -0,0 +1,27 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +hyperloglog +=========== + +A [HyperLogLog](https://static.googleusercontent.com/media/research.google.com/en/us/pubs/archive/40671.pdf) implementation in Rust, with bias correction. + +Installation: use [Cargo](http://crates.io): + +```toml +[dependencies] +hyperloglog = "0" +``` + +Usage: + +```rust +let mut hll = HyperLogLog::new(error_rate); +hll.insert(&"test1"); +hll.insert(&"test2"); +let card_estimation = hll.len(); + +let mut hll2 = HyperLogLog::new_from_template(&hll); +hll2.insert(&"test3"); + +hll.merge(&hll2); +``` diff --git a/components/rust-hyperloglog/THANKS b/components/rust-hyperloglog/THANKS new file mode 100644 index 0000000000..091c37fc33 --- /dev/null +++ b/components/rust-hyperloglog/THANKS @@ -0,0 +1,3 @@ +Nelson Gonçalves (@goncalvesnelson) +Vasily Evseenko (@svpcom) +for Python's hyperloglog implementation this code is based on. diff --git a/components/rust-hyperloglog/src/hyperloglog/lib.rs b/components/rust-hyperloglog/src/hyperloglog/lib.rs new file mode 100644 index 0000000000..242ae9980e --- /dev/null +++ b/components/rust-hyperloglog/src/hyperloglog/lib.rs @@ -0,0 +1,4264 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// (C)opyleft 2013-2019 Frank Denis + +//! HyperLogLog implementation for Rust +//! +//! Forked from +#![crate_name = "hyperloglog"] +#![warn(non_camel_case_types, non_upper_case_globals, unused_qualifications)] +#![allow(non_snake_case)] +#![allow(clippy::unreadable_literal)] + +use std::{ + cmp::Ordering::{Equal, Greater, Less}, + hash::{Hash, Hasher}, + iter::repeat, +}; + +use bytes::{MemBuf, MemBufMut}; +use siphasher::sip::SipHasher13; +use snafu::{ResultExt, Snafu}; + +static TRESHOLD_DATA: [f64; 15] = [ + 10.0, 20.0, 40.0, 80.0, 220.0, 400.0, 900.0, 1800.0, 3100.0, 6500.0, 11500.0, 20000.0, 50000.0, + 120000.0, 350000.0, +]; + +static RAW_ESTIMATE_DATA: &[&[f64]] = &[ + &[ + 11.0, 11.717, 12.207, 12.7896, 13.2882, 13.8204, 14.3772, 14.9342, 15.5202, 16.161, + 16.7722, 17.4636, 18.0396, 18.6766, 19.3566, 20.0454, 20.7936, 21.4856, 22.2666, 22.9946, + 23.766, 24.4692, 25.3638, 26.0764, 26.7864, 27.7602, 28.4814, 29.433, 30.2926, 31.0664, + 31.9996, 32.7956, 33.5366, 34.5894, 35.5738, 36.2698, 37.3682, 38.0544, 39.2342, 40.0108, + 40.7966, 41.9298, 42.8704, 43.6358, 44.5194, 45.773, 46.6772, 47.6174, 48.4888, 49.3304, + 50.2506, 51.4996, 52.3824, 53.3078, 54.3984, 55.5838, 56.6618, 57.2174, 58.3514, 59.0802, + 60.1482, 61.0376, 62.3598, 62.8078, 63.9744, 64.914, 65.781, 67.1806, 68.0594, 68.8446, + 69.7928, 70.8248, 71.8324, 72.8598, 73.6246, 74.7014, 75.393, 76.6708, 77.2394, + ], + &[ + 23.0, 23.1194, 23.8208, 24.2318, 24.77, 25.2436, 25.7774, 26.2848, 26.8224, 27.3742, + 27.9336, 28.503, 29.0494, 29.6292, 30.2124, 30.798, 31.367, 31.9728, 32.5944, 33.217, + 33.8438, 34.3696, 35.0956, 35.7044, 36.324, 37.0668, 37.6698, 38.3644, 39.049, 39.6918, + 40.4146, 41.082, 41.687, 42.5398, 43.2462, 43.857, 44.6606, 45.4168, 46.1248, 46.9222, + 47.6804, 48.447, 49.3454, 49.9594, 50.7636, 51.5776, 52.331, 53.19, 53.9676, 54.7564, + 55.5314, 56.4442, 57.3708, 57.9774, 58.9624, 59.8796, 60.755, 61.472, 62.2076, 63.1024, + 63.8908, 64.7338, 65.7728, 66.629, 67.413, 68.3266, 69.1524, 70.2642, 71.1806, 72.0566, + 72.9192, 73.7598, 74.3516, 75.5802, 76.4386, 77.4916, 78.1524, 79.1892, 79.8414, 80.8798, + 81.8376, 82.4698, 83.7656, 84.331, 85.5914, 86.6012, 87.7016, 88.5582, 89.3394, 90.3544, + 91.4912, 92.308, 93.3552, 93.9746, 95.2052, 95.727, 97.1322, 98.3944, 98.7588, 100.242, + 101.1914, 102.2538, 102.8776, 103.6292, 105.1932, 105.9152, 107.0868, 107.6728, 108.7144, + 110.3114, 110.8716, 111.245, 112.7908, 113.7064, 114.636, 115.7464, 116.1788, 117.7464, + 118.4896, 119.6166, 120.5082, 121.7798, 122.9028, 123.4426, 124.8854, 125.705, 126.4652, + 128.3464, 128.3462, 130.0398, 131.0342, 131.0042, 132.4766, 133.511, 134.7252, 135.425, + 136.5172, 138.0572, 138.6694, 139.3712, 140.8598, 141.4594, 142.554, 143.4006, 144.7374, + 146.1634, 146.8994, 147.605, 147.9304, 149.1636, 150.2468, 151.5876, 152.2096, 153.7032, + 154.7146, 155.807, 156.9228, 157.0372, 158.5852, + ], + &[ + 46.0, 46.1902, 47.271, 47.8358, 48.8142, 49.2854, 50.317, 51.354, 51.8924, 52.9436, + 53.4596, 54.5262, 55.6248, 56.1574, 57.2822, 57.837, 58.9636, 60.074, 60.7042, 61.7976, + 62.4772, 63.6564, 64.7942, 65.5004, 66.686, 67.291, 68.5672, 69.8556, 70.4982, 71.8204, + 72.4252, 73.7744, 75.0786, 75.8344, 77.0294, 77.8098, 79.0794, 80.5732, 81.1878, 82.5648, + 83.2902, 84.6784, 85.3352, 86.8946, 88.3712, 89.0852, 90.499, 91.2686, 92.6844, 94.2234, + 94.9732, 96.3356, 97.2286, 98.7262, 100.3284, 101.1048, 102.5962, 103.3562, 105.1272, + 106.4184, 107.4974, 109.0822, 109.856, 111.48, 113.2834, 114.0208, 115.637, 116.5174, + 118.0576, 119.7476, 120.427, 122.1326, 123.2372, 125.2788, 126.6776, 127.7926, 129.1952, + 129.9564, 131.6454, 133.87, 134.5428, 136.2, 137.0294, 138.6278, 139.6782, 141.792, + 143.3516, 144.2832, 146.0394, 147.0748, 148.4912, 150.849, 151.696, 153.5404, 154.073, + 156.3714, 157.7216, 158.7328, 160.4208, 161.4184, 163.9424, 165.2772, 166.411, 168.1308, + 168.769, 170.9258, 172.6828, 173.7502, 175.706, 176.3886, 179.0186, 180.4518, 181.927, + 183.4172, 184.4114, 186.033, 188.5124, 189.5564, 191.6008, 192.4172, 193.8044, 194.997, + 197.4548, 198.8948, 200.2346, 202.3086, 203.1548, 204.8842, 206.6508, 206.6772, 209.7254, + 210.4752, 212.7228, 214.6614, 215.1676, 217.793, 218.0006, 219.9052, 221.66, 223.5588, + 225.1636, 225.6882, 227.7126, 229.4502, 231.1978, 232.9756, 233.1654, 236.727, 238.1974, + 237.7474, 241.1346, 242.3048, 244.1948, 245.3134, 246.879, 249.1204, 249.853, 252.6792, + 253.857, 254.4486, 257.2362, 257.9534, 260.0286, 260.5632, 262.663, 264.723, 265.7566, + 267.2566, 267.1624, 270.62, 272.8216, 273.2166, 275.2056, 276.2202, 278.3726, 280.3344, + 281.9284, 283.9728, 284.1924, 286.4872, 287.587, 289.807, 291.1206, 292.769, 294.8708, + 296.665, 297.1182, 299.4012, 300.6352, 302.1354, 304.1756, 306.1606, 307.3462, 308.5214, + 309.4134, 310.8352, 313.9684, 315.837, 316.7796, 318.9858, + ], + &[ + 92.0, 93.4934, 94.9758, 96.4574, 97.9718, 99.4954, 101.5302, 103.0756, 104.6374, 106.1782, + 107.7888, 109.9522, 111.592, 113.2532, 114.9086, 116.5938, 118.9474, 120.6796, 122.4394, + 124.2176, 125.9768, 128.4214, 130.2528, 132.0102, 133.8658, 135.7278, 138.3044, 140.1316, + 142.093, 144.0032, 145.9092, 148.6306, 150.5294, 152.5756, 154.6508, 156.662, 159.552, + 161.3724, 163.617, 165.5754, 167.7872, 169.8444, 172.7988, 174.8606, 177.2118, 179.3566, + 181.4476, 184.5882, 186.6816, 189.0824, 191.0258, 193.6048, 196.4436, 198.7274, 200.957, + 203.147, 205.4364, 208.7592, 211.3386, 213.781, 215.8028, 218.656, 221.6544, 223.996, + 226.4718, 229.1544, 231.6098, 234.5956, 237.0616, 239.5758, 242.4878, 244.5244, 248.2146, + 250.724, 252.8722, 255.5198, 258.0414, 261.941, 264.9048, 266.87, 269.4304, 272.028, + 274.4708, 278.37, 281.0624, 283.4668, 286.5532, 289.4352, 293.2564, 295.2744, 298.2118, + 300.7472, 304.1456, 307.2928, 309.7504, 312.5528, 315.979, 318.2102, 322.1834, 324.3494, + 327.325, 330.6614, 332.903, 337.2544, 339.9042, 343.215, 345.2864, 348.0814, 352.6764, + 355.301, 357.139, 360.658, 363.1732, 366.5902, 369.9538, 373.0828, 375.922, 378.9902, + 382.7328, 386.4538, 388.1136, 391.2234, 394.0878, 396.708, 401.1556, 404.1852, 406.6372, + 409.6822, 412.7796, 416.6078, 418.4916, 422.131, 424.5376, 428.1988, 432.211, 434.4502, + 438.5282, 440.912, 444.0448, 447.7432, 450.8524, 453.7988, 456.7858, 458.8868, 463.9886, + 466.5064, 468.9124, 472.6616, 475.4682, 478.582, 481.304, 485.2738, 488.6894, 490.329, + 496.106, 497.6908, 501.1374, 504.5322, 506.8848, 510.3324, 513.4512, 516.179, 520.4412, + 522.6066, 526.167, 528.7794, 533.379, 536.067, 538.46, 542.9116, 545.692, 547.9546, + 552.493, 555.2722, 557.335, 562.449, 564.2014, 569.0738, 571.0974, 574.8564, 578.2996, + 581.409, 583.9704, 585.8098, 589.6528, 594.5998, 595.958, 600.068, 603.3278, 608.2016, + 609.9632, 612.864, 615.43, 620.7794, 621.272, 625.8644, 629.206, 633.219, 634.5154, + 638.6102, + ], + &[ + 184.2152, 187.2454, 190.2096, 193.6652, 196.6312, 199.6822, 203.249, 206.3296, 210.0038, + 213.2074, 216.4612, 220.27, 223.5178, 227.4412, 230.8032, 234.1634, 238.1688, 241.6074, + 245.6946, 249.2664, 252.8228, 257.0432, 260.6824, 264.9464, 268.6268, 272.2626, 276.8376, + 280.4034, 284.8956, 288.8522, 292.7638, 297.3552, 301.3556, 305.7526, 309.9292, 313.8954, + 318.8198, 322.7668, 327.298, 331.6688, 335.9466, 340.9746, 345.1672, 349.3474, 354.3028, + 358.8912, 364.114, 368.4646, 372.9744, 378.4092, 382.6022, 387.843, 392.5684, 397.1652, + 402.5426, 407.4152, 412.5388, 417.3592, 422.1366, 427.486, 432.3918, 437.5076, 442.509, + 447.3834, 453.3498, 458.0668, 463.7346, 469.1228, 473.4528, 479.7, 484.644, 491.0518, + 495.5774, 500.9068, 506.432, 512.1666, 517.434, 522.6644, 527.4894, 533.6312, 538.3804, + 544.292, 550.5496, 556.0234, 562.8206, 566.6146, 572.4188, 579.117, 583.6762, 590.6576, + 595.7864, 601.509, 607.5334, 612.9204, 619.772, 624.2924, 630.8654, 636.1836, 642.745, + 649.1316, 655.0386, 660.0136, 666.6342, 671.6196, 678.1866, 684.4282, 689.3324, 695.4794, + 702.5038, 708.129, 713.528, 720.3204, 726.463, 732.7928, 739.123, 744.7418, 751.2192, + 756.5102, 762.6066, 769.0184, 775.2224, 781.4014, 787.7618, 794.1436, 798.6506, 805.6378, + 811.766, 819.7514, 824.5776, 828.7322, 837.8048, 843.6302, 849.9336, 854.4798, 861.3388, + 867.9894, 873.8196, 880.3136, 886.2308, 892.4588, 899.0816, 905.4076, 912.0064, 917.3878, + 923.619, 929.998, 937.3482, 943.9506, 947.991, 955.1144, 962.203, 968.8222, 975.7324, + 981.7826, 988.7666, 994.2648, 1000.3128, 1007.4082, 1013.7536, 1020.3376, 1026.7156, + 1031.7478, 1037.4292, 1045.393, 1051.2278, 1058.3434, 1062.8726, 1071.884, 1076.806, + 1082.9176, 1089.1678, 1095.5032, 1102.525, 1107.2264, 1115.315, 1120.93, 1127.252, + 1134.1496, 1139.0408, 1147.5448, 1153.3296, 1158.1974, 1166.5262, 1174.3328, 1175.657, + 1184.4222, 1190.9172, 1197.1292, 1204.4606, 1210.4578, 1218.8728, 1225.3336, 1226.6592, + 1236.5768, 1241.363, 1249.4074, 1254.6566, 1260.8014, 1266.5454, 1274.5192, + ], + &[ + 369.0, 374.8294, 381.2452, 387.6698, 394.1464, 400.2024, 406.8782, 413.6598, 420.462, + 427.2826, 433.7102, 440.7416, 447.9366, 455.1046, 462.285, 469.0668, 476.306, 483.8448, + 491.301, 498.9886, 506.2422, 513.8138, 521.7074, 529.7428, 537.8402, 545.1664, 553.3534, + 561.594, 569.6886, 577.7876, 585.65, 594.228, 602.8036, 611.1666, 620.0818, 628.0824, + 637.2574, 646.302, 655.1644, 664.0056, 672.3802, 681.7192, 690.5234, 700.2084, 708.831, + 718.485, 728.1112, 737.4764, 746.76, 756.3368, 766.5538, 775.5058, 785.2646, 795.5902, + 804.3818, 814.8998, 824.9532, 835.2062, 845.2798, 854.4728, 864.9582, 875.3292, 886.171, + 896.781, 906.5716, 916.7048, 927.5322, 937.875, 949.3972, 958.3464, 969.7274, 980.2834, + 992.1444, 1003.4264, 1013.0166, 1024.018, 1035.0438, 1046.34, 1057.6856, 1068.9836, + 1079.0312, 1091.677, 1102.3188, 1113.4846, 1124.4424, 1135.739, 1147.1488, 1158.9202, + 1169.406, 1181.5342, 1193.2834, 1203.8954, 1216.3286, 1226.2146, 1239.6684, 1251.9946, + 1262.123, 1275.4338, 1285.7378, 1296.076, 1308.9692, 1320.4964, 1333.0998, 1343.9864, + 1357.7754, 1368.3208, 1380.4838, 1392.7388, 1406.0758, 1416.9098, 1428.9728, 1440.9228, + 1453.9292, 1462.617, 1476.05, 1490.2996, 1500.6128, 1513.7392, 1524.5174, 1536.6322, + 1548.2584, 1562.3766, 1572.423, 1587.1232, 1596.5164, 1610.5938, 1622.5972, 1633.1222, + 1647.7674, 1658.5044, 1671.57, 1683.7044, 1695.4142, 1708.7102, 1720.6094, 1732.6522, + 1747.841, 1756.4072, 1769.9786, 1782.3276, 1797.5216, 1808.3186, 1819.0694, 1834.354, + 1844.575, 1856.2808, 1871.1288, 1880.7852, 1893.9622, 1906.3418, 1920.6548, 1932.9302, + 1945.8584, 1955.473, 1968.8248, 1980.6446, 1995.9598, 2008.349, 2019.8556, 2033.0334, + 2044.0206, 2059.3956, 2069.9174, 2082.6084, 2093.7036, 2106.6108, 2118.9124, 2132.301, + 2144.7628, 2159.8422, 2171.0212, 2183.101, 2193.5112, 2208.052, 2221.3194, 2233.3282, + 2247.295, 2257.7222, 2273.342, 2286.5638, 2299.6786, 2310.8114, 2322.3312, 2335.516, + 2349.874, 2363.5968, 2373.865, 2387.1918, 2401.8328, 2414.8496, 2424.544, 2436.7592, + 2447.1682, 2464.1958, 2474.3438, 2489.0006, 2497.4526, 2513.6586, 2527.19, 2540.7028, + 2553.768, + ], + &[ + 738.1256, 750.4234, 763.1064, 775.4732, 788.4636, 801.0644, 814.488, 827.9654, 841.0832, + 854.7864, 868.1992, 882.2176, 896.5228, 910.1716, 924.7752, 938.899, 953.6126, 968.6492, + 982.9474, 998.5214, 1013.1064, 1028.6364, 1044.2468, 1059.4588, 1075.3832, 1091.0584, + 1106.8606, 1123.3868, 1139.5062, 1156.1862, 1172.463, 1189.339, 1206.1936, 1223.1292, + 1240.1854, 1257.2908, 1275.3324, 1292.8518, 1310.5204, 1328.4854, 1345.9318, 1364.552, + 1381.4658, 1400.4256, 1419.849, 1438.152, 1456.8956, 1474.8792, 1494.118, 1513.62, + 1532.5132, 1551.9322, 1570.7726, 1590.6086, 1610.5332, 1630.5918, 1650.4294, 1669.7662, + 1690.4106, 1710.7338, 1730.9012, 1750.4486, 1770.1556, 1791.6338, 1812.7312, 1833.6264, + 1853.9526, 1874.8742, 1896.8326, 1918.1966, 1939.5594, 1961.07, 1983.037, 2003.1804, + 2026.071, 2047.4884, 2070.0848, 2091.2944, 2114.333, 2135.9626, 2158.2902, 2181.0814, + 2202.0334, 2224.4832, 2246.39, 2269.7202, 2292.1714, 2314.2358, 2338.9346, 2360.891, + 2384.0264, 2408.3834, 2430.1544, 2454.8684, 2476.9896, 2501.4368, 2522.8702, 2548.0408, + 2570.6738, 2593.5208, 2617.0158, 2640.2302, 2664.0962, 2687.4986, 2714.2588, 2735.3914, + 2759.6244, 2781.8378, 2808.0072, 2830.6516, 2856.2454, 2877.2136, 2903.4546, 2926.785, + 2951.2294, 2976.468, 3000.867, 3023.6508, 3049.91, 3073.5984, 3098.162, 3121.5564, + 3146.2328, 3170.9484, 3195.5902, 3221.3346, 3242.7032, 3271.6112, 3296.5546, 3317.7376, + 3345.072, 3369.9518, 3394.326, 3418.1818, 3444.6926, 3469.086, 3494.2754, 3517.8698, + 3544.248, 3565.3768, 3588.7234, 3616.979, 3643.7504, 3668.6812, 3695.72, 3719.7392, + 3742.6224, 3770.4456, 3795.6602, 3819.9058, 3844.002, 3869.517, 3895.6824, 3920.8622, + 3947.1364, 3973.985, 3995.4772, 4021.62, 4046.628, 4074.65, 4096.2256, 4121.831, 4146.6406, + 4173.276, 4195.0744, 4223.9696, 4251.3708, 4272.9966, 4300.8046, 4326.302, 4353.1248, + 4374.312, 4403.0322, 4426.819, 4450.0598, 4478.5206, 4504.8116, 4528.8928, 4553.9584, + 4578.8712, 4603.8384, 4632.3872, 4655.5128, 4675.821, 4704.6222, 4731.9862, 4755.4174, + 4781.2628, 4804.332, 4832.3048, 4862.8752, 4883.4148, 4906.9544, 4935.3516, 4954.3532, + 4984.0248, 5011.217, 5035.3258, 5057.3672, 5084.1828, + ], + &[ + 1477.0, 1501.6014, 1526.5802, 1551.7942, 1577.3042, 1603.2062, 1629.8402, 1656.2292, + 1682.9462, 1709.9926, 1737.3026, 1765.4252, 1793.0578, 1821.6092, 1849.626, 1878.5568, + 1908.527, 1937.5154, 1967.1874, 1997.3878, 2027.37, 2058.1972, 2089.5728, 2120.1012, + 2151.9668, 2183.292, 2216.0772, 2247.8578, 2280.6562, 2313.041, 2345.714, 2380.3112, + 2414.1806, 2447.9854, 2481.656, 2516.346, 2551.5154, 2586.8378, 2621.7448, 2656.6722, + 2693.5722, 2729.1462, 2765.4124, 2802.8728, 2838.898, 2876.408, 2913.4926, 2951.4938, + 2989.6776, 3026.282, 3065.7704, 3104.1012, 3143.7388, 3181.6876, 3221.1872, 3261.5048, + 3300.0214, 3339.806, 3381.409, 3421.4144, 3461.4294, 3502.2286, 3544.651, 3586.6156, + 3627.337, 3670.083, 3711.1538, 3753.5094, 3797.01, 3838.6686, 3882.1678, 3922.8116, + 3967.9978, 4009.9204, 4054.3286, 4097.5706, 4140.6014, 4185.544, 4229.5976, 4274.583, + 4316.9438, 4361.672, 4406.2786, 4451.8628, 4496.1834, 4543.505, 4589.1816, 4632.5188, + 4678.2294, 4724.8908, 4769.0194, 4817.052, 4861.4588, 4910.1596, 4956.4344, 5002.5238, + 5048.13, 5093.6374, 5142.8162, 5187.7894, 5237.3984, 5285.6078, 5331.0858, 5379.1036, + 5428.6258, 5474.6018, 5522.7618, 5571.5822, 5618.59, 5667.9992, 5714.88, 5763.454, + 5808.6982, 5860.3644, 5910.2914, 5953.571, 6005.9232, 6055.1914, 6104.5882, 6154.5702, + 6199.7036, 6251.1764, 6298.7596, 6350.0302, 6398.061, 6448.4694, 6495.933, 6548.0474, + 6597.7166, 6646.9416, 6695.9208, 6742.6328, 6793.5276, 6842.1934, 6894.2372, 6945.3864, + 6996.9228, 7044.2372, 7094.1374, 7142.2272, 7192.2942, 7238.8338, 7288.9006, 7344.0908, + 7394.8544, 7443.5176, 7490.4148, 7542.9314, 7595.6738, 7641.9878, 7694.3688, 7743.0448, + 7797.522, 7845.53, 7899.594, 7950.3132, 7996.455, 8050.9442, 8092.9114, 8153.1374, + 8197.4472, 8252.8278, 8301.8728, 8348.6776, 8401.4698, 8453.551, 8504.6598, 8553.8944, + 8604.1276, 8657.6514, 8710.3062, 8758.908, 8807.8706, 8862.1702, 8910.4668, 8960.77, + 9007.2766, 9063.164, 9121.0534, 9164.1354, 9218.1594, 9267.767, 9319.0594, 9372.155, + 9419.7126, 9474.3722, 9520.1338, 9572.368, 9622.7702, 9675.8448, 9726.5396, 9778.7378, + 9827.6554, 9878.1922, 9928.7782, 9978.3984, 10026.578, 10076.5626, 10137.1618, 10177.5244, + 10229.9176, + ], + &[ + 2954.0, 3003.4782, 3053.3568, 3104.3666, 3155.324, 3206.9598, 3259.648, 3312.539, + 3366.1474, 3420.2576, 3474.8376, 3530.6076, 3586.451, 3643.38, 3700.4104, 3757.5638, + 3815.9676, 3875.193, 3934.838, 3994.8548, 4055.018, 4117.1742, 4178.4482, 4241.1294, + 4304.4776, 4367.4044, 4431.8724, 4496.3732, 4561.4304, 4627.5326, 4693.949, 4761.5532, + 4828.7256, 4897.6182, 4965.5186, 5034.4528, 5104.865, 5174.7164, 5244.6828, 5316.6708, + 5387.8312, 5459.9036, 5532.476, 5604.8652, 5679.6718, 5753.757, 5830.2072, 5905.2828, + 5980.0434, 6056.6264, 6134.3192, 6211.5746, 6290.0816, 6367.1176, 6447.9796, 6526.5576, + 6606.1858, 6686.9144, 6766.1142, 6847.0818, 6927.9664, 7010.9096, 7091.0816, 7175.3962, + 7260.3454, 7344.018, 7426.4214, 7511.3106, 7596.0686, 7679.8094, 7765.818, 7852.4248, + 7936.834, 8022.363, 8109.5066, 8200.4554, 8288.5832, 8373.366, 8463.4808, 8549.7682, + 8642.0522, 8728.3288, 8820.9528, 8907.727, 9001.0794, 9091.2522, 9179.988, 9269.852, + 9362.6394, 9453.642, 9546.9024, 9640.6616, 9732.6622, 9824.3254, 9917.7484, 10007.9392, + 10106.7508, 10196.2152, 10289.8114, 10383.5494, 10482.3064, 10576.8734, 10668.7872, + 10764.7156, 10862.0196, 10952.793, 11049.9748, 11146.0702, 11241.4492, 11339.2772, + 11434.2336, 11530.741, 11627.6136, 11726.311, 11821.5964, 11918.837, 12015.3724, + 12113.0162, 12213.0424, 12306.9804, 12408.4518, 12504.8968, 12604.586, 12700.9332, + 12798.705, 12898.5142, 12997.0488, 13094.788, 13198.475, 13292.7764, 13392.9698, + 13486.8574, 13590.1616, 13686.5838, 13783.6264, 13887.2638, 13992.0978, 14081.0844, + 14189.9956, 14280.0912, 14382.4956, 14486.4384, 14588.1082, 14686.2392, 14782.276, + 14888.0284, 14985.1864, 15088.8596, 15187.0998, 15285.027, 15383.6694, 15495.8266, + 15591.3736, 15694.2008, 15790.3246, 15898.4116, 15997.4522, 16095.5014, 16198.8514, + 16291.7492, 16402.6424, 16499.1266, 16606.2436, 16697.7186, 16796.3946, 16902.3376, + 17005.7672, 17100.814, 17206.8282, 17305.8262, 17416.0744, 17508.4092, 17617.0178, + 17715.4554, 17816.758, 17920.1748, 18012.9236, 18119.7984, 18223.2248, 18324.2482, + 18426.6276, 18525.0932, 18629.8976, 18733.2588, 18831.0466, 18940.1366, 19032.2696, + 19131.729, 19243.4864, 19349.6932, 19442.866, 19547.9448, 19653.2798, 19754.4034, + 19854.0692, 19965.1224, 20065.1774, 20158.2212, 20253.353, 20366.3264, 20463.22, + ], + &[ + 5908.5052, 6007.2672, 6107.347, 6208.5794, 6311.2622, 6414.5514, 6519.3376, 6625.6952, + 6732.5988, 6841.3552, 6950.5972, 7061.3082, 7173.5646, 7287.109, 7401.8216, 7516.4344, + 7633.3802, 7751.2962, 7870.3784, 7990.292, 8110.79, 8233.4574, 8356.6036, 8482.2712, + 8607.7708, 8735.099, 8863.1858, 8993.4746, 9123.8496, 9255.6794, 9388.5448, 9522.7516, + 9657.3106, 9792.6094, 9930.5642, 10068.794, 10206.7256, 10347.81, 10490.3196, 10632.0778, + 10775.9916, 10920.4662, 11066.124, 11213.073, 11358.0362, 11508.1006, 11659.1716, + 11808.7514, 11959.4884, 12112.1314, 12265.037, 12420.3756, 12578.933, 12734.311, + 12890.0006, 13047.2144, 13207.3096, 13368.5144, 13528.024, 13689.847, 13852.7528, + 14018.3168, 14180.5372, 14346.9668, 14513.5074, 14677.867, 14846.2186, 15017.4186, + 15184.9716, 15356.339, 15529.2972, 15697.3578, 15871.8686, 16042.187, 16216.4094, + 16389.4188, 16565.9126, 16742.3272, 16919.0042, 17094.7592, 17273.965, 17451.8342, + 17634.4254, 17810.5984, 17988.9242, 18171.051, 18354.7938, 18539.466, 18721.0408, + 18904.9972, 19081.867, 19271.9118, 19451.8694, 19637.9816, 19821.2922, 20013.1292, + 20199.3858, 20387.8726, 20572.9514, 20770.7764, 20955.1714, 21144.751, 21329.9952, + 21520.709, 21712.7016, 21906.3868, 22096.2626, 22286.0524, 22475.051, 22665.5098, + 22862.8492, 23055.5294, 23249.6138, 23437.848, 23636.273, 23826.093, 24020.3296, + 24213.3896, 24411.7392, 24602.9614, 24805.7952, 24998.1552, 25193.9588, 25389.0166, + 25585.8392, 25780.6976, 25981.2728, 26175.977, 26376.5252, 26570.1964, 26773.387, + 26962.9812, 27163.0586, 27368.164, 27565.0534, 27758.7428, 27961.1276, 28163.2324, + 28362.3816, 28565.7668, 28758.644, 28956.9768, 29163.4722, 29354.7026, 29561.1186, + 29767.9948, 29959.9986, 30164.0492, 30366.9818, 30562.5338, 30762.9928, 30976.1592, + 31166.274, 31376.722, 31570.3734, 31770.809, 31974.8934, 32179.5286, 32387.5442, + 32582.3504, 32794.076, 32989.9528, 33191.842, 33392.4684, 33595.659, 33801.8672, + 34000.3414, 34200.0922, 34402.6792, 34610.0638, 34804.0084, 35011.13, 35218.669, + 35418.6634, 35619.0792, 35830.6534, 36028.4966, 36229.7902, 36438.6422, 36630.7764, + 36833.3102, 37048.6728, 37247.3916, 37453.5904, 37669.3614, 37854.5526, 38059.305, + 38268.0936, 38470.2516, 38674.7064, 38876.167, 39068.3794, 39281.9144, 39492.8566, + 39684.8628, 39898.4108, 40093.1836, 40297.6858, 40489.7086, 40717.2424, + ], + &[ + 11817.475, 12015.0046, 12215.3792, 12417.7504, 12623.1814, 12830.0086, 13040.0072, + 13252.503, 13466.178, 13683.2738, 13902.0344, 14123.9798, 14347.394, 14573.7784, + 14802.6894, 15033.6824, 15266.9134, 15502.8624, 15741.4944, 15980.7956, 16223.8916, + 16468.6316, 16715.733, 16965.5726, 17217.204, 17470.666, 17727.8516, 17986.7886, + 18247.6902, 18510.9632, 18775.304, 19044.7486, 19314.4408, 19587.202, 19862.2576, + 20135.924, 20417.0324, 20697.9788, 20979.6112, 21265.0274, 21550.723, 21841.6906, + 22132.162, 22428.1406, 22722.127, 23020.5606, 23319.7394, 23620.4014, 23925.2728, + 24226.9224, 24535.581, 24845.505, 25155.9618, 25470.3828, 25785.9702, 26103.7764, + 26420.4132, 26742.0186, 27062.8852, 27388.415, 27714.6024, 28042.296, 28365.4494, + 28701.1526, 29031.8008, 29364.2156, 29704.497, 30037.1458, 30380.111, 30723.8168, + 31059.5114, 31404.9498, 31751.6752, 32095.2686, 32444.7792, 32794.767, 33145.204, + 33498.4226, 33847.6502, 34209.006, 34560.849, 34919.4838, 35274.9778, 35635.1322, + 35996.3266, 36359.1394, 36722.8266, 37082.8516, 37447.7354, 37815.9606, 38191.0692, + 38559.4106, 38924.8112, 39294.6726, 39663.973, 40042.261, 40416.2036, 40779.2036, + 41161.6436, 41540.9014, 41921.1998, 42294.7698, 42678.5264, 43061.3464, 43432.375, + 43818.432, 44198.6598, 44583.0138, 44970.4794, 45353.924, 45729.858, 46118.2224, + 46511.5724, 46900.7386, 47280.6964, 47668.1472, 48055.6796, 48446.9436, 48838.7146, + 49217.7296, 49613.7796, 50010.7508, 50410.0208, 50793.7886, 51190.2456, 51583.1882, + 51971.0796, 52376.5338, 52763.319, 53165.5534, 53556.5594, 53948.2702, 54346.352, + 54748.7914, 55138.577, 55543.4824, 55941.1748, 56333.7746, 56745.1552, 57142.7944, + 57545.2236, 57935.9956, 58348.5268, 58737.5474, 59158.5962, 59542.6896, 59958.8004, + 60349.3788, 60755.0212, 61147.6144, 61548.194, 61946.0696, 62348.6042, 62763.603, + 63162.781, 63560.635, 63974.3482, 64366.4908, 64771.5876, 65176.7346, 65597.3916, + 65995.915, 66394.0384, 66822.9396, 67203.6336, 67612.2032, 68019.0078, 68420.0388, + 68821.22, 69235.8388, 69640.0724, 70055.155, 70466.357, 70863.4266, 71276.2482, 71677.0306, + 72080.2006, 72493.0214, 72893.5952, 73314.5856, 73714.9852, 74125.3022, 74521.2122, + 74933.6814, 75341.5904, 75743.0244, 76166.0278, 76572.1322, 76973.1028, 77381.6284, + 77800.6092, 78189.328, 78607.0962, 79012.2508, 79407.8358, 79825.725, 80238.701, 80646.891, + 81035.6436, 81460.0448, 81876.3884, + ], + &[ + 23635.0036, + 24030.8034, + 24431.4744, + 24837.1524, + 25246.7928, + 25661.326, + 26081.3532, + 26505.2806, + 26933.9892, + 27367.7098, + 27805.318, + 28248.799, + 28696.4382, + 29148.8244, + 29605.5138, + 30066.8668, + 30534.2344, + 31006.32, + 31480.778, + 31962.2418, + 32447.3324, + 32938.0232, + 33432.731, + 33930.728, + 34433.9896, + 34944.1402, + 35457.5588, + 35974.5958, + 36497.3296, + 37021.9096, + 37554.326, + 38088.0826, + 38628.8816, + 39171.3192, + 39723.2326, + 40274.5554, + 40832.3142, + 41390.613, + 41959.5908, + 42532.5466, + 43102.0344, + 43683.5072, + 44266.694, + 44851.2822, + 45440.7862, + 46038.0586, + 46640.3164, + 47241.064, + 47846.155, + 48454.7396, + 49076.9168, + 49692.542, + 50317.4778, + 50939.65, + 51572.5596, + 52210.2906, + 52843.7396, + 53481.3996, + 54127.236, + 54770.406, + 55422.6598, + 56078.7958, + 56736.7174, + 57397.6784, + 58064.5784, + 58730.308, + 59404.9784, + 60077.0864, + 60751.9158, + 61444.1386, + 62115.817, + 62808.7742, + 63501.4774, + 64187.5454, + 64883.6622, + 65582.7468, + 66274.5318, + 66976.9276, + 67688.7764, + 68402.138, + 69109.6274, + 69822.9706, + 70543.6108, + 71265.5202, + 71983.3848, + 72708.4656, + 73433.384, + 74158.4664, + 74896.4868, + 75620.9564, + 76362.1434, + 77098.3204, + 77835.7662, + 78582.6114, + 79323.9902, + 80067.8658, + 80814.9246, + 81567.0136, + 82310.8536, + 83061.9952, + 83821.4096, + 84580.8608, + 85335.547, + 86092.5802, + 86851.6506, + 87612.311, + 88381.2016, + 89146.3296, + 89907.8974, + 90676.846, + 91451.4152, + 92224.5518, + 92995.8686, + 93763.5066, + 94551.2796, + 95315.1944, + 96096.1806, + 96881.0918, + 97665.679, + 98442.68, + 99229.3002, + 100011.0994, + 100790.6386, + 101580.1564, + 102377.7484, + 103152.1392, + 103944.2712, + 104730.216, + 105528.6336, + 106324.9398, + 107117.6706, + 107890.3988, + 108695.2266, + 109485.238, + 110294.7876, + 111075.0958, + 111878.0496, + 112695.2864, + 113464.5486, + 114270.0474, + 115068.608, + 115884.3626, + 116673.2588, + 117483.3716, + 118275.097, + 119085.4092, + 119879.2808, + 120687.5868, + 121499.9944, + 122284.916, + 123095.9254, + 123912.5038, + 124709.0454, + 125503.7182, + 126323.259, + 127138.9412, + 127943.8294, + 128755.646, + 129556.5354, + 130375.3298, + 131161.4734, + 131971.1962, + 132787.5458, + 133588.1056, + 134431.351, + 135220.2906, + 136023.398, + 136846.6558, + 137667.0004, + 138463.663, + 139283.7154, + 140074.6146, + 140901.3072, + 141721.8548, + 142543.2322, + 143356.1096, + 144173.7412, + 144973.0948, + 145794.3162, + 146609.5714, + 147420.003, + 148237.9784, + 149050.5696, + 149854.761, + 150663.1966, + 151494.0754, + 152313.1416, + 153112.6902, + 153935.7206, + 154746.9262, + 155559.547, + 156401.9746, + 157228.7036, + 158008.7254, + 158820.75, + 159646.9184, + 160470.4458, + 161279.5348, + 162093.3114, + 162918.542, + 163729.2842, + ], + &[ + 47271.0, + 48062.3584, + 48862.7074, + 49673.152, + 50492.8416, + 51322.9514, + 52161.03, + 53009.407, + 53867.6348, + 54734.206, + 55610.5144, + 56496.2096, + 57390.795, + 58297.268, + 59210.6448, + 60134.665, + 61068.0248, + 62010.4472, + 62962.5204, + 63923.5742, + 64895.0194, + 65876.4182, + 66862.6136, + 67862.6968, + 68868.8908, + 69882.8544, + 70911.271, + 71944.0924, + 72990.0326, + 74040.692, + 75100.6336, + 76174.7826, + 77252.5998, + 78340.2974, + 79438.2572, + 80545.4976, + 81657.2796, + 82784.6336, + 83915.515, + 85059.7362, + 86205.9368, + 87364.4424, + 88530.3358, + 89707.3744, + 90885.9638, + 92080.197, + 93275.5738, + 94479.391, + 95695.918, + 96919.2236, + 98148.4602, + 99382.3474, + 100625.6974, + 101878.0284, + 103141.6278, + 104409.4588, + 105686.2882, + 106967.5402, + 108261.6032, + 109548.1578, + 110852.0728, + 112162.231, + 113479.0072, + 114806.2626, + 116137.9072, + 117469.5048, + 118813.5186, + 120165.4876, + 121516.2556, + 122875.766, + 124250.5444, + 125621.2222, + 127003.2352, + 128387.848, + 129775.2644, + 131181.7776, + 132577.3086, + 133979.9458, + 135394.1132, + 136800.9078, + 138233.217, + 139668.5308, + 141085.212, + 142535.2122, + 143969.0684, + 145420.2872, + 146878.1542, + 148332.7572, + 149800.3202, + 151269.66, + 152743.6104, + 154213.0948, + 155690.288, + 157169.4246, + 158672.1756, + 160160.059, + 161650.6854, + 163145.7772, + 164645.6726, + 166159.1952, + 167682.1578, + 169177.3328, + 170700.0118, + 172228.8964, + 173732.6664, + 175265.5556, + 176787.799, + 178317.111, + 179856.6914, + 181400.865, + 182943.4612, + 184486.742, + 186033.4698, + 187583.7886, + 189148.1868, + 190688.4526, + 192250.1926, + 193810.9042, + 195354.2972, + 196938.7682, + 198493.5898, + 200079.2824, + 201618.912, + 203205.5492, + 204765.5798, + 206356.1124, + 207929.3064, + 209498.7196, + 211086.229, + 212675.1324, + 214256.7892, + 215826.2392, + 217412.8474, + 218995.6724, + 220618.6038, + 222207.1166, + 223781.0364, + 225387.4332, + 227005.7928, + 228590.4336, + 230217.8738, + 231805.1054, + 233408.9, + 234995.3432, + 236601.4956, + 238190.7904, + 239817.2548, + 241411.2832, + 243002.4066, + 244640.1884, + 246255.3128, + 247849.3508, + 249479.9734, + 251106.8822, + 252705.027, + 254332.9242, + 255935.129, + 257526.9014, + 259154.772, + 260777.625, + 262390.253, + 264004.4906, + 265643.59, + 267255.4076, + 268873.426, + 270470.7252, + 272106.4804, + 273722.4456, + 275337.794, + 276945.7038, + 278592.9154, + 280204.3726, + 281841.1606, + 283489.171, + 285130.1716, + 286735.3362, + 288364.7164, + 289961.1814, + 291595.5524, + 293285.683, + 294899.6668, + 296499.3434, + 298128.0462, + 299761.8946, + 301394.2424, + 302997.6748, + 304615.1478, + 306269.7724, + 307886.114, + 309543.1028, + 311153.2862, + 312782.8546, + 314421.2008, + 316033.2438, + 317692.9636, + 319305.2648, + 320948.7406, + 322566.3364, + 324228.4224, + 325847.1542, + ], + &[ + 94542.0, + 96125.811, + 97728.019, + 99348.558, + 100987.9705, + 102646.7565, + 104324.5125, + 106021.7435, + 107736.7865, + 109469.272, + 111223.9465, + 112995.219, + 114787.432, + 116593.152, + 118422.71, + 120267.2345, + 122134.6765, + 124020.937, + 125927.2705, + 127851.255, + 129788.9485, + 131751.016, + 133726.8225, + 135722.592, + 137736.789, + 139770.568, + 141821.518, + 143891.343, + 145982.1415, + 148095.387, + 150207.526, + 152355.649, + 154515.6415, + 156696.05, + 158887.7575, + 161098.159, + 163329.852, + 165569.053, + 167837.4005, + 170121.6165, + 172420.4595, + 174732.6265, + 177062.77, + 179412.502, + 181774.035, + 184151.939, + 186551.6895, + 188965.691, + 191402.8095, + 193857.949, + 196305.0775, + 198774.6715, + 201271.2585, + 203764.78, + 206299.3695, + 208818.1365, + 211373.115, + 213946.7465, + 216532.076, + 219105.541, + 221714.5375, + 224337.5135, + 226977.5125, + 229613.0655, + 232270.2685, + 234952.2065, + 237645.3555, + 240331.1925, + 243034.517, + 245756.0725, + 248517.6865, + 251232.737, + 254011.3955, + 256785.995, + 259556.44, + 262368.335, + 265156.911, + 267965.266, + 270785.583, + 273616.0495, + 276487.4835, + 279346.639, + 282202.509, + 285074.3885, + 287942.2855, + 290856.018, + 293774.0345, + 296678.5145, + 299603.6355, + 302552.6575, + 305492.9785, + 308466.8605, + 311392.581, + 314347.538, + 317319.4295, + 320285.9785, + 323301.7325, + 326298.3235, + 329301.3105, + 332301.987, + 335309.791, + 338370.762, + 341382.923, + 344431.1265, + 347464.1545, + 350507.28, + 353619.2345, + 356631.2005, + 359685.203, + 362776.7845, + 365886.488, + 368958.2255, + 372060.6825, + 375165.4335, + 378237.935, + 381328.311, + 384430.5225, + 387576.425, + 390683.242, + 393839.648, + 396977.8425, + 400101.9805, + 403271.296, + 406409.8425, + 409529.5485, + 412678.7, + 415847.423, + 419020.8035, + 422157.081, + 425337.749, + 428479.6165, + 431700.902, + 434893.1915, + 438049.582, + 441210.5415, + 444379.2545, + 447577.356, + 450741.931, + 453959.548, + 457137.0935, + 460329.846, + 463537.4815, + 466732.3345, + 469960.5615, + 473164.681, + 476347.6345, + 479496.173, + 482813.1645, + 486025.6995, + 489249.4885, + 492460.1945, + 495675.8805, + 498908.0075, + 502131.802, + 505374.3855, + 508550.9915, + 511806.7305, + 515026.776, + 518217.0005, + 521523.9855, + 524705.9855, + 527950.997, + 531210.0265, + 534472.497, + 537750.7315, + 540926.922, + 544207.094, + 547429.4345, + 550666.3745, + 553975.3475, + 557150.7185, + 560399.6165, + 563662.697, + 566916.7395, + 570146.1215, + 573447.425, + 576689.6245, + 579874.5745, + 583202.337, + 586503.0255, + 589715.635, + 592910.161, + 596214.3885, + 599488.035, + 602740.92, + 605983.0685, + 609248.67, + 612491.3605, + 615787.912, + 619107.5245, + 622307.9555, + 625577.333, + 628840.4385, + 632085.2155, + 635317.6135, + 638691.7195, + 641887.467, + 645139.9405, + 648441.546, + 651666.252, + 654941.845, + ], + &[ + 189084.0, + 192250.913, + 195456.774, + 198696.946, + 201977.762, + 205294.444, + 208651.754, + 212042.099, + 215472.269, + 218941.91, + 222443.912, + 225996.845, + 229568.199, + 233193.568, + 236844.457, + 240543.233, + 244279.475, + 248044.27, + 251854.588, + 255693.2, + 259583.619, + 263494.621, + 267445.385, + 271454.061, + 275468.769, + 279549.456, + 283646.446, + 287788.198, + 291966.099, + 296181.164, + 300431.469, + 304718.618, + 309024.004, + 313393.508, + 317760.803, + 322209.731, + 326675.061, + 331160.627, + 335654.47, + 340241.442, + 344841.833, + 349467.132, + 354130.629, + 358819.432, + 363574.626, + 368296.587, + 373118.482, + 377914.93, + 382782.301, + 387680.669, + 392601.981, + 397544.323, + 402529.115, + 407546.018, + 412593.658, + 417638.657, + 422762.865, + 427886.169, + 433017.167, + 438213.273, + 443441.254, + 448692.421, + 453937.533, + 459239.049, + 464529.569, + 469910.083, + 475274.03, + 480684.473, + 486070.26, + 491515.237, + 496995.651, + 502476.617, + 507973.609, + 513497.19, + 519083.233, + 524726.509, + 530305.505, + 535945.728, + 541584.404, + 547274.055, + 552967.236, + 558667.862, + 564360.216, + 570128.148, + 575965.08, + 581701.952, + 587532.523, + 593361.144, + 599246.128, + 605033.418, + 610958.779, + 616837.117, + 622772.818, + 628672.04, + 634675.369, + 640574.831, + 646585.739, + 652574.547, + 658611.217, + 664642.684, + 670713.914, + 676737.681, + 682797.313, + 688837.897, + 694917.874, + 701009.882, + 707173.648, + 713257.254, + 719415.392, + 725636.761, + 731710.697, + 737906.209, + 744103.074, + 750313.39, + 756504.185, + 762712.579, + 768876.985, + 775167.859, + 781359.0, + 787615.959, + 793863.597, + 800245.477, + 806464.582, + 812785.294, + 819005.925, + 825403.057, + 831676.197, + 837936.284, + 844266.968, + 850642.711, + 856959.756, + 863322.774, + 869699.931, + 876102.478, + 882355.787, + 888694.463, + 895159.952, + 901536.143, + 907872.631, + 914293.672, + 920615.14, + 927130.974, + 933409.404, + 939922.178, + 946331.47, + 952745.93, + 959209.264, + 965590.224, + 972077.284, + 978501.961, + 984953.19, + 991413.271, + 997817.479, + 1004222.658, + 1010725.676, + 1017177.138, + 1023612.529, + 1030098.236, + 1036493.719, + 1043112.207, + 1049537.036, + 1056008.096, + 1062476.184, + 1068942.337, + 1075524.95, + 1081932.864, + 1088426.025, + 1094776.005, + 1101327.448, + 1107901.673, + 1114423.639, + 1120884.602, + 1127324.923, + 1133794.24, + 1140328.886, + 1146849.376, + 1153346.682, + 1159836.502, + 1166478.703, + 1172953.304, + 1179391.502, + 1185950.982, + 1192544.052, + 1198913.41, + 1205430.994, + 1212015.525, + 1218674.042, + 1225121.683, + 1231551.101, + 1238126.379, + 1244673.795, + 1251260.649, + 1257697.86, + 1264320.983, + 1270736.319, + 1277274.694, + 1283804.95, + 1290211.514, + 1296858.568, + 1303455.691, + ], +]; + +static BIAS_DATA: &[&[f64]] = &[ + &[ + 10.0, + 9.717, + 9.207, + 8.7896, + 8.2882, + 7.8204, + 7.3772, + 6.9342, + 6.5202, + 6.161, + 5.7722, + 5.4636, + 5.0396, + 4.6766, + 4.3566, + 4.0454, + 3.7936, + 3.4856, + 3.2666, + 2.9946, + 2.766, + 2.4692, + 2.3638, + 2.0764, + 1.7864, + 1.7602, + 1.4814, + 1.433, + 1.2926, + 1.0664, + 0.999600000000001, + 0.7956, + 0.5366, + 0.589399999999998, + 0.573799999999999, + 0.269799999999996, + 0.368200000000002, + 0.0544000000000011, + 0.234200000000001, + 0.0108000000000033, + -0.203400000000002, + -0.0701999999999998, + -0.129600000000003, + -0.364199999999997, + -0.480600000000003, + -0.226999999999997, + -0.322800000000001, + -0.382599999999996, + -0.511200000000002, + -0.669600000000003, + -0.749400000000001, + -0.500399999999999, + -0.617600000000003, + -0.6922, + -0.601599999999998, + -0.416200000000003, + -0.338200000000001, + -0.782600000000002, + -0.648600000000002, + -0.919800000000002, + -0.851799999999997, + -0.962400000000002, + -0.6402, + -1.1922, + -1.0256, + -1.086, + -1.21899999999999, + -0.819400000000002, + -0.940600000000003, + -1.1554, + -1.2072, + -1.1752, + -1.16759999999999, + -1.14019999999999, + -1.3754, + -1.29859999999999, + -1.607, + -1.3292, + -1.7606, + ], + &[ + 22.0, + 21.1194, + 20.8208, + 20.2318, + 19.77, + 19.2436, + 18.7774, + 18.2848, + 17.8224, + 17.3742, + 16.9336, + 16.503, + 16.0494, + 15.6292, + 15.2124, + 14.798, + 14.367, + 13.9728, + 13.5944, + 13.217, + 12.8438, + 12.3696, + 12.0956, + 11.7044, + 11.324, + 11.0668, + 10.6698, + 10.3644, + 10.049, + 9.6918, + 9.4146, + 9.082, + 8.687, + 8.5398, + 8.2462, + 7.857, + 7.6606, + 7.4168, + 7.1248, + 6.9222, + 6.6804, + 6.447, + 6.3454, + 5.9594, + 5.7636, + 5.5776, + 5.331, + 5.19, + 4.9676, + 4.7564, + 4.5314, + 4.4442, + 4.3708, + 3.9774, + 3.9624, + 3.8796, + 3.755, + 3.472, + 3.2076, + 3.1024, + 2.8908, + 2.7338, + 2.7728, + 2.629, + 2.413, + 2.3266, + 2.1524, + 2.2642, + 2.1806, + 2.0566, + 1.9192, + 1.7598, + 1.3516, + 1.5802, + 1.43859999999999, + 1.49160000000001, + 1.1524, + 1.1892, + 0.841399999999993, + 0.879800000000003, + 0.837599999999995, + 0.469800000000006, + 0.765600000000006, + 0.331000000000003, + 0.591399999999993, + 0.601200000000006, + 0.701599999999999, + 0.558199999999999, + 0.339399999999998, + 0.354399999999998, + 0.491200000000006, + 0.308000000000007, + 0.355199999999996, + -0.0254000000000048, + 0.205200000000005, + -0.272999999999996, + 0.132199999999997, + 0.394400000000005, + -0.241200000000006, + 0.242000000000004, + 0.191400000000002, + 0.253799999999998, + -0.122399999999999, + -0.370800000000003, + 0.193200000000004, + -0.0848000000000013, + 0.0867999999999967, + -0.327200000000005, + -0.285600000000002, + 0.311400000000006, + -0.128399999999999, + -0.754999999999995, + -0.209199999999996, + -0.293599999999998, + -0.364000000000004, + -0.253600000000006, + -0.821200000000005, + -0.253600000000006, + -0.510400000000004, + -0.383399999999995, + -0.491799999999998, + -0.220200000000006, + -0.0972000000000008, + -0.557400000000001, + -0.114599999999996, + -0.295000000000002, + -0.534800000000004, + 0.346399999999988, + -0.65379999999999, + 0.0398000000000138, + 0.0341999999999985, + -0.995800000000003, + -0.523400000000009, + -0.489000000000004, + -0.274799999999999, + -0.574999999999989, + -0.482799999999997, + 0.0571999999999946, + -0.330600000000004, + -0.628800000000012, + -0.140199999999993, + -0.540600000000012, + -0.445999999999998, + -0.599400000000003, + -0.262599999999992, + 0.163399999999996, + -0.100599999999986, + -0.39500000000001, + -1.06960000000001, + -0.836399999999998, + -0.753199999999993, + -0.412399999999991, + -0.790400000000005, + -0.29679999999999, + -0.28540000000001, + -0.193000000000012, + -0.0772000000000048, + -0.962799999999987, + -0.414800000000014, + ], + &[ + 45.0, + 44.1902, + 43.271, + 42.8358, + 41.8142, + 41.2854, + 40.317, + 39.354, + 38.8924, + 37.9436, + 37.4596, + 36.5262, + 35.6248, + 35.1574, + 34.2822, + 33.837, + 32.9636, + 32.074, + 31.7042, + 30.7976, + 30.4772, + 29.6564, + 28.7942, + 28.5004, + 27.686, + 27.291, + 26.5672, + 25.8556, + 25.4982, + 24.8204, + 24.4252, + 23.7744, + 23.0786, + 22.8344, + 22.0294, + 21.8098, + 21.0794, + 20.5732, + 20.1878, + 19.5648, + 19.2902, + 18.6784, + 18.3352, + 17.8946, + 17.3712, + 17.0852, + 16.499, + 16.2686, + 15.6844, + 15.2234, + 14.9732, + 14.3356, + 14.2286, + 13.7262, + 13.3284, + 13.1048, + 12.5962, + 12.3562, + 12.1272, + 11.4184, + 11.4974, + 11.0822, + 10.856, + 10.48, + 10.2834, + 10.0208, + 9.637, + 9.51739999999999, + 9.05759999999999, + 8.74760000000001, + 8.42700000000001, + 8.1326, + 8.2372, + 8.2788, + 7.6776, + 7.79259999999999, + 7.1952, + 6.9564, + 6.6454, + 6.87, + 6.5428, + 6.19999999999999, + 6.02940000000001, + 5.62780000000001, + 5.6782, + 5.792, + 5.35159999999999, + 5.28319999999999, + 5.0394, + 5.07480000000001, + 4.49119999999999, + 4.84899999999999, + 4.696, + 4.54040000000001, + 4.07300000000001, + 4.37139999999999, + 3.7216, + 3.7328, + 3.42080000000001, + 3.41839999999999, + 3.94239999999999, + 3.27719999999999, + 3.411, + 3.13079999999999, + 2.76900000000001, + 2.92580000000001, + 2.68279999999999, + 2.75020000000001, + 2.70599999999999, + 2.3886, + 3.01859999999999, + 2.45179999999999, + 2.92699999999999, + 2.41720000000001, + 2.41139999999999, + 2.03299999999999, + 2.51240000000001, + 2.5564, + 2.60079999999999, + 2.41720000000001, + 1.80439999999999, + 1.99700000000001, + 2.45480000000001, + 1.8948, + 2.2346, + 2.30860000000001, + 2.15479999999999, + 1.88419999999999, + 1.6508, + 0.677199999999999, + 1.72540000000001, + 1.4752, + 1.72280000000001, + 1.66139999999999, + 1.16759999999999, + 1.79300000000001, + 1.00059999999999, + 0.905200000000008, + 0.659999999999997, + 1.55879999999999, + 1.1636, + 0.688199999999995, + 0.712600000000009, + 0.450199999999995, + 1.1978, + 0.975599999999986, + 0.165400000000005, + 1.727, + 1.19739999999999, + -0.252600000000001, + 1.13460000000001, + 1.3048, + 1.19479999999999, + 0.313400000000001, + 0.878999999999991, + 1.12039999999999, + 0.853000000000009, + 1.67920000000001, + 0.856999999999999, + 0.448599999999999, + 1.2362, + 0.953399999999988, + 1.02859999999998, + 0.563199999999995, + 0.663000000000011, + 0.723000000000013, + 0.756599999999992, + 0.256599999999992, + -0.837600000000009, + 0.620000000000005, + 0.821599999999989, + 0.216600000000028, + 0.205600000000004, + 0.220199999999977, + 0.372599999999977, + 0.334400000000016, + 0.928400000000011, + 0.972800000000007, + 0.192400000000021, + 0.487199999999973, + -0.413000000000011, + 0.807000000000016, + 0.120600000000024, + 0.769000000000005, + 0.870799999999974, + 0.66500000000002, + 0.118200000000002, + 0.401200000000017, + 0.635199999999998, + 0.135400000000004, + 0.175599999999974, + 1.16059999999999, + 0.34620000000001, + 0.521400000000028, + -0.586599999999976, + -1.16480000000001, + 0.968399999999974, + 0.836999999999989, + 0.779600000000016, + 0.985799999999983, + ], + &[ + 91.0, + 89.4934, + 87.9758, + 86.4574, + 84.9718, + 83.4954, + 81.5302, + 80.0756, + 78.6374, + 77.1782, + 75.7888, + 73.9522, + 72.592, + 71.2532, + 69.9086, + 68.5938, + 66.9474, + 65.6796, + 64.4394, + 63.2176, + 61.9768, + 60.4214, + 59.2528, + 58.0102, + 56.8658, + 55.7278, + 54.3044, + 53.1316, + 52.093, + 51.0032, + 49.9092, + 48.6306, + 47.5294, + 46.5756, + 45.6508, + 44.662, + 43.552, + 42.3724, + 41.617, + 40.5754, + 39.7872, + 38.8444, + 37.7988, + 36.8606, + 36.2118, + 35.3566, + 34.4476, + 33.5882, + 32.6816, + 32.0824, + 31.0258, + 30.6048, + 29.4436, + 28.7274, + 27.957, + 27.147, + 26.4364, + 25.7592, + 25.3386, + 24.781, + 23.8028, + 23.656, + 22.6544, + 21.996, + 21.4718, + 21.1544, + 20.6098, + 19.5956, + 19.0616, + 18.5758, + 18.4878, + 17.5244, + 17.2146, + 16.724, + 15.8722, + 15.5198, + 15.0414, + 14.941, + 14.9048, + 13.87, + 13.4304, + 13.028, + 12.4708, + 12.37, + 12.0624, + 11.4668, + 11.5532, + 11.4352, + 11.2564, + 10.2744, + 10.2118, + 9.74720000000002, + 10.1456, + 9.2928, + 8.75040000000001, + 8.55279999999999, + 8.97899999999998, + 8.21019999999999, + 8.18340000000001, + 7.3494, + 7.32499999999999, + 7.66140000000001, + 6.90300000000002, + 7.25439999999998, + 6.9042, + 7.21499999999997, + 6.28640000000001, + 6.08139999999997, + 6.6764, + 6.30099999999999, + 5.13900000000001, + 5.65800000000002, + 5.17320000000001, + 4.59019999999998, + 4.9538, + 5.08280000000002, + 4.92200000000003, + 4.99020000000002, + 4.7328, + 5.4538, + 4.11360000000002, + 4.22340000000003, + 4.08780000000002, + 3.70800000000003, + 4.15559999999999, + 4.18520000000001, + 3.63720000000001, + 3.68220000000002, + 3.77960000000002, + 3.6078, + 2.49160000000001, + 3.13099999999997, + 2.5376, + 3.19880000000001, + 3.21100000000001, + 2.4502, + 3.52820000000003, + 2.91199999999998, + 3.04480000000001, + 2.7432, + 2.85239999999999, + 2.79880000000003, + 2.78579999999999, + 1.88679999999999, + 2.98860000000002, + 2.50639999999999, + 1.91239999999999, + 2.66160000000002, + 2.46820000000002, + 1.58199999999999, + 1.30399999999997, + 2.27379999999999, + 2.68939999999998, + 1.32900000000001, + 3.10599999999999, + 1.69080000000002, + 2.13740000000001, + 2.53219999999999, + 1.88479999999998, + 1.33240000000001, + 1.45119999999997, + 1.17899999999997, + 2.44119999999998, + 1.60659999999996, + 2.16700000000003, + 0.77940000000001, + 2.37900000000002, + 2.06700000000001, + 1.46000000000004, + 2.91160000000002, + 1.69200000000001, + 0.954600000000028, + 2.49300000000005, + 2.2722, + 1.33500000000004, + 2.44899999999996, + 1.20140000000004, + 3.07380000000001, + 2.09739999999999, + 2.85640000000001, + 2.29960000000005, + 2.40899999999999, + 1.97040000000004, + 0.809799999999996, + 1.65279999999996, + 2.59979999999996, + 0.95799999999997, + 2.06799999999998, + 2.32780000000002, + 4.20159999999998, + 1.96320000000003, + 1.86400000000003, + 1.42999999999995, + 3.77940000000001, + 1.27200000000005, + 1.86440000000005, + 2.20600000000002, + 3.21900000000005, + 1.5154, + 2.61019999999996, + ], + &[ + 183.2152, + 180.2454, + 177.2096, + 173.6652, + 170.6312, + 167.6822, + 164.249, + 161.3296, + 158.0038, + 155.2074, + 152.4612, + 149.27, + 146.5178, + 143.4412, + 140.8032, + 138.1634, + 135.1688, + 132.6074, + 129.6946, + 127.2664, + 124.8228, + 122.0432, + 119.6824, + 116.9464, + 114.6268, + 112.2626, + 109.8376, + 107.4034, + 104.8956, + 102.8522, + 100.7638, + 98.3552, + 96.3556, + 93.7526, + 91.9292, + 89.8954, + 87.8198, + 85.7668, + 83.298, + 81.6688, + 79.9466, + 77.9746, + 76.1672, + 74.3474, + 72.3028, + 70.8912, + 69.114, + 67.4646, + 65.9744, + 64.4092, + 62.6022, + 60.843, + 59.5684, + 58.1652, + 56.5426, + 55.4152, + 53.5388, + 52.3592, + 51.1366, + 49.486, + 48.3918, + 46.5076, + 45.509, + 44.3834, + 43.3498, + 42.0668, + 40.7346, + 40.1228, + 38.4528, + 37.7, + 36.644, + 36.0518, + 34.5774, + 33.9068, + 32.432, + 32.1666, + 30.434, + 29.6644, + 28.4894, + 27.6312, + 26.3804, + 26.292, + 25.5496000000001, + 25.0234, + 24.8206, + 22.6146, + 22.4188, + 22.117, + 20.6762, + 20.6576, + 19.7864, + 19.509, + 18.5334, + 17.9204, + 17.772, + 16.2924, + 16.8654, + 15.1836, + 15.745, + 15.1316, + 15.0386, + 14.0136, + 13.6342, + 12.6196, + 12.1866, + 12.4281999999999, + 11.3324, + 10.4794000000001, + 11.5038, + 10.129, + 9.52800000000002, + 10.3203999999999, + 9.46299999999997, + 9.79280000000006, + 9.12300000000005, + 8.74180000000001, + 9.2192, + 7.51020000000005, + 7.60659999999996, + 7.01840000000004, + 7.22239999999999, + 7.40139999999997, + 6.76179999999999, + 7.14359999999999, + 5.65060000000005, + 5.63779999999997, + 5.76599999999996, + 6.75139999999999, + 5.57759999999996, + 3.73220000000003, + 5.8048, + 5.63019999999995, + 4.93359999999996, + 3.47979999999995, + 4.33879999999999, + 3.98940000000005, + 3.81960000000004, + 3.31359999999995, + 3.23080000000004, + 3.4588, + 3.08159999999998, + 3.4076, + 3.00639999999999, + 2.38779999999997, + 2.61900000000003, + 1.99800000000005, + 3.34820000000002, + 2.95060000000001, + 0.990999999999985, + 2.11440000000005, + 2.20299999999997, + 2.82219999999995, + 2.73239999999998, + 2.7826, + 3.76660000000004, + 2.26480000000004, + 2.31280000000004, + 2.40819999999997, + 2.75360000000001, + 3.33759999999995, + 2.71559999999999, + 1.7478000000001, + 1.42920000000004, + 2.39300000000003, + 2.22779999999989, + 2.34339999999997, + 0.87259999999992, + 3.88400000000001, + 1.80600000000004, + 1.91759999999999, + 1.16779999999994, + 1.50320000000011, + 2.52500000000009, + 0.226400000000012, + 2.31500000000005, + 0.930000000000064, + 1.25199999999995, + 2.14959999999996, + 0.0407999999999902, + 2.5447999999999, + 1.32960000000003, + 0.197400000000016, + 2.52620000000002, + 3.33279999999991, + -1.34300000000007, + 0.422199999999975, + 0.917200000000093, + 1.12920000000008, + 1.46060000000011, + 1.45779999999991, + 2.8728000000001, + 3.33359999999993, + -1.34079999999994, + 1.57680000000005, + 0.363000000000056, + 1.40740000000005, + 0.656600000000026, + 0.801400000000058, + -0.454600000000028, + 1.51919999999996, + ], + &[ + 368.0, + 361.8294, + 355.2452, + 348.6698, + 342.1464, + 336.2024, + 329.8782, + 323.6598, + 317.462, + 311.2826, + 305.7102, + 299.7416, + 293.9366, + 288.1046, + 282.285, + 277.0668, + 271.306, + 265.8448, + 260.301, + 254.9886, + 250.2422, + 244.8138, + 239.7074, + 234.7428, + 229.8402, + 225.1664, + 220.3534, + 215.594, + 210.6886, + 205.7876, + 201.65, + 197.228, + 192.8036, + 188.1666, + 184.0818, + 180.0824, + 176.2574, + 172.302, + 168.1644, + 164.0056, + 160.3802, + 156.7192, + 152.5234, + 149.2084, + 145.831, + 142.485, + 139.1112, + 135.4764, + 131.76, + 129.3368, + 126.5538, + 122.5058, + 119.2646, + 116.5902, + 113.3818, + 110.8998, + 107.9532, + 105.2062, + 102.2798, + 99.4728, + 96.9582, + 94.3292, + 92.171, + 89.7809999999999, + 87.5716, + 84.7048, + 82.5322, + 79.875, + 78.3972, + 75.3464, + 73.7274, + 71.2834, + 70.1444, + 68.4263999999999, + 66.0166, + 64.018, + 62.0437999999999, + 60.3399999999999, + 58.6856, + 57.9836, + 55.0311999999999, + 54.6769999999999, + 52.3188, + 51.4846, + 49.4423999999999, + 47.739, + 46.1487999999999, + 44.9202, + 43.4059999999999, + 42.5342000000001, + 41.2834, + 38.8954000000001, + 38.3286000000001, + 36.2146, + 36.6684, + 35.9946, + 33.123, + 33.4338, + 31.7378000000001, + 29.076, + 28.9692, + 27.4964, + 27.0998, + 25.9864, + 26.7754, + 24.3208, + 23.4838, + 22.7388000000001, + 24.0758000000001, + 21.9097999999999, + 20.9728, + 19.9228000000001, + 19.9292, + 16.617, + 17.05, + 18.2996000000001, + 15.6128000000001, + 15.7392, + 14.5174, + 13.6322, + 12.2583999999999, + 13.3766000000001, + 11.423, + 13.1232, + 9.51639999999998, + 10.5938000000001, + 9.59719999999993, + 8.12220000000002, + 9.76739999999995, + 7.50440000000003, + 7.56999999999994, + 6.70440000000008, + 6.41419999999994, + 6.71019999999999, + 5.60940000000005, + 4.65219999999999, + 6.84099999999989, + 3.4072000000001, + 3.97859999999991, + 3.32760000000007, + 5.52160000000003, + 3.31860000000006, + 2.06940000000009, + 4.35400000000004, + 1.57500000000005, + 0.280799999999999, + 2.12879999999996, + -0.214799999999968, + -0.0378000000000611, + -0.658200000000079, + 0.654800000000023, + -0.0697999999999865, + 0.858400000000074, + -2.52700000000004, + -2.1751999999999, + -3.35539999999992, + -1.04019999999991, + -0.651000000000067, + -2.14439999999991, + -1.96659999999997, + -3.97939999999994, + -0.604400000000169, + -3.08260000000018, + -3.39159999999993, + -5.29640000000018, + -5.38920000000007, + -5.08759999999984, + -4.69900000000007, + -5.23720000000003, + -3.15779999999995, + -4.97879999999986, + -4.89899999999989, + -7.48880000000008, + -5.94799999999987, + -5.68060000000014, + -6.67180000000008, + -4.70499999999993, + -7.27779999999984, + -4.6579999999999, + -4.4362000000001, + -4.32139999999981, + -5.18859999999995, + -6.66879999999992, + -6.48399999999992, + -5.1260000000002, + -4.4032000000002, + -6.13500000000022, + -5.80819999999994, + -4.16719999999987, + -4.15039999999999, + -7.45600000000013, + -7.24080000000004, + -9.83179999999993, + -5.80420000000004, + -8.6561999999999, + -6.99940000000015, + -10.5473999999999, + -7.34139999999979, + -6.80999999999995, + -6.29719999999998, + -6.23199999999997, + ], + &[ + 737.1256, + 724.4234, + 711.1064, + 698.4732, + 685.4636, + 673.0644, + 660.488, + 647.9654, + 636.0832, + 623.7864, + 612.1992, + 600.2176, + 588.5228, + 577.1716, + 565.7752, + 554.899, + 543.6126, + 532.6492, + 521.9474, + 511.5214, + 501.1064, + 490.6364, + 480.2468, + 470.4588, + 460.3832, + 451.0584, + 440.8606, + 431.3868, + 422.5062, + 413.1862, + 404.463, + 395.339, + 386.1936, + 378.1292, + 369.1854, + 361.2908, + 353.3324, + 344.8518, + 337.5204, + 329.4854, + 321.9318, + 314.552, + 306.4658, + 299.4256, + 292.849, + 286.152, + 278.8956, + 271.8792, + 265.118, + 258.62, + 252.5132, + 245.9322, + 239.7726, + 233.6086, + 227.5332, + 222.5918, + 216.4294, + 210.7662, + 205.4106, + 199.7338, + 194.9012, + 188.4486, + 183.1556, + 178.6338, + 173.7312, + 169.6264, + 163.9526, + 159.8742, + 155.8326, + 151.1966, + 147.5594, + 143.07, + 140.037, + 134.1804, + 131.071, + 127.4884, + 124.0848, + 120.2944, + 117.333, + 112.9626, + 110.2902, + 107.0814, + 103.0334, + 99.4832000000001, + 96.3899999999999, + 93.7202000000002, + 90.1714000000002, + 87.2357999999999, + 85.9346, + 82.8910000000001, + 80.0264000000002, + 78.3834000000002, + 75.1543999999999, + 73.8683999999998, + 70.9895999999999, + 69.4367999999999, + 64.8701999999998, + 65.0408000000002, + 61.6738, + 59.5207999999998, + 57.0158000000001, + 54.2302, + 53.0962, + 50.4985999999999, + 52.2588000000001, + 47.3914, + 45.6244000000002, + 42.8377999999998, + 43.0072, + 40.6516000000001, + 40.2453999999998, + 35.2136, + 36.4546, + 33.7849999999999, + 33.2294000000002, + 32.4679999999998, + 30.8670000000002, + 28.6507999999999, + 28.9099999999999, + 27.5983999999999, + 26.1619999999998, + 24.5563999999999, + 23.2328000000002, + 21.9484000000002, + 21.5902000000001, + 21.3346000000001, + 17.7031999999999, + 20.6111999999998, + 19.5545999999999, + 15.7375999999999, + 17.0720000000001, + 16.9517999999998, + 15.326, + 13.1817999999998, + 14.6925999999999, + 13.0859999999998, + 13.2754, + 10.8697999999999, + 11.248, + 7.3768, + 4.72339999999986, + 7.97899999999981, + 8.7503999999999, + 7.68119999999999, + 9.7199999999998, + 7.73919999999998, + 5.6224000000002, + 7.44560000000001, + 6.6601999999998, + 5.9058, + 4.00199999999995, + 4.51699999999983, + 4.68240000000014, + 3.86220000000003, + 5.13639999999987, + 5.98500000000013, + 2.47719999999981, + 2.61999999999989, + 1.62800000000016, + 4.65000000000009, + 0.225599999999758, + 0.831000000000131, + -0.359400000000278, + 1.27599999999984, + -2.92559999999958, + -0.0303999999996449, + 2.37079999999969, + -2.0033999999996, + 0.804600000000391, + 0.30199999999968, + 1.1247999999996, + -2.6880000000001, + 0.0321999999996478, + -1.18099999999959, + -3.9402, + -1.47940000000017, + -0.188400000000001, + -2.10720000000038, + -2.04159999999956, + -3.12880000000041, + -4.16160000000036, + -0.612799999999879, + -3.48719999999958, + -8.17900000000009, + -5.37780000000021, + -4.01379999999972, + -5.58259999999973, + -5.73719999999958, + -7.66799999999967, + -5.69520000000011, + -1.1247999999996, + -5.58520000000044, + -8.04560000000038, + -4.64840000000004, + -11.6468000000004, + -7.97519999999986, + -5.78300000000036, + -7.67420000000038, + -10.6328000000003, + -9.81720000000041, + ], + &[ + 1476.0, + 1449.6014, + 1423.5802, + 1397.7942, + 1372.3042, + 1347.2062, + 1321.8402, + 1297.2292, + 1272.9462, + 1248.9926, + 1225.3026, + 1201.4252, + 1178.0578, + 1155.6092, + 1132.626, + 1110.5568, + 1088.527, + 1066.5154, + 1045.1874, + 1024.3878, + 1003.37, + 982.1972, + 962.5728, + 942.1012, + 922.9668, + 903.292, + 884.0772, + 864.8578, + 846.6562, + 828.041, + 809.714, + 792.3112, + 775.1806, + 757.9854, + 740.656, + 724.346, + 707.5154, + 691.8378, + 675.7448, + 659.6722, + 645.5722, + 630.1462, + 614.4124, + 600.8728, + 585.898, + 572.408, + 558.4926, + 544.4938, + 531.6776, + 517.282, + 505.7704, + 493.1012, + 480.7388, + 467.6876, + 456.1872, + 445.5048, + 433.0214, + 420.806, + 411.409, + 400.4144, + 389.4294, + 379.2286, + 369.651, + 360.6156, + 350.337, + 342.083, + 332.1538, + 322.5094, + 315.01, + 305.6686, + 298.1678, + 287.8116, + 280.9978, + 271.9204, + 265.3286, + 257.5706, + 249.6014, + 242.544, + 235.5976, + 229.583, + 220.9438, + 214.672, + 208.2786, + 201.8628, + 195.1834, + 191.505, + 186.1816, + 178.5188, + 172.2294, + 167.8908, + 161.0194, + 158.052, + 151.4588, + 148.1596, + 143.4344, + 138.5238, + 133.13, + 127.6374, + 124.8162, + 118.7894, + 117.3984, + 114.6078, + 109.0858, + 105.1036, + 103.6258, + 98.6018000000004, + 95.7618000000002, + 93.5821999999998, + 88.5900000000001, + 86.9992000000002, + 82.8800000000001, + 80.4539999999997, + 74.6981999999998, + 74.3644000000004, + 73.2914000000001, + 65.5709999999999, + 66.9232000000002, + 65.1913999999997, + 62.5882000000001, + 61.5702000000001, + 55.7035999999998, + 56.1764000000003, + 52.7596000000003, + 53.0302000000001, + 49.0609999999997, + 48.4694, + 44.933, + 46.0474000000004, + 44.7165999999997, + 41.9416000000001, + 39.9207999999999, + 35.6328000000003, + 35.5276000000003, + 33.1934000000001, + 33.2371999999996, + 33.3864000000003, + 33.9228000000003, + 30.2371999999996, + 29.1373999999996, + 25.2272000000003, + 24.2942000000003, + 19.8338000000003, + 18.9005999999999, + 23.0907999999999, + 21.8544000000002, + 19.5176000000001, + 15.4147999999996, + 16.9314000000004, + 18.6737999999996, + 12.9877999999999, + 14.3688000000002, + 12.0447999999997, + 15.5219999999999, + 12.5299999999997, + 14.5940000000001, + 14.3131999999996, + 9.45499999999993, + 12.9441999999999, + 3.91139999999996, + 13.1373999999996, + 5.44720000000052, + 9.82779999999912, + 7.87279999999919, + 3.67760000000089, + 5.46980000000076, + 5.55099999999948, + 5.65979999999945, + 3.89439999999922, + 3.1275999999998, + 5.65140000000065, + 6.3062000000009, + 3.90799999999945, + 1.87060000000019, + 5.17020000000048, + 2.46680000000015, + 0.770000000000437, + -3.72340000000077, + 1.16400000000067, + 8.05340000000069, + 0.135399999999208, + 2.15940000000046, + 0.766999999999825, + 1.0594000000001, + 3.15500000000065, + -0.287399999999252, + 2.37219999999979, + -2.86620000000039, + -1.63199999999961, + -2.22979999999916, + -0.15519999999924, + -1.46039999999994, + -0.262199999999211, + -2.34460000000036, + -2.8078000000005, + -3.22179999999935, + -5.60159999999996, + -8.42200000000048, + -9.43740000000071, + 0.161799999999857, + -10.4755999999998, + -10.0823999999993, + ], + &[ + 2953.0, + 2900.4782, + 2848.3568, + 2796.3666, + 2745.324, + 2694.9598, + 2644.648, + 2595.539, + 2546.1474, + 2498.2576, + 2450.8376, + 2403.6076, + 2357.451, + 2311.38, + 2266.4104, + 2221.5638, + 2176.9676, + 2134.193, + 2090.838, + 2048.8548, + 2007.018, + 1966.1742, + 1925.4482, + 1885.1294, + 1846.4776, + 1807.4044, + 1768.8724, + 1731.3732, + 1693.4304, + 1657.5326, + 1621.949, + 1586.5532, + 1551.7256, + 1517.6182, + 1483.5186, + 1450.4528, + 1417.865, + 1385.7164, + 1352.6828, + 1322.6708, + 1291.8312, + 1260.9036, + 1231.476, + 1201.8652, + 1173.6718, + 1145.757, + 1119.2072, + 1092.2828, + 1065.0434, + 1038.6264, + 1014.3192, + 988.5746, + 965.0816, + 940.1176, + 917.9796, + 894.5576, + 871.1858, + 849.9144, + 827.1142, + 805.0818, + 783.9664, + 763.9096, + 742.0816, + 724.3962, + 706.3454, + 688.018, + 667.4214, + 650.3106, + 633.0686, + 613.8094, + 597.818, + 581.4248, + 563.834, + 547.363, + 531.5066, + 520.455400000001, + 505.583199999999, + 488.366, + 476.480799999999, + 459.7682, + 450.0522, + 434.328799999999, + 423.952799999999, + 408.727000000001, + 399.079400000001, + 387.252200000001, + 373.987999999999, + 360.852000000001, + 351.6394, + 339.642, + 330.902400000001, + 322.661599999999, + 311.662200000001, + 301.3254, + 291.7484, + 279.939200000001, + 276.7508, + 263.215200000001, + 254.811400000001, + 245.5494, + 242.306399999999, + 234.8734, + 223.787200000001, + 217.7156, + 212.0196, + 200.793, + 195.9748, + 189.0702, + 182.449199999999, + 177.2772, + 170.2336, + 164.741, + 158.613600000001, + 155.311, + 147.5964, + 142.837, + 137.3724, + 132.0162, + 130.0424, + 121.9804, + 120.451800000001, + 114.8968, + 111.585999999999, + 105.933199999999, + 101.705, + 98.5141999999996, + 95.0488000000005, + 89.7880000000005, + 91.4750000000004, + 83.7764000000006, + 80.9698000000008, + 72.8574000000008, + 73.1615999999995, + 67.5838000000003, + 62.6263999999992, + 63.2638000000006, + 66.0977999999996, + 52.0843999999997, + 58.9956000000002, + 47.0912000000008, + 46.4956000000002, + 48.4383999999991, + 47.1082000000006, + 43.2392, + 37.2759999999998, + 40.0283999999992, + 35.1864000000005, + 35.8595999999998, + 32.0998, + 28.027, + 23.6694000000007, + 33.8266000000003, + 26.3736000000008, + 27.2008000000005, + 21.3245999999999, + 26.4115999999995, + 23.4521999999997, + 19.5013999999992, + 19.8513999999996, + 10.7492000000002, + 18.6424000000006, + 13.1265999999996, + 18.2436000000016, + 6.71860000000015, + 3.39459999999963, + 6.33759999999893, + 7.76719999999841, + 0.813999999998487, + 3.82819999999992, + 0.826199999999517, + 8.07440000000133, + -1.59080000000176, + 5.01780000000144, + 0.455399999998917, + -0.24199999999837, + 0.174800000000687, + -9.07640000000174, + -4.20160000000033, + -3.77520000000004, + -4.75179999999818, + -5.3724000000002, + -8.90680000000066, + -6.10239999999976, + -5.74120000000039, + -9.95339999999851, + -3.86339999999836, + -13.7304000000004, + -16.2710000000006, + -7.51359999999841, + -3.30679999999847, + -13.1339999999982, + -10.0551999999989, + -6.72019999999975, + -8.59660000000076, + -10.9307999999983, + -1.8775999999998, + -4.82259999999951, + -13.7788, + -21.6470000000008, + -10.6735999999983, + -15.7799999999988, + ], + &[ + 5907.5052, + 5802.2672, + 5697.347, + 5593.5794, + 5491.2622, + 5390.5514, + 5290.3376, + 5191.6952, + 5093.5988, + 4997.3552, + 4902.5972, + 4808.3082, + 4715.5646, + 4624.109, + 4533.8216, + 4444.4344, + 4356.3802, + 4269.2962, + 4183.3784, + 4098.292, + 4014.79, + 3932.4574, + 3850.6036, + 3771.2712, + 3691.7708, + 3615.099, + 3538.1858, + 3463.4746, + 3388.8496, + 3315.6794, + 3244.5448, + 3173.7516, + 3103.3106, + 3033.6094, + 2966.5642, + 2900.794, + 2833.7256, + 2769.81, + 2707.3196, + 2644.0778, + 2583.9916, + 2523.4662, + 2464.124, + 2406.073, + 2347.0362, + 2292.1006, + 2238.1716, + 2182.7514, + 2128.4884, + 2077.1314, + 2025.037, + 1975.3756, + 1928.933, + 1879.311, + 1831.0006, + 1783.2144, + 1738.3096, + 1694.5144, + 1649.024, + 1606.847, + 1564.7528, + 1525.3168, + 1482.5372, + 1443.9668, + 1406.5074, + 1365.867, + 1329.2186, + 1295.4186, + 1257.9716, + 1225.339, + 1193.2972, + 1156.3578, + 1125.8686, + 1091.187, + 1061.4094, + 1029.4188, + 1000.9126, + 972.3272, + 944.004199999999, + 915.7592, + 889.965, + 862.834200000001, + 840.4254, + 812.598399999999, + 785.924200000001, + 763.050999999999, + 741.793799999999, + 721.466, + 699.040799999999, + 677.997200000002, + 649.866999999998, + 634.911800000002, + 609.8694, + 591.981599999999, + 570.2922, + 557.129199999999, + 538.3858, + 521.872599999999, + 502.951400000002, + 495.776399999999, + 475.171399999999, + 459.751, + 439.995200000001, + 426.708999999999, + 413.7016, + 402.3868, + 387.262599999998, + 372.0524, + 357.050999999999, + 342.5098, + 334.849200000001, + 322.529399999999, + 311.613799999999, + 295.848000000002, + 289.273000000001, + 274.093000000001, + 263.329600000001, + 251.389599999999, + 245.7392, + 231.9614, + 229.7952, + 217.155200000001, + 208.9588, + 199.016599999999, + 190.839199999999, + 180.6976, + 176.272799999999, + 166.976999999999, + 162.5252, + 151.196400000001, + 149.386999999999, + 133.981199999998, + 130.0586, + 130.164000000001, + 122.053400000001, + 110.7428, + 108.1276, + 106.232400000001, + 100.381600000001, + 98.7668000000012, + 86.6440000000002, + 79.9768000000004, + 82.4722000000002, + 68.7026000000005, + 70.1186000000016, + 71.9948000000004, + 58.998599999999, + 59.0492000000013, + 56.9818000000014, + 47.5338000000011, + 42.9928, + 51.1591999999982, + 37.2740000000013, + 42.7220000000016, + 31.3734000000004, + 26.8090000000011, + 25.8934000000008, + 26.5286000000015, + 29.5442000000003, + 19.3503999999994, + 26.0760000000009, + 17.9527999999991, + 14.8419999999969, + 10.4683999999979, + 8.65899999999965, + 9.86720000000059, + 4.34139999999752, + -0.907800000000861, + -3.32080000000133, + -0.936199999996461, + -11.9916000000012, + -8.87000000000262, + -6.33099999999831, + -11.3366000000024, + -15.9207999999999, + -9.34659999999712, + -15.5034000000014, + -19.2097999999969, + -15.357799999998, + -28.2235999999975, + -30.6898000000001, + -19.3271999999997, + -25.6083999999973, + -24.409599999999, + -13.6385999999984, + -33.4473999999973, + -32.6949999999997, + -28.9063999999998, + -31.7483999999968, + -32.2935999999972, + -35.8329999999987, + -47.620600000002, + -39.0855999999985, + -33.1434000000008, + -46.1371999999974, + -37.5892000000022, + -46.8164000000033, + -47.3142000000007, + -60.2914000000019, + -37.7575999999972, + ], + &[ + 11816.475, + 11605.0046, + 11395.3792, + 11188.7504, + 10984.1814, + 10782.0086, + 10582.0072, + 10384.503, + 10189.178, + 9996.2738, + 9806.0344, + 9617.9798, + 9431.394, + 9248.7784, + 9067.6894, + 8889.6824, + 8712.9134, + 8538.8624, + 8368.4944, + 8197.7956, + 8031.8916, + 7866.6316, + 7703.733, + 7544.5726, + 7386.204, + 7230.666, + 7077.8516, + 6926.7886, + 6778.6902, + 6631.9632, + 6487.304, + 6346.7486, + 6206.4408, + 6070.202, + 5935.2576, + 5799.924, + 5671.0324, + 5541.9788, + 5414.6112, + 5290.0274, + 5166.723, + 5047.6906, + 4929.162, + 4815.1406, + 4699.127, + 4588.5606, + 4477.7394, + 4369.4014, + 4264.2728, + 4155.9224, + 4055.581, + 3955.505, + 3856.9618, + 3761.3828, + 3666.9702, + 3575.7764, + 3482.4132, + 3395.0186, + 3305.8852, + 3221.415, + 3138.6024, + 3056.296, + 2970.4494, + 2896.1526, + 2816.8008, + 2740.2156, + 2670.497, + 2594.1458, + 2527.111, + 2460.8168, + 2387.5114, + 2322.9498, + 2260.6752, + 2194.2686, + 2133.7792, + 2074.767, + 2015.204, + 1959.4226, + 1898.6502, + 1850.006, + 1792.849, + 1741.4838, + 1687.9778, + 1638.1322, + 1589.3266, + 1543.1394, + 1496.8266, + 1447.8516, + 1402.7354, + 1361.9606, + 1327.0692, + 1285.4106, + 1241.8112, + 1201.6726, + 1161.973, + 1130.261, + 1094.2036, + 1048.2036, + 1020.6436, + 990.901400000002, + 961.199800000002, + 924.769800000002, + 899.526400000002, + 872.346400000002, + 834.375, + 810.432000000001, + 780.659800000001, + 756.013800000001, + 733.479399999997, + 707.923999999999, + 673.858, + 652.222399999999, + 636.572399999997, + 615.738599999997, + 586.696400000001, + 564.147199999999, + 541.679600000003, + 523.943599999999, + 505.714599999999, + 475.729599999999, + 461.779600000002, + 449.750800000002, + 439.020799999998, + 412.7886, + 400.245600000002, + 383.188199999997, + 362.079599999997, + 357.533799999997, + 334.319000000003, + 327.553399999997, + 308.559399999998, + 291.270199999999, + 279.351999999999, + 271.791400000002, + 252.576999999997, + 247.482400000001, + 236.174800000001, + 218.774599999997, + 220.155200000001, + 208.794399999999, + 201.223599999998, + 182.995600000002, + 185.5268, + 164.547400000003, + 176.5962, + 150.689599999998, + 157.8004, + 138.378799999999, + 134.021200000003, + 117.614399999999, + 108.194000000003, + 97.0696000000025, + 89.6042000000016, + 95.6030000000028, + 84.7810000000027, + 72.635000000002, + 77.3482000000004, + 59.4907999999996, + 55.5875999999989, + 50.7346000000034, + 61.3916000000027, + 50.9149999999936, + 39.0384000000049, + 58.9395999999979, + 29.633600000001, + 28.2032000000036, + 26.0078000000067, + 17.0387999999948, + 9.22000000000116, + 13.8387999999977, + 8.07240000000456, + 14.1549999999988, + 15.3570000000036, + 3.42660000000615, + 6.24820000000182, + -2.96940000000177, + -8.79940000000352, + -5.97860000000219, + -14.4048000000039, + -3.4143999999942, + -13.0148000000045, + -11.6977999999945, + -25.7878000000055, + -22.3185999999987, + -24.409599999999, + -31.9756000000052, + -18.9722000000038, + -22.8678000000073, + -30.8972000000067, + -32.3715999999986, + -22.3907999999938, + -43.6720000000059, + -35.9038, + -39.7492000000057, + -54.1641999999993, + -45.2749999999942, + -42.2989999999991, + -44.1089999999967, + -64.3564000000042, + -49.9551999999967, + -42.6116000000038, + ], + &[ + 23634.0036, + 23210.8034, + 22792.4744, + 22379.1524, + 21969.7928, + 21565.326, + 21165.3532, + 20770.2806, + 20379.9892, + 19994.7098, + 19613.318, + 19236.799, + 18865.4382, + 18498.8244, + 18136.5138, + 17778.8668, + 17426.2344, + 17079.32, + 16734.778, + 16397.2418, + 16063.3324, + 15734.0232, + 15409.731, + 15088.728, + 14772.9896, + 14464.1402, + 14157.5588, + 13855.5958, + 13559.3296, + 13264.9096, + 12978.326, + 12692.0826, + 12413.8816, + 12137.3192, + 11870.2326, + 11602.5554, + 11340.3142, + 11079.613, + 10829.5908, + 10583.5466, + 10334.0344, + 10095.5072, + 9859.694, + 9625.2822, + 9395.7862, + 9174.0586, + 8957.3164, + 8738.064, + 8524.155, + 8313.7396, + 8116.9168, + 7913.542, + 7718.4778, + 7521.65, + 7335.5596, + 7154.2906, + 6968.7396, + 6786.3996, + 6613.236, + 6437.406, + 6270.6598, + 6107.7958, + 5945.7174, + 5787.6784, + 5635.5784, + 5482.308, + 5337.9784, + 5190.0864, + 5045.9158, + 4919.1386, + 4771.817, + 4645.7742, + 4518.4774, + 4385.5454, + 4262.6622, + 4142.74679999999, + 4015.5318, + 3897.9276, + 3790.7764, + 3685.13800000001, + 3573.6274, + 3467.9706, + 3368.61079999999, + 3271.5202, + 3170.3848, + 3076.4656, + 2982.38400000001, + 2888.4664, + 2806.4868, + 2711.9564, + 2634.1434, + 2551.3204, + 2469.7662, + 2396.61139999999, + 2318.9902, + 2243.8658, + 2171.9246, + 2105.01360000001, + 2028.8536, + 1960.9952, + 1901.4096, + 1841.86079999999, + 1777.54700000001, + 1714.5802, + 1654.65059999999, + 1596.311, + 1546.2016, + 1492.3296, + 1433.8974, + 1383.84600000001, + 1339.4152, + 1293.5518, + 1245.8686, + 1193.50659999999, + 1162.27959999999, + 1107.19439999999, + 1069.18060000001, + 1035.09179999999, + 999.679000000004, + 957.679999999993, + 925.300199999998, + 888.099400000006, + 848.638600000006, + 818.156400000007, + 796.748399999997, + 752.139200000005, + 725.271200000003, + 692.216, + 671.633600000001, + 647.939799999993, + 621.670599999998, + 575.398799999995, + 561.226599999995, + 532.237999999998, + 521.787599999996, + 483.095799999996, + 467.049599999998, + 465.286399999997, + 415.548599999995, + 401.047399999996, + 380.607999999993, + 377.362599999993, + 347.258799999996, + 338.371599999999, + 310.096999999994, + 301.409199999995, + 276.280799999993, + 265.586800000005, + 258.994399999996, + 223.915999999997, + 215.925399999993, + 213.503800000006, + 191.045400000003, + 166.718200000003, + 166.259000000005, + 162.941200000001, + 148.829400000002, + 141.645999999993, + 123.535399999993, + 122.329800000007, + 89.473399999988, + 80.1962000000058, + 77.5457999999926, + 59.1056000000099, + 83.3509999999951, + 52.2906000000075, + 36.3979999999865, + 40.6558000000077, + 42.0003999999899, + 19.6630000000005, + 19.7153999999864, + -8.38539999999921, + -0.692799999989802, + 0.854800000000978, + 3.23219999999856, + -3.89040000000386, + -5.25880000001052, + -24.9052000000083, + -22.6837999999989, + -26.4286000000138, + -34.997000000003, + -37.0216000000073, + -43.430400000012, + -58.2390000000014, + -68.8034000000043, + -56.9245999999985, + -57.8583999999973, + -77.3097999999882, + -73.2793999999994, + -81.0738000000129, + -87.4530000000086, + -65.0254000000132, + -57.296399999992, + -96.2746000000043, + -103.25, + -96.081600000005, + -91.5542000000132, + -102.465200000006, + -107.688599999994, + -101.458000000013, + -109.715800000005, + ], + &[ + 47270.0, + 46423.3584, + 45585.7074, + 44757.152, + 43938.8416, + 43130.9514, + 42330.03, + 41540.407, + 40759.6348, + 39988.206, + 39226.5144, + 38473.2096, + 37729.795, + 36997.268, + 36272.6448, + 35558.665, + 34853.0248, + 34157.4472, + 33470.5204, + 32793.5742, + 32127.0194, + 31469.4182, + 30817.6136, + 30178.6968, + 29546.8908, + 28922.8544, + 28312.271, + 27707.0924, + 27114.0326, + 26526.692, + 25948.6336, + 25383.7826, + 24823.5998, + 24272.2974, + 23732.2572, + 23201.4976, + 22674.2796, + 22163.6336, + 21656.515, + 21161.7362, + 20669.9368, + 20189.4424, + 19717.3358, + 19256.3744, + 18795.9638, + 18352.197, + 17908.5738, + 17474.391, + 17052.918, + 16637.2236, + 16228.4602, + 15823.3474, + 15428.6974, + 15043.0284, + 14667.6278, + 14297.4588, + 13935.2882, + 13578.5402, + 13234.6032, + 12882.1578, + 12548.0728, + 12219.231, + 11898.0072, + 11587.2626, + 11279.9072, + 10973.5048, + 10678.5186, + 10392.4876, + 10105.2556, + 9825.766, + 9562.5444, + 9294.2222, + 9038.2352, + 8784.848, + 8533.2644, + 8301.7776, + 8058.30859999999, + 7822.94579999999, + 7599.11319999999, + 7366.90779999999, + 7161.217, + 6957.53080000001, + 6736.212, + 6548.21220000001, + 6343.06839999999, + 6156.28719999999, + 5975.15419999999, + 5791.75719999999, + 5621.32019999999, + 5451.66, + 5287.61040000001, + 5118.09479999999, + 4957.288, + 4798.4246, + 4662.17559999999, + 4512.05900000001, + 4364.68539999999, + 4220.77720000001, + 4082.67259999999, + 3957.19519999999, + 3842.15779999999, + 3699.3328, + 3583.01180000001, + 3473.8964, + 3338.66639999999, + 3233.55559999999, + 3117.799, + 3008.111, + 2909.69140000001, + 2814.86499999999, + 2719.46119999999, + 2624.742, + 2532.46979999999, + 2444.7886, + 2370.1868, + 2272.45259999999, + 2196.19260000001, + 2117.90419999999, + 2023.2972, + 1969.76819999999, + 1885.58979999999, + 1833.2824, + 1733.91200000001, + 1682.54920000001, + 1604.57980000001, + 1556.11240000001, + 1491.3064, + 1421.71960000001, + 1371.22899999999, + 1322.1324, + 1264.7892, + 1196.23920000001, + 1143.8474, + 1088.67240000001, + 1073.60380000001, + 1023.11660000001, + 959.036400000012, + 927.433199999999, + 906.792799999996, + 853.433599999989, + 841.873800000001, + 791.1054, + 756.899999999994, + 704.343200000003, + 672.495599999995, + 622.790399999998, + 611.254799999995, + 567.283200000005, + 519.406599999988, + 519.188400000014, + 495.312800000014, + 451.350799999986, + 443.973399999988, + 431.882199999993, + 392.027000000002, + 380.924200000009, + 345.128999999986, + 298.901400000002, + 287.771999999997, + 272.625, + 247.253000000026, + 222.490600000019, + 223.590000000026, + 196.407599999977, + 176.425999999978, + 134.725199999986, + 132.4804, + 110.445599999977, + 86.7939999999944, + 56.7038000000175, + 64.915399999998, + 38.3726000000024, + 37.1606000000029, + 46.170999999973, + 49.1716000000015, + 15.3362000000197, + 6.71639999997569, + -34.8185999999987, + -39.4476000000141, + 12.6830000000191, + -12.3331999999937, + -50.6565999999875, + -59.9538000000175, + -65.1054000000004, + -70.7576000000117, + -106.325200000021, + -126.852200000023, + -110.227599999984, + -132.885999999999, + -113.897200000007, + -142.713800000027, + -151.145399999979, + -150.799200000009, + -177.756200000003, + -156.036399999983, + -182.735199999996, + -177.259399999981, + -198.663600000029, + -174.577600000019, + -193.84580000001, + ], + &[ + 94541.0, + 92848.811, + 91174.019, + 89517.558, + 87879.9705, + 86262.7565, + 84663.5125, + 83083.7435, + 81521.7865, + 79977.272, + 78455.9465, + 76950.219, + 75465.432, + 73994.152, + 72546.71, + 71115.2345, + 69705.6765, + 68314.937, + 66944.2705, + 65591.255, + 64252.9485, + 62938.016, + 61636.8225, + 60355.592, + 59092.789, + 57850.568, + 56624.518, + 55417.343, + 54231.1415, + 53067.387, + 51903.526, + 50774.649, + 49657.6415, + 48561.05, + 47475.7575, + 46410.159, + 45364.852, + 44327.053, + 43318.4005, + 42325.6165, + 41348.4595, + 40383.6265, + 39436.77, + 38509.502, + 37594.035, + 36695.939, + 35818.6895, + 34955.691, + 34115.8095, + 33293.949, + 32465.0775, + 31657.6715, + 30877.2585, + 30093.78, + 29351.3695, + 28594.1365, + 27872.115, + 27168.7465, + 26477.076, + 25774.541, + 25106.5375, + 24452.5135, + 23815.5125, + 23174.0655, + 22555.2685, + 21960.2065, + 21376.3555, + 20785.1925, + 20211.517, + 19657.0725, + 19141.6865, + 18579.737, + 18081.3955, + 17578.995, + 17073.44, + 16608.335, + 16119.911, + 15651.266, + 15194.583, + 14749.0495, + 14343.4835, + 13925.639, + 13504.509, + 13099.3885, + 12691.2855, + 12328.018, + 11969.0345, + 11596.5145, + 11245.6355, + 10917.6575, + 10580.9785, + 10277.8605, + 9926.58100000001, + 9605.538, + 9300.42950000003, + 8989.97850000003, + 8728.73249999998, + 8448.3235, + 8175.31050000002, + 7898.98700000002, + 7629.79100000003, + 7413.76199999999, + 7149.92300000001, + 6921.12650000001, + 6677.1545, + 6443.28000000003, + 6278.23450000002, + 6014.20049999998, + 5791.20299999998, + 5605.78450000001, + 5438.48800000001, + 5234.2255, + 5059.6825, + 4887.43349999998, + 4682.935, + 4496.31099999999, + 4322.52250000002, + 4191.42499999999, + 4021.24200000003, + 3900.64799999999, + 3762.84250000003, + 3609.98050000001, + 3502.29599999997, + 3363.84250000003, + 3206.54849999998, + 3079.70000000001, + 2971.42300000001, + 2867.80349999998, + 2727.08100000001, + 2630.74900000001, + 2496.6165, + 2440.902, + 2356.19150000002, + 2235.58199999999, + 2120.54149999999, + 2012.25449999998, + 1933.35600000003, + 1820.93099999998, + 1761.54800000001, + 1663.09350000002, + 1578.84600000002, + 1509.48149999999, + 1427.3345, + 1379.56150000001, + 1306.68099999998, + 1212.63449999999, + 1084.17300000001, + 1124.16450000001, + 1060.69949999999, + 1007.48849999998, + 941.194499999983, + 879.880500000028, + 836.007500000007, + 782.802000000025, + 748.385499999975, + 647.991500000004, + 626.730500000005, + 570.776000000013, + 484.000500000024, + 513.98550000001, + 418.985499999952, + 386.996999999974, + 370.026500000036, + 355.496999999974, + 356.731499999994, + 255.92200000002, + 259.094000000041, + 205.434499999974, + 165.374500000034, + 197.347500000033, + 95.718499999959, + 67.6165000000037, + 54.6970000000438, + 31.7395000000251, + -15.8784999999916, + 8.42500000004657, + -26.3754999999655, + -118.425500000012, + -66.6629999999423, + -42.9745000000112, + -107.364999999991, + -189.839000000036, + -162.611499999999, + -164.964999999967, + -189.079999999958, + -223.931499999948, + -235.329999999958, + -269.639500000048, + -249.087999999989, + -206.475499999942, + -283.04449999996, + -290.667000000016, + -304.561499999953, + -336.784499999951, + -380.386500000022, + -283.280499999993, + -364.533000000054, + -389.059499999974, + -364.454000000027, + -415.748000000021, + -417.155000000028, + ], + &[ + 189083.0, + 185696.913, + 182348.774, + 179035.946, + 175762.762, + 172526.444, + 169329.754, + 166166.099, + 163043.269, + 159958.91, + 156907.912, + 153906.845, + 150924.199, + 147996.568, + 145093.457, + 142239.233, + 139421.475, + 136632.27, + 133889.588, + 131174.2, + 128511.619, + 125868.621, + 123265.385, + 120721.061, + 118181.769, + 115709.456, + 113252.446, + 110840.198, + 108465.099, + 106126.164, + 103823.469, + 101556.618, + 99308.004, + 97124.508, + 94937.803, + 92833.731, + 90745.061, + 88677.627, + 86617.47, + 84650.442, + 82697.833, + 80769.132, + 78879.629, + 77014.432, + 75215.626, + 73384.587, + 71652.482, + 69895.93, + 68209.301, + 66553.669, + 64921.981, + 63310.323, + 61742.115, + 60205.018, + 58698.658, + 57190.657, + 55760.865, + 54331.169, + 52908.167, + 51550.273, + 50225.254, + 48922.421, + 47614.533, + 46362.049, + 45098.569, + 43926.083, + 42736.03, + 41593.473, + 40425.26, + 39316.237, + 38243.651, + 37170.617, + 36114.609, + 35084.19, + 34117.233, + 33206.509, + 32231.505, + 31318.728, + 30403.404, + 29540.0550000001, + 28679.236, + 27825.862, + 26965.216, + 26179.148, + 25462.08, + 24645.952, + 23922.523, + 23198.144, + 22529.128, + 21762.4179999999, + 21134.779, + 20459.117, + 19840.818, + 19187.04, + 18636.3689999999, + 17982.831, + 17439.7389999999, + 16874.547, + 16358.2169999999, + 15835.684, + 15352.914, + 14823.681, + 14329.313, + 13816.897, + 13342.874, + 12880.882, + 12491.648, + 12021.254, + 11625.392, + 11293.7610000001, + 10813.697, + 10456.209, + 10099.074, + 9755.39000000001, + 9393.18500000006, + 9047.57900000003, + 8657.98499999999, + 8395.85900000005, + 8033.0, + 7736.95900000003, + 7430.59699999995, + 7258.47699999996, + 6924.58200000005, + 6691.29399999999, + 6357.92500000005, + 6202.05700000003, + 5921.19700000004, + 5628.28399999999, + 5404.96799999999, + 5226.71100000001, + 4990.75600000005, + 4799.77399999998, + 4622.93099999998, + 4472.478, + 4171.78700000001, + 3957.46299999999, + 3868.95200000005, + 3691.14300000004, + 3474.63100000005, + 3341.67200000002, + 3109.14000000001, + 3071.97400000005, + 2796.40399999998, + 2756.17799999996, + 2611.46999999997, + 2471.93000000005, + 2382.26399999997, + 2209.22400000005, + 2142.28399999999, + 2013.96100000001, + 1911.18999999994, + 1818.27099999995, + 1668.47900000005, + 1519.65800000005, + 1469.67599999998, + 1367.13800000004, + 1248.52899999998, + 1181.23600000003, + 1022.71900000004, + 1088.20700000005, + 959.03600000008, + 876.095999999903, + 791.183999999892, + 703.337000000058, + 731.949999999953, + 586.86400000006, + 526.024999999907, + 323.004999999888, + 320.448000000091, + 340.672999999952, + 309.638999999966, + 216.601999999955, + 102.922999999952, + 19.2399999999907, + -0.114000000059605, + -32.6240000000689, + -89.3179999999702, + -153.497999999905, + -64.2970000000205, + -143.695999999996, + -259.497999999905, + -253.017999999924, + -213.948000000091, + -397.590000000084, + -434.006000000052, + -403.475000000093, + -297.958000000101, + -404.317000000039, + -528.898999999976, + -506.621000000043, + -513.205000000075, + -479.351000000024, + -596.139999999898, + -527.016999999993, + -664.681000000099, + -680.306000000099, + -704.050000000047, + -850.486000000034, + -757.43200000003, + -713.308999999892, + ], +]; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to write hll to bytes, err:{}", source))] + WriteHll { source: bytes::Error }, + + #[snafu(display("Failed to write hll to bytes, err:{}", source))] + ReadHll { source: bytes::Error }, +} + +pub type Result = std::result::Result; + +pub struct HyperLogLog { + alpha: f64, + p: u8, + m: usize, + M: Vec, + sip: SipHasher13, +} + +impl HyperLogLog { + pub fn new(error_rate: f64) -> Self { + Self::new_with_keys(error_rate, rand::random(), rand::random()) + } + + pub fn new_with_keys(error_rate: f64, key0: u64, key1: u64) -> Self { + assert!(error_rate > 0.0 && error_rate < 1.0); + let sr = 1.04 / error_rate; + let p = f64::ln(sr * sr).ceil() as u8; + assert!(p <= 64); + let alpha = Self::get_alpha(p); + let m = 1usize << p; + HyperLogLog { + alpha, + p, + m, + M: repeat(0u8).take(m).collect(), + sip: SipHasher13::new_with_keys(key0, key1), + } + } + + pub fn new_from_template(hll: &HyperLogLog) -> Self { + HyperLogLog { + alpha: hll.alpha, + p: hll.p, + m: hll.m, + M: repeat(0u8).take(hll.m).collect(), + sip: hll.sip, + } + } + + pub fn insert(&mut self, value: &V) { + let sip = &mut self.sip.clone(); + value.hash(sip); + let x = sip.finish(); + self.insert_by_hash_value(x); + } + + pub fn insert_by_hash_value(&mut self, x: u64) { + let j = x as usize & (self.m - 1); + let w = x >> self.p; + let rho = Self::get_rho(w, 64 - self.p); + let mjr = &mut self.M[j]; + if rho > *mjr { + *mjr = rho; + } + } + + pub fn len(&self) -> f64 { + let V = Self::vec_count_zero(&self.M); + if V > 0 { + let H = self.m as f64 * (self.m as f64 / V as f64).ln(); + if H <= Self::get_treshold(self.p) { + H + } else { + self.ep() + } + } else { + self.ep() + } + } + + pub fn is_empty(&self) -> bool { + self.len() == 0.0 + } + + pub fn merge(&mut self, src: &HyperLogLog) { + assert!(src.p == self.p); + assert!(src.m == self.m); + let sip1 = &mut src.sip.clone(); + let sip2 = &mut self.sip.clone(); + 42.hash(sip1); + 42.hash(sip2); + assert!(sip1.finish() == sip2.finish()); + for i in 0..self.m { + let (src_mir, mir) = (src.M[i], &mut self.M[i]); + if src_mir > *mir { + *mir = src_mir; + } + } + } + + pub fn clear(&mut self) { + self.M.iter_mut().all(|x| { + *x = 0; + true + }); + } + + pub fn write_to_buf(&self, buf: &mut B) -> Result<()> { + buf.write_f64(self.alpha).context(WriteHll)?; + buf.write_u8(self.p).context(WriteHll)?; + // self.m is the length of self.M + buf.write_u64(self.m as u64).context(WriteHll)?; + buf.write_slice(&self.M).context(WriteHll)?; + // Store keys of hasher + let (key0, key1) = self.sip.keys(); + buf.write_u64(key0).context(WriteHll)?; + buf.write_u64(key1).context(WriteHll) + } + + pub fn read_from_buf(buf: &mut B) -> Result { + let alpha = buf.read_f64().context(ReadHll)?; + let p = buf.read_u8().context(ReadHll)?; + let m = buf.read_u64().context(ReadHll)? as usize; + let mut m_buf = vec![0u8; m]; + buf.read_to_slice(&mut m_buf).context(ReadHll)?; + let key0 = buf.read_u64().context(ReadHll)?; + let key1 = buf.read_u64().context(ReadHll)?; + + Ok(HyperLogLog { + alpha, + p, + m, + M: m_buf, + sip: SipHasher13::new_with_keys(key0, key1), + }) + } + + fn get_treshold(p: u8) -> f64 { + TRESHOLD_DATA[p as usize] + } + + fn get_alpha(p: u8) -> f64 { + assert!((4..=16).contains(&p)); + match p { + 4 => 0.673, + 5 => 0.697, + 6 => 0.709, + _ => 0.7213 / (1.0 + 1.079 / (1usize << (p as usize)) as f64), + } + } + + fn bit_length(x: u64) -> u8 { + let mut bits: u8 = 0; + let mut xm = x; + while xm != 0 { + bits += 1; + xm >>= 1; + } + bits + } + + fn get_rho(w: u64, max_width: u8) -> u8 { + let rho = max_width - Self::bit_length(w) + 1; + assert!(rho > 0); + rho + } + + fn vec_count_zero(v: &[u8]) -> usize { + bytecount::count(v, 0) + } + + fn estimate_bias(E: f64, p: u8) -> f64 { + let bias_vector = BIAS_DATA[(p - 4) as usize]; + let nearest_neighbors = Self::get_nearest_neighbors(E, RAW_ESTIMATE_DATA[(p - 4) as usize]); + let sum = nearest_neighbors + .iter() + .fold(0.0, |acc, &neighbor| acc + bias_vector[neighbor]); + sum / nearest_neighbors.len() as f64 + } + + fn get_nearest_neighbors(E: f64, estimate_vector: &[f64]) -> Vec { + let ev_len = estimate_vector.len(); + let mut r: Vec<(f64, usize)> = repeat((0.0f64, 0usize)).take(ev_len).collect(); + for i in 0..ev_len { + let dr = E - estimate_vector[i]; + r[i] = (dr * dr, i); + } + r.sort_by(|a, b| { + if a < b { + Less + } else if a > b { + Greater + } else { + Equal + } + }); + r.truncate(6); + r.iter() + .map(|&ez| { + let (_, b) = ez; + b + }) + .collect() + } + + fn ep(&self) -> f64 { + let sum = self + .M + .iter() + .fold(0.0, |acc, &x| acc + 2.0f64.powi(-(x as i32))); + let E = self.alpha * (self.m * self.m) as f64 / sum; + if E <= (5 * self.m) as f64 { + E - Self::estimate_bias(E, self.p) + } else { + E + } + } +} + +#[test] +fn hyperloglog_test_simple() { + let mut hll = HyperLogLog::new(0.00408); + let keys = ["test1", "test2", "test3", "test2", "test2", "test2"]; + for k in &keys { + hll.insert(k); + } + assert!((hll.len().round() - 3.0).abs() < std::f64::EPSILON); + assert!(!hll.is_empty()); + hll.clear(); + assert!(hll.is_empty()); + assert!(hll.len() == 0.0); +} + +#[test] +fn hyperloglog_test_merge() { + let mut hll = HyperLogLog::new(0.00408); + let keys = ["test1", "test2", "test3", "test2", "test2", "test2"]; + for k in &keys { + hll.insert(k); + } + assert!((hll.len().round() - 3.0).abs() < std::f64::EPSILON); + + let mut hll2 = HyperLogLog::new_from_template(&hll); + let keys2 = ["test3", "test4", "test4", "test4", "test4", "test1"]; + for k in &keys2 { + hll2.insert(k); + } + assert!((hll2.len().round() - 3.0).abs() < std::f64::EPSILON); + + hll.merge(&hll2); + assert!((hll.len().round() - 4.0).abs() < std::f64::EPSILON); +} + +#[test] +fn hyperloglog_test_write_read() { + let mut hll = HyperLogLog::new(0.00408); + hll.insert(&123); + + let mut write_buf = Vec::new(); + hll.write_to_buf(&mut write_buf).unwrap(); + + let mut buf = &write_buf[..]; + let hll2 = HyperLogLog::read_from_buf(&mut buf).unwrap(); + + let error_margin = f64::EPSILON; + assert!((hll.alpha - hll2.alpha).abs() < error_margin); + assert_eq!(hll.p, hll2.p); + assert_eq!(hll.m, hll2.m); + assert_eq!(hll.M, hll2.M); + assert_eq!(hll.sip.keys(), hll2.sip.keys()); +} diff --git a/components/skiplist/Cargo.toml b/components/skiplist/Cargo.toml new file mode 100644 index 0000000000..f56e48d122 --- /dev/null +++ b/components/skiplist/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "skiplist" +version = "0.1.0" +authors = ["Jay Lee "] +edition = "2018" + +[dependencies] +rand = "0.7" +bytes = "1.0" +arena = { path = "../arena" } + +[dev-dependencies] +yatp = { git = "https://github.com/tikv/yatp.git", rev = "4b71f8abd86890f0d1e95778c2b6bf5a9ee4c502" } +criterion = "0.3" + +# [target.'cfg(not(target_env = "msvc"))'.dev-dependencies] +# tikv-jemallocator = "0.4.0" + +[[bench]] +name = "bench" +harness = false diff --git a/components/skiplist/benches/bench.rs b/components/skiplist/benches/bench.rs new file mode 100644 index 0000000000..4744bb558c --- /dev/null +++ b/components/skiplist/benches/bench.rs @@ -0,0 +1,181 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + collections::*, + sync::{atomic::*, *}, + thread, +}; + +use arena::MonoIncArena; +use bytes::*; +use criterion::*; +use rand::prelude::*; +use skiplist::*; + +// #[cfg(not(target_env = "msvc"))] +// use tikv_jemallocator::Jemalloc; + +// #[cfg(not(target_env = "msvc"))] +// #[global_allocator] +// static GLOBAL: Jemalloc = Jemalloc; + +fn skiplist_round( + l: &Skiplist, + case: &(Bytes, bool), + exp: &Bytes, +) { + if case.1 { + if let Some(v) = l.get(&case.0) { + assert_eq!(v, exp); + } + } else { + l.put(&case.0, exp); + } +} + +fn append_ts(key: &mut BytesMut, ts: u64) { + key.put_u64(ts); +} + +fn random_key(rng: &mut ThreadRng) -> Bytes { + let mut key = BytesMut::with_capacity(16); + unsafe { + rng.fill_bytes(&mut *(&mut key.chunk_mut()[..8] as *mut _ as *mut [u8])); + key.advance_mut(8); + } + append_ts(&mut key, 0); + key.freeze() +} + +fn bench_read_write_skiplist_frac(b: &mut Bencher<'_>, frac: &usize) { + let frac = *frac; + let value = Bytes::from_static(b"00123"); + let comp = FixedLengthSuffixComparator::new(8); + let arena = MonoIncArena::new(1 << 10); + let list = Skiplist::with_arena(comp, arena); + let l = list.clone(); + let stop = Arc::new(AtomicBool::new(false)); + let s = stop.clone(); + let v = value.clone(); + let handle = thread::spawn(move || { + let mut rng = rand::thread_rng(); + while !s.load(Ordering::SeqCst) { + let key = random_key(&mut rng); + let case = (key, frac > rng.gen_range(0, 11)); + skiplist_round(&l, &case, &v); + } + }); + let mut rng = rand::thread_rng(); + b.iter_batched_ref( + || (random_key(&mut rng), frac > rng.gen_range(0, 11)), + |case| skiplist_round(&list, case, &value), + BatchSize::SmallInput, + ); + stop.store(true, Ordering::SeqCst); + handle.join().unwrap(); +} + +fn bench_read_write_skiplist(c: &mut Criterion) { + let mut group = c.benchmark_group("skiplist_read_write"); + for i in 0..=10 { + group.bench_with_input( + BenchmarkId::from_parameter(i), + &i, + bench_read_write_skiplist_frac, + ); + } + group.finish(); +} + +fn map_round(m: &Mutex>, case: &(Bytes, bool), exp: &Bytes) { + if case.1 { + let rm = m.lock().unwrap(); + let value = rm.get(&case.0); + if let Some(v) = value { + assert_eq!(v, exp); + } + } else { + let mut rm = m.lock().unwrap(); + rm.insert(case.0.clone(), exp.clone()); + } +} + +fn bench_read_write_map_frac(b: &mut Bencher<'_>, frac: &usize) { + let frac = *frac; + let value = Bytes::from_static(b"00123"); + let map = Arc::new(Mutex::new(HashMap::with_capacity(512 << 10))); + let map_in_thread = map.clone(); + let stop = Arc::new(AtomicBool::new(false)); + let thread_stop = stop.clone(); + + let v = value.clone(); + let handle = thread::spawn(move || { + let mut rng = rand::thread_rng(); + while !thread_stop.load(Ordering::SeqCst) { + let f = rng.gen_range(0, 11); + let case = (random_key(&mut rng), f < frac); + map_round(&map_in_thread, &case, &v); + } + }); + let mut rng = rand::thread_rng(); + b.iter_batched_ref( + || { + let f = rng.gen_range(0, 11); + (random_key(&mut rng), f < frac) + }, + |case| map_round(&map, case, &value), + BatchSize::SmallInput, + ); + stop.store(true, Ordering::SeqCst); + handle.join().unwrap(); +} + +fn bench_read_write_map(c: &mut Criterion) { + let mut group = c.benchmark_group("map_read_write"); + for i in 0..=10 { + group.bench_with_input( + BenchmarkId::from_parameter(i), + &i, + bench_read_write_map_frac, + ); + } + group.finish(); +} + +fn bench_write_skiplist(c: &mut Criterion) { + let comp = FixedLengthSuffixComparator::new(8); + let arena = MonoIncArena::new(1 << 10); + let list = Skiplist::with_arena(comp, arena); + let value = Bytes::from_static(b"00123"); + let l = list.clone(); + let stop = Arc::new(AtomicBool::new(false)); + let s = stop.clone(); + let v = value.clone(); + let handle = thread::spawn(move || { + let mut rng = rand::thread_rng(); + while !s.load(Ordering::SeqCst) { + let case = (random_key(&mut rng), false); + skiplist_round(&l, &case, &v); + } + }); + let mut rng = rand::thread_rng(); + c.bench_function("skiplist_write", |b| { + b.iter_batched( + || random_key(&mut rng), + |key| { + list.put(&key, &value); + }, + BatchSize::SmallInput, + ) + }); + stop.store(true, Ordering::SeqCst); + handle.join().unwrap(); +} + +criterion_group!( + benches, + bench_read_write_skiplist, + bench_read_write_map, + bench_write_skiplist +); +criterion_main!(benches); diff --git a/components/skiplist/src/key.rs b/components/skiplist/src/key.rs new file mode 100644 index 0000000000..297e4e446d --- /dev/null +++ b/components/skiplist/src/key.rs @@ -0,0 +1,55 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::cmp::Ordering; + +use bytes::Bytes; + +pub trait KeyComparator: Clone { + fn compare_key(&self, lhs: &[u8], rhs: &[u8]) -> Ordering; + fn same_key(&self, lhs: &[u8], rhs: &[u8]) -> bool; +} + +#[derive(Default, Debug, Clone, Copy)] +pub struct FixedLengthSuffixComparator { + len: usize, +} + +impl FixedLengthSuffixComparator { + pub const fn new(len: usize) -> FixedLengthSuffixComparator { + FixedLengthSuffixComparator { len } + } +} + +impl KeyComparator for FixedLengthSuffixComparator { + #[inline] + fn compare_key(&self, lhs: &[u8], rhs: &[u8]) -> Ordering { + if lhs.len() < self.len { + panic!( + "cannot compare with suffix {}: {:?}", + self.len, + Bytes::copy_from_slice(lhs) + ); + } + if rhs.len() < self.len { + panic!( + "cannot compare with suffix {}: {:?}", + self.len, + Bytes::copy_from_slice(rhs) + ); + } + let (l_p, l_s) = lhs.split_at(lhs.len() - self.len); + let (r_p, r_s) = rhs.split_at(rhs.len() - self.len); + let res = l_p.cmp(r_p); + match res { + Ordering::Greater | Ordering::Less => res, + Ordering::Equal => l_s.cmp(r_s), + } + } + + #[inline] + fn same_key(&self, lhs: &[u8], rhs: &[u8]) -> bool { + let (l_p, _) = lhs.split_at(lhs.len() - self.len); + let (r_p, _) = rhs.split_at(rhs.len() - self.len); + l_p == r_p + } +} diff --git a/components/skiplist/src/lib.rs b/components/skiplist/src/lib.rs new file mode 100644 index 0000000000..ca7d13b1a8 --- /dev/null +++ b/components/skiplist/src/lib.rs @@ -0,0 +1,21 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Forked from +//! +//! Differences: +//! 1. Inline key and value in Node, so all memory of skiplist is allocated from +//! arena. Drawback: we have to copy the content of key/value +//! 2. Tower stores pointer to Node instead of offset, so we can use other arena +//! implementation +//! 3. Use [ArenaSlice] to replace Bytes +//! 4. impl Send/Sync for the iterator + +mod key; +mod list; +mod slice; + +const MAX_HEIGHT: usize = 20; + +pub use key::{FixedLengthSuffixComparator, KeyComparator}; +pub use list::{IterRef, Skiplist}; +pub use slice::ArenaSlice; diff --git a/components/skiplist/src/list.rs b/components/skiplist/src/list.rs new file mode 100644 index 0000000000..ae84d2c3e7 --- /dev/null +++ b/components/skiplist/src/list.rs @@ -0,0 +1,698 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + alloc::Layout, + convert::TryInto, + mem, ptr, + ptr::NonNull, + slice, + sync::{ + atomic::{AtomicPtr, AtomicUsize, Ordering}, + Arc, + }, +}; + +use arena::{Arena, BasicStats}; +use rand::Rng; + +use super::{slice::ArenaSlice, KeyComparator, MAX_HEIGHT}; + +const HEIGHT_INCREASE: u32 = u32::MAX / 3; + +type KeySize = u16; +type ValueSize = u32; + +/// The layout of Node +/// 1. height: usize +/// 2. tower: AtomicPtr x (height + 1) +/// 3. key_size: KeySize +/// 4. key: u8 x key_size +/// 5. value_size: ValueSize +/// 6. value: ValueSize +// Uses C layout to make sure tower is at the bottom +#[derive(Debug)] +#[repr(C)] +pub struct Node { + /// Height of node, different from badger, The valid range of tower is [0, + /// height] + height: usize, + /// The node tower + /// + /// Only [0, height] parts is utilized to store node pointer, the key and + /// value block are start from tower[height + 1] + tower: [AtomicPtr; MAX_HEIGHT], +} + +impl Node { + /// Allocate a new node from the arena, and copy the content of key/value + /// into the node + /// # Safety + /// - from_size_align_unchecked: align is got from [mem::align_of]. + /// # Notice + /// This will only allocate the *exact* amount of memory needed within the + /// given height. + fn alloc(arena: &A, key: &[u8], value: &[u8], height: usize) -> *mut Node + where + A: Arena, + { + // Calculate node size to alloc + let size = mem::size_of::(); + // Not all values in Node::tower will be utilized. + let not_used = (MAX_HEIGHT - height - 1) * mem::size_of::>(); + // Space to store key/value: (key size) + key + (value size) + value + let kv_used = + mem::size_of::() + key.len() + mem::size_of::() + value.len(); + // UB in fact: the `not_used` size is able to be access in a "safe" way. + // It is guaranteed by the user to not use those memory. + let alloc_size = size - not_used + kv_used; + let layout = + unsafe { Layout::from_size_align_unchecked(alloc_size, mem::align_of::()) }; + let node_ptr = arena.alloc(layout).as_ptr() as *mut Node; + unsafe { + let node = &mut *node_ptr; + node.height = height; + ptr::write_bytes(node.tower.as_mut_ptr(), 0, height + 1); + Self::init_key_value(node, key, value); + + node_ptr + } + } + + /// Fetch next node ptr in given height + fn next_ptr(&self, height: usize) -> *mut Node { + self.tower[height].load(Ordering::SeqCst) + } + + /// Get key + /// + /// REQUIRE: This Node is created via `Node::alloc()` + unsafe fn key(&self) -> &[u8] { + let (key_block, key_size) = self.load_key_size(); + + slice::from_raw_parts(key_block, key_size as usize) + } + + /// Get value + /// + /// REQUIRE: This Node is created via `Node::alloc()` + unsafe fn value(&self) -> &[u8] { + let (key_block, key_size) = self.load_key_size(); + let (value_block, value_size) = self.load_value_size(key_block, key_size); + + slice::from_raw_parts(value_block, value_size as usize) + } + + /// Set key and value parts of Node during creating Node + /// + /// Will copy the content of key and value to the Node + /// + /// REQUIRE: This Node is created via Arena and node.tower and node.height + /// is already set to correct value + /// Panic: The size of key/value must less than max value of + /// KeySize/ValueSize (u16/u32), otherwise this function will panic + unsafe fn init_key_value(node: &mut Node, key: &[u8], value: &[u8]) { + let key_block = node.tower.as_mut_ptr().add(node.height + 1) as *mut u8; + let key_size: KeySize = key.len().try_into().unwrap(); + let key_size_bytes = key_size.to_ne_bytes(); + + ptr::copy_nonoverlapping( + key_size_bytes.as_ptr(), + key_block, + mem::size_of::(), + ); + let key_block = key_block.add(mem::size_of::()); + ptr::copy_nonoverlapping(key.as_ptr(), key_block, key.len()); + + let value_block = key_block.add(key.len()); + let value_size: ValueSize = value.len().try_into().unwrap(); + let value_size_bytes = value_size.to_ne_bytes(); + + ptr::copy_nonoverlapping( + value_size_bytes.as_ptr(), + value_block, + mem::size_of::(), + ); + let value_block = value_block.add(mem::size_of::()); + ptr::copy_nonoverlapping(value.as_ptr(), value_block, value.len()); + } + + /// Load key pointer and size of key + /// + /// REQUIRE: This Node is created via `Node::alloc()` + unsafe fn load_key_size(&self) -> (*const u8, KeySize) { + let tower = self.tower.as_ptr(); + // Move to key block + let key_block = tower.add(self.height + 1) as *const u8; + // Load key size from key block + let key_size = u16::from_ne_bytes(*(key_block as *const [u8; mem::size_of::()])); + // Move key block to the start of key + let key_block = key_block.add(mem::size_of::()); + + (key_block, key_size) + } + + /// Load value pointer and size of value + /// + /// Given key_block and key_size returned from `load_key_size()`, loads + /// value pointer and value size + /// + /// REQUIRE: This Node is created via `Node::alloc()` + unsafe fn load_value_size( + &self, + key_block: *const u8, + key_size: KeySize, + ) -> (*const u8, ValueSize) { + // Move to value block + let value_block = key_block.add(key_size as usize); + // Load value size from value block + let value_size = + u32::from_ne_bytes(*(value_block as *const [u8; mem::size_of::()])); + // Move value block to the start of value + let value_block = value_block.add(mem::size_of::()); + + (value_block, value_size) + } + + /// Get key with arena + /// + /// REQUIRE: This Node is created via `Node::alloc()` + unsafe fn key_with_arena(&self, arena: A) -> ArenaSlice + where + A: Arena, + { + let (key_block, key_size) = self.load_key_size(); + + ArenaSlice::from_raw_parts(arena, key_block, key_size as usize) + } + + /// Get value with arena + /// + /// REQUIRE: This Node is created via `Node::alloc()` + unsafe fn value_with_arena(&self, arena: A) -> ArenaSlice + where + A: Arena, + { + let (key_block, key_size) = self.load_key_size(); + let (value_block, value_size) = self.load_value_size(key_block, key_size); + + ArenaSlice::from_raw_parts(arena, value_block, value_size as usize) + } +} + +struct SkiplistCore> { + height: AtomicUsize, + head: NonNull, + arena: A, +} + +/// FIXME(yingwen): Modify the skiplist to support arena that supports growth, +/// otherwise it is hard to avoid memory usage not out of the arena capacity +#[derive(Clone)] +pub struct Skiplist + Clone> { + core: Arc>, + c: C, +} + +impl + Clone> Skiplist { + pub fn with_arena(c: C, arena: A) -> Skiplist { + let head = Node::alloc(&arena, &[], &[], MAX_HEIGHT - 1); + let head = unsafe { NonNull::new_unchecked(head) }; + Skiplist { + core: Arc::new(SkiplistCore { + height: AtomicUsize::new(0), + head, + arena, + }), + c, + } + } + + fn random_height(&self) -> usize { + let mut rng = rand::thread_rng(); + for h in 0..(MAX_HEIGHT - 1) { + if !rng.gen_ratio(HEIGHT_INCREASE, u32::MAX) { + return h; + } + } + MAX_HEIGHT - 1 + } + + fn height(&self) -> usize { + self.core.height.load(Ordering::SeqCst) + } +} + +impl + Clone> Skiplist { + /// Finds the node near to key. + /// + /// If less=true, it finds rightmost node such that node.key < key (if + /// allow_equal=false) or node.key <= key (if allow_equal=true). + /// If less=false, it finds leftmost node such that node.key > key (if + /// allowEqual=false) or node.key >= key (if allow_equal=true). + /// Returns the node found. + unsafe fn find_near(&self, key: &[u8], less: bool, allow_equal: bool) -> *const Node { + let mut cursor: *const Node = self.core.head.as_ptr(); + let mut level = self.height(); + loop { + // Assume cursor.key < key + let next_ptr = (&*cursor).next_ptr(level); + if next_ptr.is_null() { + // cursor.key < key < END OF LIST + if level > 0 { + // Can descend further to iterate closer to the end + level -= 1; + continue; + } + // 1. Level=0. Cannot descend further. Let's return something that makes sense + // 2. Try to return cursor. Make sure it is not a head node + if !less || cursor == self.core.head.as_ptr() { + return ptr::null(); + } + return cursor; + } + + let next = &*next_ptr; + let res = self.c.compare_key(key, next.key()); + if res == std::cmp::Ordering::Greater { + // cursor.key < next.key < key. We can continue to move right + cursor = next_ptr; + continue; + } + if res == std::cmp::Ordering::Equal { + // cursor.key < key == next.key + if allow_equal { + return next; + } + if !less { + // We want >, so go to base level to grab the next bigger node + return next.next_ptr(0); + } + // We want <. If not base level, we should go closer in the next level. + if level > 0 { + level -= 1; + continue; + } + // On base level. Return cursor + if cursor == self.core.head.as_ptr() { + return ptr::null(); + } + return cursor; + } + // cursor.key < key < next.key + if level > 0 { + level -= 1; + continue; + } + // At base level. Need to return something + if !less { + return next; + } + // Try to return cursor. Make sure it is not a head node + if cursor == self.core.head.as_ptr() { + return ptr::null(); + } + return cursor; + } + } + + /// Returns (out_before, out_after) with out_before.key <= key <= + /// out_after.key + /// + /// The input `before` tells us where to start looking + /// If we found a node with the same key, then we return out_before = + /// out_after. Otherwise, out_before.key < key < out_after.key + unsafe fn find_splice_for_level( + &self, + key: &[u8], + mut before: *mut Node, + level: usize, + ) -> (*mut Node, *mut Node) { + loop { + // Assume before.key < key + let next_ptr = (&*before).next_ptr(level); + if next_ptr.is_null() { + return (before, ptr::null_mut()); + } + let next_node = &*next_ptr; + match self.c.compare_key(key, next_node.key()) { + // Equality case + std::cmp::Ordering::Equal => return (next_ptr, next_ptr), + // before.key < key < next.key. We are done for this level + std::cmp::Ordering::Less => return (before, next_ptr), + // Keep moving right on this level + _ => before = next_ptr, + } + } + } + + /// Put the key-value into the skiplist if the key does not exists. + /// + /// The content of key and value will be copied into the list. Returns true + /// if the node is inserted, otherwise return false (key is duplicated) + /// + /// Panic: The skiplist will panic if the allocated memory + /// out of the capacity + pub fn put(&self, key: &[u8], value: &[u8]) -> bool { + let mut list_height = self.height(); + let mut prev = [ptr::null_mut(); MAX_HEIGHT + 1]; + let mut next = [ptr::null_mut(); MAX_HEIGHT + 1]; + prev[list_height + 1] = self.core.head.as_ptr(); + // Recompute splice levels + for i in (0..=list_height).rev() { + // Use higher level to speed up for current level + let (p, n) = unsafe { self.find_splice_for_level(key, prev[i + 1], i) }; + prev[i] = p; + next[i] = n; + if p == n { + // Key already exists + return false; + } + } + + // Create a new node + let height = self.random_height(); + let node_ptr = Node::alloc(&self.core.arena, key, value, height); + + // Try to increase skiplist height via CAS + while height > list_height { + match self.core.height.compare_exchange_weak( + list_height, + height, + Ordering::SeqCst, + Ordering::SeqCst, + ) { + // Successfully increased skiplist height + Ok(_) => break, + Err(h) => list_height = h, + } + } + + // We always insert from the base level and up. After you add a node in base + // leve, we cannot create a node in the level above because it would + // have discovered the node in the base level + let x: &mut Node = unsafe { &mut *node_ptr }; + for i in 0..=height { + loop { + if prev[i].is_null() { + // This cannot happen in base level + assert!(i > 1); + // We haven't computed prev, next for this level because height exceeds old + // list_height. For these levels, we expect the lists to be + // sparse, so we can just search from head. + let (p, n) = + unsafe { self.find_splice_for_level(x.key(), self.core.head.as_ptr(), i) }; + prev[i] = p; + next[i] = n; + // Someone adds the exact same key before we are able to do so. This can only + // happen on the base level. But we know we are not on the + // base level. + assert_ne!(p, n); + } + x.tower[i].store(next[i], Ordering::SeqCst); + match unsafe { &*prev[i] }.tower[i].compare_exchange( + next[i], + node_ptr, + Ordering::SeqCst, + Ordering::SeqCst, + ) { + // Managed to insert x between prev[i] and next[i]. Go to the next level. + Ok(_) => break, + Err(_) => { + // CAS failed. We need to recompute prev and next. + // It is unlikely to be helpful to try to use a different level as we redo + // the search, because it is unlikely that lots of + // nodes are inserted between prev[i] and next[i]. + let (p, n) = unsafe { self.find_splice_for_level(x.key(), prev[i], i) }; + if p == n { + assert_eq!(i, 0); + return false; + } + prev[i] = p; + next[i] = n; + } + } + } + } + true + } + + /// Returns if the skiplist is empty + pub fn is_empty(&self) -> bool { + let node = self.core.head.as_ptr(); + let next_ptr = unsafe { (&*node).next_ptr(0) }; + next_ptr.is_null() + } + + /// Returns len of the skiplist + pub fn len(&self) -> usize { + let mut node = self.core.head.as_ptr(); + let mut count = 0; + loop { + let next_ptr = unsafe { (&*node).next_ptr(0) }; + if !next_ptr.is_null() { + count += 1; + node = next_ptr; + continue; + } + return count; + } + } + + /// Returns the last element. If head (empty list), we return null. All the + /// find functions will NEVER return the head nodes + fn find_last(&self) -> *const Node { + let mut node = self.core.head.as_ptr(); + let mut level = self.height(); + loop { + let next_ptr = unsafe { (&*node).next_ptr(level) }; + if !next_ptr.is_null() { + node = next_ptr; + continue; + } + // next is null + if level == 0 { + if node == self.core.head.as_ptr() { + return ptr::null(); + } + return node; + } + level -= 1; + } + } + + /// Gets the value associated with the key. It returns a valid value if it + /// finds equal or earlier version of the same key. + pub fn get(&self, key: &[u8]) -> Option<&[u8]> { + if let Some((_, value)) = self.get_with_key(key) { + Some(value) + } else { + None + } + } + + /// Gets the key and value associated with the key. It returns a valid value + /// if it finds equal or earlier version of the same key. + pub fn get_with_key(&self, key: &[u8]) -> Option<(&[u8], &[u8])> { + // Find greater or equal + let node = unsafe { self.find_near(key, false, true) }; + if node.is_null() { + return None; + } + if self.c.same_key(unsafe { (*node).key() }, key) { + return Some(unsafe { ((*node).key(), (*node).value()) }); + } + None + } + + /// Returns a skiplist iterator + pub fn iter_ref(&self) -> IterRef<&Skiplist, C, A> { + IterRef { + list: self, + cursor: ptr::null(), + _key_cmp: std::marker::PhantomData, + _arena: std::marker::PhantomData, + } + } + + /// Returns a skiplist iterator + pub fn iter(&self) -> IterRef, C, A> { + IterRef { + list: self.clone(), + cursor: ptr::null(), + _key_cmp: std::marker::PhantomData, + _arena: std::marker::PhantomData, + } + } + + /// Consider the total bytes allocated by the arena (not the bytes used). + pub fn mem_size(&self) -> u32 { + self.core.arena.stats().bytes_allocated() as u32 + } +} + +impl + Clone> AsRef> for Skiplist { + fn as_ref(&self) -> &Skiplist { + self + } +} + +unsafe impl + Clone + Send> Send for Skiplist {} +unsafe impl + Clone + Sync> Sync for Skiplist {} + +pub struct IterRef +where + T: AsRef>, + A: Arena + Clone, +{ + list: T, + cursor: *const Node, + _key_cmp: std::marker::PhantomData, + _arena: std::marker::PhantomData, +} + +impl>, C: KeyComparator, A: Arena + Clone> + IterRef +{ + pub fn valid(&self) -> bool { + !self.cursor.is_null() + } + + pub fn key(&self) -> &[u8] { + assert!(self.valid()); + unsafe { (*self.cursor).key() } + } + + pub fn value(&self) -> &[u8] { + assert!(self.valid()); + unsafe { (*self.cursor).value() } + } + + pub fn next(&mut self) { + assert!(self.valid()); + unsafe { + self.cursor = (&*self.cursor).next_ptr(0); + } + } + + pub fn prev(&mut self) { + assert!(self.valid()); + unsafe { + self.cursor = self.list.as_ref().find_near(self.key(), true, false); + } + } + + pub fn seek(&mut self, target: &[u8]) { + unsafe { + self.cursor = self.list.as_ref().find_near(target, false, true); + } + } + + pub fn seek_for_prev(&mut self, target: &[u8]) { + unsafe { + self.cursor = self.list.as_ref().find_near(target, true, true); + } + } + + pub fn seek_to_first(&mut self) { + unsafe { + self.cursor = (&*self.list.as_ref().core.head.as_ptr()).next_ptr(0); + } + } + + pub fn seek_to_last(&mut self) { + self.cursor = self.list.as_ref().find_last(); + } + + pub fn key_with_arena(&self) -> ArenaSlice { + assert!(self.valid()); + unsafe { (*self.cursor).key_with_arena(self.list.as_ref().core.arena.clone()) } + } + + pub fn value_with_arena(&self) -> ArenaSlice { + assert!(self.valid()); + unsafe { (*self.cursor).value_with_arena(self.list.as_ref().core.arena.clone()) } + } +} + +unsafe impl>, C: Send, A: Arena + Clone + Send> Send + for IterRef +{ +} +unsafe impl>, C: Sync, A: Arena + Clone + Sync> Sync + for IterRef +{ +} + +#[cfg(test)] +mod tests { + use arena::MonoIncArena; + use bytes::Bytes; + + use super::*; + use crate::FixedLengthSuffixComparator; + + #[test] + fn test_node_alloc() { + let arena = MonoIncArena::new(1 << 10); + let key = b"key of node"; + let value = b"value of node"; + let node_ptr = Node::alloc(&arena, key, value, 5); + unsafe { + let node = &*node_ptr; + assert_eq!(5, node.height); + for i in 0..=node.height { + assert!(node.tower[i].load(Ordering::SeqCst).is_null()); + } + assert_eq!(key, node.key()); + assert_eq!(value, node.value()); + } + } + + #[test] + fn test_find_near() { + let comp = FixedLengthSuffixComparator::new(8); + let arena = MonoIncArena::new(1 << 10); + let list = Skiplist::with_arena(comp, arena); + for i in 0..1000 { + let key = Bytes::from(format!("{:05}{:08}", i * 10 + 5, 0)); + let value = Bytes::from(format!("{:05}", i)); + list.put(&key, &value); + } + let mut cases = vec![ + ("00001", false, false, Some("00005")), + ("00001", false, true, Some("00005")), + ("00001", true, false, None), + ("00001", true, true, None), + ("00005", false, false, Some("00015")), + ("00005", false, true, Some("00005")), + ("00005", true, false, None), + ("00005", true, true, Some("00005")), + ("05555", false, false, Some("05565")), + ("05555", false, true, Some("05555")), + ("05555", true, false, Some("05545")), + ("05555", true, true, Some("05555")), + ("05558", false, false, Some("05565")), + ("05558", false, true, Some("05565")), + ("05558", true, false, Some("05555")), + ("05558", true, true, Some("05555")), + ("09995", false, false, None), + ("09995", false, true, Some("09995")), + ("09995", true, false, Some("09985")), + ("09995", true, true, Some("09995")), + ("59995", false, false, None), + ("59995", false, true, None), + ("59995", true, false, Some("09995")), + ("59995", true, true, Some("09995")), + ]; + for (i, (key, less, allow_equal, exp)) in cases.drain(..).enumerate() { + let seek_key = Bytes::from(format!("{}{:08}", key, 0)); + let res = unsafe { list.find_near(&seek_key, less, allow_equal) }; + if exp.is_none() { + assert!(res.is_null(), "{}", i); + continue; + } + let e = format!("{}{:08}", exp.unwrap(), 0); + assert_eq!(unsafe { (*res).key() }, e.as_bytes(), "{}", i); + } + } +} diff --git a/components/skiplist/src/slice.rs b/components/skiplist/src/slice.rs new file mode 100644 index 0000000000..fb2fe9b0b1 --- /dev/null +++ b/components/skiplist/src/slice.rs @@ -0,0 +1,74 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Slice with arena + +use std::{fmt, ops::Deref, slice}; + +use arena::{Arena, BasicStats}; + +/// Arena slice +/// +/// A slice allocated from the arena, it will holds the reference to the arena +/// so it is safe to clone and deref the slice +#[derive(Clone)] +pub struct ArenaSlice> { + /// Arena the slice memory allocated from. + _arena: A, + /// The slice pointer. + slice_ptr: *const u8, + /// The slice len. + slice_len: usize, +} + +impl> ArenaSlice { + /// Create a [ArenaSlice] + /// + /// See the documentation of [`slice::from_raw_parts`] for slice safety + /// requirements. + pub(crate) unsafe fn from_raw_parts(_arena: A, slice_ptr: *const u8, slice_len: usize) -> Self { + Self { + _arena, + slice_ptr, + slice_len, + } + } +} + +unsafe impl + Send> Send for ArenaSlice {} +unsafe impl + Sync> Sync for ArenaSlice {} + +impl> Deref for ArenaSlice { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + unsafe { slice::from_raw_parts(self.slice_ptr, self.slice_len) } + } +} + +impl> fmt::Debug for ArenaSlice { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_list().entries(self.iter()).finish() + } +} + +#[cfg(test)] +mod tests { + use std::{alloc::Layout, mem, ptr}; + + use arena::MonoIncArena; + + use super::*; + + #[test] + fn test_arena_slice() { + let hello = b"hello"; + let arena = MonoIncArena::new(1 << 10); + let slice = unsafe { + let data_ptr = arena + .alloc(Layout::from_size_align(hello.len(), mem::align_of_val(hello)).unwrap()); + ptr::copy_nonoverlapping(hello.as_ptr(), data_ptr.as_ptr(), hello.len()); + ArenaSlice::from_raw_parts(arena, data_ptr.as_ptr(), hello.len()) + }; + assert_eq!(hello, &slice[..]); + } +} diff --git a/components/skiplist/tests/tests.rs b/components/skiplist/tests/tests.rs new file mode 100644 index 0000000000..78a5d81f78 --- /dev/null +++ b/components/skiplist/tests/tests.rs @@ -0,0 +1,261 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + str, + sync::{atomic::*, *}, + thread::yield_now, + time::Duration, +}; + +use arena::MonoIncArena; +use bytes::*; +use skiplist::*; +use yatp::task::callback::Handle; + +fn new_value(v: usize) -> Bytes { + Bytes::from(format!("{:05}", v)) +} + +fn key_with_ts(key: &str, ts: u64) -> Bytes { + Bytes::from(format!("{}{:08}", key, ts)) +} + +#[test] +fn test_empty() { + let key = key_with_ts("aaa", 0); + let comp = FixedLengthSuffixComparator::new(8); + let arena = MonoIncArena::new(1 << 10); + let list = Skiplist::with_arena(comp, arena); + let v = list.get(&key); + assert!(v.is_none()); + + let mut iter = list.iter_ref(); + assert!(!iter.valid()); + iter.seek_to_first(); + assert!(!iter.valid()); + iter.seek_to_last(); + assert!(!iter.valid()); + iter.seek(&key); + assert!(!iter.valid()); + assert!(list.is_empty()); +} + +#[test] +fn test_basic() { + let comp = FixedLengthSuffixComparator::new(8); + let arena = MonoIncArena::new(1 << 10); + let list = Skiplist::with_arena(comp, arena); + let table = vec![ + ("key1", new_value(42)), + ("key2", new_value(52)), + ("key3", new_value(62)), + ("key5", Bytes::from(format!("{:0102400}", 1))), + ("key4", new_value(72)), + ]; + + for (key, value) in &table { + list.put(&key_with_ts(*key, 0), value); + } + + assert_eq!(list.get(&key_with_ts("key", 0)), None); + assert_eq!(list.len(), 5); + assert!(!list.is_empty()); + for (key, value) in &table { + let get_key = key_with_ts(*key, 0); + assert_eq!(list.get(&get_key), Some(&value[..]), "{}", key); + } +} + +fn test_concurrent_basic(n: usize, value_len: usize) { + let pool = yatp::Builder::new("concurrent_basic").build_callback_pool(); + let comp = FixedLengthSuffixComparator::new(8); + let arena = MonoIncArena::new(1 << 10); + let list = Skiplist::with_arena(comp, arena); + let kvs: Vec<_> = (0..n) + .map(|i| { + ( + key_with_ts(format!("{:05}", i).as_str(), 0), + Bytes::from(format!("{1:00$}", value_len, i)), + ) + }) + .collect(); + let (tx, rx) = mpsc::channel(); + for (k, v) in kvs.clone() { + let tx = tx.clone(); + let list = list.clone(); + pool.spawn(move |_: &mut Handle<'_>| { + list.put(&k, &v); + tx.send(()).unwrap(); + }) + } + for _ in 0..n { + rx.recv_timeout(Duration::from_secs(3)).unwrap(); + } + for (k, v) in kvs { + let tx = tx.clone(); + let list = list.clone(); + pool.spawn(move |_: &mut Handle<'_>| { + let val = list.get(&k); + assert_eq!(val, Some(&v[..]), "{:?}", k); + tx.send(()).unwrap(); + }); + } + for _ in 0..n { + rx.recv_timeout(Duration::from_secs(3)).unwrap(); + } + assert_eq!(list.len(), n); +} + +#[test] +fn test_concurrent_basic_small_value() { + test_concurrent_basic(1000, 5); +} + +#[test] +fn test_concurrent_basic_big_value() { + test_concurrent_basic(100, 1048576); +} + +#[test] +fn test_one_key() { + let n = 10000; + let write_pool = yatp::Builder::new("one_key_write").build_callback_pool(); + let read_pool = yatp::Builder::new("one_key_read").build_callback_pool(); + let comp = FixedLengthSuffixComparator::new(8); + let arena = MonoIncArena::new(1 << 10); + let list = Skiplist::with_arena(comp, arena); + let key = key_with_ts("thekey", 0); + let (tx, rx) = mpsc::channel(); + list.put(&key, &new_value(0)); + for i in 0..n { + let tx = tx.clone(); + let list = list.clone(); + let key = key.clone(); + let value = new_value(i); + write_pool.spawn(move |_: &mut Handle<'_>| { + list.put(&key, &value); + tx.send("w").unwrap(); + yield_now(); + }) + } + let mark = Arc::new(AtomicBool::new(false)); + for _ in 0..n { + let tx = tx.clone(); + let list = list.clone(); + let mark = mark.clone(); + let key = key.clone(); + read_pool.spawn(move |_: &mut Handle<'_>| { + let val = list.get(&key); + if val.is_none() { + return; + } + let s = unsafe { str::from_utf8_unchecked(val.unwrap()) }; + let val: usize = s.parse().unwrap(); + assert!(val < n); + mark.store(true, Ordering::SeqCst); + tx.send("r").unwrap(); + yield_now(); + }); + } + let mut r = 0; + let mut w = 0; + for _ in 0..(n * 2) { + match rx.recv_timeout(Duration::from_secs(3)) { + Ok("w") => w += 1, + Ok("r") => r += 1, + Err(err) => panic!("timeout on receiving r{} w{} msg {:?}", r, w, err), + _ => panic!("unexpected value"), + } + } + assert_eq!(list.len(), 1); + assert!(mark.load(Ordering::SeqCst)); +} + +#[test] +fn test_iterator_next() { + let n = 100; + let comp = FixedLengthSuffixComparator::new(8); + let arena = MonoIncArena::new(1 << 10); + let list = Skiplist::with_arena(comp, arena); + let mut iter_ref = list.iter_ref(); + assert!(!iter_ref.valid()); + iter_ref.seek_to_first(); + assert!(!iter_ref.valid()); + for i in (0..n).rev() { + let key = key_with_ts(format!("{:05}", i).as_str(), 0); + list.put(&key, &new_value(i)); + } + iter_ref.seek_to_first(); + for i in 0..n { + assert!(iter_ref.valid()); + let v = iter_ref.value(); + assert_eq!(v, &new_value(i)); + iter_ref.next(); + } + assert!(!iter_ref.valid()); +} + +#[test] +fn test_iterator_prev() { + let n = 100; + let comp = FixedLengthSuffixComparator::new(8); + let arena = MonoIncArena::new(1 << 10); + let list = Skiplist::with_arena(comp, arena); + let mut iter_ref = list.iter_ref(); + assert!(!iter_ref.valid()); + iter_ref.seek_to_last(); + assert!(!iter_ref.valid()); + for i in (0..n).rev() { + let key = key_with_ts(format!("{:05}", i).as_str(), 0); + list.put(&key, &new_value(i)); + } + iter_ref.seek_to_last(); + for i in (0..n).rev() { + assert!(iter_ref.valid()); + let v = iter_ref.value(); + assert_eq!(v, &new_value(i)); + iter_ref.prev(); + } + assert!(!iter_ref.valid()); +} + +#[test] +fn test_iterator_seek() { + let n = 100; + let comp = FixedLengthSuffixComparator::new(8); + let arena = MonoIncArena::new(1 << 10); + let list = Skiplist::with_arena(comp, arena); + let mut iter_ref = list.iter_ref(); + assert!(!iter_ref.valid()); + iter_ref.seek_to_first(); + assert!(!iter_ref.valid()); + for i in (0..n).rev() { + let v = i * 10 + 1000; + let key = key_with_ts(format!("{:05}", v).as_str(), 0); + list.put(&key, &new_value(v)); + } + iter_ref.seek_to_first(); + assert!(iter_ref.valid()); + assert_eq!(iter_ref.value(), b"01000" as &[u8]); + + let cases = vec![ + ("00000", Some(b"01000"), None), + ("01000", Some(b"01000"), Some(b"01000")), + ("01005", Some(b"01010"), Some(b"01000")), + ("01010", Some(b"01010"), Some(b"01010")), + ("99999", None, Some(b"01990")), + ]; + for (key, seek_expect, for_prev_expect) in cases { + let key = key_with_ts(key, 0); + iter_ref.seek(&key); + assert_eq!(iter_ref.valid(), seek_expect.is_some()); + if let Some(v) = seek_expect { + assert_eq!(iter_ref.value(), &v[..]); + } + iter_ref.seek_for_prev(&key); + assert_eq!(iter_ref.valid(), for_prev_expect.is_some()); + if let Some(v) = for_prev_expect { + assert_eq!(iter_ref.value(), &v[..]); + } + } +} diff --git a/components/tracing/Cargo.toml b/components/tracing/Cargo.toml new file mode 100644 index 0000000000..dc493f02cc --- /dev/null +++ b/components/tracing/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "tracing" +version = "0.1.0" +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +upstream = { version = "0.1.26", package = "tracing" } diff --git a/components/tracing/src/lib.rs b/components/tracing/src/lib.rs new file mode 100644 index 0000000000..5cdff967b6 --- /dev/null +++ b/components/tracing/src/lib.rs @@ -0,0 +1,5 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Tracing core lib re-export. + +pub use upstream::*; diff --git a/components/tracing_examples/Cargo.toml b/components/tracing_examples/Cargo.toml new file mode 100644 index 0000000000..b8bea30722 --- /dev/null +++ b/components/tracing_examples/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "trace_examples" +version = "0.1.0" +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +tracing = { path = "../tracing" } +tracing_util = { path = "../tracing_util" } diff --git a/components/tracing_examples/examples/init_tracing_with_file.rs b/components/tracing_examples/examples/init_tracing_with_file.rs new file mode 100644 index 0000000000..75f89f6dca --- /dev/null +++ b/components/tracing_examples/examples/init_tracing_with_file.rs @@ -0,0 +1,41 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use tracing_util::{init_tracing_with_file, tracing_appender::rolling::Rotation}; + +#[tracing::instrument(level = "debug")] +fn nth_fibonacci(n: u64) -> u64 { + if n == 0 || n == 1 { + 1 + } else { + nth_fibonacci(n - 1) + nth_fibonacci(n - 2) + } +} + +// default leve info +#[tracing::instrument] +fn fibonacci_seq(to: u64) -> Vec { + let mut sequence = vec![]; + + for n in 0..=to { + sequence.push(nth_fibonacci(n)); + } + + sequence +} + +// cargo run --example init_tracing_with_file +// log file: /tmp/test_logs/init_tracing_with_file +// 2021-09-28T22:41:30.362078+08:00 INFO main ThreadId(01) fibonacci_seq{to=5}: +// init_tracing_with_file: enter 2021-09-28T22:41:30.364181+08:00 INFO main +// ThreadId(01) fibonacci_seq{to=5}: init_tracing_with_file: close +// time.busy=2.13ms time.idle=34.8µs +fn main() { + let _g = init_tracing_with_file( + "init_tracing_with_file", + "/tmp/test_logs", + "info", + Rotation::NEVER, + ); + let ret = fibonacci_seq(5); + println!("{:?}", ret); +} diff --git a/components/tracing_util/Cargo.toml b/components/tracing_util/Cargo.toml new file mode 100644 index 0000000000..15eb11520a --- /dev/null +++ b/components/tracing_util/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "tracing_util" +version = "0.1.0" +authors = ["Databend Authors "] +license = "Apache-2.0" +publish = false +edition = "2018" + +[dependencies] # In alphabetical order +lazy_static = "1.4.0" +tracing = "0.1.26" +tracing-appender = "0.1.2" +tracing-subscriber = "0.2.20" diff --git a/components/tracing_util/src/lib.rs b/components/tracing_util/src/lib.rs new file mode 100644 index 0000000000..69c7432fd4 --- /dev/null +++ b/components/tracing_util/src/lib.rs @@ -0,0 +1,22 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Copyright 2020 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// fork from:https://github.com/datafuselabs/databend/tree/master/common/tracing + +mod logging; + +pub use logging::{init_default_tracing, init_default_ut_tracing, init_tracing_with_file}; +pub use tracing_appender; diff --git a/components/tracing_util/src/logging.rs b/components/tracing_util/src/logging.rs new file mode 100644 index 0000000000..7a314608f5 --- /dev/null +++ b/components/tracing_util/src/logging.rs @@ -0,0 +1,147 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Copyright 2020 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + fs::OpenOptions, + path::Path, + sync::{Arc, Mutex, Once}, +}; + +use fmt::format::FmtSpan; +use lazy_static::lazy_static; +use tracing::Subscriber; +use tracing_appender::{ + non_blocking::WorkerGuard, + rolling::{RollingFileAppender, Rotation}, +}; +use tracing_subscriber::{ + fmt, + fmt::{time::ChronoLocal, Layer}, + prelude::*, + registry::Registry, + EnvFilter, +}; + +/// Write logs to stdout. +pub fn init_default_tracing() { + static START: Once = Once::new(); + + START.call_once(|| { + init_tracing_stdout(); + }); +} + +/// Init tracing for unittest. +/// Write logs to file `unittest`. +pub fn init_default_ut_tracing() { + static START: Once = Once::new(); + + START.call_once(|| { + let mut g = GLOBAL_UT_LOG_GUARD.as_ref().lock().unwrap(); + let (work_guard, sub) = init_file_subscriber("unittest", "_logs"); + tracing::subscriber::set_global_default(sub) + .expect("error setting global tracing subscriber"); + + tracing::info!("init default ut tracing"); + *g = Some(work_guard); + }); +} + +lazy_static! { + static ref GLOBAL_UT_LOG_GUARD: Arc>> = Arc::new(Mutex::new(None)); +} + +fn init_tracing_stdout() { + let fmt_layer = Layer::default() + .with_thread_ids(true) + .with_thread_names(false) + .with_ansi(false) + .with_span_events(fmt::format::FmtSpan::FULL); + + let subscriber = Registry::default() + .with(EnvFilter::from_default_env()) + .with(fmt_layer); + + tracing::subscriber::set_global_default(subscriber) + .expect("error setting global tracing subscriber"); +} + +/// Write logs to file and rotation. +pub fn init_tracing_with_file( + app_name: &str, + dir: impl AsRef, + level: &str, + rotation: Rotation, +) -> WorkerGuard { + let file_appender = RollingFileAppender::new(rotation, dir, app_name); + let (file_writer, file_guard) = tracing_appender::non_blocking(file_appender); + let f_layer = Layer::new() + .with_timer(ChronoLocal::rfc3339()) + .with_writer(file_writer) + .with_thread_ids(true) + .with_thread_names(true) + .with_ansi(false) + .with_span_events(FmtSpan::ENTER | FmtSpan::CLOSE); + + let subscriber = Registry::default() + .with(EnvFilter::new(level)) + .with(f_layer); + + tracing::subscriber::set_global_default(subscriber) + .expect("error setting global tracing subscriber"); + + file_guard +} + +/// Create a file based tracing/logging subscriber. +/// A guard must be held during using the logging. +fn init_file_subscriber(app_name: &str, dir: &str) -> (WorkerGuard, impl Subscriber) { + let path_str = dir.to_string() + "/" + app_name; + let path: &Path = path_str.as_ref(); + + // open log file + + let mut open_options = OpenOptions::new(); + open_options.append(true).create(true); + + let mut open_res = open_options.open(path); + if open_res.is_err() { + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).unwrap(); + open_res = open_options.open(path); + } + } + + let f = open_res.unwrap(); + + // build subscriber + + let (writer, writer_guard) = tracing_appender::non_blocking(f); + + let f_layer = Layer::new() + .with_timer(ChronoLocal::rfc3339()) + .with_writer(writer) + .with_thread_ids(true) + .with_thread_names(false) + .with_ansi(false) + .with_span_events(FmtSpan::ENTER | FmtSpan::CLOSE); + + let subscriber = Registry::default() + .with(EnvFilter::from_default_env()) + .with(f_layer); + + (writer_guard, subscriber) +} diff --git a/configs/ceresdb.toml b/configs/ceresdb.toml new file mode 100644 index 0000000000..7b1a216dfc --- /dev/null +++ b/configs/ceresdb.toml @@ -0,0 +1,23 @@ +bind_addr = "0.0.0.0" +http_port = ${HTTP_PORT} +grpc_port = ${GRPC_PORT} +log_level = "info" + +[analytic] +data_path = "${DATA_PATH}" + +[[meta_client.cluster_view.schema_shards]] +schema = 'public' +auto_create_tables = true + +[[meta_client.cluster_view.schema_shards.shard_views]] +shard_id = 0 + +[meta_client.cluster_view.schema_shards.shard_views.node] +addr = "${NODE_ADDR}" +port = ${GRPC_PORT} + +[[route_rules.prefix_rules]] +schema = 'public' +prefix = 'special_prefix' +shard = 0 diff --git a/docker/entrypoint.py b/docker/entrypoint.py new file mode 100755 index 0000000000..35b3e12cdf --- /dev/null +++ b/docker/entrypoint.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python2 +import os +import sys +import commands + +ENABLE_DATA_NODE = os.getenv('ENABLE_DATANODE') == 'true' +HTTP_PORT = os.getenv('CERESDB_HTTP_PORT', '5000') +GRPC_PORT = os.getenv('CERESDB_GRPC_PORT', '8831') +DATA_PATH = '/home/admin/data/ceresdb' + +# hostname maybe return some ip(array) +def get_local_ip(): + return commands.getoutput('/usr/bin/localip').strip().split()[0] + +def create_datanode_config(): + config = open('/etc/ceresdb/ceresdb.toml', 'r').read() + config = config.replace("${HTTP_PORT}", HTTP_PORT) + config = config.replace("${GRPC_PORT}", GRPC_PORT) + config = config.replace("${NODE_ADDR}", get_local_ip()) + config = config.replace("${DATA_PATH}", DATA_PATH) + open('/etc/ceresdb/ceresdb.toml', 'w').write(config) + +def start_datanode(): + create_datanode_config() + + cmd = ''' +# load env +. /ceresdb.env +env +exec /usr/bin/ceresdb-server --config /etc/ceresdb/ceresdb.toml +''' + open('/usr/bin/ceresdb-start.sh', 'w').write(cmd) + +def start_supervisord(): + port = int(os.getenv('SUPERVISORD_HTTP_PORT', '9001')) + conf = '/etc/supervisor/supervisord.conf' + if port: + os.system(''' sed -i 's/:9001/:%d/g' %s ''' % (port, conf)) + open('/etc/supervisor/conf.d/touch-admin-cron.conf', 'a').write('\nkillasgroup=true\nstopasgroup=true\n') + os.system('/usr/bin/supervisord -c %s --nodaemon' % conf) + +def copy_environ(): + envs = [] + for k, v in os.environ.items(): + envs.append('export %s="%s"' % (k, v)) + # copy DATANODE_ to CSE_ + if 'DATANODE_' in k: + envs.append('export %s="%s"' % (k.replace('DATANODE_', 'CSE_'), v)) + + envs.append('export LOCAL_IP=%s' % get_local_ip()) + # support register ceres meta + envs.append('export CSE_CERES_META_NODE_ADDR=%s' % (get_local_ip())) + + envs.append('export MALLOC_CONF=prof:true,prof_active:false,lg_prof_sample:19') + + open('/ceresdb.env', 'w').write('\n'.join(envs)) + +def init_dir(): + cmd = ''' +mkdir -p /home/admin/logs /home/admin/data + +# set logdir +mkdir -p /home/admin/logs/ceresdb + +ln -nsf /data /home/admin/data + +chmod +777 -R /data /home/admin/data /home/admin/logs +chown -R admin.admin /data /home/admin/data /home/admin/logs +''' + open('/ceresdb-init.sh', 'w').write(cmd) + os.system('sh /ceresdb-init.sh') + +def main(): + print "copy_environ" + copy_environ() + + print "init_dir" + init_dir() + + if ENABLE_DATA_NODE: + print "start_datanode" + start_datanode() + + print "start_datanode" + start_supervisord() + +if __name__ == '__main__': + main() diff --git a/docker/supervisor/conf.d/ceresdb.conf b/docker/supervisor/conf.d/ceresdb.conf new file mode 100644 index 0000000000..3b956c3118 --- /dev/null +++ b/docker/supervisor/conf.d/ceresdb.conf @@ -0,0 +1,17 @@ +[program:ceresdbx] +command=sh /usr/bin/ceresdb-start.sh +autostart=true +startsecs=3 +startretries=9999 +autorestart=true +;exitcodes=0,2 +;stopsignal=QUIT +;stopwaitsecs=10 +stopasgroup=true +killasgroup=true +user=admin +redirect_stderr=true +stdout_logfile=/home/admin/logs/ceresdb/out.log +stdout_logfile_maxbytes=200MB +stdout_logfile_backups=5 +;environment=A="1",B="2" diff --git a/docker/supervisor/supervisord.conf b/docker/supervisor/supervisord.conf new file mode 100644 index 0000000000..401fb2e363 --- /dev/null +++ b/docker/supervisor/supervisord.conf @@ -0,0 +1,24 @@ +[unix_http_server] +file=/tmp/supervisor.sock ; (the path to the socket file) + +[inet_http_server] ; inet (TCP) server disabled by default +port=127.0.0.1:9001 ; (ip_address:port specifier, *:port for all iface) + +[supervisord] +logfile=/tmp/supervisord.log ; (main log file;default $CWD/supervisord.log) +logfile_maxbytes=500MB ; (max main logfile bytes b4 rotation;default 50MB) +logfile_backups=10 ; (num of main logfile rotation backups;default 10) +loglevel=info ; (log level;default info; others: debug,warn,trace) +pidfile=/tmp/supervisord.pid ; (supervisord pidfile;default supervisord.pid) +nodaemon=false ; (start in foreground if true;default false) +minfds=1024 ; (min. avail startup file descriptors;default 1024) +minprocs=200 ; (min. avail process descriptors;default 200) + +[rpcinterface:supervisor] +supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface + +[supervisorctl] +serverurl=http://127.0.0.1:9001 + +[include] +files = /etc/supervisor/conf.d/*.conf diff --git a/docker/tini b/docker/tini new file mode 100644 index 0000000000000000000000000000000000000000..03af82f09e6484df819313cf9fec158d6b4a879d GIT binary patch literal 24064 zcmeHve|!|xx%VU_2mv+$L8H=o!q#N{<`HuEer$$? zd~V;*{k(s?pAVy(GtckmJm)#joS8Yy?VhHF+?*Uu9l6?-8bMtPT^uF;YTD#_4oY0L zHd}LPpU@U-Gl9v+&&4U!iZj(7m#S4eOY+S?n}=U@u0T}hy7-A|7Yj5jh(mcuE!mqo zT=G=3^;Zc{wVK8_g5=28qGk@F?cyh@{qz|UuN;(e8<=IQ5Nw zKG{ECt_u*sE^4%T{&%V+UOHO;VD{4a_B9+t>*6P>eNdn^)hfAbAV+@wy+@H`Y?Jn; zi^~z&Kh;W5yL2$ny=vv9gT3X0fnan~`KIbs<*QaYBO&K<-fz-Ryp7Er+>(kT14KSA z#E)`-=+#ejZ`!o*WzSbP?R~eg=<(m)z4kNf=YzHcKjNjB7EKhfbvbvZFZvBThpZ}9Xf4*w`=G*VTZTXY$FL-SH&yF8Ge%%YJzEvJOaLaXz_udlU`>$(X zpG>~~<=dWm{)^wdl_rbRx`R4lz=B>GRPU$D^T6~>mP?HY%_i@ys_LAhWW zd?|u|D*oPS_&dPQPNDzBY4|Cgr_ygtga6Vr`21=7TssZ_U#7udHx2&mY4jhMroZ*m z=%IW%Re#&2!Bak+ihs^D_(Rj+*);y&I}QIYros1M0y5CDk0=nQ@^kMr_-{;u|F3EA zw@rhen8u&KO~X&ufm8YUV;&!e#+JIkp=m3imyYFV7ibQxYmtk?6h}I?Nc^F5TpWH$ z;xFTR3bng05qM6-pV{D_ubrd0uIb{awN#LBERc357PvV4jMTFT(0uJ&?K>S^9530m z^SIqP+NL5G$E!b^q@E+^3p~w3I_{PDt|cx4wIt-}xDag-^e3c#fz`CNl7B+#*&qoY z06!lR+hxC$KNmOXXzi^IMAucrM9_#(tdlkt&g ztwrjYl>Eh#@I8qiJIlo}{|tFLZjpMj`}JLE_u<7Z&i6dz>G&yr6z5492f9YkQ7-u> zWPc^QHY)A5Uf|+<1=4N;EsgI=>A#B4e96zG9)nA1+H+FBS|8E1hK`5vBYz&w^5;Ln z&rT7a&rAO8S^SpN-zxiiEh)#bQTbEs;_wf_#}8bGd+4&eN{g7`NT5IHAJjs_W+2$B z^@YMWYd{T$Lp?+O2tV%&3=T#NQ|mX(;Xto;Ge8nFLxWn6e{e9=qYa09OyC0j#?Y|2 znRq)pjc_;^>IBtq20}p+h6&*NhQooN+1J@K0Oe+HC~9hb!%?$GGkYL7+#ep+cneFA zF@huHnm^o6%+N_zdJXQ6vB~He8P@tn!vWJEyuXK=8ubUbqrjMfA)}XIa=2#*s?9Kz zgu-xyqR<(EYO{059|&sQ{)iFu4;j3lY0ib^<6qJ4H)cn1@ zoyMkNqsJta8I8cn7M%75f}PQb(W|W6`-m=LkPKu(XJ>aLA{B>?h#9guM&5Vc8XD>j z81RzD6VdE3f)NbEuzxhDbzVKv*(O6#H|UQ6;e7^{x@}|Ff^B^&ehozj!G3_b8P>e2-zug zZl}_@fN5TdbE<9PFwY6hrWNZBI6=7Rxdi8GPm)tOj<K)vYXx;YY{Q?WkZ5<< z@MqiblQz7%wi0=-4S%kUf1eG1o(;d>hFAC6L_TE0f5OIZ+3=sV;g8tx%!WU1!(U*- z>#?_H>52U4RZP>jPMZ1FQDmtzrqrP{j-@vc zOsSvR$Kf`DDdkg>9KM=hO7+yk9IhppQarVt!)pkpX_cDb@N$AF)l*{}zL;Q2@zgMf zmk>;;o$BK7xdc;6r&>9@fM81Hl#9a!1XBv9Dma`=Fr{vaarlEP0aMDRia7i(!IY{g zjl*veOeH0Cn&9w{2&NQFjdA#U1XJpzhB^FAf+^)vT^zojU@QR9e-3|{U`oA|i^HEMm{KlP z!QsymOsSS)91hnot#1M_KlD4^+J?c?$D2wP>f^qW;yv{xJT7P#(kNRWAlAs3xd|Zj zJDw!&-IbHpZiE!*Gj+#=6 zw6CNB;P{r3N#Yv6t;D6r>r0q*CE~tyGWz{Q)z4~~w)L2K<4*2FA3TS#R_o^FB9`O?j%C`5SX_RQrozjszRSr z!M`AShknO9T*0o7XF!1-KcerluF`X!)}Q^@T$Ir^M`&c7%uPgJ0NHU z5#xdpW8`O&oVeZdI4}nz@0_7Yb5VC0BnHqF|?Ys7Fc9l9$d=GxH}L< zYc9x%YHKUbldel$Ncmmhei1cCCFTtv>4~b{K#Acx2O@mdY}nm5VIBT}>wggBEd3Xd z{(l7-`j?XaBBB5A1Biq5h>#mRSuO?rmMP@k#(8o;$fbG^srJfam1*lCpb?gB?@$lD z04me*r>$0wjqf%?RuyTF?@nm3mc{AYGua2_n_tQKdF4RJ}s0IJ*x*lgrs!^$cXn(aGw0B=thc4 zgOVrz`d7oh!aEy)w#tOMr!cdj&hBG3cVyo8K}phzQ}{SRK3*n#oB&A#VJ5Vsj$z7A zxI5fFZ~UWx;@J5tTBBF@A zRbrX^Z)O_cPlg45xq!a&3v&zwu|KR#8xV-VnWUj53AM^fK^@Ah=`EzKD?@wJuHG$Vd zDDxuzbg>xUUT>-a zI}TM-V!UHjLp$cr%lfX@^AcaNfQ>Ku8ETA}intPCp4lcJt{PMM1{=jmTGY_8W0sCp z8@~&*7E-XVNST$Sva4e$wRG%CEz0zoG~m%t76BiM_Ib$sYxus^gS?&Ie$Jb!c~uKc$lG`L0yT}e60dEC53n1NE7EqVeI{t+?f4-aZjO00k^dFYz?HIY zfD#1!OyO{y?VfCccV!+Gq~tQpEaB_+%iycQNmaiENyK0!X`xE0?jvRhwOGBjBQ}K5iSw1;WIJjEQGSB>$k3X{oZzddI!q-QkTNbNlf3qjucg z@1qc$1>apPCj-foyT82s^i(fr}@QKLrq0t)h+2OsD7NLqPbr)Kk&~(xf+R-rCem`plN3i@L&V~kU#&2ItV@KE z-(%iq%}+lxS#Q0?$JM}jax3&>RN>O2;8xXYjWGBvpoGD{LWEnAR&561-{YV{gL$&O zKMPN8AWAA3v?cCuIHk&!G4WMWa;iQ}SQ~`?g(vI(60}&0K}Pkx2j|K4qCcrikbP>s zV4WxA|2L{iRy?lZUd&5^Vhj2MP?EY6lUn3Ydy-w#rC|vjdvZ*VzhS+6SgtA(h<{>b zN$lhK=FPF1-(NzvV4r&kS6Kg{bu}EoL~gfkA<=3)kU%MYit=F$ z2!w9+$*w(0y}I1mi6*&eB_E~3oqXLDtJy3}y$%g@cl6d6-K%QWnWRkIA?KNI#E#D~ zU$f2^2KJ+rWc9s67`TxPlnMjzgzAw>gB2j5On)6Hkp}O+2?WfEq*HYyk^kn!RHzEK zgs5Qu8!;tn`h-P$v3i~RGOr1gB$Yu?tXg5)+QCUxl^Ii?gBDzU&nJx=gvR%rybty| z^(CS4B~X$@8IAieepa8*xDzDde3f**a+0D&^JIq5`}LFcu7)t~9IbB%j$6S&weK}3 zR>3(-=zW3Vs4OkJs4Dbc;PIN78wZHp*&*a5mJ>DE!=Oa0O{!gljX_m~wq-ecz%iEg6C<#UhFw^-=2U8k+D zK}fAL{)sE7IaVgd=sQR!HAZ2f`yfc7pOvKXr2c&PNACCCV8bBW{jTH+=G_X4&F?du z@O!4v-I>w-0%@`7&ODE_{!!>(DfHX(_gT@ybBThxNa$6r|CCdzw2b-@C_y4@eIMzS zz?0zkzSyDL3%7oeC(FP^SRIaBV^s=E&!9e_C-#Rg{m@tVXj92N4AA#MPVTeC>1tu@ zE5M65&4wN_Quyf8@h9BzypmlX2=$L-)X#>h)C@k6 zU7Gt^tYG44y4`FVU;5i63=fyb%c;c^v18u&3r)-ZK=;-9u8(F|7ySWU-TJo4Di5l7 zG{xU+iXW?ohV;Uh`Azrg!_ha9D)OP0`$l)?Zrl3t=x&;H&<75jYV;m@LxbDrZerNe z5n!cD1IsotwX1^l`2&MS?;5r=qHU=6)OIwUz?IpBVdtY=A$FK{UO?WC+Td+&XkjO) z0z>K2$g%>hE;KZRy$cN6J`F}2O+vxQnt}q2vGTz+tbHIf3KHKwz=+M*6fm6ynkP6C z2#110M$lv<{&0Y{QgAV!x7pj--cj4;akqNfG!Cx!wAHq>dwtgnaILqgsk61MrOwmd z-r3mJ($R|8H|l80%SR0CdBKj9rI7-yIbJkwDPP90t|Ff!Ud zXskgTeQoYK&k3DZyB~~R=8rJGfq_l!``B=;yRG@eXkpWb+-Q>t>qSc=7ftPVv7rMT zt@h~H2;gtv!^_MN!!8nR+F6jC(9)r`89hc|1df8UPH(E$TCrgx5bS4SAIciW z{b&x!UO+S7QDaBY-VW%&o|az8H5xDn7`6+MG~D%IOrdg(qz)h^G|bq90;Aq$A2ru> zczKJlBM>p=qvoz{NEEBM?^ZR-g3c|=>+ zB8;{BTDW#!yN|aWZH>fsU0rJjZyVZgpmsfGkq|^SH4x@nZ*!eaC+*j_^YVp$)9x4w zm>g!leTUKZsS}4ru-o{dg@%R=AngZ@a2EHJUe4FjF%93zQ#Kf4rn#AwhHj=jpgb{* z-euDK)JCg}J@7}OJp-)Q7{NT8POhD{_GAW_YmG#BaxgX8Vz8wlUVP+iV-?QIYG(y2 zXZ@JmeMZlURn;pbUkB|1qAA5TwBU7LOLO~1hG;jox!1Asa<;+U)(FgPIG4&$I_U7A z3Pnu=6Bxeqhy6n=8pIAPW)2wW4z^c~1ZbkOP+w+_)AlA24|vXt6I7uLDb2DPDW`4K z2=>4?AQ{YoK!gn#gTrjdh>-jC!Uev_8m0;WO@HoT4>+ev04og}w?qTjS;d-YH$1zt zhGApVG88_@4>JsTv%6^x^MwQb{pgf{+30`~WY~2^MX?vVugaL=?02#yLV_m65@-+y z^o*=wjYiN2V-pn{qGEuab%!FT!ch~O?ws&)7<<`RbeIxl1u7W&qZB}XbPa2z?Oo^r zcAW(&$At%!Qnag!!4-ymdNjEi-{IA(#Hk;c@a@=4CM!vVjL_)td2&%CGZOvbP;}TX zXE4uoUSDTjOTC98IpiOnLa@@laA=5Ckh0}fRm-TYAYq0S`k%#}yC?bB7#N++{MsXG2yOb4g;#5B>umQR7`nHq ze(Ji%kb%*le+1>8u8a(c&a-kDJLYf=8*(x?hGugp%A&lO+p~;bmT0yMg4%p{ic1y0 z)N%F``2DKRXRc|?Z<^siHzuSK=?IG5hbS3*&m{{4DG@SzGFAWTXm=m-D}wq1y*$C> zHb=@TaTS6KC+Ryir&6nVXrm|8VY$PvSfq@u?zk#tVoN<+huxpDbc<^e!*z2AV}lCH z3!B(I$!i-G(UbWFmni#EL;!T<%+%eJ<;a_8@aAA>G)R*tf>j9)c^wt|D>HLMmDrl4 z0b;^2l}Ff=N>flVeR3H+(2qd{b7t@npqU7R5?M-$T z+`u>Zssu!>AH~jZuC?2Mbh$`3h@$|L%`z4VF`2b=Dd1O&5Sbdq{e!c>7h-*ZO}x0` zf{mIrREFAQa71}Tt-moPD4pptfqm5{-2t!@SN~*Nb%Z-{%D$8_XhkS896T1zQ^bx? zkCwSO1y54=(Tv+{I%qcz9UT&$AwX@Tg!3faD`9%zOvj*v=|MCdriAG!H65RpFg=^5 zG)8>iv_6VUFzaz=Sw)}ycw6%voY@Pi|O4(v>*V5@j1fxBU_F=RYNPwc>;dUMEk>7(4Z7Xiy_o96m4*-s! zoy3!YVkE{Go|sLbox~&2y#(XUhYR+Ppxq`DMa3`Y);29?Q&G-&bI+JHk#j~7;i+%O z_overqgBVL`8BNs48%X{1>v*iC~9yNUtKt7bk>-5)w!2nyrN_Y=^(y({I(+o41Lnv zj-ps@ZexLCmOdwUBbqv%Qlx7Tzxxm){%>sr-@Tj<_$K`B!ms${bejL0l3GX6cy6tu z_^uiC4t8gr+ff?JzuHle8!m8^x*g2zCKKR+J6Nu(z)=hg z^fb(=cN8~dRFb|=L2eV|t|dA2BDc*ki*%U4PXf>X4UxxD^o3lHqj-Eqt%Kc_SMMmj zGr!(Z5u2HNjp!+2AbMIqC-)wpU^atloYR_xsh1O&?7vL?L2hjj_KR}YJ7&>%y$Adl z@ciF+HPU!_#c+KV4Ue82f8^ed*VAeKZcO`ixy%1;7<~N# zjobJP5A_G*Ci~Oq7|WfZ=M*?98u$oiI?*sEw+C2oVq7y=&m0OsHgBz}Pc$w+gIx=A zg#TMRo=n5Bv@I?Fp&g!UhKFGZXxg&Vm$p1|l{Jo3v;!eu5SY88C z?9l=TtK;K}{A!$&yE_X;!cEcF3Xdqin=mK$KA%oMgZOHVj*0vrBIOoznj?dtS7YuP zIM-nJixj6XP@Kl|Y8|C_ew0U9ImjV+RdYayvRg_|9j;hLr)j7;^r={iSFa+Q1riioDLCuU#^Rq%s1aAXA4+8 zf1Q{iVD&t8pS(F&&nwh3CG|X$bkTuZd2zIM@mAf_)At?dpyxPrsOP%sdFSLj*9mxC z!piSWoQm%T$yV9NhEm?2V#%MDu!<)hC35=2T%liz|37ckckHqaJz6CEcv`kE%JvP} z9+U0N`GW5(*-H)MNEwlfz<`?9@I zwyR{@Alvn_?UU_@Y_t2F4Xg1{c!JmFDG1;5!>f&TULW53i#<=J0G1UL)P;sO<7NMV z$x3^cvE>z&Rm?XK!Y&fl5)R$k6Jl4G9CU>^=f$;)x@>j@_P@jp{ z{bExC_5y`^0(eQ?%X&jSQQD5hzvzQ^IoOZIO6f(-l6EPvWEnSt_qc-v0eT^g?MF%( zy}-n_1H7)mM|cAKQ$1yPCEPO@#RqAyuRxLq14996fy8-4A_edquZqgJpJmv}&>QI6 zjMm`Y8jf}k1|kDx*bzah=^ZrQMG=w*xQt$UUCQ4p4GtEdr`R_^QL%f+c_==J4eZMh z@eK>3i24b&*v} zzdJO7T@aZ8$D2CZcSGAv#5??{QI({~0NzJ4ymu1;gWW)|fKaMCcxx8z#*6Bp8SoFX z;ZT^{%kHMLpunelSbIx@4?EC2%-c>oZPt71J@ssfyB+Y7GIp)kr?+(Y7&zM8&A#hd zO9OK^U(eQho9oN452LjWyXsg=TY-07Ym?Ul9JWL?b<}&C8(A&nnp=?DUL-Y?`dSzn zl&ZWQ$ThHap0+w2n%%YDCT!U&D`@cgnn`CvOB-{uR(G4vTi4O#Zey(-ZLKZs9+-BBe_UB`IOH;iE$XX9P zbJsR`gei2YuF36PSH|ky>)egxT^nnGmbL<76W+0Fbq}G)o*Vzy(H14@25lw7zCT|X zdfDd7h+gY$_mnYro41_;($LnjuB?E<2?=P2638`sgcb@l4^U=Az(N7*K&O>2tlr~p zf-VeC^9jS}ENHB&Tf<5_x}!le%Bq|zofYM)q8zIHOyw$P#Y*SuWdctHWmUzhiq)Ft zj0|9!nEr0fiLE^bzI5e8mKjcTxZL#jOQ=5>b#_OwGow7vt2qh6ro92p*}FLy**qj# zGb|`_ztjmpCum_~&`$#LYH!^^$bMwYuLKeQ+&84`-&LW>&e zyOLMyng-yARmrRM*`!mpIW?*S8q~z85XF@J8w@5kl{$25@mfo?^+DgG{ zeN`+Mq!m&=+kc7HZo&!0RLQILoG#@zNk(NqyZ@$?ua$gieK;xQ)q0ZTDQ4O7^yvYL zi41dQJ-w+2NE~XNEFj^lI;VCses=loa$UAv${)%KxI|~ZYew<4%a1K^0fYCH3wV?9 zS3pz#D|j3yc6s$Z%n2!XTzmg@d?jP>HL~y8ewBpas|F21THGb+lm)oWM z2_ItvFH?$S`wg_@e@1?di&ND1mmlQN)SqnoKLE)tuV3!s%=+a*=5LuB4FRk17ZAQn zgcI7!q1e>@lKLKVHh(t!6l4o+^6Gx7m?jpEZ2oNcS@4km%07MGh7R_3%D)I+%3UR| z?kCjuviFg494fz+eU%5>K(pId-_bkX!o{ZbU+H-@OI~a35`wLw;mYMHHA+sk7I1d^ z+BIFohVPk*CXqGZOrSG=-@zH_QvFxo`%82PmNC&}{<6%OI5~=j>{n#rl|0pV`ys=b W>DCFp+ZcCB`+u|H=v2uo^#1{xm}Ef! literal 0 HcmV?d00001 diff --git a/docs/crate-deps.dot b/docs/crate-deps.dot new file mode 100644 index 0000000000..95622dc25d --- /dev/null +++ b/docs/crate-deps.dot @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// to update svg, run: +// ```bash +// dot -Tsvg crate-deps.dot > ./crate-deps.svg +// ``` + +digraph G { + + arrow_deps + + analytic_engine -> arrow_deps + analytic_engine -> proto + analytic_engine -> table_engine + analytic_engine -> wal + + catalog -> table_engine + + catalog_impls -> catalog + catalog_impls -> system_catalog + catalog_impls -> table_engine + + cluster -> analytic_engine + cluster -> catalog + cluster -> meta_client_v2 + + interpreters -> catalog + interpreters -> sql + interpreters -> table_engine + interpreters -> udf + interpreters -> query_engine + interpreters -> arrow_deps + + meta_client -> catalog + meta_client -> table_engine + + meta_client_v2 -> catalog + meta_client_v2 -> table_engine + + query_engine -> arrow_deps + query_engine -> sql + query_engine -> table_engine + query_engine -> udf + + server -> analytic_engine + server -> arrow_deps + server -> catalog + server -> interpreters + server -> meta_client + server -> query_engine + server -> sql + server -> system_catalog + server -> table_engine + server -> udf + + sql -> arrow_deps + sql -> catalog + sql -> table_engine + sql -> udf + + system_catalog -> arrow_deps + system_catalog -> catalog + system_catalog -> proto + system_catalog -> table_engine + + table_engine -> arrow_deps + table_engine -> proto + + udf -> arrow_deps + + ceresdb -> analytic_engine + ceresdb -> catalog + ceresdb -> catalog_impls + ceresdb -> query_engine + ceresdb -> server + ceresdb -> table_engine + ceresdb -> udf +} diff --git a/docs/crate-deps.svg b/docs/crate-deps.svg new file mode 100644 index 0000000000..a52863ea12 --- /dev/null +++ b/docs/crate-deps.svg @@ -0,0 +1,433 @@ + + + + + + +G + + + +arrow_deps + +arrow_deps + + + +analytic_engine + +analytic_engine + + + +analytic_engine->arrow_deps + + + + + +proto + +proto + + + +analytic_engine->proto + + + + + +table_engine + +table_engine + + + +analytic_engine->table_engine + + + + + +wal + +wal + + + +analytic_engine->wal + + + + + +table_engine->arrow_deps + + + + + +table_engine->proto + + + + + +catalog + +catalog + + + +catalog->table_engine + + + + + +catalog_impls + +catalog_impls + + + +catalog_impls->table_engine + + + + + +catalog_impls->catalog + + + + + +system_catalog + +system_catalog + + + +catalog_impls->system_catalog + + + + + +system_catalog->arrow_deps + + + + + +system_catalog->proto + + + + + +system_catalog->table_engine + + + + + +system_catalog->catalog + + + + + +cluster + +cluster + + + +cluster->analytic_engine + + + + + +cluster->catalog + + + + + +meta_client_v2 + +meta_client_v2 + + + +cluster->meta_client_v2 + + + + + +meta_client_v2->table_engine + + + + + +meta_client_v2->catalog + + + + + +interpreters + +interpreters + + + +interpreters->arrow_deps + + + + + +interpreters->table_engine + + + + + +interpreters->catalog + + + + + +sql + +sql + + + +interpreters->sql + + + + + +udf + +udf + + + +interpreters->udf + + + + + +query_engine + +query_engine + + + +interpreters->query_engine + + + + + +sql->arrow_deps + + + + + +sql->table_engine + + + + + +sql->catalog + + + + + +sql->udf + + + + + +udf->arrow_deps + + + + + +query_engine->arrow_deps + + + + + +query_engine->table_engine + + + + + +query_engine->sql + + + + + +query_engine->udf + + + + + +meta_client + +meta_client + + + +meta_client->table_engine + + + + + +meta_client->catalog + + + + + +server + +server + + + +server->arrow_deps + + + + + +server->analytic_engine + + + + + +server->table_engine + + + + + +server->catalog + + + + + +server->system_catalog + + + + + +server->interpreters + + + + + +server->sql + + + + + +server->udf + + + + + +server->query_engine + + + + + +server->meta_client + + + + + +ceresdb + +ceresdb + + + +ceresdb->analytic_engine + + + + + +ceresdb->table_engine + + + + + +ceresdb->catalog + + + + + +ceresdb->catalog_impls + + + + + +ceresdb->udf + + + + + +ceresdb->query_engine + + + + + +ceresdb->server + + + + + diff --git a/docs/example.toml b/docs/example.toml new file mode 100644 index 0000000000..2e0fdc5064 --- /dev/null +++ b/docs/example.toml @@ -0,0 +1,20 @@ +bind_addr = "0.0.0.0" +http_port = 5440 +grpc_port = 8831 +log_level = "info" +enable_cluster = true + +[analytic] +data_path = "/tmp/ceresdbx" +sst_data_cache_cap = 10000 +sst_meta_cache_cap = 10000 + +[[meta_client.cluster_view.schema_shards]] +schema = 'public' + +[[meta_client.cluster_view.schema_shards.shard_views]] +shard_id = 0 + +[meta_client.cluster_view.schema_shards.shard_views.node] +addr = "127.0.0.1" +port = 8831 diff --git a/etc/license.template b/etc/license.template new file mode 100644 index 0000000000..377ec98bed --- /dev/null +++ b/etc/license.template @@ -0,0 +1 @@ +// Copyright {\d+} CeresDB Project Authors. Licensed under Apache-2.0. \ No newline at end of file diff --git a/grpcio/Cargo.toml b/grpcio/Cargo.toml new file mode 100644 index 0000000000..09a147a0d1 --- /dev/null +++ b/grpcio/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "grpcio" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +# Rename to workaround doctest bug +# See: https://github.com/rust-lang/cargo/issues/6819 + +[target.'cfg(target_os = "macos")'.dependencies] +upstream = { version = "0.9.1", package = "grpcio" } + +[target.'cfg(target_os = "linux")'.dependencies] +upstream = { version = "0.9.1", package = "grpcio", features = ["openssl"] } diff --git a/grpcio/src/lib.rs b/grpcio/src/lib.rs new file mode 100644 index 0000000000..99d9172ad7 --- /dev/null +++ b/grpcio/src/lib.rs @@ -0,0 +1,3 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +pub use upstream::*; diff --git a/interpreters/Cargo.toml b/interpreters/Cargo.toml new file mode 100644 index 0000000000..8d28241eef --- /dev/null +++ b/interpreters/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "interpreters" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +# In alphabetical order +async-trait = "0.1.41" +catalog = { path = "../catalog" } +common_types = { path = "../common_types" } +common_util = { path = "../common_util" } +log = "0.4" +snafu = { version ="0.6.10", features = ["backtraces"]} +sql = { path = "../sql" } +table_engine = { path = "../table_engine" } +udf = { path = "../udf" } +query_engine = { path = "../query_engine" } +arrow_deps = { path = "../arrow_deps" } + +[dev-dependencies] +analytic_engine = { path = "../analytic_engine", features = ["test"] } +catalog_impls = { path = "../catalog_impls" } +sql = { path = "../sql", features = ["test"] } +tokio = { version = "1.0", features = ["sync", "time"] } diff --git a/interpreters/src/alter_table.rs b/interpreters/src/alter_table.rs new file mode 100644 index 0000000000..acfce81adc --- /dev/null +++ b/interpreters/src/alter_table.rs @@ -0,0 +1,132 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Interpreter for insert statement + +use async_trait::async_trait; +use common_types::{ + column_schema::{self, ColumnSchema}, + schema::{self, Schema}, +}; +use common_util::define_result; +use snafu::{ensure, ResultExt, Snafu}; +use sql::plan::{AlterTableOperation, AlterTablePlan}; +use table_engine::table::AlterSchemaRequest; + +use crate::interpreter::{self, AlterTable, Interpreter, InterpreterPtr, Output}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to add column to schema, err:{}", source))] + AddColumnSchema { source: common_types::schema::Error }, + + #[snafu(display("Failed to build schema, err:{}", source))] + BuildSchema { source: common_types::schema::Error }, + + #[snafu(display("Failed to alter table schema, err:{}", source))] + AlterSchema { source: table_engine::table::Error }, + + #[snafu(display("Failed to alter table options, err:{}", source))] + AlterOptions { source: table_engine::table::Error }, + + #[snafu(display("Not allow to add a not null column, name:{}", name))] + AddNotNull { name: String }, +} + +define_result!(Error); + +pub struct AlterTableInterpreter { + plan: AlterTablePlan, +} + +impl AlterTableInterpreter { + pub fn create(plan: AlterTablePlan) -> InterpreterPtr { + Box::new(Self { plan }) + } +} + +#[async_trait] +impl Interpreter for AlterTableInterpreter { + async fn execute(self: Box) -> interpreter::Result { + self.execute_alter().await.context(AlterTable) + } +} + +impl AlterTableInterpreter { + async fn execute_alter(self: Box) -> Result { + let AlterTablePlan { table, operations } = self.plan; + + match operations { + AlterTableOperation::AddColumn(columns) => { + let current_schema = table.schema(); + let new_schema = build_new_schema(¤t_schema, columns)?; + + let request = AlterSchemaRequest { + schema: new_schema, + pre_schema_version: current_schema.version(), + }; + + let num_rows = table.alter_schema(request).await.context(AlterSchema)?; + + Ok(Output::AffectedRows(num_rows)) + } + AlterTableOperation::ModifySetting(options) => { + let num_rows = table.alter_options(options).await.context(AlterOptions)?; + Ok(Output::AffectedRows(num_rows)) + } + } + } +} + +fn build_new_schema(current_schema: &Schema, column_schemas: Vec) -> Result { + let current_version = current_schema.version(); + + let mut builder = + schema::Builder::with_capacity(current_schema.num_columns() + column_schemas.len()) + // Increment the schema version. + .version(current_version + 1); + // Add existing columns to builder. + for key_column in current_schema.key_columns() { + builder = builder + .add_key_column(key_column.clone()) + .context(AddColumnSchema)?; + } + for normal_column in current_schema.normal_columns() { + builder = builder + .add_normal_column(normal_column.clone()) + .context(AddColumnSchema)?; + } + + builder = builder + // Enable column id generation of the schema builder. + .auto_increment_column_id(true) + .enable_tsid_primary_key(current_schema.index_of_tsid().is_some()); + + // Add new columns + for mut column_schema in column_schemas { + // Uninit the id of the column schema. + column_schema.id = column_schema::COLUMN_ID_UNINIT; + + validate_add_column(&column_schema)?; + + // Only allow to add normal column. + builder = builder + .add_normal_column(column_schema) + .context(AddColumnSchema)?; + } + + // Build the final schema. + let new_schema = builder.build().context(BuildSchema)?; + + Ok(new_schema) +} + +fn validate_add_column(column_schema: &ColumnSchema) -> Result<()> { + ensure!( + column_schema.is_nullable, + AddNotNull { + name: &column_schema.name + } + ); + + Ok(()) +} diff --git a/interpreters/src/context.rs b/interpreters/src/context.rs new file mode 100644 index 0000000000..2e46f07082 --- /dev/null +++ b/interpreters/src/context.rs @@ -0,0 +1,79 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Interpreter context + +use std::sync::Arc; + +use common_types::request_id::RequestId; +use query_engine::context::{Context as QueryContext, ContextRef as QueryContextRef}; +use snafu::Snafu; + +#[derive(Debug, Snafu)] +pub enum Error {} + +define_result!(Error); + +/// Interpreter context +/// +/// Contains information that all interpreters need +pub struct Context { + request_id: RequestId, + default_catalog: String, + default_schema: String, +} + +impl Context { + pub fn builder(request_id: RequestId) -> Builder { + Builder { + request_id, + default_catalog: String::new(), + default_schema: String::new(), + } + } + + /// Create a new context of query executor + pub fn new_query_context(&self) -> Result { + let ctx = QueryContext::builder(self.request_id) + .default_catalog_and_schema(self.default_catalog.clone(), self.default_schema.clone()) + .build(); + Ok(Arc::new(ctx)) + } + + #[inline] + pub fn default_catalog(&self) -> &str { + &self.default_catalog + } + + #[inline] + pub fn default_schema(&self) -> &str { + &self.default_schema + } + + #[inline] + pub fn request_id(&self) -> RequestId { + self.request_id + } +} + +#[must_use] +pub struct Builder { + request_id: RequestId, + default_catalog: String, + default_schema: String, +} + +impl Builder { + pub fn default_catalog_and_schema(mut self, catalog: String, schema: String) -> Self { + self.default_catalog = catalog; + self.default_schema = schema; + self + } + + pub fn build(self) -> Context { + Context { + request_id: self.request_id, + default_catalog: self.default_catalog, + default_schema: self.default_schema, + } + } +} diff --git a/interpreters/src/create.rs b/interpreters/src/create.rs new file mode 100644 index 0000000000..252b459732 --- /dev/null +++ b/interpreters/src/create.rs @@ -0,0 +1,137 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Interpreter for create statements + +use async_trait::async_trait; +use catalog::{manager::Manager, schema::CreateOptions}; +use snafu::{Backtrace, OptionExt, ResultExt, Snafu}; +use sql::plan::CreateTablePlan; +use table_engine::engine::{CreateTableRequest, TableEngineRef, TableState}; + +use crate::{ + context::Context, + interpreter::{Create, Interpreter, InterpreterPtr, Output, Result as InterpreterResult}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to find catalog, name:{}, err:{}", name, source))] + FindCatalog { + name: String, + source: catalog::manager::Error, + }, + + #[snafu(display("Catalog not exists, name:{}.\nBacktrace:\n{}", name, backtrace))] + CatalogNotExists { name: String, backtrace: Backtrace }, + + #[snafu(display("Failed to find schema, name:{}, err:{}", name, source))] + FindSchema { + name: String, + source: catalog::Error, + }, + + #[snafu(display("Schema not exists, name:{}.\nBacktrace:\n{}", name, backtrace))] + SchemaNotExists { name: String, backtrace: Backtrace }, + + #[snafu(display("Failed to create table, name:{}, err:{}", table, source))] + SchemaCreateTable { + table: String, + source: catalog::schema::Error, + }, + + #[snafu(display("Failed to allocate table id, err:{}", source))] + AllocTableId { source: catalog::schema::Error }, +} + +define_result!(Error); + +/// Create interpreter +pub struct CreateInterpreter { + ctx: Context, + plan: CreateTablePlan, + catalog_manager: C, + table_engine: TableEngineRef, +} + +impl CreateInterpreter { + pub fn create( + ctx: Context, + plan: CreateTablePlan, + catalog_manager: C, + table_engine: TableEngineRef, + ) -> InterpreterPtr { + Box::new(Self { + ctx, + plan, + catalog_manager, + table_engine, + }) + } +} + +impl CreateInterpreter { + async fn execute_create(self: Box) -> Result { + let default_catalog = self.ctx.default_catalog(); + let catalog = self + .catalog_manager + .catalog_by_name(default_catalog) + .context(FindCatalog { + name: default_catalog, + })? + .context(CatalogNotExists { + name: default_catalog, + })?; + + let default_schema = self.ctx.default_schema(); + let schema = catalog + .schema_by_name(default_schema) + .context(FindSchema { + name: default_schema, + })? + .context(SchemaNotExists { + name: default_schema, + })?; + + let CreateTablePlan { + engine, + table, + table_schema, + if_not_exists, + options, + } = self.plan; + + let table_id = schema.alloc_table_id(&table).context(AllocTableId)?; + let request = CreateTableRequest { + catalog_name: catalog.name().to_string(), + schema_name: schema.name().to_string(), + table_id, + table_name: table.clone(), + table_schema, + partition_info: None, + engine, + options, + state: TableState::Stable, + }; + + let opts = CreateOptions { + table_engine: self.table_engine, + create_if_not_exists: if_not_exists, + }; + + schema + .create_table(request, opts) + .await + .context(SchemaCreateTable { table })?; + + Ok(Output::AffectedRows(1)) + } +} + +// TODO(yingwen): Wrap a method that returns self::Result, simplify some code to +// converting self::Error to super::Error +#[async_trait] +impl Interpreter for CreateInterpreter { + async fn execute(self: Box) -> InterpreterResult { + self.execute_create().await.context(Create) + } +} diff --git a/interpreters/src/describe.rs b/interpreters/src/describe.rs new file mode 100644 index 0000000000..ca6266a872 --- /dev/null +++ b/interpreters/src/describe.rs @@ -0,0 +1,89 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{convert::TryInto, sync::Arc}; + +use arrow_deps::arrow::{ + array::{BooleanArray, StringArray}, + datatypes::{DataType, Field, Schema}, + record_batch::RecordBatch, +}; +use async_trait::async_trait; +use query_engine::executor::RecordBatchVec; +use snafu::{ResultExt, Snafu}; +use sql::plan::DescribeTablePlan; +use table_engine::table::TableRef; + +use crate::interpreter::{ + Describe, Interpreter, InterpreterPtr, Output, Result as InterpreterResult, +}; + +#[derive(Debug, Snafu)] +pub enum Error {} + +define_result!(Error); + +pub struct DescribeInterpreter { + plan: DescribeTablePlan, +} + +impl DescribeInterpreter { + pub fn create(plan: DescribeTablePlan) -> InterpreterPtr { + Box::new(Self { plan }) + } + + async fn execute_describe(self: Box) -> Result { + let DescribeTablePlan { table } = self.plan; + + Self::table_ref_to_record_batch(table).map(Output::Records) + } + + fn table_ref_to_record_batch(table_ref: TableRef) -> Result { + let table_schema = table_ref.schema(); + let num_columns = table_schema.num_columns(); + let num_key_columns = table_schema.num_key_columns(); + + let mut names = Vec::with_capacity(num_columns); + let mut types = Vec::with_capacity(num_columns); + let mut is_primary_keys = Vec::with_capacity(num_columns); + let mut is_nullables = Vec::with_capacity(num_columns); + let mut is_tags = Vec::with_capacity(num_columns); + for (idx, col) in table_schema.columns().iter().enumerate() { + names.push(col.name.to_string()); + types.push(col.data_type.to_string()); + is_primary_keys.push(idx < num_key_columns); + is_nullables.push(col.is_nullable); + is_tags.push(col.is_tag); + } + + let schema = Schema::new(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("type", DataType::Utf8, false), + Field::new("is_primary", DataType::Boolean, false), + Field::new("is_nullable", DataType::Boolean, false), + Field::new("is_tag", DataType::Boolean, false), + ]); + + let arrow_record_batch = RecordBatch::try_new( + Arc::new(schema), + vec![ + Arc::new(StringArray::from(names)), + Arc::new(StringArray::from(types)), + Arc::new(BooleanArray::from(is_primary_keys)), + Arc::new(BooleanArray::from(is_nullables)), + Arc::new(BooleanArray::from(is_tags)), + ], + ) + .unwrap(); + + let record_batch = arrow_record_batch.try_into().unwrap(); + + Ok(vec![record_batch]) + } +} + +#[async_trait] +impl Interpreter for DescribeInterpreter { + async fn execute(self: Box) -> InterpreterResult { + self.execute_describe().await.context(Describe) + } +} diff --git a/interpreters/src/drop.rs b/interpreters/src/drop.rs new file mode 100644 index 0000000000..7282ae3bc2 --- /dev/null +++ b/interpreters/src/drop.rs @@ -0,0 +1,126 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Interpreter for drop statements + +use async_trait::async_trait; +use catalog::{manager::Manager, schema::DropOptions}; +use snafu::{Backtrace, OptionExt, ResultExt, Snafu}; +use sql::plan::DropTablePlan; +use table_engine::engine::{DropTableRequest, TableEngineRef}; + +use crate::{ + context::Context, + interpreter::{Drop, Interpreter, InterpreterPtr, Output, Result as InterpreterResult}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to find catalog, name:{}, err:{}", name, source))] + FindCatalog { + name: String, + source: catalog::manager::Error, + }, + + #[snafu(display("Catalog not exists, name:{}.\nBacktrace:\n{}", name, backtrace))] + CatalogNotExists { name: String, backtrace: Backtrace }, + + #[snafu(display("Failed to find schema, name:{}, err:{}", name, source))] + FindSchema { + name: String, + source: catalog::Error, + }, + + #[snafu(display("Schema not exists, name:{}.\nBacktrace:\n{}", name, backtrace))] + SchemaNotExists { name: String, backtrace: Backtrace }, + + #[snafu(display("Failed to drop table in schema, name:{}, err:{}", table, source))] + SchemaDropTable { + table: String, + source: catalog::schema::Error, + }, + + #[snafu(display("Failed to drop table, name:{}, err:{}", table, source))] + DropTable { + table: String, + source: table_engine::engine::Error, + }, +} + +define_result!(Error); + +/// Drop interpreter +pub struct DropInterpreter { + ctx: Context, + plan: DropTablePlan, + catalog_manager: C, + table_engine: TableEngineRef, +} + +impl DropInterpreter { + pub fn create( + ctx: Context, + plan: DropTablePlan, + catalog_manager: C, + table_engine: TableEngineRef, + ) -> InterpreterPtr { + Box::new(Self { + ctx, + plan, + catalog_manager, + table_engine, + }) + } +} + +impl DropInterpreter { + async fn execute_drop(self: Box) -> Result { + let default_catalog = self.ctx.default_catalog(); + let catalog = self + .catalog_manager + .catalog_by_name(default_catalog) + .context(FindCatalog { + name: default_catalog, + })? + .context(CatalogNotExists { + name: default_catalog, + })?; + + let default_schema = self.ctx.default_schema(); + let schema = catalog + .schema_by_name(default_schema) + .context(FindSchema { + name: default_schema, + })? + .context(SchemaNotExists { + name: default_schema, + })?; + + let table = self.plan.table; + let request = DropTableRequest { + catalog_name: catalog.name().to_string(), + schema_name: schema.name().to_string(), + table_name: table.clone(), + engine: self.plan.engine, + }; + + let opts = DropOptions { + table_engine: self.table_engine, + }; + + let dropped = schema + .drop_table(request, opts) + .await + .context(SchemaDropTable { table: &table })?; + + Ok(Output::AffectedRows(if dropped { 1 } else { 0 })) + } +} + +// TODO(yingwen): Wrap a method that returns self::Result, simplify some code to +// converting self::Error to super::Error +#[async_trait] +impl Interpreter for DropInterpreter { + async fn execute(self: Box) -> InterpreterResult { + self.execute_drop().await.context(Drop) + } +} diff --git a/interpreters/src/exists.rs b/interpreters/src/exists.rs new file mode 100644 index 0000000000..f926a700c4 --- /dev/null +++ b/interpreters/src/exists.rs @@ -0,0 +1,62 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{convert::TryInto, sync::Arc}; + +use arrow_deps::arrow::{ + array::UInt8Array, + datatypes::{DataType, Field, Schema}, + record_batch::RecordBatch, +}; +use async_trait::async_trait; +use query_engine::executor::RecordBatchVec; +use snafu::{ResultExt, Snafu}; +use sql::plan::ExistsTablePlan; + +use crate::interpreter::{ + Exists, Interpreter, InterpreterPtr, Output, Result as InterpreterResult, +}; + +#[derive(Debug, Snafu)] +pub enum Error {} + +define_result!(Error); + +pub struct ExistsInterpreter { + plan: ExistsTablePlan, +} + +impl ExistsInterpreter { + pub fn create(plan: ExistsTablePlan) -> InterpreterPtr { + Box::new(Self { plan }) + } + + async fn execute_exists(self: Box) -> Result { + let ExistsTablePlan { exists } = self.plan; + + exists_table_result(exists).map(Output::Records) + } +} + +fn exists_table_result(exists: bool) -> Result { + let schema = Schema::new(vec![Field::new("result", DataType::UInt8, false)]); + + let arrow_record_batch = RecordBatch::try_new( + Arc::new(schema), + vec![Arc::new(UInt8Array::from_value( + if exists { 1u8 } else { 0u8 }, + 1, + ))], + ) + .unwrap(); + + let record_batch = arrow_record_batch.try_into().unwrap(); + + Ok(vec![record_batch]) +} + +#[async_trait] +impl Interpreter for ExistsInterpreter { + async fn execute(self: Box) -> InterpreterResult { + self.execute_exists().await.context(Exists) + } +} diff --git a/interpreters/src/factory.rs b/interpreters/src/factory.rs new file mode 100644 index 0000000000..26b858723c --- /dev/null +++ b/interpreters/src/factory.rs @@ -0,0 +1,49 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Interpreter factory + +use catalog::manager::Manager as CatalogManager; +use query_engine::executor::Executor; +use sql::plan::Plan; +use table_engine::engine::TableEngineRef; + +use crate::{ + alter_table::AlterTableInterpreter, context::Context, create::CreateInterpreter, + describe::DescribeInterpreter, drop::DropInterpreter, exists::ExistsInterpreter, + insert::InsertInterpreter, interpreter::InterpreterPtr, select::SelectInterpreter, + show_create::ShowCreateInInterpreter, +}; + +/// A factory to create interpreters +pub struct Factory { + query_executor: Q, + catalog_manager: C, + table_engine: TableEngineRef, +} + +impl Factory { + pub fn new(query_executor: Q, catalog_manager: C, table_engine: TableEngineRef) -> Self { + Self { + query_executor, + catalog_manager, + table_engine, + } + } + + pub fn create(self, ctx: Context, plan: Plan) -> InterpreterPtr { + match plan { + Plan::Query(p) => SelectInterpreter::create(ctx, p, self.query_executor), + Plan::Insert(p) => InsertInterpreter::create(ctx, p), + Plan::Create(p) => { + CreateInterpreter::create(ctx, p, self.catalog_manager, self.table_engine) + } + Plan::Drop(p) => { + DropInterpreter::create(ctx, p, self.catalog_manager, self.table_engine) + } + Plan::Describe(p) => DescribeInterpreter::create(p), + Plan::AlterTable(p) => AlterTableInterpreter::create(p), + Plan::ShowCreate(p) => ShowCreateInInterpreter::create(p), + Plan::Exists(p) => ExistsInterpreter::create(p), + } + } +} diff --git a/interpreters/src/insert.rs b/interpreters/src/insert.rs new file mode 100644 index 0000000000..c2a2ddf636 --- /dev/null +++ b/interpreters/src/insert.rs @@ -0,0 +1,138 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Interpreter for insert statement + +use async_trait::async_trait; +use common_types::{column_schema::ColumnId, datum::Datum, hash::hash64}; +use common_util::codec::{compact::MemCompactEncoder, Encoder}; +use snafu::{ResultExt, Snafu}; +use sql::plan::InsertPlan; +use table_engine::table::WriteRequest; + +use crate::{ + context::Context, + interpreter::{Insert, Interpreter, InterpreterPtr, Output, Result}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to write table, err:{}", source))] + WriteTable { source: table_engine::table::Error }, + + #[snafu(display("Failed to encode tsid, err:{}", source))] + EncodeTsid { + source: common_util::codec::compact::Error, + }, +} + +pub struct InsertInterpreter { + ctx: Context, + plan: InsertPlan, +} + +impl InsertInterpreter { + pub fn create(ctx: Context, plan: InsertPlan) -> InterpreterPtr { + Box::new(Self { ctx, plan }) + } +} + +#[async_trait] +impl Interpreter for InsertInterpreter { + async fn execute(mut self: Box) -> Result { + // Generate tsid if needed. + self.maybe_generate_tsid()?; + let InsertPlan { table, rows } = self.plan; + + // Context is unused now + let _ctx = self.ctx; + + let request = WriteRequest { row_group: rows }; + + let num_rows = table + .write(request) + .await + .context(WriteTable) + .context(Insert)?; + + Ok(Output::AffectedRows(num_rows)) + } +} + +impl InsertInterpreter { + fn maybe_generate_tsid(&mut self) -> Result<()> { + let schema = self.plan.rows.schema(); + let tsid_idx = schema.index_of_tsid(); + + if let Some(idx) = tsid_idx { + // Vec of (`index of tag`, `column id of tag`). + let tag_idx_column_ids: Vec<_> = schema + .columns() + .iter() + .enumerate() + .filter_map(|(i, column)| { + if column.is_tag { + Some((i, column.id)) + } else { + None + } + }) + .collect(); + + let mut hash_bytes = Vec::new(); + for i in 0..self.plan.rows.num_rows() { + let row = self.plan.rows.get_row_mut(i).unwrap(); + + let mut tsid_builder = TsidBuilder::new(&mut hash_bytes); + + for (idx, column_id) in &tag_idx_column_ids { + tsid_builder.maybe_write_datum(*column_id, &row[*idx])?; + } + + let tsid = tsid_builder.finish(); + row[idx] = Datum::UInt64(tsid); + } + } + Ok(()) + } +} + +struct TsidBuilder<'a> { + encoder: MemCompactEncoder, + hash_bytes: &'a mut Vec, +} + +impl<'a> TsidBuilder<'a> { + fn new(hash_bytes: &'a mut Vec) -> Self { + // Clear the bytes buffer. + hash_bytes.clear(); + + Self { + encoder: MemCompactEncoder, + hash_bytes, + } + } + + fn maybe_write_datum(&mut self, column_id: ColumnId, datum: &Datum) -> Result<()> { + // Null datum will be ignored, so tsid remains unchanged after adding a null + // column. + if datum.is_null() { + return Ok(()); + } + + // Write column id first. + self.encoder + .encode(self.hash_bytes, &Datum::UInt64(u64::from(column_id))) + .context(EncodeTsid) + .context(Insert)?; + // Write datum. + self.encoder + .encode(self.hash_bytes, datum) + .context(EncodeTsid) + .context(Insert)?; + Ok(()) + } + + fn finish(self) -> u64 { + hash64(self.hash_bytes) + } +} diff --git a/interpreters/src/interpreter.rs b/interpreters/src/interpreter.rs new file mode 100644 index 0000000000..4591eb5df5 --- /dev/null +++ b/interpreters/src/interpreter.rs @@ -0,0 +1,56 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Interpreter trait + +use async_trait::async_trait; +use query_engine::executor::RecordBatchVec; +use snafu::Snafu; + +// Make the variant closer to actual error code like invalid arguments. +#[derive(Debug, Snafu)] +#[snafu(visibility(pub(crate)))] +pub enum Error { + #[snafu(display("Failed to execute select, err:{}", source))] + Select { source: crate::select::Error }, + + #[snafu(display("Failed to execute create table, err:{}", source))] + Create { source: crate::create::Error }, + + #[snafu(display("Failed to execute drop table, err:{}", source))] + Drop { source: crate::drop::Error }, + + #[snafu(display("Failed to execute insert, err:{}", source))] + Insert { source: crate::insert::Error }, + + #[snafu(display("Failed to execute describe, err:{}", source))] + Describe { source: crate::describe::Error }, + + #[snafu(display("Failed to execute alter table, err:{}", source))] + AlterTable { source: crate::alter_table::Error }, + + #[snafu(display("Failed to show create table, err:{}", source))] + ShowCreate { source: crate::show_create::Error }, + + #[snafu(display("Failed to execute exists, err:{}", source))] + Exists { source: crate::exists::Error }, +} + +define_result!(Error); + +// TODO(yingwen): Maybe add a stream variant for streaming result +/// The interpreter output +pub enum Output { + /// Affected rows number + AffectedRows(usize), + /// A vec of RecordBatch + Records(RecordBatchVec), +} + +/// Interpreter executes the plan it holds +#[async_trait] +pub trait Interpreter { + async fn execute(self: Box) -> Result; +} + +/// A pointer to Interpreter +pub type InterpreterPtr = Box; diff --git a/interpreters/src/lib.rs b/interpreters/src/lib.rs new file mode 100644 index 0000000000..6f3b888e6e --- /dev/null +++ b/interpreters/src/lib.rs @@ -0,0 +1,23 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Interpreters of query/insert/update/delete commands +//! +//! Inspired by fuse-query: and ClickHouse + +#[macro_use] +extern crate common_util; + +pub mod alter_table; +pub mod context; +pub mod create; +pub mod describe; +pub mod drop; +pub mod exists; +pub mod factory; +pub mod insert; +pub mod interpreter; +pub mod select; +pub mod show_create; + +#[cfg(test)] +mod tests; diff --git a/interpreters/src/select.rs b/interpreters/src/select.rs new file mode 100644 index 0000000000..97a0f84a57 --- /dev/null +++ b/interpreters/src/select.rs @@ -0,0 +1,75 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Interpreter for select statement + +use async_trait::async_trait; +use log::debug; +use query_engine::executor::{Executor, Query}; +use snafu::{ResultExt, Snafu}; +use sql::plan::QueryPlan; + +use crate::{ + context::Context, + interpreter::{Interpreter, InterpreterPtr, Output, Result as InterpreterResult, Select}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to create query context, err:{}", source))] + CreateQueryContext { source: crate::context::Error }, + + #[snafu(display("Failed to execute logical plan, err:{}", source))] + ExecutePlan { + source: query_engine::executor::Error, + }, +} + +define_result!(Error); + +/// Select interpreter +pub struct SelectInterpreter { + ctx: Context, + plan: QueryPlan, + executor: T, +} + +impl SelectInterpreter { + pub fn create(ctx: Context, plan: QueryPlan, executor: T) -> InterpreterPtr { + Box::new(Self { + ctx, + plan, + executor, + }) + } +} + +#[async_trait] +impl Interpreter for SelectInterpreter { + async fn execute(self: Box) -> InterpreterResult { + let request_id = self.ctx.request_id(); + debug!( + "Interpreter execute select begin, request_id:{}, plan:{:?}", + request_id, self.plan + ); + + let query_ctx = self + .ctx + .new_query_context() + .context(CreateQueryContext) + .context(Select)?; + let query = Query::new(self.plan); + let record_batches = self + .executor + .execute_logical_plan(query_ctx, query) + .await + .context(ExecutePlan) + .context(Select)?; + + debug!( + "Interpreter execute select finish, request_id:{}", + request_id + ); + + Ok(Output::Records(record_batches)) + } +} diff --git a/interpreters/src/show_create.rs b/interpreters/src/show_create.rs new file mode 100644 index 0000000000..38d1747ab8 --- /dev/null +++ b/interpreters/src/show_create.rs @@ -0,0 +1,136 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{collections::HashMap, convert::TryInto, sync::Arc}; + +use arrow_deps::arrow::{ + array::StringArray, + datatypes::{DataType, Field, Schema}, + record_batch::RecordBatch, +}; +use async_trait::async_trait; +use query_engine::executor::RecordBatchVec; +use snafu::{ensure, Backtrace, ResultExt, Snafu}; +use sql::{ast::ShowCreateObject, plan::ShowCreatePlan}; +use table_engine::table::TableRef; + +use crate::interpreter::{ + Interpreter, InterpreterPtr, Output, Result as InterpreterResult, ShowCreate, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Unsupported show create type, type: {:?}, err:{}", + obj_type, + backtrace + ))] + UnsupportedType { + obj_type: ShowCreateObject, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +pub struct ShowCreateInInterpreter { + plan: ShowCreatePlan, +} + +impl ShowCreateInInterpreter { + pub fn create(plan: ShowCreatePlan) -> InterpreterPtr { + Box::new(Self { plan }) + } + + async fn execute_show_create(self: Box) -> Result { + let ShowCreatePlan { table, obj_type } = self.plan; + + ensure!( + obj_type == ShowCreateObject::Table, + UnsupportedType { obj_type } + ); + + Self::table_ref_to_record_batch(table).map(Output::Records) + } + + fn table_ref_to_record_batch(table_ref: TableRef) -> Result { + let tables = vec![table_ref.name().to_string()]; + let sqls = vec![Self::render_table_sql(table_ref)]; + + let schema = Schema::new(vec![ + Field::new("Table", DataType::Utf8, false), + Field::new("Create Table", DataType::Utf8, false), + ]); + + let arrow_record_batch = RecordBatch::try_new( + Arc::new(schema), + vec![ + Arc::new(StringArray::from(tables)), + Arc::new(StringArray::from(sqls)), + ], + ) + .unwrap(); + + let record_batch = arrow_record_batch.try_into().unwrap(); + + Ok(vec![record_batch]) + } + + fn render_table_sql(table_ref: TableRef) -> String { + //TODO(boyan) pretty output + format!( + "CREATE TABLE `{}` ({}) ENGINE={}{}", + table_ref.name(), + Self::render_columns_and_constrains(&table_ref), + table_ref.engine_type(), + Self::render_options(table_ref.options()) + ) + } + + fn render_columns_and_constrains(table_ref: &TableRef) -> String { + let table_schema = table_ref.schema(); + let key_columns = table_schema.key_columns(); + let timestamp_key = table_schema.timestamp_name(); + + let mut res = String::new(); + for col in table_schema.columns() { + res += format!("`{}` {}", col.name, col.data_type).as_str(); + if col.is_tag { + res += " TAG"; + } + if !col.is_nullable { + res += " NOT NULL"; + } + + if !col.comment.is_empty() { + res += format!(" COMMENT '{}'", col.comment).as_str(); + } + res += ", "; + } + let keys: Vec = key_columns.iter().map(|col| col.name.to_string()).collect(); + res += format!("PRIMARY KEY({}), ", keys.join(",")).as_str(); + res += format!("TIMESTAMP KEY({})", timestamp_key).as_str(); + + res + } + + fn render_options(opts: HashMap) -> String { + if !opts.is_empty() { + let mut v: Vec = opts + .into_iter() + .map(|(k, v)| format!("{}='{}'", k, v)) + .collect(); + // sorted by option name + v.sort(); + format!(" WITH({})", v.join(", ")) + } else { + "".to_string() + } + } +} + +#[async_trait] +impl Interpreter for ShowCreateInInterpreter { + async fn execute(self: Box) -> InterpreterResult { + self.execute_show_create().await.context(ShowCreate) + } +} diff --git a/interpreters/src/tests.rs b/interpreters/src/tests.rs new file mode 100644 index 0000000000..4b05a239f8 --- /dev/null +++ b/interpreters/src/tests.rs @@ -0,0 +1,236 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use analytic_engine::tests::util::TestEnv; +use catalog::consts::{DEFAULT_CATALOG, DEFAULT_SCHEMA}; +use catalog_impls::table_based::TableBasedManager; +use common_types::request_id::RequestId; +use query_engine::executor::ExecutorImpl; +use sql::{ + parser::Parser, plan::Plan, planner::Planner, provider::MetaProvider, tests::MockMetaProvider, +}; +use table_engine::engine::TableEngine; + +use crate::{ + context::Context, + factory::Factory, + interpreter::{Output, Result}, +}; + +async fn build_catalog_manager(analytic: E) -> TableBasedManager +where + E: TableEngine + Clone + Send + Sync + 'static, +{ + // Create catalog manager, use analytic table as backend + TableBasedManager::new(&analytic.clone(), Arc::new(analytic)) + .await + .unwrap_or_else(|e| { + panic!("Failed to create catalog manager, err:{}", e); + }) +} + +fn sql_to_plan(meta_provider: &M, sql: &str) -> Plan { + let planner = Planner::new(meta_provider, RequestId::next_id(), 1); + let mut statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + planner.statement_to_plan(statements.remove(0)).unwrap() +} + +async fn build_factory(env: &Env) -> Factory +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + let catalog_manager = build_catalog_manager(env.engine()).await; + Factory::new(ExecutorImpl::new(), catalog_manager, Arc::new(env.engine())) +} + +async fn sql_to_output(env: &Env, sql: &str) -> Result +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + let plan = sql_to_plan(&env.meta_provider, sql); + + let ctx = Context::builder(RequestId::next_id()) + .default_catalog_and_schema(DEFAULT_CATALOG.to_string(), DEFAULT_SCHEMA.to_string()) + .build(); + + let factory = build_factory(env).await; + let interpreter = factory.create(ctx, plan); + interpreter.execute().await +} + +async fn test_create_table(env: &Env) +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + let sql="CREATE TABLE IF NOT EXISTS test_table(c1 string tag not null,ts timestamp not null, c3 string, timestamp key(ts),primary key(c1, ts)) \ + ENGINE=Analytic WITH (ttl='70d',update_mode='overwrite',arena_block_size='1KB')"; + + let output = sql_to_output(env, sql).await.unwrap(); + if let Output::AffectedRows(v) = output { + assert_eq!(v, 1); + } else { + panic!(); + } +} + +async fn test_desc_table(env: &Env) +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + let sql = "desc table test_table"; + let output = sql_to_output(env, sql).await.unwrap(); + if let Output::Records(v) = output { + assert_eq!(v.len(), 1); + } else { + panic!(); + } +} + +async fn test_exists_table(env: &Env) +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + let sql = "exists table test_table"; + let output = sql_to_output(env, sql).await.unwrap(); + if let Output::Records(v) = output { + assert_eq!(v.len(), 1); + } else { + panic!(); + } +} + +async fn test_insert_table(env: &Env) +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + let sql = "INSERT INTO test_table(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3'),('tagk2', 1638428434000,100, 'hello3');"; + let output = sql_to_output(env, sql).await.unwrap(); + if let Output::AffectedRows(v) = output { + assert_eq!(v, 2); + } else { + panic!(); + } +} + +async fn test_select_table(env: &Env) +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + let sql = "select * from test_table"; + let output = sql_to_output(env, sql).await.unwrap(); + if let Output::Records(v) = output { + assert_eq!(v.len(), 1); + assert_eq!(v[0].num_rows(), 2); + } else { + panic!(); + } + + let sql = "select count(*) from test_table"; + let output = sql_to_output(env, sql).await.unwrap(); + if let Output::Records(v) = output { + assert_eq!(v.len(), 1); + assert_eq!(v[0].num_rows(), 1); + } else { + panic!(); + } +} + +async fn test_show_create_table(env: &Env) +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + let sql = "show create table test_table"; + let output = sql_to_output(env, sql).await.unwrap(); + if let Output::Records(v) = output { + assert_eq!(v.len(), 1); + assert_eq!(v[0].num_rows(), 1); + } else { + panic!(); + } +} + +async fn test_alter_table(env: &Env) +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + let sql = "alter table test_table add column add_col string"; + let output = sql_to_output(env, sql).await.unwrap(); + if let Output::AffectedRows(v) = output { + assert_eq!(v, 1); + } else { + panic!(); + } + + let sql = "alter table test_table modify SETTING ttl='9d'"; + let output = sql_to_output(env, sql).await.unwrap(); + if let Output::AffectedRows(v) = output { + assert_eq!(v, 1); + } else { + panic!(); + } +} + +async fn test_drop_table(env: &Env) +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + let sql = "drop table test_table"; + let output = sql_to_output(env, sql).await.unwrap(); + if let Output::AffectedRows(v) = output { + assert_eq!(v, 1); + } else { + panic!(); + } +} + +struct Env +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + pub engine: E, + pub meta_provider: M, +} + +impl Env +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + fn engine(&self) -> E { + self.engine.clone() + } +} + +#[tokio::test] +async fn test_interpreters() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + test_ctx.open().await; + let mock = MockMetaProvider::default(); + let env = Env { + engine: test_ctx.engine(), + meta_provider: mock, + }; + + test_create_table(&env).await; + test_desc_table(&env).await; + test_exists_table(&env).await; + test_insert_table(&env).await; + test_select_table(&env).await; + test_show_create_table(&env).await; + test_alter_table(&env).await; + test_drop_table(&env).await; +} diff --git a/meta_client/Cargo.toml b/meta_client/Cargo.toml new file mode 100644 index 0000000000..bd544c5d7f --- /dev/null +++ b/meta_client/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "meta_client" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +async-trait = "0.1.41" +catalog = { path = "../catalog" } +common_types = { path = "../common_types" } +table_engine = { path = "../table_engine" } +common_util = { path = "../common_util" } +ceresdbproto = { git = "https://github.com/CeresDB/ceresdbproto.git"} +futures = "0.3" +grpcio = { path = "../grpcio" } +log = "0.4" +rand = "0.7" +reqwest = "0.11" +serde = "1.0" +serde_derive = "1.0.81" +serde_json = "1.0.60" +snafu = { version ="0.6.10", features = ["backtraces"]} +tokio = { version = "1.0", features = ["full"] } +url = "2.2" diff --git a/meta_client/src/lib.rs b/meta_client/src/lib.rs new file mode 100644 index 0000000000..34563a1e71 --- /dev/null +++ b/meta_client/src/lib.rs @@ -0,0 +1,705 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Client to communicate with meta + +use std::{ + collections::HashMap, + convert::TryFrom, + sync::{Arc, RwLock}, + time::Duration, +}; + +use async_trait::async_trait; +use ceresdbproto::{ + meta::{CommonNodeInfo, NodeType}, + metagrpc::{ + ClusterViewResponse, FetchClusterViewRequest, NameSpace, RegisterNodeRequest, + RegisterNodeResponse, + }, + metagrpc_grpc::CeresmetaRpcServiceClient, +}; +use common_types::{bytes::Bytes, schema::TIMESTAMP_COLUMN}; +use common_util::{config::ReadableDuration, define_result, runtime::Runtime}; +use futures::TryStreamExt; +use grpcio::{ChannelBuilder, Environment}; +use load_balance::{LoadBalancer, RandomLoadBalancer}; +use log::{error, info}; +use reqwest::{self, StatusCode, Url}; +use serde::de::DeserializeOwned; +use serde_derive::Deserialize; +use snafu::{Backtrace, OptionExt, ResultExt, Snafu}; +use table_engine::ANALYTIC_ENGINE_TYPE; +use tokio::time; + +use crate::static_client::StaticMetaClient; + +mod load_balance; +mod static_client; + +#[derive(Debug, Snafu)] +#[snafu(visibility = "pub")] +pub enum Error { + #[snafu(display("Build http client failed, err:{}.\nBacktrace:\n{}", source, backtrace))] + BuildHttpClient { + source: reqwest::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid meta addr, addr:{}, err:{}.\nBacktrace:\n{}", + meta_addr, + source, + backtrace + ))] + InvalidMetaAddr { + meta_addr: String, + source: url::ParseError, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to join url, input:{}, err:{}.\nBacktrace:\n{}", + input, + source, + backtrace + ))] + JoinUrl { + input: String, + source: url::ParseError, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to send http request, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + SendHttp { + source: reqwest::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to parse http text, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + ParseText { + source: reqwest::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Bad http status, status:{}, url:{}, text:{:?}.\nBacktrace:\n{}", + status, + url, + text, + backtrace + ))] + BadHttpStatus { + status: StatusCode, + url: String, + text: Bytes, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to parse json text, text:{:?}, err:{}.\nBacktrace:\n{}", + text, + source, + backtrace + ))] + ParseJson { + text: Bytes, + source: serde_json::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to fetch cluster view, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + FetchClusterViewError { + source: grpcio::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Encountered register node, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + RegisterNodeError { + source: grpcio::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Encountered build rpc client, err:{}", source))] + BuildRpcClientError { source: load_balance::Error }, + + #[snafu(display( + "Invalid node addr of cluster view, node:{}.\nBacktrace:\n{}", + node, + backtrace + ))] + InvalidNodeAddr { node: String, backtrace: Backtrace }, + + #[snafu(display( + "Invalid node port of cluster view, node:{}, err:{}.\nBacktrace:\n{}", + node, + source, + backtrace + ))] + InvalidNodePort { + node: String, + source: std::num::ParseIntError, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to create schema:{}, catalog:{}, err:{}", + schema, + catalog, + source + ))] + FailOnChangeView { + schema: String, + catalog: String, + source: Box, + }, + + #[snafu(display("Failed to get catalog:{}, err:{}", catalog, source))] + FailGetCatalog { + catalog: String, + source: Box, + }, +} + +define_result!(Error); + +type ShardViewMap = HashMap; + +#[async_trait] +pub trait MetaWatcher { + async fn on_change(&self, view: ClusterViewRef) -> Result<()>; +} + +pub type MetaWatcherPtr = Box; + +/// Meta client abstraction +#[async_trait] +pub trait MetaClient { + /// Start the meta client + async fn start(&self) -> Result<()>; + + /// Get current cluster view. + /// + /// The cluster view is updated by background workers periodically + fn get_cluster_view(&self) -> ClusterViewRef; +} + +// TODO(yingwen): Now meta use i32 as shard id, maybe switch to unsigned number +pub type ShardId = i32; + +#[derive(Debug, Clone, Deserialize)] +pub struct Node { + pub addr: String, + pub port: u32, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct ShardView { + pub shard_id: ShardId, + pub node: Node, +} + +fn default_engine_type() -> String { + ANALYTIC_ENGINE_TYPE.to_string() +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(default)] +pub struct SchemaConfig { + pub auto_create_tables: bool, + pub default_engine_type: String, + pub default_timestamp_column_name: String, +} + +impl Default for SchemaConfig { + fn default() -> Self { + Self { + auto_create_tables: false, + default_engine_type: default_engine_type(), + default_timestamp_column_name: default_timestamp_column_name(), + } + } +} + +impl From for SchemaConfig { + fn from(view: SchemaShardView) -> Self { + Self { + auto_create_tables: view.auto_create_tables, + default_engine_type: view.default_engine_type, + default_timestamp_column_name: view.default_timestamp_column_name, + } + } +} + +#[derive(Debug, Default, Clone, Deserialize)] +pub struct ClusterView { + pub schema_shards: HashMap, + pub schema_configs: HashMap, +} + +impl TryFrom for ClusterView { + type Error = Error; + + fn try_from(result: ClusterViewResponse) -> Result { + let mut schema_shards = HashMap::with_capacity(result.schema_shards.len()); + let mut schema_configs = HashMap::with_capacity(result.schema_shards.len()); + + for (schema, shard_view) in result.schema_shards { + let mut schema_view = HashMap::with_capacity(shard_view.shard_nodes.len()); + for (shard_id, shard_node) in shard_view.shard_nodes { + let mut addr_port = shard_node.split(':'); + let addr = addr_port + .next() + .context(InvalidNodeAddr { node: &shard_node })?; + let port = addr_port + .next() + .context(InvalidNodeAddr { node: &shard_node })? + .parse() + .context(InvalidNodePort { node: &shard_node })?; + let node = Node { + addr: addr.to_string(), + port, + }; + schema_view.insert(shard_id, ShardView { shard_id, node }); + } + schema_shards.insert(schema.clone(), schema_view); + // TODO(boyan) support config in ClusterViewResponse + schema_configs.insert(schema, SchemaConfig::default()); + } + + Ok(ClusterView { + schema_shards, + schema_configs, + }) + } +} + +pub type ClusterViewRef = Arc; + +#[derive(Debug, Deserialize)] +#[serde(default)] +pub struct MetaClientConfig { + pub cluster: String, + pub meta_addr: String, + pub meta_version: String, + /// Local ip address of this node, used as endpoint ip in meta. + pub node: String, + /// Grpc port of this node, also used as endpoint port in meta. + pub port: u16, + pub meta_members_url: String, + pub lease: ReadableDuration, + pub timeout: ReadableDuration, + pub cq_count: usize, + /// + /// - If `enable_meta` is true, the client will fetch cluster view from + /// remote meta ndoe. + /// - If `enable_meta` is false, the client will try to read cluster view + /// from `cluster_view`. + pub enable_meta: bool, + /// The static cluster view used by static meta client. + pub cluster_view: ClusterViewConfig, +} + +impl Default for MetaClientConfig { + fn default() -> Self { + Self { + cluster: String::new(), + meta_addr: "http://127.0.0.1:8080".to_string(), + meta_version: String::from("v1"), + node: String::new(), + port: 8831, + meta_members_url: "ceresmeta/members".to_string(), + lease: ReadableDuration::secs(10), + timeout: ReadableDuration::secs(5), + cq_count: 8, + enable_meta: false, + cluster_view: ClusterViewConfig { + schema_shards: Vec::new(), + }, + } + } +} + +impl From<&MetaClientConfig> for RegisterNodeRequest { + fn from(meta_config: &MetaClientConfig) -> Self { + let mut req = RegisterNodeRequest::new(); + req.set_node_type(NodeType::Data); + req.set_ns(NameSpace { + cluster: meta_config.cluster.to_string(), + version: meta_config.meta_version.to_string(), + ..Default::default() + }); + req.set_node_info(CommonNodeInfo { + node: format!("{}:{}", meta_config.node, meta_config.port), + lease: meta_config.lease.as_secs() as i32, + ..Default::default() + }); + req + } +} + +#[derive(Debug, Deserialize, Clone)] +#[serde(default)] +pub struct SchemaShardView { + schema: String, + auto_create_tables: bool, + pub default_engine_type: String, + default_timestamp_column_name: String, + shard_views: Vec, +} + +impl Default for SchemaShardView { + fn default() -> Self { + Self { + schema: "".to_string(), + auto_create_tables: false, + default_engine_type: default_engine_type(), + default_timestamp_column_name: default_timestamp_column_name(), + shard_views: Vec::default(), + } + } +} + +#[inline] +fn default_timestamp_column_name() -> String { + TIMESTAMP_COLUMN.to_string() +} + +#[derive(Debug, Deserialize, Clone)] +pub struct ClusterViewConfig { + schema_shards: Vec, +} + +impl ClusterViewConfig { + pub(crate) fn to_cluster_view(&self) -> ClusterView { + let mut schema_configs = HashMap::with_capacity(self.schema_shards.len()); + let mut schema_shards = HashMap::with_capacity(self.schema_shards.len()); + + for schema_shard_view in self.schema_shards.clone() { + let schema = schema_shard_view.schema.clone(); + schema_shards.insert( + schema.clone(), + schema_shard_view + .shard_views + .iter() + .map(|shard| (shard.shard_id, shard.clone())) + .collect(), + ); + schema_configs.insert(schema, SchemaConfig::from(schema_shard_view)); + } + ClusterView { + schema_shards, + schema_configs, + } + } +} + +struct MetaClientImplInner { + meta_grpc_address: RwLock>, + http_client: reqwest::Client, + balancer: Box, + meta_config: MetaClientConfig, + cluster_view: RwLock, + members_url: Url, + watcher: Option, +} + +impl MetaClientImplInner { + fn new(meta_config: MetaClientConfig, watcher: Option) -> Result { + let http_client = reqwest::Client::builder() + .timeout(Duration::from(meta_config.timeout)) + .build() + .context(BuildHttpClient)?; + + let members_url = Url::parse(&meta_config.meta_addr) + .context(InvalidMetaAddr { + meta_addr: &meta_config.meta_addr, + })? + .join(format!("{}/", meta_config.meta_version).as_str()) + .context(JoinUrl { + input: &meta_config.meta_version, + })? + .join(&meta_config.meta_members_url) + .context(JoinUrl { + input: &meta_config.meta_members_url, + })?; + + Ok(Self { + meta_grpc_address: RwLock::new(Vec::new()), + http_client, + balancer: Box::new(RandomLoadBalancer), + meta_config, + cluster_view: RwLock::new(Arc::new(ClusterView::default())), + members_url, + watcher, + }) + } + + async fn fetch_cluster_view(&self) -> Result<()> { + let client = self.build_rpc_client()?; + let mut req = FetchClusterViewRequest::new(); + req.set_ns(NameSpace { + cluster: self.meta_config.cluster.to_string(), + version: self.meta_config.meta_version.to_string(), + ..Default::default() + }); + let mut receiver = client + .fetch_cluster_view(&req) + .context(FetchClusterViewError)?; + + while let Some(result) = receiver.try_next().await.context(FetchClusterViewError)? { + self.update_cluster_view_by_result(result).await?; + + info!( + "Fetch cluster view from meta, cluster:{}, view:{:#?}", + self.meta_config.cluster, + *self.cluster_view.read().unwrap(), + ); + } + + Ok(()) + } + + async fn update_cluster_view_by_result(&self, view_result: ClusterViewResponse) -> Result<()> { + let view = Arc::new(ClusterView::try_from(view_result)?); + + { + let mut cluster_view = self.cluster_view.write().unwrap(); + *cluster_view = view.clone(); + } + + if let Some(w) = &self.watcher { + w.on_change(view).await?; + } + + Ok(()) + } + + fn meta_addresses(&self) -> Vec { + self.meta_grpc_address.read().unwrap().clone() + } + + fn build_rpc_client(&self) -> Result { + let meta_addresses = self.meta_addresses(); + let meta_rpc_addr = self + .balancer + .select(&meta_addresses) + .context(BuildRpcClientError)?; + + let cb = ChannelBuilder::new(Arc::new(Environment::new(self.meta_config.cq_count))); + Ok(CeresmetaRpcServiceClient::new(cb.connect(meta_rpc_addr))) + } + + async fn register(&self, client: &CeresmetaRpcServiceClient) -> Result { + let req = RegisterNodeRequest::from(&self.meta_config); + client.register_node(&req).context(RegisterNodeError) + } + + async fn get_bytes_from_url(&self, url: Url) -> Result { + let resp = self + .http_client + .get(self.members_url.clone()) + .send() + .await + .context(SendHttp)?; + let status = resp.status(); + let text = resp.bytes().await.context(ParseText)?; + + if status.is_success() { + info!( + "Get bytes from url success, status:{}, url:{}, bytes:{:?}", + status, url, text + ); + + Ok(text) + } else { + error!( + "Failed to get bytes from url, status:{}, url:{}, bytes:{:?}", + status, url, text + ); + + BadHttpStatus { status, url, text }.fail() + } + } + + async fn get_from_url(&self, url: Url) -> Result { + let full = self.get_bytes_from_url(url).await?; + + serde_json::from_slice(&full).context(ParseJson { text: full }) + } + + async fn pull_meta_grpc_address(&self) -> Result<()> { + let addresses: Vec = self.get_from_url(self.members_url.clone()).await?; + + *self.meta_grpc_address.write().unwrap() = addresses; + + Ok(()) + } + + // TODO(yingwen): Store the value in field + fn error_wait_lease(&self) -> Duration { + Duration::from_secs(self.meta_config.lease.as_secs() / 2) + } + + // Register node every 2/3 lease + fn register_interval(&self) -> Duration { + Duration::from_secs(self.meta_config.lease.as_secs() * 2 / 3) + } + + fn fetch_view_interval(&self) -> Duration { + Duration::from_secs(self.meta_config.lease.as_secs() * 3) + } + + async fn start_fetch_cluster_view(&self) { + loop { + match self.fetch_cluster_view().await { + Ok(()) => { + info!( + "Fetch cluster view finished, cluster:{}", + self.meta_config.cluster + ); + } + Err(e) => { + error!( + "Failed to fetch cluster view from meta, cluster:{}, error:{}", + self.meta_config.cluster, e + ); + } + } + + time::sleep(self.error_wait_lease()).await; + } + } + + async fn register_loop(&self) -> Result<()> { + let mut interval = time::interval(self.register_interval()); + let rpc_client = self.build_rpc_client()?; + + loop { + let resp = self.register(&rpc_client).await?; + info!( + "Register node successfully, cluster:{}, response:{:#?}", + self.meta_config.cluster, resp + ); + + interval.tick().await; + } + } + + async fn start_register(&self) { + loop { + if let Err(e) = self.register_loop().await { + error!( + "Failed to register node to meta, cluster:{}, error:{}", + self.meta_config.cluster, e + ); + + time::sleep(self.error_wait_lease()).await; + } + } + } + + async fn start_refresh_meta_addresses(&self) { + let mut interval = time::interval(self.fetch_view_interval()); + + loop { + match self.pull_meta_grpc_address().await { + Ok(()) => { + interval.tick().await; + } + Err(e) => { + error!( + "Failed to refresh meta addresses from meta, url:{}, error:{}", + self.members_url, e + ); + + time::sleep(self.error_wait_lease()).await + } + } + } + } +} + +/// Default meta client impl, will interact with a remote meta node. +pub struct MetaClientImpl { + inner: Arc, + runtime: Arc, +} + +impl MetaClientImpl { + pub fn new( + config: MetaClientConfig, + runtime: Arc, + watcher: Option, + ) -> Result { + Ok(Self { + inner: Arc::new(MetaClientImplInner::new(config, watcher)?), + runtime, + }) + } +} + +#[async_trait] +impl MetaClient for MetaClientImpl { + async fn start(&self) -> Result<()> { + info!( + "Meta client is starting, config:{:?}", + self.inner.meta_config + ); + + self.inner.pull_meta_grpc_address().await?; + + let inner = self.inner.clone(); + self.runtime.spawn(async move { + inner.start_refresh_meta_addresses().await; + }); + + let inner = self.inner.clone(); + self.runtime.spawn(async move { + inner.start_register().await; + }); + + let inner = self.inner.clone(); + self.runtime.spawn(async move { + inner.start_fetch_cluster_view().await; + }); + + info!("Meta client has started"); + + Ok(()) + } + + fn get_cluster_view(&self) -> ClusterViewRef { + self.inner.cluster_view.read().unwrap().clone() + } +} + +/// Create a meta client with given `config`. +pub fn build_meta_client( + config: MetaClientConfig, + runtime: Arc, + watcher: Option, +) -> Result> { + if config.enable_meta { + let meta_client = MetaClientImpl::new(config, runtime, watcher)?; + Ok(Arc::new(meta_client)) + } else { + let meta_client = StaticMetaClient::new(config, watcher); + Ok(Arc::new(meta_client)) + } +} diff --git a/meta_client/src/load_balance.rs b/meta_client/src/load_balance.rs new file mode 100644 index 0000000000..707fb08d98 --- /dev/null +++ b/meta_client/src/load_balance.rs @@ -0,0 +1,65 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Load balancer + +use common_util::define_result; +use rand::Rng; +use snafu::{Backtrace, Snafu}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Meta Addresses empty.\nBacktrace:\n{}", backtrace))] + MetaAddressesEmpty { backtrace: Backtrace }, +} + +define_result!(Error); + +pub trait LoadBalancer { + fn select<'a>(&self, addresses: &'a [String]) -> Result<&'a String>; +} + +pub struct RandomLoadBalancer; + +impl LoadBalancer for RandomLoadBalancer { + fn select<'a>(&self, addresses: &'a [String]) -> Result<&'a String> { + if addresses.is_empty() { + return MetaAddressesEmpty.fail(); + } + + let len = addresses.len(); + if len == 1 { + return Ok(&addresses[0]); + } + let mut rng = rand::thread_rng(); + let idx = rng.gen_range(0, len); + + Ok(&addresses[idx]) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_random_loadbalancer() { + let lb = RandomLoadBalancer; + let addresses = vec![ + "127.0.0.1:8080".to_string(), + "127.0.0.2:8080".to_string(), + "127.0.0.3:8080".to_string(), + "127.0.0.4:8080".to_string(), + "127.0.0.5:8080".to_string(), + ]; + for _idx in 0..100 { + let addr = lb.select(&addresses).unwrap(); + assert!(addresses.contains(addr)); + } + + // Empty case + assert!(lb.select(&[]).is_err()); + + let addresses = ["127.0.0.1:5000".to_string()]; + assert_eq!(&addresses[0], lb.select(&addresses).unwrap()); + } +} diff --git a/meta_client/src/static_client.rs b/meta_client/src/static_client.rs new file mode 100644 index 0000000000..8639100f53 --- /dev/null +++ b/meta_client/src/static_client.rs @@ -0,0 +1,86 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Static meta client. + +use std::{collections::HashMap, sync::Arc}; + +use async_trait::async_trait; +use log::info; + +use crate::{ + ClusterView, ClusterViewConfig, ClusterViewRef, MetaClient, MetaClientConfig, MetaWatcherPtr, + Node, Result, ShardView, +}; + +/// Static meta client. +pub struct StaticMetaClient { + cluster_view: ClusterViewRef, + watcher: Option, +} + +impl StaticMetaClient { + pub fn new(config: MetaClientConfig, watcher: Option) -> Self { + let cluster_view = match new_cluster_view(&config.cluster_view) { + Some(v) => v, + None => cluster_view_without_meta(&config.node, config.port), + }; + + Self { + cluster_view: Arc::new(cluster_view), + watcher, + } + } +} + +#[async_trait] +impl MetaClient for StaticMetaClient { + async fn start(&self) -> Result<()> { + info!( + "File meta client is starting, cluster_view:{:?}", + self.cluster_view + ); + + info!("File meta client invoke watcher"); + + if let Some(w) = &self.watcher { + w.on_change(self.cluster_view.clone()).await?; + } + + info!("File meta client has started"); + + Ok(()) + } + + fn get_cluster_view(&self) -> ClusterViewRef { + self.cluster_view.clone() + } +} + +fn new_cluster_view(config: &ClusterViewConfig) -> Option { + if config.schema_shards.is_empty() { + return None; + } + + Some(config.to_cluster_view()) +} + +fn cluster_view_without_meta(addr: &str, port: u16) -> ClusterView { + let shard_id = 0; + let mut static_shards = HashMap::new(); + static_shards.insert( + shard_id, + ShardView { + shard_id, + node: Node { + addr: addr.to_string(), + port: u32::from(port), + }, + }, + ); + let mut schema_shards = HashMap::new(); + schema_shards.insert(catalog::consts::DEFAULT_SCHEMA.to_string(), static_shards); + ClusterView { + schema_shards, + schema_configs: HashMap::default(), + } +} diff --git a/meta_client_v2/Cargo.toml b/meta_client_v2/Cargo.toml new file mode 100644 index 0000000000..6ca7a6338a --- /dev/null +++ b/meta_client_v2/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "meta_client_v2" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +async-trait = "0.1.41" +catalog = { path = "../catalog" } +common_types = { path = "../common_types" } +table_engine = { path = "../table_engine" } +common_util = { path = "../common_util" } +ceresdbproto = { git = "https://github.com/CeresDB/ceresdbproto.git"} +futures = "0.3" +grpcio = { path = "../grpcio" } +log = "0.4" +protobuf = "2.20" +rand = "0.7" +reqwest = "0.11" +serde = "1.0" +serde_derive = "1.0.81" +serde_json = "1.0.60" +snafu = { version ="0.6.10", features = ["backtraces"]} +tokio = { version = "1.0", features = ["full"] } +url = "2.2" diff --git a/meta_client_v2/src/lib.rs b/meta_client_v2/src/lib.rs new file mode 100644 index 0000000000..4dd4244c12 --- /dev/null +++ b/meta_client_v2/src/lib.rs @@ -0,0 +1,676 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Client to communicate with meta + +use std::{ + sync::{Arc, RwLock as StdRwLock}, + time::Duration, +}; + +use async_trait::async_trait; +use ceresdbproto::{ + metagrpcV2::{ + AllocSchemaIdRequest as PbAllocSchemaIdRequest, + AllocTableIdRequest as PbAllocTableIdRequest, DropTableRequest as PbDropTableRequest, + GetTablesRequest as PbGetTablesRequest, NodeHeartbeatRequest as PbNodeHeartbeatRequest, + NodeHeartbeatResponse as PbNodeHeartbeatResponse, + }, + metagrpcV2_grpc::CeresmetaRpcServiceClient, +}; +use common_types::bytes::Bytes; +use common_util::{config::ReadableDuration, define_result, runtime::Runtime}; +use futures::{SinkExt, TryStreamExt}; +use grpcio::{ + CallOption, ChannelBuilder, ClientDuplexReceiver, ClientDuplexSender, Environment, WriteFlags, +}; +use load_balance::{LoadBalancer, RandomLoadBalancer}; +use log::{error, info, warn}; +use reqwest::{self, StatusCode, Url}; +use serde::de::DeserializeOwned; +use serde_derive::Deserialize; +use snafu::{Backtrace, ResultExt, Snafu}; +use tokio::{ + sync::{mpsc::Sender, RwLock}, + time, +}; +pub use types::*; + +mod load_balance; +mod types; + +#[derive(Debug, Snafu)] +#[snafu(visibility = "pub")] +pub enum Error { + #[snafu(display("Build http client failed, err:{}.\nBacktrace:\n{}", source, backtrace))] + BuildHttpClient { + source: reqwest::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid meta addr, addr:{}, err:{}.\nBacktrace:\n{}", + meta_addr, + source, + backtrace + ))] + InvalidMetaAddr { + meta_addr: String, + source: url::ParseError, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to join url, input:{}, err:{}.\nBacktrace:\n{}", + input, + source, + backtrace + ))] + JoinUrl { + input: String, + source: url::ParseError, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to send http request, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + SendHttp { + source: reqwest::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to parse http text, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + ParseText { + source: reqwest::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Bad http status, status:{}, url:{}, text:{:?}.\nBacktrace:\n{}", + status, + url, + text, + backtrace + ))] + BadHttpStatus { + status: StatusCode, + url: String, + text: Bytes, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to parse json text, text:{:?}, err:{}.\nBacktrace:\n{}", + text, + source, + backtrace + ))] + ParseJson { + text: Bytes, + source: serde_json::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to fetch action cmd, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + FetchActionCmdError { + source: grpcio::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Encountered build rpc client, err:{}", source))] + BuildRpcClientError { source: load_balance::Error }, + + #[snafu(display("Failed to get grpc client, grpc client is none, msg:{}", msg))] + FailGetGrpcClient { msg: String }, + + #[snafu(display("Failed to send heartbeat, cluster:{}, err:{}", cluster, source))] + FailSendHeartbeat { + cluster: String, + source: Box, + }, + + #[snafu(display( + "Failed to notify action cmd, action cmd:{:?}, err:{}", + action_cmd, + source + ))] + FailNotifyActionCmd { + action_cmd: ActionCmd, + source: Box, + }, + + #[snafu(display("Failed to alloc schema id, err:{}", source))] + FailAllocSchemaId { + source: Box, + }, + + #[snafu(display("Failed to alloc table id, err:{}", source))] + FailAllocTableId { + source: Box, + }, + + #[snafu(display("Failed to drop table, err:{}", source))] + FailDropTable { + source: Box, + }, + + #[snafu(display("Failed to get tables, err:{}", source))] + FailGetTables { + source: Box, + }, + + #[snafu(display("Meta error, resp header:{:?}.\nBacktrace:\n{}", header, backtrace))] + Meta { + header: ResponseHeader, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +const DEFAULT_META_URL_VERSION: &str = "v1"; + +/// Meta client abstraction +#[async_trait] +pub trait MetaClient { + /// Start the meta client + async fn start(&self) -> Result<()>; + + async fn alloc_schema_id(&self, _: AllocSchemaIdRequest) -> Result; + + async fn alloc_table_id(&self, _: AllocTableIdRequest) -> Result; + + async fn drop_table(&self, _: DropTableRequest) -> Result; + + async fn get_tables(&self, _: GetTablesRequest) -> Result; + + async fn send_heartbeat(&self, _: Vec) -> Result<()>; +} + +#[derive(Debug, Deserialize, Clone)] +#[serde(default)] +pub struct MetaClientConfig { + pub cluster_name: String, + pub meta_addr: String, + pub meta_members_url: String, + pub lease: ReadableDuration, + pub timeout: ReadableDuration, + pub cq_count: usize, +} + +impl Default for MetaClientConfig { + fn default() -> Self { + Self { + cluster_name: String::new(), + meta_addr: "http://127.0.0.1:8080".to_string(), + meta_members_url: "ceresmeta/members".to_string(), + lease: ReadableDuration::secs(10), + timeout: ReadableDuration::secs(5), + cq_count: 8, + } + } +} + +struct NodeHeartbeatChannel { + heartbeat_sender: ClientDuplexSender, + action_cmd_receiver: Option>, +} + +struct GrpcClient { + client: CeresmetaRpcServiceClient, + heartbeat_channel: NodeHeartbeatChannel, +} + +struct MetaClientImplInner { + meta_grpc_address: StdRwLock>, + http_client: reqwest::Client, + balancer: Box, + meta_config: MetaClientConfig, + node_meta_info: NodeMetaInfo, + members_url: Url, + + grpc_client: RwLock>, + + notify_sender: Option>, +} + +impl MetaClientImplInner { + fn new( + meta_config: MetaClientConfig, + node_meta_info: NodeMetaInfo, + sender: Option>, + ) -> Result { + let http_client = reqwest::Client::builder() + .timeout(Duration::from(meta_config.timeout)) + .build() + .context(BuildHttpClient)?; + + let members_url = Url::parse(&meta_config.meta_addr) + .context(InvalidMetaAddr { + meta_addr: &meta_config.meta_addr, + })? + .join(format!("{}/", DEFAULT_META_URL_VERSION).as_str()) + .unwrap() + .join(&meta_config.meta_members_url) + .context(JoinUrl { + input: &meta_config.meta_members_url, + })?; + + let client = Self { + meta_grpc_address: StdRwLock::new(Vec::new()), + http_client, + balancer: Box::new(RandomLoadBalancer), + meta_config, + node_meta_info, + members_url, + grpc_client: RwLock::new(None), + notify_sender: sender, + }; + + Ok(client) + } + + fn request_header(&self) -> RequestHeader { + RequestHeader { + node: self.node_meta_info.node.to_string(), + cluster_name: self.meta_config.cluster_name.clone(), + } + } + + fn node_meta_info(&self) -> NodeMetaInfo { + self.node_meta_info.clone() + } + + fn get_cluster_name(&self) -> &str { + // let a :Option=None; + + self.meta_config.cluster_name.as_str() + } + + fn connect_grpc_client(&self) -> Result { + let client = self.build_rpc_client()?; + let (sender, receiver) = client + .node_heartbeat_opt(CallOption::default()) + .context(FetchActionCmdError)?; + Ok(GrpcClient { + client, + heartbeat_channel: NodeHeartbeatChannel { + heartbeat_sender: sender, + action_cmd_receiver: Some(receiver), + }, + }) + } + + async fn reconnect_heartbeat_channel(&self) { + let grpc_client = &mut *self.grpc_client.write().await; + loop { + match self.connect_grpc_client() { + Ok(client) => { + *grpc_client = Some(client); + return; + } + Err(e) => { + error!("Grpc reconnect failed, error:{}", e); + time::sleep(self.error_wait_lease()).await; + } + } + } + } + + fn meta_addresses(&self) -> Vec { + self.meta_grpc_address.read().unwrap().clone() + } + + fn build_rpc_client(&self) -> Result { + let meta_addresses = self.meta_addresses(); + let meta_rpc_addr = self + .balancer + .select(&meta_addresses) + .context(BuildRpcClientError)?; + + let cb = ChannelBuilder::new(Arc::new(Environment::new(self.meta_config.cq_count))); + Ok(CeresmetaRpcServiceClient::new(cb.connect(meta_rpc_addr))) + } + + async fn get_bytes_from_url(&self, url: Url) -> Result { + let resp = self + .http_client + .get(self.members_url.clone()) + .send() + .await + .context(SendHttp)?; + let status = resp.status(); + let text = resp.bytes().await.context(ParseText)?; + + if status.is_success() { + info!( + "Get bytes from url success, status:{}, url:{}, bytes:{:?}", + status, url, text + ); + + Ok(text) + } else { + error!( + "Failed to get bytes from url, status:{}, url:{}, bytes:{:?}", + status, url, text + ); + + BadHttpStatus { status, url, text }.fail() + } + } + + async fn get_from_url(&self, url: Url) -> Result { + let full = self.get_bytes_from_url(url).await?; + + serde_json::from_slice(&full).context(ParseJson { text: full }) + } + + async fn pull_meta_grpc_address(&self) -> Result<()> { + let addresses: Vec = self.get_from_url(self.members_url.clone()).await?; + + *self.meta_grpc_address.write().unwrap() = addresses; + + Ok(()) + } + + // TODO(yingwen): Store the value in field + fn error_wait_lease(&self) -> Duration { + Duration::from_secs(self.meta_config.lease.as_secs() / 2) + } + + fn fetch_view_interval(&self) -> Duration { + Duration::from_secs(self.meta_config.lease.as_secs() * 3) + } + + async fn start_refresh_meta_addresses(&self) { + let mut interval = time::interval(self.fetch_view_interval()); + + loop { + match self.pull_meta_grpc_address().await { + Ok(()) => { + interval.tick().await; + } + Err(e) => { + error!( + "Failed to refresh meta addresses from meta, url:{}, error:{}", + self.members_url, e + ); + + time::sleep(self.error_wait_lease()).await; + } + } + } + } + + async fn start_fetch_action_cmd(&self) { + loop { + let mut receiver = None; + if let Some(client) = &mut *self.grpc_client.write().await { + receiver = client.heartbeat_channel.action_cmd_receiver.take(); + if receiver.is_none() { + error!("Failed to fetch action cmd receiver"); + } + } else { + error!("Grpc client is not inited"); + } + + if let Some(v) = receiver { + match self.fetch_action_cmd(v).await { + Ok(()) => { + info!( + "Fetch cluster view finished, cluster:{}", + self.get_cluster_name() + ); + } + Err(e) => { + self.reconnect_heartbeat_channel().await; + error!( + "Failed to get action cmd, cluster:{}, error:{}", + self.get_cluster_name(), + e + ); + } + } + } + + time::sleep(self.error_wait_lease()).await; + } + } + + async fn fetch_action_cmd( + &self, + mut receiver: ClientDuplexReceiver, + ) -> Result<()> { + while let Some(resp) = receiver.try_next().await.context(FetchActionCmdError)? { + info!( + "Fetch action cmd from meta, cluster:{}, action_cmd:{:?}", + self.get_cluster_name(), + resp, + ); + if let Some(notify_sender) = &self.notify_sender { + let resp: NodeHeartbeatResponse = resp.into(); + if let Err(e) = check_response_header(&resp.header) { + error!("Fetch action cmd failed, err:{}", e); + continue; + } + if let Some(action_cmd) = resp.action_cmd { + if let Err(e) = notify_sender.send(action_cmd.clone()).await { + error!( + "Notify sender send failed, action cmd:{:?}, err:{}", + action_cmd, e + ); + } + } else { + warn!("Fetch action cmd is empty, resp:{:?}", resp) + } + } + } + + Ok(()) + } +} + +/// Default meta client impl, will interact with a remote meta node. +pub struct MetaClientImpl { + inner: Arc, + runtime: Arc, +} + +impl MetaClientImpl { + pub fn new( + config: MetaClientConfig, + node_meta_info: NodeMetaInfo, + runtime: Arc, + sender: Option>, + ) -> Result { + Ok(Self { + inner: Arc::new(MetaClientImplInner::new(config, node_meta_info, sender)?), + runtime, + }) + } +} + +#[async_trait] +impl MetaClient for MetaClientImpl { + async fn start(&self) -> Result<()> { + info!( + "Meta client is starting, config:{:?}", + self.inner.meta_config + ); + + self.inner.pull_meta_grpc_address().await?; + self.inner.reconnect_heartbeat_channel().await; + + let inner = self.inner.clone(); + self.runtime.spawn(async move { + inner.start_refresh_meta_addresses().await; + }); + + let inner = self.inner.clone(); + self.runtime.spawn(async move { + inner.start_fetch_action_cmd().await; + }); + + info!("Meta client has started"); + + Ok(()) + } + + async fn alloc_schema_id(&self, req: AllocSchemaIdRequest) -> Result { + if let Some(grpc_client) = &mut *self.inner.grpc_client.write().await { + let mut pb_req: PbAllocSchemaIdRequest = req.into(); + pb_req.set_header(self.inner.request_header().into()); + let pb_resp = grpc_client + .client + .alloc_schema_id_async_opt(&pb_req, CallOption::default()) + .map_err(|e| Box::new(e) as _) + .context(FailAllocSchemaId)? + .await + .map_err(|e| Box::new(e) as _) + .context(FailAllocSchemaId)?; + let resp: AllocSchemaIdResponse = pb_resp.into(); + check_response_header(&resp.header)?; + Ok(resp) + } else { + FailGetGrpcClient { + msg: "alloc schema id".to_string(), + } + .fail() + } + } + + async fn alloc_table_id(&self, req: AllocTableIdRequest) -> Result { + if let Some(grpc_client) = &mut *self.inner.grpc_client.write().await { + let mut pb_req: PbAllocTableIdRequest = req.into(); + pb_req.set_header(self.inner.request_header().into()); + let pb_resp = grpc_client + .client + .alloc_table_id_async_opt(&pb_req, CallOption::default()) + .map_err(|e| Box::new(e) as _) + .context(FailAllocTableId)? + .await + .map_err(|e| Box::new(e) as _) + .context(FailAllocTableId)?; + let resp: AllocTableIdResponse = pb_resp.into(); + check_response_header(&resp.header)?; + Ok(resp) + } else { + FailGetGrpcClient { + msg: "alloc table id".to_string(), + } + .fail() + } + } + + async fn drop_table(&self, req: DropTableRequest) -> Result { + if let Some(grpc_client) = &mut *self.inner.grpc_client.write().await { + let mut pb_req: PbDropTableRequest = req.into(); + pb_req.set_header(self.inner.request_header().into()); + let pb_resp = grpc_client + .client + .drop_table_async_opt(&pb_req, CallOption::default()) + .map_err(|e| Box::new(e) as _) + .context(FailDropTable)? + .await + .map_err(|e| Box::new(e) as _) + .context(FailDropTable)?; + let resp: DropTableResponse = pb_resp.into(); + check_response_header(&resp.header)?; + Ok(resp) + } else { + FailGetGrpcClient { + msg: "drop table".to_string(), + } + .fail() + } + } + + async fn get_tables(&self, req: GetTablesRequest) -> Result { + if let Some(grpc_client) = &mut *self.inner.grpc_client.write().await { + let mut pb_req: PbGetTablesRequest = req.into(); + pb_req.set_header(self.inner.request_header().into()); + let pb_resp = grpc_client + .client + .get_tables_async_opt(&pb_req, CallOption::default()) + .map_err(|e| Box::new(e) as _) + .context(FailGetTables)? + .await + .map_err(|e| Box::new(e) as _) + .context(FailGetTables)?; + let resp: GetTablesResponse = pb_resp.into(); + check_response_header(&resp.header)?; + Ok(resp) + } else { + FailGetGrpcClient { + msg: "get tables".to_string(), + } + .fail() + } + } + + async fn send_heartbeat(&self, shards_info: Vec) -> Result<()> { + if let Some(grpc_client) = &mut *self.inner.grpc_client.write().await { + info!( + "Meta client send heartbeat, cluster:{}, shards_info:{:?}", + self.inner.get_cluster_name(), + shards_info + ); + let mut pb_request = PbNodeHeartbeatRequest::new(); + pb_request.set_header(self.inner.request_header().into()); + let node_info = NodeInfo { + node_meta_info: self.inner.node_meta_info(), + shards_info, + }; + pb_request.set_info(node_info.into()); + if let Err(e) = grpc_client + .heartbeat_channel + .heartbeat_sender + .send((pb_request, WriteFlags::default())) + .await + .map_err(|e| Box::new(e) as _) + .context(FailSendHeartbeat { + cluster: self.inner.get_cluster_name(), + }) + { + self.inner.reconnect_heartbeat_channel().await; + return Err(e); + }; + } else { + error!("Grpc_client is none"); + } + + Ok(()) + } +} + +fn check_response_header(header: &ResponseHeader) -> Result<()> { + if header.success { + Ok(()) + } else { + Meta { + header: header.clone(), + } + .fail() + } +} + +/// Create a meta client with given `config`. +pub fn build_meta_client( + config: MetaClientConfig, + node_meta_info: NodeMetaInfo, + runtime: Arc, + sender: Option>, +) -> Result> { + let meta_client = MetaClientImpl::new(config, node_meta_info, runtime, sender)?; + Ok(Arc::new(meta_client)) +} diff --git a/meta_client_v2/src/load_balance.rs b/meta_client_v2/src/load_balance.rs new file mode 100644 index 0000000000..707fb08d98 --- /dev/null +++ b/meta_client_v2/src/load_balance.rs @@ -0,0 +1,65 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Load balancer + +use common_util::define_result; +use rand::Rng; +use snafu::{Backtrace, Snafu}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Meta Addresses empty.\nBacktrace:\n{}", backtrace))] + MetaAddressesEmpty { backtrace: Backtrace }, +} + +define_result!(Error); + +pub trait LoadBalancer { + fn select<'a>(&self, addresses: &'a [String]) -> Result<&'a String>; +} + +pub struct RandomLoadBalancer; + +impl LoadBalancer for RandomLoadBalancer { + fn select<'a>(&self, addresses: &'a [String]) -> Result<&'a String> { + if addresses.is_empty() { + return MetaAddressesEmpty.fail(); + } + + let len = addresses.len(); + if len == 1 { + return Ok(&addresses[0]); + } + let mut rng = rand::thread_rng(); + let idx = rng.gen_range(0, len); + + Ok(&addresses[idx]) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_random_loadbalancer() { + let lb = RandomLoadBalancer; + let addresses = vec![ + "127.0.0.1:8080".to_string(), + "127.0.0.2:8080".to_string(), + "127.0.0.3:8080".to_string(), + "127.0.0.4:8080".to_string(), + "127.0.0.5:8080".to_string(), + ]; + for _idx in 0..100 { + let addr = lb.select(&addresses).unwrap(); + assert!(addresses.contains(addr)); + } + + // Empty case + assert!(lb.select(&[]).is_err()); + + let addresses = ["127.0.0.1:5000".to_string()]; + assert_eq!(&addresses[0], lb.select(&addresses).unwrap()); + } +} diff --git a/meta_client_v2/src/types.rs b/meta_client_v2/src/types.rs new file mode 100644 index 0000000000..7f558feec7 --- /dev/null +++ b/meta_client_v2/src/types.rs @@ -0,0 +1,458 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::collections::HashMap; + +use ceresdbproto::{ + metaV2::ShardRole as PbShardRole, + metagrpcV2::{ + AllocSchemaIdRequest as PbAllocSchemaIdRequest, + AllocSchemaIdResponse as PbAllocSchemaIdResponse, + AllocTableIdRequest as PbAllocTableIdRequest, + AllocTableIdResponse as PbAllocTableIdResponse, ChangeRoleCmd as PbChangeRoleCmd, + CloseCmd as PbCloseCmd, DropTableRequest as PbDropTableRequest, + DropTableResponse as PbDropTableResponse, Error as PbError, ErrorType as PbErrorType, + GetTablesRequest as PbGetTablesRequest, GetTablesResponse as PbGetTablesResponse, + NodeHeartbeatResponse as PbNodeHeartbeatResponse, NodeHeartbeatResponse_oneof_cmd, + NodeInfo as PbNodeInfo, NoneCmd as PbNoneCmd, OpenCmd as PbOpenCmd, + RequestHeader as PbRequestHeader, ResponseHeader as PbResponseHeader, + ShardInfo as PbShardInfo, ShardTables as PbShardTables, SplitCmd as PbSplitCmd, + TableInfo as PbTableInfo, + }, +}; +use common_util::config::ReadableDuration; +use serde_derive::Deserialize; + +pub type TableId = u64; +pub type ShardId = u32; +pub type SchemaId = u32; + +#[derive(Debug, Clone)] +pub struct RequestHeader { + pub node: String, + pub cluster_name: String, +} + +#[derive(Debug, Clone)] +pub struct ResponseHeader { + pub success: bool, + pub error: ResponseError, +} + +#[derive(Debug, Clone)] +pub struct ResponseError { + pub error_type: ErrorType, + pub message: String, +} + +#[derive(Debug, Clone)] +pub enum ErrorType { + UNKNOWN, +} + +pub struct AllocSchemaIdRequest { + pub name: String, +} + +pub struct AllocSchemaIdResponse { + pub header: ResponseHeader, + + pub name: String, + pub id: SchemaId, +} + +pub struct AllocTableIdRequest { + pub schema_name: String, + pub name: String, +} + +pub struct AllocTableIdResponse { + pub header: ResponseHeader, + + pub schema_name: String, + pub name: String, + pub shard_id: ShardId, + pub schema_id: SchemaId, + pub id: TableId, +} + +pub struct DropTableRequest { + pub schema_name: String, + pub name: String, +} + +pub struct DropTableResponse { + pub header: ResponseHeader, +} + +#[derive(Clone, Debug)] +pub struct GetTablesRequest { + pub shard_ids: Vec, +} + +#[derive(Clone, Debug)] +pub struct GetTablesResponse { + pub header: ResponseHeader, + + pub tables_map: HashMap, +} + +#[derive(Clone, Debug)] +pub struct TableInfo { + pub id: TableId, + pub name: String, + pub schema_id: SchemaId, + pub schema_name: String, +} + +#[derive(Clone, Debug)] +pub struct ShardTables { + pub role: ShardRole, + pub tables: Vec, +} + +#[derive(Debug)] +struct NodeHeartbeatRequest { + info: NodeInfo, +} + +#[derive(Debug, Clone, Default, Deserialize)] +pub struct Node { + pub addr: String, + pub port: u16, +} + +impl ToString for Node { + fn to_string(&self) -> String { + format!("{}:{}", self.addr, self.port) + } +} + +#[derive(Debug, Default, Clone, Deserialize)] +pub struct NodeMetaInfo { + pub node: String, + pub zone: String, + pub idc: String, + pub binary_version: String, +} + +#[derive(Debug, Clone)] +pub struct NodeInfo { + pub node_meta_info: NodeMetaInfo, + pub shards_info: Vec, +} + +#[derive(Debug)] +pub struct NodeHeartbeatResponse { + pub header: ResponseHeader, + + pub timestamp: u64, + pub action_cmd: Option, +} + +#[derive(Debug, Clone)] +pub struct ShardInfo { + pub shard_id: ShardId, + pub role: ShardRole, +} + +#[derive(Debug, Copy, Clone)] +pub enum ShardRole { + LEADER, + FOLLOWER, +} + +#[derive(Debug, Clone)] +pub enum ActionCmd { + NoneCmd(NoneCmd), + OpenCmd(OpenCmd), + SplitCmd(SplitCmd), + CloseCmd(CloseCmd), + ChangeRoleCmd(ChangeRoleCmd), +} + +#[derive(Debug, Clone)] +pub struct NoneCmd {} + +#[derive(Debug, Clone)] +pub struct OpenCmd { + pub shard_ids: Vec, +} + +#[derive(Debug, Clone)] +pub struct SplitCmd {} + +#[derive(Debug, Clone)] +pub struct CloseCmd { + pub shard_ids: Vec, +} + +#[derive(Debug, Clone)] +pub struct ChangeRoleCmd {} + +#[derive(Debug, Deserialize, Clone)] +#[serde(default)] +pub struct MetaClientConfig { + pub cluster_name: String, + pub meta_addr: String, + pub meta_members_url: String, + pub lease: ReadableDuration, + pub timeout: ReadableDuration, + pub cq_count: usize, + + /// + /// - If `enable_meta` is true, the client will fetch cluster view from + /// remote meta ndoe. + /// - If `enable_meta` is false, the client will try to read cluster view + /// from `cluster_view`. + pub enable_meta: bool, +} + +impl Default for MetaClientConfig { + fn default() -> Self { + Self { + cluster_name: String::new(), + meta_addr: "http://127.0.0.1:8080".to_string(), + meta_members_url: "ceresmeta/members".to_string(), + lease: ReadableDuration::secs(10), + timeout: ReadableDuration::secs(5), + cq_count: 8, + enable_meta: true, + } + } +} + +impl From for PbNodeInfo { + fn from(node_info: NodeInfo) -> Self { + let mut pb_node_info = PbNodeInfo::new(); + pb_node_info.set_node(node_info.node_meta_info.node.to_string()); + pb_node_info.set_zone(node_info.node_meta_info.zone); + pb_node_info.set_binary_version(node_info.node_meta_info.binary_version); + pb_node_info.set_shardsInfo(protobuf::RepeatedField::from_vec( + node_info + .shards_info + .into_iter() + .map(|v| v.into()) + .collect(), + )); + pb_node_info + } +} + +impl From for PbShardInfo { + fn from(shard_info: ShardInfo) -> Self { + let mut pb_shard_info = PbShardInfo::new(); + pb_shard_info.set_shard_id(shard_info.shard_id); + pb_shard_info.set_role(shard_info.role.into()); + pb_shard_info + } +} + +impl From for PbShardRole { + fn from(shard_role: ShardRole) -> Self { + match shard_role { + ShardRole::LEADER => PbShardRole::LEADER, + ShardRole::FOLLOWER => PbShardRole::FOLLOWER, + } + } +} + +impl From for ShardRole { + fn from(pb: PbShardRole) -> Self { + match pb { + PbShardRole::LEADER => ShardRole::LEADER, + PbShardRole::FOLLOWER => ShardRole::FOLLOWER, + } + } +} + +impl From for NodeHeartbeatResponse { + fn from(mut pb: PbNodeHeartbeatResponse) -> Self { + let timestamp = pb.get_timestamp(); + NodeHeartbeatResponse { + header: pb.take_header().into(), + timestamp, + action_cmd: pb.cmd.map(|v| v.into()), + } + } +} + +impl From for ActionCmd { + fn from(pb: NodeHeartbeatResponse_oneof_cmd) -> Self { + match pb { + NodeHeartbeatResponse_oneof_cmd::none_cmd(_) => ActionCmd::NoneCmd(NoneCmd {}), + NodeHeartbeatResponse_oneof_cmd::open_cmd(v) => ActionCmd::OpenCmd(v.into()), + NodeHeartbeatResponse_oneof_cmd::split_cmd(v) => ActionCmd::SplitCmd(v.into()), + NodeHeartbeatResponse_oneof_cmd::close_cmd(v) => ActionCmd::CloseCmd(v.into()), + NodeHeartbeatResponse_oneof_cmd::change_role_cmd(v) => { + ActionCmd::ChangeRoleCmd(v.into()) + } + } + } +} + +impl From for NoneCmd { + fn from(_pb: PbNoneCmd) -> Self { + Self {} + } +} + +impl From for OpenCmd { + fn from(mut pb: PbOpenCmd) -> Self { + Self { + shard_ids: pb.take_shard_ids(), + } + } +} + +impl From for SplitCmd { + fn from(_pb: PbSplitCmd) -> Self { + Self {} + } +} + +impl From for CloseCmd { + fn from(mut pb: PbCloseCmd) -> Self { + Self { + shard_ids: pb.take_shard_ids(), + } + } +} + +impl From for ChangeRoleCmd { + fn from(_pb: PbChangeRoleCmd) -> Self { + Self {} + } +} + +impl From for PbGetTablesRequest { + fn from(req: GetTablesRequest) -> Self { + let mut pb = PbGetTablesRequest::new(); + pb.set_shard_id(req.shard_ids); + pb + } +} + +impl From for GetTablesResponse { + fn from(mut pb: PbGetTablesResponse) -> Self { + Self { + header: pb.take_header().into(), + tables_map: pb + .take_tables_map() + .into_iter() + .map(|(k, v)| (k, v.into())) + .collect(), + } + } +} + +impl From for ShardTables { + fn from(mut pb: PbShardTables) -> Self { + Self { + role: pb.get_role().into(), + tables: pb.take_tables().into_iter().map(|v| v.into()).collect(), + } + } +} + +impl From for TableInfo { + fn from(mut pb: PbTableInfo) -> Self { + TableInfo { + id: pb.get_id(), + name: pb.take_name(), + schema_id: pb.get_schema_id(), + schema_name: pb.take_schema_name(), + } + } +} + +impl From for PbRequestHeader { + fn from(req: RequestHeader) -> Self { + let mut pb = PbRequestHeader::new(); + pb.set_node(req.node); + pb.set_cluster_name(req.cluster_name); + pb + } +} + +impl From for ResponseHeader { + fn from(mut pb: PbResponseHeader) -> Self { + Self { + success: pb.get_success(), + error: pb.take_error().into(), + } + } +} + +impl From for ErrorType { + fn from(pb: PbErrorType) -> Self { + match pb { + PbErrorType::UNKNOWN => ErrorType::UNKNOWN, + } + } +} + +impl From for ResponseError { + fn from(mut pb: PbError) -> Self { + Self { + error_type: pb.get_error_type().into(), + message: pb.take_message(), + } + } +} + +impl From for PbAllocSchemaIdRequest { + fn from(req: AllocSchemaIdRequest) -> Self { + let mut pb = PbAllocSchemaIdRequest::new(); + pb.set_name(req.name); + pb + } +} + +impl From for AllocSchemaIdResponse { + fn from(mut pb: PbAllocSchemaIdResponse) -> Self { + Self { + header: pb.take_header().into(), + name: pb.take_name(), + id: pb.get_id(), + } + } +} + +impl From for PbAllocTableIdRequest { + fn from(req: AllocTableIdRequest) -> Self { + let mut pb = PbAllocTableIdRequest::new(); + pb.set_schema_name(req.schema_name); + pb.set_name(req.name); + pb + } +} + +impl From for AllocTableIdResponse { + fn from(mut pb: PbAllocTableIdResponse) -> Self { + Self { + header: pb.take_header().into(), + schema_name: pb.take_schema_name(), + name: pb.take_name(), + shard_id: pb.get_shard_id(), + schema_id: pb.get_schema_id(), + id: pb.get_id(), + } + } +} + +impl From for PbDropTableRequest { + fn from(req: DropTableRequest) -> Self { + let mut pb = PbDropTableRequest::new(); + pb.set_schema_name(req.schema_name); + pb.set_name(req.name); + pb + } +} + +impl From for DropTableResponse { + fn from(mut pb: PbDropTableResponse) -> Self { + Self { + header: pb.take_header().into(), + } + } +} diff --git a/proto/.gitignore b/proto/.gitignore new file mode 100644 index 0000000000..5eb2f8833d --- /dev/null +++ b/proto/.gitignore @@ -0,0 +1 @@ +src/protos diff --git a/proto/Cargo.toml b/proto/Cargo.toml new file mode 100644 index 0000000000..609680dd7f --- /dev/null +++ b/proto/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "proto" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +protobuf = "2.20" + +[build-dependencies.protobuf-builder] +git = "https://github.com/CeresDB/protobuf-builder.git" +rev = "745cc8527d1c5eb48745f5ce74b2b5bdb75c3bf2" diff --git a/proto/build.rs b/proto/build.rs new file mode 100644 index 0000000000..e992a9163c --- /dev/null +++ b/proto/build.rs @@ -0,0 +1,11 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use protobuf_builder::Builder; + +fn generate_pb() { + Builder::new().search_dir_for_protos("protos").generate(); +} + +fn main() { + generate_pb(); +} diff --git a/proto/protos/analytic_common.proto b/proto/protos/analytic_common.proto new file mode 100644 index 0000000000..c418296f99 --- /dev/null +++ b/proto/protos/analytic_common.proto @@ -0,0 +1,62 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Common protos of analytic engine +syntax = "proto3"; +package analytic_common; + +// Options of a table that need to persist +message TableOptions { + // Segment duration in ms. + uint64 segment_duration = 1; + bool enable_ttl = 2; + uint64 ttl = 3; + uint32 arena_block_size = 4; + uint64 num_rows_per_row_group = 5; + CompactionStrategy compaction_strategy= 6; + CompactionOptions compaction_options = 7; + UpdateMode update_mode = 8; + uint32 write_buffer_size = 9; + Compression compression = 10; + // If sampling_segment_duration is true, then the segment duration + // is still unknown. + bool sampling_segment_duration = 11; +} + +enum UpdateMode { + Overwrite = 0; + Append = 1; +} + +message CompactionOptions { + // Options for STCS + float bucket_low = 1; + float bucket_high = 2; + uint32 min_sstable_size = 3; + uint32 min_threshold = 4; + uint32 max_threshold = 5; + // Options for TWCS + TimeUnit timestamp_resolution = 6; +} + +enum TimeUnit { + NANOSECONDS = 0; + MICROSECONDS = 1; + MILLISECONDS = 2; + SECONDS = 3; + MINUTES = 4; + HOURS = 5; + DAYS = 6; +} + +enum CompactionStrategy { + DEFAULT = 0; + SIZE_TIERED = 1; + TIME_WINDOW = 2; +} + +enum Compression { + UNCOMPRESSED = 0; + LZ4 = 1; + SNAPPY = 2; + ZSTD = 3; +} diff --git a/proto/protos/common.proto b/proto/protos/common.proto new file mode 100644 index 0000000000..dc917685a7 --- /dev/null +++ b/proto/protos/common.proto @@ -0,0 +1,63 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Common types +syntax = "proto3"; +package common; + +// Data type of column +// TODO(yingwen): Do we need a null type? +enum DataType { + NULL = 0; + TIMESTAMP = 1; + DOUBLE = 2; + VARBINARY = 3; + STRING = 4; + UINT64 = 5; + FLOAT = 6; + INT64 = 7; + INT32 = 8; + INT16 = 9; + INT8 = 10; + UINT32 = 11; + UINT16 = 12; + UINT8 = 13; + BOOL = 14; +} + +// Column schema +message ColumnSchema { + // Column name + string name = 1; + // Column type + DataType data_type = 2; + // Is the column nullable + bool is_nullable = 3; + // Id of the column + uint32 id = 4; + // Is the column used as tag + bool is_tag = 5; + // Comment of the column + string comment = 6; +} + +// Table Schema +message TableSchema { + // Schema of each column + repeated ColumnSchema columns = 1; + // Version of the schema + uint32 version = 2; + // Key column num + uint32 num_key_columns = 3; + // Timestamp index in columns + uint32 timestamp_index = 4; + // Enable auto generated tsid as primary key + bool enable_tsid_primary_key = 5; +} + +// Time range of [start, end) +message TimeRange { + // inclusive start + int64 start = 1; + // exclusive end + int64 end = 2; +} diff --git a/proto/protos/meta_update.proto b/proto/protos/meta_update.proto new file mode 100644 index 0000000000..64c2b384ad --- /dev/null +++ b/proto/protos/meta_update.proto @@ -0,0 +1,101 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Meta Updates of analytic engine +syntax = "proto3"; +package meta_update; + +import "analytic_common.proto"; +import "common.proto"; + +// Meta update for a new space +message AddSpaceMeta { + uint32 space_id = 1; + string space_name = 2; +} + +// Meta update for a new table +message AddTableMeta { + uint32 space_id = 1; + uint64 table_id = 2; + string table_name = 3; + // Schema of the table + common.TableSchema schema = 4; + // Options of the table + analytic_common.TableOptions options = 5; +} + +// Meta update for dropping a table +message DropTableMeta { + uint32 space_id = 1; + uint64 table_id = 2; + string table_name = 3; +} + +// Meta data of a sst file +message AddFileMeta { + // Level of the file + uint32 level = 1; + // Id of the file + uint64 file_id = 2; + bytes min_key = 3; + bytes max_key = 4; + uint64 max_seq = 5; + common.TimeRange time_range = 6; + common.TableSchema schema = 7; + uint64 size = 8; + uint64 row_num = 9; +} + +// Meta data of the file to delete +message DeleteFileMeta { + // Level of the file + uint32 level = 1; + // Id of the file + uint64 file_id = 2; +} + +// Meta data of version edit to table +message VersionEditMeta { + uint32 space_id = 1; + uint64 table_id = 2; + uint64 flushed_sequence = 3; + repeated AddFileMeta files_to_add = 4; + repeated DeleteFileMeta files_to_delete = 5; +} + +// Meta data of schema update. +message AlterSchemaMeta { + uint32 space_id = 1; + uint64 table_id = 2; + // New schema of the table. + common.TableSchema schema = 3; + // Previous schema version. + uint32 pre_schema_version = 4; +} + +// Meta data of schema update. +message AlterOptionsMeta { + uint32 space_id = 1; + uint64 table_id = 2; + // New options of the table. + analytic_common.TableOptions options = 3; +} + +// Meta data of manifest snapshot. +message SnapshotManifestMeta { + uint64 region_id = 1; + uint64 sequence = 2; +} + +// Meta update data to persist +message MetaUpdate { + oneof meta { + AddSpaceMeta add_space = 1; + AddTableMeta add_table = 2; + VersionEditMeta version_edit = 3; + AlterSchemaMeta alter_schema = 4; + AlterOptionsMeta alter_options = 5; + DropTableMeta drop_table = 6; + SnapshotManifestMeta snapshot_manifest = 7; + } +} diff --git a/proto/protos/sst.proto b/proto/protos/sst.proto new file mode 100644 index 0000000000..a1ab16e9a7 --- /dev/null +++ b/proto/protos/sst.proto @@ -0,0 +1,21 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Sst types +syntax = "proto3"; +package sst; + +import "common.proto"; + +message SstMetaData { + // Min key in the sst + bytes min_key = 1; + // Max key in the sst + bytes max_key = 2; + // Max sequence number in the sst + uint64 max_sequence = 3; + // The time range of the sst + common.TimeRange time_range = 4; + common.TableSchema schema = 5; + uint64 size = 6; + uint64 row_num = 7; +} diff --git a/proto/protos/sys_catalog.proto b/proto/protos/sys_catalog.proto new file mode 100644 index 0000000000..11cce62d06 --- /dev/null +++ b/proto/protos/sys_catalog.proto @@ -0,0 +1,55 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Types for sys catalog +syntax = "proto3"; +package sys_catalog; + +import "common.proto"; + +// Catalog entry +message CatalogEntry { + // Name of catalog + string catalog_name = 1; + // Created time: ms + int64 created_time = 2; +} + +// Schema entry +message SchemaEntry { + // Name of catalog + string catalog_name = 1; + // Name of schema + string schema_name = 2; + // Id of the schema + uint32 schema_id = 3; + // Created time: ms + int64 created_time = 4; +} + +// State of the table +enum TableState { + STABLE = 0; + DROPPING = 1; + DROPPED = 2; +} + +// Table entry +// TODO(yingwen): Add PartitionInfo +message TableEntry { + // Name of catalog + string catalog_name = 1; + // Name of schema + string schema_name = 2; + // Table id + uint64 table_id = 3; + // Table name + string table_name = 4; + // Table engine type + string engine = 5; + // The state of the table. + TableState state = 6; + // Created time: ms + int64 created_time = 7; + // Modified time: ms + int64 modified_time = 8; +} diff --git a/proto/protos/table_requests.proto b/proto/protos/table_requests.proto new file mode 100644 index 0000000000..a379299ef5 --- /dev/null +++ b/proto/protos/table_requests.proto @@ -0,0 +1,19 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Types for table requests +syntax = "proto3"; +package table_requests; + +import "common.proto"; + +// Write table request +message WriteRequest { + // Version of row encoding method + uint32 version = 1; + // Schema of rows + common.TableSchema schema = 2; + // Rows in bytes + // + // Each row is encoded in the same format as memtable + repeated bytes rows = 3; +} diff --git a/proto/src/lib.rs b/proto/src/lib.rs new file mode 100644 index 0000000000..d9d1e95e10 --- /dev/null +++ b/proto/src/lib.rs @@ -0,0 +1,10 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Protobuf messages + +// TODO(yingwen): All the protos need review +mod protos { + include!(concat!(env!("OUT_DIR"), "/protos/mod.rs")); +} + +pub use protos::*; diff --git a/query_engine/Cargo.toml b/query_engine/Cargo.toml new file mode 100644 index 0000000000..232992401c --- /dev/null +++ b/query_engine/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "query_engine" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +# In alphabetical order +arrow_deps = { path = "../arrow_deps" } +async-trait = "0.1.41" +common_types = { path = "../common_types" } +common_util = { path = "../common_util" } +futures = "0.3" +log = "0.4" +snafu = { version ="0.6.10", features = ["backtraces"]} +sql = { path = "../sql" } +table_engine = { path = "../table_engine" } +udf = { path = "../udf" } diff --git a/query_engine/src/context.rs b/query_engine/src/context.rs new file mode 100644 index 0000000000..9ebc825f84 --- /dev/null +++ b/query_engine/src/context.rs @@ -0,0 +1,121 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Query context + +use std::sync::Arc; + +use arrow_deps::datafusion::{ + execution::context::{ExecutionConfig, ExecutionContext}, + optimizer::{ + common_subexpr_eliminate::CommonSubexprEliminate, eliminate_limit::EliminateLimit, + filter_push_down::FilterPushDown, limit_push_down::LimitPushDown, optimizer::OptimizerRule, + projection_push_down::ProjectionPushDown, simplify_expressions::SimplifyExpressions, + }, + physical_optimizer::optimizer::PhysicalOptimizerRule, +}; +use common_types::request_id::RequestId; + +use crate::{ + df_planner_extension::QueryPlannerAdapter, + logical_optimizer::{ + order_by_primary_key::OrderByPrimaryKeyRule, type_conversion::TypeConversion, + }, + physical_optimizer, +}; + +/// Query context +pub struct Context { + request_id: RequestId, + df_exec_ctx: ExecutionContext, +} + +impl Context { + // For datafusion, internal use only + #[inline] + pub(crate) fn df_exec_ctx(&self) -> &ExecutionContext { + &self.df_exec_ctx + } + + #[inline] + pub fn request_id(&self) -> RequestId { + self.request_id + } + + pub fn builder(request_id: RequestId) -> Builder { + Builder { + request_id, + df_exec_config: ExecutionConfig::new(), + } + } +} + +pub type ContextRef = Arc; + +#[must_use] +pub struct Builder { + request_id: RequestId, + df_exec_config: ExecutionConfig, +} + +impl Builder { + /// Set default catalog and schema of this query context + pub fn default_catalog_and_schema(mut self, catalog: String, schema: String) -> Self { + self.df_exec_config = self + .df_exec_config + .with_default_catalog_and_schema(catalog, schema); + + self + } + + pub fn build(self) -> Context { + // Always create default catalog and schema now + let df_exec_config = { + let adapted_physical_optimize_rules = Self::apply_adapters_for_physical_optimize_rules( + &self.df_exec_config.physical_optimizers, + ); + let logical_optimize_rules = Self::logical_optimize_rules(); + self.df_exec_config + .with_query_planner(Arc::new(QueryPlannerAdapter)) + .with_optimizer_rules(logical_optimize_rules) + .with_physical_optimizer_rules(adapted_physical_optimize_rules) + }; + + Context { + request_id: self.request_id, + df_exec_ctx: ExecutionContext::with_config(df_exec_config), + } + } + + fn apply_adapters_for_physical_optimize_rules( + default_rules: &[Arc], + ) -> Vec> { + let mut new_rules = Vec::with_capacity(default_rules.len()); + for rule in default_rules { + new_rules.push(physical_optimizer::may_adapt_optimize_rule(rule.clone())) + } + + new_rules + } + + fn logical_optimize_rules() -> Vec> { + let mut optimizers: Vec> = vec![ + Arc::new(TypeConversion), + // These rules are the default settings of the datafusion. + Arc::new(SimplifyExpressions::new()), + Arc::new(CommonSubexprEliminate::new()), + Arc::new(EliminateLimit::new()), + Arc::new(ProjectionPushDown::new()), + Arc::new(FilterPushDown::new()), + Arc::new(LimitPushDown::new()), + // TODO(xikai): restore this rule after the bug of df is fixed. + // Arc::new(SingleDistinctToGroupBy::new()), + ]; + + // FIXME(xikai): use config to control the optimize rule. + if std::env::var("ENABLE_CUSTOM_OPTIMIZE").is_ok() { + optimizers.push(Arc::new(OrderByPrimaryKeyRule)); + } + + optimizers + } +} diff --git a/query_engine/src/df_execution_extension/mod.rs b/query_engine/src/df_execution_extension/mod.rs new file mode 100644 index 0000000000..746499e79a --- /dev/null +++ b/query_engine/src/df_execution_extension/mod.rs @@ -0,0 +1,4 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +pub mod prom_align; +pub use prom_align::PromAlignExec; diff --git a/query_engine/src/df_execution_extension/prom_align.rs b/query_engine/src/df_execution_extension/prom_align.rs new file mode 100644 index 0000000000..5e41f6e9af --- /dev/null +++ b/query_engine/src/df_execution_extension/prom_align.rs @@ -0,0 +1,931 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + any::Any, + collections::{hash_map, BTreeMap, HashMap, VecDeque}, + fmt, mem, + pin::Pin, + sync::Arc, + task::{Context, Poll}, +}; + +use arrow_deps::{ + arrow::{ + array::{ + new_empty_array, Float64Array, StringArray, TimestampMillisecondArray, UInt64Array, + }, + error::ArrowError, + record_batch::RecordBatch, + }, + datafusion::{ + error::{DataFusionError, Result as ArrowResult}, + execution::runtime_env::RuntimeEnv, + physical_plan::{ + repartition::RepartitionExec, ColumnarValue, DisplayFormatType, ExecutionPlan, + Partitioning, PhysicalExpr, RecordBatchStream, + SendableRecordBatchStream as DfSendableRecordBatchStream, Statistics, + }, + }, +}; +use async_trait::async_trait; +use common_types::{ + schema::{ArrowSchema, ArrowSchemaRef, DataType, TSID_COLUMN}, + time::{TimeRange, Timestamp}, +}; +use futures::{Stream, StreamExt}; +use log::debug; +use snafu::{OptionExt, ResultExt, Snafu}; +use sql::promql::{AlignParameter, ColumnNames, Func as PromFunc}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Internal err, source:{:?}", source))] + Internal { source: DataFusionError }, + + #[snafu(display("Invalid schema, source:{:?}", source))] + InvalidSchema { source: common_types::schema::Error }, + + #[snafu(display("Tsid column is required"))] + TsidRequired, + + #[snafu(display("Invalid column type, required:{:?}", required_type))] + InvalidColumnType { required_type: String }, + + #[snafu(display("{} column type cannot be null", name))] + NullColumn { name: String }, + + #[snafu(display("timestamp out of range"))] + TimestampOutOfRange {}, +} + +define_result!(Error); + +/// Limits Extrapolation range. +/// Refer to https://github.com/prometheus/prometheus/pull/1295 +const PROMTHEUS_EXTRAPOLATION_THRESHOLD_COEFFICIENT: f64 = 1.1; + +#[derive(Debug)] +struct ExtractTsidExpr {} + +impl fmt::Display for ExtractTsidExpr { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "(ExtractTsid)") + } +} + +impl PhysicalExpr for ExtractTsidExpr { + fn as_any(&self) -> &dyn Any { + &*self + } + + fn data_type(&self, _input_schema: &ArrowSchema) -> ArrowResult { + Ok(DataType::UInt64) + } + + fn nullable(&self, _input_schema: &ArrowSchema) -> ArrowResult { + Ok(false) + } + + fn evaluate(&self, batch: &RecordBatch) -> ArrowResult { + let tsid_idx = batch + .schema() + .index_of(TSID_COLUMN) + .expect("checked in plan build"); + Ok(ColumnarValue::Array(batch.column(tsid_idx).clone())) + } +} + +/// Note: caller should ensure data[tail_index] is valid +pub(crate) trait AlignFunc: fmt::Debug { + fn call( + &self, + data: &VecDeque, + tail_index: usize, + timestamp: Timestamp, + param: &AlignParameter, + ) -> Result>; +} + +/// PromAlignExec will group data by tsid and align sample based on align_param +#[derive(Debug)] +pub struct PromAlignExec { + input: Arc, + column_name: Arc, + align_func: Arc, + align_param: AlignParameter, +} + +impl PromAlignExec { + pub fn try_new( + input: Arc, + column_name: Arc, + func: PromFunc, + align_param: AlignParameter, + read_parallelism: usize, + ) -> Result { + let extract_tsid: Arc = Arc::new(ExtractTsidExpr {}); + let input = Arc::new( + RepartitionExec::try_new( + input, + Partitioning::Hash(vec![extract_tsid], read_parallelism), + ) + .context(Internal)?, + ) as Arc; + let align_func: Arc = match func { + PromFunc::Instant => Arc::new(InstantFunc {}), + PromFunc::Rate => Arc::new(RateFunc {}), + PromFunc::Irate => Arc::new(IrateFunc {}), + PromFunc::Delta => Arc::new(DeltaFunc {}), + PromFunc::Idelta => Arc::new(IdeltaFunc {}), + PromFunc::Increase => Arc::new(IncreaseFunc {}), + }; + Ok(Self { + input, + column_name, + align_func, + align_param, + }) + } +} + +#[async_trait] +impl ExecutionPlan for PromAlignExec { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> ArrowSchemaRef { + self.input.schema() + } + + fn output_partitioning(&self) -> Partitioning { + self.input.output_partitioning() + } + + fn children(&self) -> Vec> { + vec![self.input.clone()] + } + + fn with_new_children( + &self, + children: Vec>, + ) -> ArrowResult> { + match children.len() { + 1 => Ok(Arc::new(PromAlignExec { + input: children[0].clone(), + column_name: self.column_name.clone(), + align_func: self.align_func.clone(), + align_param: self.align_param, + })), + _ => Err(DataFusionError::Internal( + "PromAlignExec wrong number of children".to_string(), + )), + } + } + + async fn execute( + &self, + partition: usize, + runtime: Arc, + ) -> ArrowResult { + debug!("PromAlignExec: partition:{}", partition); + Ok(Box::pin(PromAlignReader { + input: self.input.execute(partition, runtime).await?, + done: false, + column_name: self.column_name.clone(), + align_func: self.align_func.clone(), + align_param: self.align_param, + tsid_to_tags: HashMap::default(), + tsid_to_stepper: HashMap::default(), + record_schema: None, + })) + } + + fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "PromAlignExec: align_param={:?}, func={:?}, partition_count={}", + self.align_param, + self.align_func, + self.output_partitioning().partition_count(), + ) + } + + fn statistics(&self) -> Statistics { + // TODO(chenxiang) + Statistics::default() + } +} + +struct PromAlignReader { + /// The input to read data from + input: DfSendableRecordBatchStream, + /// Have we produced the output yet? + done: bool, + column_name: Arc, + align_func: Arc, + align_param: AlignParameter, + tsid_to_tags: HashMap>, + tsid_to_stepper: HashMap>, + record_schema: Option, +} + +impl PromAlignReader { + fn step_helper(&mut self, tsid: u64, samples: Vec) -> Result>> { + let start_timestamp = self.align_param.align_range.inclusive_start(); + let offset = self.align_param.offset; + let stepper = self.tsid_to_stepper.entry(tsid).or_insert_with(|| { + Box::new(FixedStepper::new(start_timestamp)) as Box + }); + let samples = samples + .into_iter() + .map(|Sample { timestamp, value }| { + Ok(Sample { + timestamp: timestamp + .checked_add(offset) + .context(TimestampOutOfRange {})?, + value, + }) + }) + .collect::>>()?; + let sample_range = if samples.is_empty() { + TimeRange::min_to_max() + } else { + TimeRange::new_unchecked( + samples.front().unwrap().timestamp, // we have at least one samples here + samples + .back() + .unwrap() + .timestamp + .checked_add_i64(1) + .context(TimestampOutOfRange {})?, + ) + }; + stepper.step( + samples, + sample_range, + &self.align_param, + self.align_func.clone(), + ) + } + + fn accumulate_record_batch( + &mut self, + record_batch: RecordBatch, + ) -> Result>> { + let schema = record_batch.schema(); + let tsid_idx = schema.index_of(TSID_COLUMN).expect("checked in plan build"); + let field_idx = schema + .index_of(&self.column_name.field) + .expect("checked in plan build"); + let timestamp_idx = schema + .index_of(&self.column_name.timestamp) + .expect("checked in plan build"); + + let mut tsid_samples = HashMap::new(); + let tsid_array = record_batch + .column(tsid_idx) + .as_any() + .downcast_ref::() + .expect("checked in build plan"); + if tsid_array.is_empty() { + // empty array means end of data, but maybe there are still pending samples, so + // step one more time + let tsids = self.tsid_to_stepper.keys().cloned().collect::>(); + for tsid in tsids { + if let Some(result) = self.step_helper(tsid, vec![])? { + tsid_samples.insert(tsid, result); + } + } + return Ok(tsid_samples); + } + + let mut previous_tsid = tsid_array.value(0); + let mut duplicated_tsids = vec![(previous_tsid, 0)]; + for row_idx in 1..tsid_array.len() { + let tsid = tsid_array.value(row_idx); + if tsid != previous_tsid { + previous_tsid = tsid; + duplicated_tsids.push((tsid, row_idx)); + } + } + let mut step_helper = |tsid, batch| { + if let hash_map::Entry::Vacant(e) = self.tsid_to_tags.entry(tsid) { + e.insert(Self::build_tags( + &self.column_name.tag_keys, + schema.clone(), + &batch, + )?); + } + if let Some(result) = + self.step_helper(tsid, self.build_sample(field_idx, timestamp_idx, batch)?)? + { + tsid_samples.insert(tsid, result); + } + Ok(()) + }; + if duplicated_tsids.len() == 1 { + // fast path, when there is only one tsid in record_batch + step_helper(duplicated_tsids[0].0, record_batch)?; + } else { + debug!("duplicated_tsids:{:?}", duplicated_tsids); + for i in 0..duplicated_tsids.len() { + let (tsid, offset) = duplicated_tsids[i]; + let length = if i == duplicated_tsids.len() - 1 { + tsid_array.len() - offset + } else { + duplicated_tsids[i + 1].1 - offset + }; + let current_batch = record_batch.slice(offset, length); + step_helper(tsid, current_batch)?; + } + } + + Ok(tsid_samples) + } + + fn build_tags( + tag_keys: &[String], + schema: ArrowSchemaRef, + record_batch: &RecordBatch, + ) -> Result> { + tag_keys + .iter() + .map(|key| { + let v = record_batch + .column(schema.index_of(key).expect("checked in build plan")) + .as_any() + .downcast_ref::() + .context(InvalidColumnType { + required_type: "StringArray", + })? + .value(0); + Ok((key.to_owned(), v.to_string())) + }) + .collect::>>() + } + + fn build_sample( + &self, + field_idx: usize, + timestamp_idx: usize, + record_batch: RecordBatch, + ) -> Result> { + let field_array = record_batch + .column(field_idx) + .as_any() + .downcast_ref::() + .context(InvalidColumnType { + required_type: "Float64Array", + })?; + let timestamp_array = record_batch + .column(timestamp_idx) + .as_any() + .downcast_ref::() + .context(InvalidColumnType { + required_type: "TimestampMillisecondArray", + })?; + field_array + .into_iter() + .zip(timestamp_array.into_iter()) + .map(|(field, timestamp)| { + Ok(Sample { + value: field.context(NullColumn { name: "field" })?, + timestamp: Timestamp::new(timestamp.context(NullColumn { name: "timestamp" })?), + }) + }) + .collect::>>() + } + + fn samples_to_record_batch( + &self, + schema: ArrowSchemaRef, + tsid_samples: HashMap>, + ) -> std::result::Result { + let tsid_idx = schema.index_of(TSID_COLUMN).expect("checked in plan build"); + let field_idx = schema + .index_of(&self.column_name.field) + .expect("checked in plan build"); + let timestamp_idx = schema + .index_of(&self.column_name.timestamp) + .expect("checked in plan build"); + let mut batches = Vec::with_capacity(tsid_samples.len()); + for (tsid, samples) in tsid_samples { + let record_batch_len = samples.len(); + let tags = self + .tsid_to_tags + .get(&tsid) + .expect("tags are ensured in accumulated_record_batch"); + let mut arrays = vec![new_empty_array(&DataType::Int32); schema.fields().len()]; + arrays[tsid_idx] = Arc::new(UInt64Array::from(vec![tsid; record_batch_len])); + let mut fields = Vec::with_capacity(record_batch_len); + let mut timestamps = Vec::with_capacity(record_batch_len); + for Sample { + timestamp, + value: field, + } in samples + { + fields.push(field); + timestamps.push(timestamp.as_i64()); + } + arrays[timestamp_idx] = Arc::new(TimestampMillisecondArray::from(timestamps)); + arrays[field_idx] = Arc::new(Float64Array::from(fields)); + + for tag_key in &self.column_name.tag_keys { + let tag_idx = schema + .index_of(tag_key.as_str()) + .expect("checked in plan build"); + arrays[tag_idx] = Arc::new(StringArray::from(vec![ + tags.get(tag_key) + .expect("tag_key are ensured in accmulate_record_batch") + .to_string(); + record_batch_len + ])); + } + batches.push(RecordBatch::try_new(schema.clone(), arrays)?); + } + + RecordBatch::concat(&schema, &batches) + } +} + +impl Stream for PromAlignReader { + type Item = std::result::Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + if self.done { + return Poll::Ready(None); + } + + match self.input.poll_next_unpin(cx) { + Poll::Ready(Some(Ok(batch))) => { + let schema = batch.schema(); + if self.record_schema.is_none() { + self.record_schema = Some(schema.clone()); + } + let tsid_samples = self + .accumulate_record_batch(batch) + .map_err(|e| ArrowError::SchemaError(e.to_string()))?; // convert all Error enum to SchemaError + if !tsid_samples.is_empty() { + Poll::Ready(Some(self.samples_to_record_batch(schema, tsid_samples))) + } else { + Poll::Ready(Some(Ok(RecordBatch::new_empty(schema)))) + } + } + Poll::Ready(None) => { + self.done = true; + if let Some(schema) = mem::take(&mut self.record_schema) { + let tsid_samples = self + .accumulate_record_batch(RecordBatch::new_empty(schema.clone())) + .map_err(|e| ArrowError::SchemaError(e.to_string()))?; + if !tsid_samples.is_empty() { + return Poll::Ready(Some( + self.samples_to_record_batch(schema, tsid_samples), + )); + } + } + Poll::Ready(None) + } + other => other, + } + } +} + +impl RecordBatchStream for PromAlignReader { + fn schema(&self) -> ArrowSchemaRef { + self.input.schema() + } +} + +#[derive(Debug)] +pub(crate) struct Sample { + timestamp: Timestamp, + value: f64, +} + +/// `Stepper` is used for align samples, specified by [range queries](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries). +/// Note: [instant queries](https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries) are models as range queries with step is 1. +/// +/// # Diagram +/// ```plaintext +/// range +/// +-------------+ +/// v | +/// |------|-----|-----|-----|-----|--------> +/// start step end +/// ``` +trait Stepper: fmt::Debug { + /// Calculate current sample based on new input samples. + /// Samples maybe kept since some function require large time range input, + /// such as rate(metric[1d]) + fn step( + &mut self, + input: VecDeque, + range: TimeRange, + param: &AlignParameter, + align_func: Arc, + ) -> Result>>; + + // Returns size of samples kept during query, mainly used for metrics + fn pending_column_bytes(&self) -> usize; +} + +/// `FixedStepper` is one implemention of `Stepper`, which will accumulate all +/// samples within each step before pass control to next execution node. +/// This implemention will consume high memory in large range query, such as +/// rate(metric[30d]) + +/// TODO(chenxiang): A streaming implemention is required for those large range +/// query. +#[derive(Debug)] +struct FixedStepper { + /// accumulated samples used for calculate sample for current step + entries: VecDeque, + /// tail index of entries for processing current step, which means + /// [0, tail_index] is used + tail_index: usize, + /// timestamp of current step sample + timestamp: Timestamp, +} + +impl Stepper for FixedStepper { + fn step( + &mut self, + mut column: VecDeque, + column_range: TimeRange, + param: &AlignParameter, + align_func: Arc, + ) -> Result>> { + self.entries.append(&mut column); + debug!( + "column_range:{:?}, param:{:?}, ts:{:?}", + column_range, param, self.timestamp + ); + let curr_range = param.align_range.intersected_range(column_range); + if curr_range.is_none() { + return Ok(None); + } + let curr_range = curr_range.unwrap(); + let mut result = vec![]; + + // self.timestamp = self.timestamp.max(start); + while self.timestamp < curr_range.inclusive_start() { + self.timestamp = self + .timestamp + .checked_add(param.step) + .context(TimestampOutOfRange {})?; + } + + while curr_range.contains(self.timestamp) { + // push `tail_index`. In look ahead (by increasing index by 1) way. + while self.tail_index + 1 < self.entries.len() + && self.entries[self.tail_index + 1].timestamp <= self.timestamp + { + self.tail_index += 1; + } + let mint = self + .timestamp + .checked_sub(param.lookback_delta) + .context(TimestampOutOfRange {})?; + // drop some unneeded entries from begining of `entries` + while let Some(entry) = self.entries.front() { + if entry.timestamp < mint { + self.entries.pop_front(); + if let Some(index) = self.tail_index.checked_sub(1) { + self.tail_index = index + } + } else { + break; + } + } + // [mint, self.timestamp] has no data, skip to next step. + let skip = { + if let Some(first_entry) = self.entries.get(0) { + first_entry.timestamp > self.timestamp + } else { + true + } + }; + if skip { + self.timestamp = self + .timestamp + .checked_add(param.step) + .context(TimestampOutOfRange {})?; + continue; + } + + // call range function + if let Some(value) = + align_func.call(&self.entries, self.tail_index, self.timestamp, param)? + { + result.push(value); + } + + self.timestamp = self + .timestamp + .checked_add(param.step) + .context(TimestampOutOfRange {})?; + } + + if !result.is_empty() { + Ok(Some(result)) + } else { + Ok(None) + } + } + + fn pending_column_bytes(&self) -> usize { + self.entries.len() * 16 // timestamp + float value + } +} + +impl FixedStepper { + fn new(start_timestamp: Timestamp) -> FixedStepper { + Self { + entries: VecDeque::new(), + tail_index: 0, + timestamp: start_timestamp, + } + } +} + +/// Helper for Promtheus functions which needs extrapolation. [Rate][rate], +/// [Increase][increase] and [Delta][delta] for now. +/// +/// Since "range" is not always equals to `data_duration`, extrapolation needs +/// to be performed to estimate absent data. Extrapolation is named by +/// Prometheus. This function is ported from [here][prom_extrapolate_code]. +/// "extrapolate" assumes absenting data is following the same distribution with +/// existing data. Thus it simply zooms result calculated from existing data to +/// required extrapolation time range. +/// +/// [rate]: https://prometheus.io/docs/prometheus/latest/querying/functions/#rate +/// [increase]: https://prometheus.io/docs/prometheus/latest/querying/functions/#increase +/// [delta]: https://prometheus.io/docs/prometheus/latest/querying/functions/#delta +/// [prom_extrapolate_code]: https://github.com/prometheus/prometheus/blob/063154eab720d8c3d495bd78312c0df090d0bf23/promql/functions.go#L59 +/// +/// This function can be roughly divided into three parts: +/// - Calculate result from real data +/// - Calculate time range needs extrapolate to. +/// - Calculate extrapolated result. +/// +/// The outputs of above three steps are `difference`, `extrapolated_duration` +/// and `extrapolated_result`. +/// +/// # Diagram +/// ```plaintext +/// range_start first_timestamp last_timestamp range_end +/// └─────────────────────┴────────────────────┴──────────────────┘ +/// range_to_start data_duration range_to_end +/// ``` +/// +/// Legends: +/// - `range_end` is the timestamp passed in +/// - `range_start` is calculated by `timestamp` - `lookback_range`. +/// - "range" here stands for `range_end` - `range_start`, which is equals to +/// `range_to_start` + `data_duration` + `range_to_end`. +/// - `first/last_timestamp` is the timestamp of provided data. +/// - `data_duration` is a time range covered by data. +fn extrapolate_fn_helper( + data: &VecDeque, + tail_index: usize, + timestamp: Timestamp, + lookback_delta: Timestamp, + is_counter: bool, + is_rate: bool, +) -> Result> { + // no sence to calculate rate on one single item. + if tail_index < 1 { + return Ok(None); + } + + let first_data = data[0].value; + + // calculate `counter_reset_correction` for counter type. + let mut counter_reset_correction = 0.0; + if is_counter { + let mut last_data = first_data; + for Sample { value, .. } in data.iter().take(tail_index + 1) { + if *value < last_data { + counter_reset_correction += last_data; + } + last_data = *value; + } + } + + let difference = data[tail_index].value - first_data + counter_reset_correction; + + // `average_duration_between_data` assumes all data is distributed evenly. + let first_timestamp = data[0].timestamp; + let last_timestamp = data[tail_index].timestamp; + let data_duration = (last_timestamp + .checked_sub(first_timestamp) + .context(TimestampOutOfRange {})?) + .as_i64() as f64; + let average_duration_between_data = data_duration / tail_index as f64; + + let range_start = timestamp + .checked_sub(lookback_delta) + .context(TimestampOutOfRange {})?; + let range_end = timestamp; + let mut range_to_start = (first_timestamp + .checked_sub(range_start) + .context(TimestampOutOfRange)?) + .as_i64() as f64; + let mut range_to_end = (range_end + .checked_sub(last_timestamp) + .context(TimestampOutOfRange {})?) + .as_i64() as f64; + + // Prometheus shorten forward-extrapolation to zero point. + if is_counter && difference > 0.0 && first_data >= 0.0 { + let range_to_zero_point = data_duration * (first_data / difference); + range_to_start = range_to_start.min(range_to_zero_point); + } + + let extrapolation_threshold = + average_duration_between_data * PROMTHEUS_EXTRAPOLATION_THRESHOLD_COEFFICIENT; + + // if lots of data is absent (`range_to_start` or `range_to_end` is longer than + // `extrapolation_threshold`), Prometheus will not estimate all time range. Use + // half of `average_duration_between_data` instead. + if range_to_start > extrapolation_threshold { + range_to_start = average_duration_between_data / 2.0; + } + if range_to_end > extrapolation_threshold { + range_to_end = average_duration_between_data / 2.0; + } + + // `difference` is the real result calculated by existing data. Prometheus will + // zoom it to `extrapolated_duration` to get extrapolated estimated result. + let extrapolated_duration = data_duration + range_to_start + range_to_end; + let mut extrapolated_result = difference * extrapolated_duration / data_duration; + + if is_rate { + // `lookback_delta` here is in millisecond. + extrapolated_result /= lookback_delta.as_i64() as f64 / 1000.0; + } + + Ok(Some(Sample { + timestamp, + value: extrapolated_result, + })) +} + +/// Implementation of `Rate` function in `Prometheus`. More +/// [details](https://prometheus.io/docs/prometheus/latest/querying/functions/#rate) +#[derive(Debug)] +struct RateFunc {} + +impl AlignFunc for RateFunc { + fn call( + &self, + data: &VecDeque, + tail_index: usize, + timestamp: Timestamp, + param: &AlignParameter, + ) -> Result> { + extrapolate_fn_helper( + data, + tail_index, + timestamp, + param.lookback_delta, + true, + true, + ) + } +} + +#[derive(Debug)] +struct DeltaFunc {} + +impl AlignFunc for DeltaFunc { + fn call( + &self, + data: &VecDeque, + tail_index: usize, + timestamp: Timestamp, + param: &AlignParameter, + ) -> Result> { + extrapolate_fn_helper( + data, + tail_index, + timestamp, + param.lookback_delta, + false, + false, + ) + } +} + +#[derive(Debug)] +struct IncreaseFunc {} + +impl AlignFunc for IncreaseFunc { + fn call( + &self, + data: &VecDeque, + tail_index: usize, + timestamp: Timestamp, + param: &AlignParameter, + ) -> Result> { + extrapolate_fn_helper( + data, + tail_index, + timestamp, + param.lookback_delta, + true, + false, + ) + } +} + +// Port from https://github.com/prometheus/prometheus/blob/063154eab720d8c3d495bd78312c0df090d0bf23/promql/functions.go#L159 +fn instant_value( + data: &VecDeque, + tail_index: usize, + timestamp: Timestamp, + is_rate: bool, +) -> Result> { + if tail_index < 2 { + return Ok(None); + } + + let last_entry = &data[tail_index]; + let previous_entry = &data[tail_index - 1]; + + let mut result = if is_rate && last_entry.value < previous_entry.value { + last_entry.value + } else { + last_entry.value - previous_entry.value + }; + + let interval = last_entry + .timestamp + .checked_sub(previous_entry.timestamp) + .context(TimestampOutOfRange {})?; + assert!(interval.as_i64() > 0); + + if is_rate { + // Convert to per-second. + result /= interval.as_i64() as f64 / 1000.0; + } + + Ok(Some(Sample { + value: result, + timestamp, + })) +} + +#[derive(Debug)] +pub struct IdeltaFunc; + +impl AlignFunc for IdeltaFunc { + fn call( + &self, + data: &VecDeque, + tail_index: usize, + timestamp: Timestamp, + _param: &AlignParameter, + ) -> Result> { + instant_value(data, tail_index, timestamp, false) + } +} + +#[derive(Debug)] +struct IrateFunc; + +impl AlignFunc for IrateFunc { + fn call( + &self, + data: &VecDeque, + tail_index: usize, + timestamp: Timestamp, + _param: &AlignParameter, + ) -> Result> { + instant_value(data, tail_index, timestamp, true) + } +} + +/// This function is not in Promtheus' functions list. +/// +/// It simulates the behavior of `Instant Selector` by finding the newest point +/// from the input. Thus `Instant Selector` can be represented by [PromAlignOp] +/// + [InstantFn]. +#[derive(Debug)] +pub struct InstantFunc; + +impl AlignFunc for InstantFunc { + fn call( + &self, + data: &VecDeque, + tail_index: usize, + timestamp: Timestamp, + _param: &AlignParameter, + ) -> Result> { + Ok(Some(Sample { + timestamp, + value: data[tail_index].value, + })) + } +} diff --git a/query_engine/src/df_planner_extension/mod.rs b/query_engine/src/df_planner_extension/mod.rs new file mode 100644 index 0000000000..336cd128f5 --- /dev/null +++ b/query_engine/src/df_planner_extension/mod.rs @@ -0,0 +1,40 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! The query planner adapter provides some planner extensions of datafusion. + +use std::sync::Arc; + +use arrow_deps::datafusion::{ + execution::context::{ExecutionContextState, QueryPlanner}, + logical_plan::LogicalPlan, + physical_plan::{ + planner::{DefaultPhysicalPlanner, ExtensionPlanner}, + ExecutionPlan, PhysicalPlanner, + }, +}; + +pub mod prom_align; +pub mod table_scan_by_primary_key; +use async_trait::async_trait; + +/// The adapter for extending the default datafusion planner. +pub struct QueryPlannerAdapter; + +#[async_trait] +impl QueryPlanner for QueryPlannerAdapter { + async fn create_physical_plan( + &self, + logical_plan: &LogicalPlan, + ctx_state: &ExecutionContextState, + ) -> arrow_deps::datafusion::error::Result> { + let extension_planners: Vec> = vec![ + Arc::new(table_scan_by_primary_key::Planner), + Arc::new(prom_align::PromAlignPlanner), + ]; + + let physical_planner = DefaultPhysicalPlanner::with_extension_planners(extension_planners); + physical_planner + .create_physical_plan(logical_plan, ctx_state) + .await + } +} diff --git a/query_engine/src/df_planner_extension/prom_align.rs b/query_engine/src/df_planner_extension/prom_align.rs new file mode 100644 index 0000000000..f55b7042e4 --- /dev/null +++ b/query_engine/src/df_planner_extension/prom_align.rs @@ -0,0 +1,53 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use arrow_deps::datafusion::{ + error::DataFusionError, + execution::context::ExecutionContextState, + logical_plan::{LogicalPlan, UserDefinedLogicalNode}, + physical_plan::{planner::ExtensionPlanner, ExecutionPlan, PhysicalPlanner}, +}; +use snafu::Snafu; +use sql::promql::PromAlignNode; + +use crate::df_execution_extension::prom_align::{Error as ExecError, PromAlignExec}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Build execution failed. err:{:?}", source))] + ExecutionError { source: ExecError }, +} + +pub struct PromAlignPlanner; + +impl ExtensionPlanner for PromAlignPlanner { + fn plan_extension( + &self, + _planner: &dyn PhysicalPlanner, + node: &dyn UserDefinedLogicalNode, + logical_inputs: &[&LogicalPlan], + physical_inputs: &[Arc], + _ctx_state: &ExecutionContextState, + ) -> arrow_deps::datafusion::error::Result>> { + Ok( + if let Some(node) = node.as_any().downcast_ref::() { + assert_eq!(logical_inputs.len(), 1, "Inconsistent number of inputs"); + assert_eq!(physical_inputs.len(), 1, "Inconsistent number of inputs"); + Some(Arc::new( + PromAlignExec::try_new( + physical_inputs[0].clone(), + node.column_name.clone(), + node.func, + node.align_param, + node.read_parallelism, + ) + // DataFusionError is lost when wrapped, use string instead. + .map_err(|e| DataFusionError::Plan(e.to_string()))?, + )) + } else { + None + }, + ) + } +} diff --git a/query_engine/src/df_planner_extension/table_scan_by_primary_key.rs b/query_engine/src/df_planner_extension/table_scan_by_primary_key.rs new file mode 100644 index 0000000000..c864270aaa --- /dev/null +++ b/query_engine/src/df_planner_extension/table_scan_by_primary_key.rs @@ -0,0 +1,141 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + any::Any, + fmt::{Debug, Formatter}, + sync::Arc, +}; + +use arrow_deps::datafusion::{ + error::DataFusionError, + execution::context::ExecutionContextState, + logical_plan::{self, DFSchemaRef, Expr, LogicalPlan, TableScan, UserDefinedLogicalNode}, + physical_plan::{planner::ExtensionPlanner, ExecutionPlan, PhysicalPlanner}, +}; +use table_engine::{provider::TableProviderAdapter, table::ReadOrder}; + +/// The extension planner creates physical plan for the +/// [`TableScanByPrimaryKey`] which is a logical plan node. +pub struct Planner; + +impl ExtensionPlanner for Planner { + fn plan_extension( + &self, + _planner: &dyn PhysicalPlanner, + node: &dyn UserDefinedLogicalNode, + _logical_inputs: &[&LogicalPlan], + _physical_inputs: &[Arc], + _ctx_state: &ExecutionContextState, + ) -> arrow_deps::datafusion::error::Result>> { + node.as_any() + .downcast_ref::() + .map(|order_by_node| order_by_node.build_scan_table_exec_plan()) + .transpose() + } +} + +/// TableScanInPrimaryKeyOrder is a [`UserDefinedLogicalNode`] of datafusion +/// which normally is generated during logical plan optimization. +/// +/// It differs from the default [`TableScan`] in its corresponding +/// [`ExecutionPlan`] is a special [`ScanTable`] which can controls the scan +/// order. +#[derive(Clone)] +pub struct TableScanByPrimaryKey { + asc: bool, + scan_plan: Arc, +} + +impl Debug for TableScanByPrimaryKey { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + self.fmt_for_explain(f) + } +} + +impl TableScanByPrimaryKey { + /// Build the node from a [TableScan] node + /// + /// Note it panics if the plan node is not a LogicalPlan::TableScan. + pub fn new_from_scan_plan(asc: bool, scan_plan: Arc) -> Self { + // TODO(xikai): should ensure the scan_plan is a real TableScan. + Self { asc, scan_plan } + } + + /// Build the scan table [ExecutionPlan]. + fn build_scan_table_exec_plan( + &self, + ) -> arrow_deps::datafusion::error::Result> { + match self.scan_plan.as_ref() { + LogicalPlan::TableScan(TableScan { + source, + projection, + filters, + limit, + .. + }) => { + let table_provider = + if let Some(v) = source.as_any().downcast_ref::() { + v + } else { + return Err(DataFusionError::Internal(format!( + "expect table provider adapter, given plan:{:?}", + self.scan_plan, + ))); + }; + + // Remove all qualifiers from the scan as the provider + // doesn't know (nor should care) how the relation was + // referred to in the query + let filters = logical_plan::unnormalize_cols(filters.iter().cloned()); + + table_provider.scan_table( + projection, + &filters, + *limit, + ReadOrder::from_is_asc(Some(self.asc)), + ) + } + _ => Err(DataFusionError::Internal(format!( + "expect scan plan, given plan:{:?}", + self.scan_plan + ))), + } + } +} + +impl UserDefinedLogicalNode for TableScanByPrimaryKey { + fn as_any(&self) -> &dyn Any { + self + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![] + } + + fn schema(&self) -> &DFSchemaRef { + self.scan_plan.schema() + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "ScanTableInPrimaryKeyOrder, asc:{}, table_scan:{:?}", + self.asc, self.scan_plan + ) + } + + fn from_template( + &self, + _exprs: &[Expr], + _inputs: &[LogicalPlan], + ) -> Arc { + Arc::new(Self { + asc: self.asc, + scan_plan: self.scan_plan.clone(), + }) + } +} diff --git a/query_engine/src/executor.rs b/query_engine/src/executor.rs new file mode 100644 index 0000000000..99d8a637bf --- /dev/null +++ b/query_engine/src/executor.rs @@ -0,0 +1,138 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Query executor + +use std::sync::Arc; + +use async_trait::async_trait; +use common_types::record_batch::RecordBatch; +use futures::TryStreamExt; +use log::debug; +use snafu::{ResultExt, Snafu}; +use sql::{plan::QueryPlan, provider::CatalogProviderAdapter}; +use table_engine::stream::SendableRecordBatchStream; + +use crate::{ + context::ContextRef, + logical_optimizer::{LogicalOptimizer, LogicalOptimizerImpl}, + physical_optimizer::{PhysicalOptimizer, PhysicalOptimizerImpl}, + physical_plan::PhysicalPlanPtr, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to do logical optimization, err:{}", source))] + LogicalOptimize { + source: crate::logical_optimizer::Error, + }, + + #[snafu(display("Failed to do physical optimization, err:{}", source))] + PhysicalOptimize { + source: crate::physical_optimizer::Error, + }, + + #[snafu(display("Failed to execute physical plan, err:{}", source))] + ExecutePhysical { source: crate::physical_plan::Error }, + + #[snafu(display("Failed to collect record batch stream, err:{}", source,))] + Collect { source: table_engine::stream::Error }, +} + +define_result!(Error); + +// Use a type alias so that we are able to replace the implementation +pub type RecordBatchVec = Vec; + +/// Query to execute +/// +/// Contains the query plan and other infos +#[derive(Debug)] +pub struct Query { + /// The query plan + plan: QueryPlan, +} + +impl Query { + pub fn new(plan: QueryPlan) -> Self { + Self { plan } + } +} + +/// Query executor +/// +/// Executes the logical plan +#[async_trait] +pub trait Executor: Clone + Send + Sync { + // TODO(yingwen): Maybe return a stream + /// Execute the query, returning the query results as RecordBatchVec + /// + /// REQUIRE: The meta data of tables in query should be found from + /// ContextRef + async fn execute_logical_plan(&self, ctx: ContextRef, query: Query) -> Result; +} + +#[derive(Clone, Default)] +pub struct ExecutorImpl; + +impl ExecutorImpl { + pub fn new() -> Self { + Self::default() + } +} + +#[async_trait] +impl Executor for ExecutorImpl { + async fn execute_logical_plan(&self, ctx: ContextRef, query: Query) -> Result { + let plan = query.plan; + + // Register catalogs to datafusion execution context. + let catalogs = CatalogProviderAdapter::new_adapters(plan.tables.clone()); + let df_ctx = ctx.df_exec_ctx(); + for (name, catalog) in catalogs { + df_ctx.register_catalog(&name, Arc::new(catalog)); + } + let request_id = ctx.request_id(); + + let physical_plan = optimize_plan(ctx, plan).await?; + + debug!( + "Executor physical optimization finished, request_id:{}, physical_plan: {:?}", + request_id, physical_plan + ); + + let stream = physical_plan.execute().await.context(ExecutePhysical)?; + + // Collect all records in the pool, as the stream may perform some costly + // calculation + let record_batches = collect(stream).await?; + + debug!( + "Executor executed plan, request_id:{}, plan_and_metrics: {}", + request_id, + physical_plan.metrics_to_string() + ); + + Ok(record_batches) + } +} + +async fn optimize_plan(ctx: ContextRef, plan: QueryPlan) -> Result { + let mut logical_optimizer = LogicalOptimizerImpl::with_context(ctx.clone()); + let plan = logical_optimizer.optimize(plan).context(LogicalOptimize)?; + + debug!( + "Executor logical optimization finished, request_id:{}, plan: {:#?}", + ctx.request_id(), + plan + ); + + let mut physical_optimizer = PhysicalOptimizerImpl::with_context(ctx); + physical_optimizer + .optimize(plan) + .await + .context(PhysicalOptimize) +} + +async fn collect(stream: SendableRecordBatchStream) -> Result { + stream.try_collect().await.context(Collect) +} diff --git a/query_engine/src/lib.rs b/query_engine/src/lib.rs new file mode 100644 index 0000000000..36440dbb11 --- /dev/null +++ b/query_engine/src/lib.rs @@ -0,0 +1,19 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Query engine +//! +//! Optimizes and executes logical plan + +// TODO(yingwen): Maybe renamed to query_executor or query_backend? +// TODO(yingwen): Use datafusion or fuse-query as query backend + +#[macro_use] +extern crate common_util; + +pub mod context; +pub mod df_execution_extension; +pub mod df_planner_extension; +pub mod executor; +pub mod logical_optimizer; +pub mod physical_optimizer; +pub mod physical_plan; diff --git a/query_engine/src/logical_optimizer/mod.rs b/query_engine/src/logical_optimizer/mod.rs new file mode 100644 index 0000000000..2bcad7955f --- /dev/null +++ b/query_engine/src/logical_optimizer/mod.rs @@ -0,0 +1,61 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Logical optimizer + +pub mod order_by_primary_key; +#[cfg(test)] +pub mod tests; +pub mod type_conversion; + +use arrow_deps::datafusion::error::DataFusionError; +use snafu::{Backtrace, ResultExt, Snafu}; +use sql::plan::QueryPlan; + +use crate::context::ContextRef; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "DataFusion Failed to optimize logical plan, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + // TODO(yingwen): Should we carry plan in this context? + DataFusionOptimize { + source: DataFusionError, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +/// LogicalOptimizer transform the QueryPlan into a potentially more efficient +/// plan +pub trait LogicalOptimizer { + // TODO(yingwen): Maybe support other plans + fn optimize(&mut self, plan: QueryPlan) -> Result; +} + +pub struct LogicalOptimizerImpl { + ctx: ContextRef, +} + +impl LogicalOptimizerImpl { + pub fn with_context(ctx: ContextRef) -> Self { + Self { ctx } + } +} + +impl LogicalOptimizer for LogicalOptimizerImpl { + fn optimize(&mut self, plan: QueryPlan) -> Result { + // TODO(yingwen): Avoid clone the plan multiple times during optimization + let QueryPlan { + mut df_plan, + tables, + } = plan; + let exec_ctx = self.ctx.df_exec_ctx(); + df_plan = exec_ctx.optimize(&df_plan).context(DataFusionOptimize)?; + + Ok(QueryPlan { df_plan, tables }) + } +} diff --git a/query_engine/src/logical_optimizer/order_by_primary_key.rs b/query_engine/src/logical_optimizer/order_by_primary_key.rs new file mode 100644 index 0000000000..ef7942bbd9 --- /dev/null +++ b/query_engine/src/logical_optimizer/order_by_primary_key.rs @@ -0,0 +1,413 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{convert::TryFrom, sync::Arc}; + +use arrow_deps::datafusion::{ + execution::context::ExecutionProps, + logical_plan::{ + plan::{Extension, Filter, Projection, Sort}, + DFSchemaRef, Expr, Limit, LogicalPlan, TableScan, + }, + optimizer::optimizer::OptimizerRule, +}; +use common_types::schema::Schema; +use log::info; + +use crate::df_planner_extension::table_scan_by_primary_key::TableScanByPrimaryKey; + +/// The optimizer rule applies to the example plan: +/// Limit: 1 +/// Sort: #test.id ASC NULLS FIRST, #test.t ASC NULLS FIRST +/// Projection: #test.tsid, #test.t, #test.id, #test.tag1, #test.tag2 +/// TableScan: test projection=None +pub struct OrderByPrimaryKeyRule; + +impl OrderByPrimaryKeyRule { + /// Optimize the plan if it is the pattern: + /// Limit: + /// Sort: + /// Project: + /// (Filter): (Filer node is allowed to be not exist) + /// TableScan + fn do_optimize( + &self, + plan: &LogicalPlan, + ) -> arrow_deps::datafusion::error::Result> { + if let LogicalPlan::Limit(Limit { + n, + input: sort_plan, + }) = plan + { + if let LogicalPlan::Sort(Sort { + expr: sort_exprs, + input: projection_plan, + }) = sort_plan.as_ref() + { + if let LogicalPlan::Projection(Projection { + expr: projection_exprs, + input: scan_or_filter_plan, + schema: projection_schema, + alias, + }) = projection_plan.as_ref() + { + let (scan_plan, filter_predicate) = if let LogicalPlan::Filter(Filter { + predicate, + input: scan_plan, + }) = scan_or_filter_plan.as_ref() + { + (scan_plan, Some(predicate)) + } else { + (scan_or_filter_plan, None) + }; + + if let LogicalPlan::TableScan(TableScan { + table_name, source, .. + }) = scan_plan.as_ref() + { + let schema = Schema::try_from(source.schema()).map_err(|e| { + let err_msg = format!( + "fail to convert arrow schema to schema, table:{}, err:{:?}", + table_name, e + ); + arrow_deps::datafusion::error::DataFusionError::Plan(err_msg) + })?; + if let Some(sort_in_asc_order) = + Self::detect_primary_key_order(&schema, sort_exprs.as_slice()) + { + let new_plan = Self::rewrite_plan(RewriteContext { + projection: projection_exprs.clone(), + filter_predicate: filter_predicate.cloned(), + schema: projection_schema.clone(), + alias: alias.clone(), + scan_plan: scan_plan.clone(), + sort_exprs: sort_exprs.clone(), + sort_in_asc_order, + limit: *n, + }); + return Ok(Some(new_plan)); + } + } + } + } + } + + Ok(None) + } + + /// Check: + /// - Whether `timestamp` is the first column in the primary key. + /// - Whether `sort_exprs` is equals the any prefix of primary key. + /// - Whether `sort_exprs` is in the same order. + /// + /// Returns: Some(sort_order) if the two rules above are true. + fn detect_primary_key_order(schema: &Schema, sort_exprs: &[Expr]) -> Option { + if schema.timestamp_index() != 0 { + return None; + } + + let key_cols = schema.key_columns(); + if sort_exprs.len() > key_cols.len() { + return None; + } + let sub_key_cols = &key_cols[..sort_exprs.len()]; + + let mut in_asc_order = None; + for (sort_expr, key_col) in sort_exprs.iter().zip(sub_key_cols.iter()) { + if let Expr::Sort { expr, asc, .. } = sort_expr { + if let Some(in_asc_order) = in_asc_order.as_mut() { + if in_asc_order != asc { + return None; + } + } + in_asc_order = Some(*asc); + + if let Expr::Column(col) = expr.as_ref() { + if col.name == key_col.name { + continue; + } + } + } + return None; + } + + in_asc_order + } + + // TODO(xikai): The topmost limit and sort plan node of the rewritten plan is + // not necessary now because now the rewrite requires the timestamp key is + // the first column in the primary key and that means the output of + // TableScanByPrimaryKey is in the correct order. And topmost two + // plan nodes is used to optimize the normal cases where the timestamp key is + // any column. + /// Rewrite the plan: + /// Limit: + /// Sort: + /// Project: + /// Filter: + /// TableScan + /// + /// Rewritten plan: + /// Limit: + /// Sort: + /// Limit: + /// Project: + /// Filter: + /// TableScanByPrimaryKey + fn rewrite_plan(rewrite_ctx: RewriteContext) -> LogicalPlan { + let order_by_primary_key_scan = Arc::new(LogicalPlan::Extension(Extension { + node: Arc::new(TableScanByPrimaryKey::new_from_scan_plan( + rewrite_ctx.sort_in_asc_order, + rewrite_ctx.scan_plan, + )), + })); + + let filter_plan = if let Some(predicate) = rewrite_ctx.filter_predicate { + Arc::new(LogicalPlan::Filter(Filter { + predicate, + input: order_by_primary_key_scan, + })) + } else { + order_by_primary_key_scan + }; + + let new_project_plan = Arc::new(LogicalPlan::Projection(Projection { + expr: rewrite_ctx.projection, + input: filter_plan, + schema: rewrite_ctx.schema, + alias: rewrite_ctx.alias, + })); + + let new_limit_plan = Arc::new(LogicalPlan::Limit(Limit { + n: rewrite_ctx.limit, + input: new_project_plan, + })); + + let new_sort_plan = Arc::new(LogicalPlan::Sort(Sort { + expr: rewrite_ctx.sort_exprs, + input: new_limit_plan, + })); + LogicalPlan::Limit(Limit { + n: rewrite_ctx.limit, + input: new_sort_plan, + }) + } +} + +impl OptimizerRule for OrderByPrimaryKeyRule { + fn optimize( + &self, + plan: &LogicalPlan, + _execution_props: &ExecutionProps, + ) -> arrow_deps::datafusion::error::Result { + match self.do_optimize(plan)? { + Some(new_plan) => { + info!( + "optimize plan by OrderByPrimaryKeyRule, original plan:\n{:?}\n optimized plan:\n{:?}", + plan, new_plan + ); + Ok(new_plan) + } + None => Ok(plan.clone()), + } + } + + fn name(&self) -> &str { + "order_by_primary_key" + } +} + +struct RewriteContext { + projection: Vec, + filter_predicate: Option, + schema: DFSchemaRef, + alias: Option, + scan_plan: Arc, + sort_exprs: Vec, + sort_in_asc_order: bool, + limit: usize, +} + +#[cfg(test)] +mod tests { + use arrow_deps::datafusion::{logical_plan::Column, scalar::ScalarValue}; + use common_types::{column_schema, datum::DatumKind, schema}; + + use super::*; + use crate::logical_optimizer::tests::LogicalPlanNodeBuilder; + + const TEST_TABLE_NAME: &str = "order_by_primary_key_test_table"; + + fn build_no_optimized_schema() -> Schema { + schema::Builder::new() + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new("key".to_string(), DatumKind::Varbinary) + .build() + .expect("Build column schema"), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("t".to_string(), DatumKind::Timestamp) + .build() + .expect("Build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field".to_string(), DatumKind::Double) + .build() + .expect("Build column schema"), + ) + .unwrap() + .build() + .expect("Build schema") + } + + fn build_optimized_schema() -> Schema { + schema::Builder::new() + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new("t".to_string(), DatumKind::Timestamp) + .build() + .expect("Build column schema"), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("key".to_string(), DatumKind::Varbinary) + .build() + .expect("Build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field".to_string(), DatumKind::Double) + .build() + .expect("Build column schema"), + ) + .unwrap() + .build() + .expect("Build schema") + } + + fn build_sort_expr(sort_col: &str, asc: bool) -> Expr { + let col_expr = Expr::Column(Column::from(sort_col)); + Expr::Sort { + expr: Box::new(col_expr), + asc, + nulls_first: false, + } + } + + fn build_primary_key_sort_exprs(schema: &Schema, asc: bool) -> Vec { + schema + .key_columns() + .iter() + .map(|col| build_sort_expr(&col.name, asc)) + .collect() + } + + fn check_optimization_works( + schema: Schema, + sort_exprs: Vec, + filter_expr: Option, + asc: bool, + ) { + let builder = LogicalPlanNodeBuilder::new(TEST_TABLE_NAME.to_string(), schema); + + let plan = { + let mut builder = builder.clone().table_scan(); + if let Some(filter) = &filter_expr { + builder = builder.filter(filter.clone()); + } + builder + .projection(vec![]) + .sort(sort_exprs.clone()) + .limit(10) + .take_plan() + }; + + let rule = OrderByPrimaryKeyRule; + let optimized_plan = rule + .do_optimize(&*plan) + .expect("Optimize plan") + .expect("Succeed to optimize plan"); + let expected_plan = { + let mut builder = builder.table_scan().table_scan_in_primary_key_order(asc); + if let Some(filter) = filter_expr { + builder = builder.filter(filter); + } + builder + .projection(vec![]) + .limit(10) + .sort(sort_exprs) + .limit(10) + .take_plan() + }; + + crate::logical_optimizer::tests::assert_logical_plan_eq( + &optimized_plan, + expected_plan.as_ref(), + ); + } + + #[test] + fn test_optimize_applied_with_no_filter() { + let schema = build_optimized_schema(); + let sort_in_asc_order = true; + let sort_exprs = build_primary_key_sort_exprs(&schema, sort_in_asc_order); + check_optimization_works(schema, sort_exprs, None, sort_in_asc_order); + } + + #[test] + fn test_optimize_applied_with_prefix_sort_exprs() { + let schema = build_optimized_schema(); + let sort_in_asc_order = true; + let sort_exprs = build_primary_key_sort_exprs(&schema, sort_in_asc_order); + let prefix_sort_exprs = sort_exprs[..1].to_vec(); + check_optimization_works(schema, prefix_sort_exprs, None, sort_in_asc_order); + } + + #[test] + fn test_optimize_applied_with_filter() { + let schema = build_optimized_schema(); + let filter_expr = Expr::Literal(ScalarValue::Int8(None)); + let sort_in_asc_order = false; + let sort_exprs = build_primary_key_sort_exprs(&schema, sort_in_asc_order); + + check_optimization_works(schema, sort_exprs, Some(filter_expr), sort_in_asc_order); + } + + #[test] + fn test_optimize_fail_with_wrong_schema() { + let plan = { + let schema = build_no_optimized_schema(); + let sort_exprs = build_primary_key_sort_exprs(&schema, true); + let builder = LogicalPlanNodeBuilder::new(TEST_TABLE_NAME.to_string(), schema); + builder + .table_scan() + .projection(vec![]) + .sort(sort_exprs) + .limit(10) + .take_plan() + }; + + let rule = OrderByPrimaryKeyRule; + let optimized_plan = rule.do_optimize(&*plan).expect("Optimize plan"); + assert!(optimized_plan.is_none()); + } + + #[test] + fn test_optimize_with_wrong_plan() { + let plan = { + let schema = build_optimized_schema(); + let builder = LogicalPlanNodeBuilder::new(TEST_TABLE_NAME.to_string(), schema); + builder + .table_scan() + .projection(vec![]) + .limit(10) + .take_plan() + }; + + let rule = OrderByPrimaryKeyRule; + let optimized_plan = rule.do_optimize(&*plan).expect("Optimize plan"); + assert!(optimized_plan.is_none()); + } +} diff --git a/query_engine/src/logical_optimizer/tests.rs b/query_engine/src/logical_optimizer/tests.rs new file mode 100644 index 0000000000..7febd2283e --- /dev/null +++ b/query_engine/src/logical_optimizer/tests.rs @@ -0,0 +1,159 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! test utils for logical optimizer + +use std::{any::Any, sync::Arc}; + +use arrow_deps::{ + arrow::datatypes::SchemaRef, + datafusion::{ + datasource::TableProvider, + logical_plan::{ + plan::{Extension, Filter, Projection, Sort}, + DFSchemaRef, Expr, Limit, LogicalPlan, TableScan, ToDFSchema, + }, + physical_plan::ExecutionPlan, + }, +}; +use async_trait::async_trait; +use common_types::schema::Schema; + +use crate::df_planner_extension::table_scan_by_primary_key::TableScanByPrimaryKey; + +#[derive(Clone, Debug)] +#[must_use] +pub struct LogicalPlanNodeBuilder { + pub schema: Schema, + pub table_name: String, + pub plan: Option>, +} + +pub struct MockTableProvider { + schema: Schema, +} + +#[async_trait] +impl TableProvider for MockTableProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.schema.to_arrow_schema_ref() + } + + async fn scan( + &self, + _projection: &Option>, + _filters: &[Expr], + _limit: Option, + ) -> arrow_deps::datafusion::error::Result> { + unimplemented!("not support") + } +} + +impl LogicalPlanNodeBuilder { + pub fn new(table_name: String, schema: Schema) -> Self { + Self { + schema, + table_name, + plan: None, + } + } + + // caller should ensure the sub plan exists. + pub fn take_plan(&mut self) -> Arc { + self.plan.take().unwrap() + } + + pub fn df_schema_ref(&self) -> DFSchemaRef { + self.schema + .to_arrow_schema_ref() + .to_dfschema_ref() + .expect("Build dfschema") + } + + pub fn filter(mut self, predicate: Expr) -> Self { + let plan = LogicalPlan::Filter(Filter { + predicate, + input: self.take_plan(), + }); + + self.plan = Some(Arc::new(plan)); + + self + } + + pub fn projection(mut self, proj_exprs: Vec) -> Self { + let plan = LogicalPlan::Projection(Projection { + expr: proj_exprs, + input: self.take_plan(), + schema: self.df_schema_ref(), + alias: None, + }); + + self.plan = Some(Arc::new(plan)); + + self + } + + pub fn limit(mut self, n: usize) -> Self { + let plan = LogicalPlan::Limit(Limit { + n, + input: self.take_plan(), + }); + + self.plan = Some(Arc::new(plan)); + + self + } + + pub fn sort(mut self, sort_exprs: Vec) -> Self { + let plan = LogicalPlan::Sort(Sort { + expr: sort_exprs, + input: self.take_plan(), + }); + + self.plan = Some(Arc::new(plan)); + + self + } + + pub fn table_scan(mut self) -> Self { + let provider = MockTableProvider { + schema: self.schema.clone(), + }; + let projected_schema = self.df_schema_ref(); + + let plan = LogicalPlan::TableScan(TableScan { + table_name: self.table_name.clone(), + source: Arc::new(provider), + projection: None, + projected_schema, + filters: vec![], + limit: None, + }); + + self.plan = Some(Arc::new(plan)); + + self + } + + pub fn table_scan_in_primary_key_order(mut self, asc: bool) -> Self { + let sub_plan = self.take_plan(); + let node = TableScanByPrimaryKey::new_from_scan_plan(asc, sub_plan); + let plan = LogicalPlan::Extension(Extension { + node: Arc::new(node), + }); + self.plan = Some(Arc::new(plan)); + + self + } +} + +/// Check whether the logical plans are equal. +pub fn assert_logical_plan_eq(left: &LogicalPlan, right: &LogicalPlan) { + let left_plan_str = format!("{:#?}", left); + let right_plan_str = format!("{:#?}", right); + assert_eq!(left_plan_str, right_plan_str) +} diff --git a/query_engine/src/logical_optimizer/type_conversion.rs b/query_engine/src/logical_optimizer/type_conversion.rs new file mode 100644 index 0000000000..ef6aaf6d12 --- /dev/null +++ b/query_engine/src/logical_optimizer/type_conversion.rs @@ -0,0 +1,506 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{mem, sync::Arc}; + +use arrow_deps::{ + arrow::{compute, compute::kernels::cast_utils::string_to_timestamp_nanos}, + datafusion::{ + arrow::datatypes::DataType, + error::{DataFusionError, Result}, + execution::context::ExecutionProps, + logical_plan::{ + plan::Filter, DFSchemaRef, Expr, ExprRewriter, LogicalPlan, Operator, TableScan, + }, + optimizer::{optimizer::OptimizerRule, utils}, + scalar::ScalarValue, + }, +}; +use log::debug; + +/// Optimizer that cast literal value to target column's type +/// +/// Example transformations that are applied: +/// * `expr > '5'` to `expr > 5` when `expr` is of numeric type +/// * `expr > '2021-12-02 15:00:34'` to `expr > 1638428434000(ms)` when `expr` +/// is of timestamp type +/// * `expr > 10` to `expr > '10'` when `expr` is of string type +/// * `expr = 'true'` to `expr = true` when `expr` is of boolean type +pub struct TypeConversion; + +impl OptimizerRule for TypeConversion { + fn optimize( + &self, + plan: &LogicalPlan, + execution_props: &ExecutionProps, + ) -> Result { + let mut rewriter = TypeRewriter { + schemas: plan.all_schemas(), + }; + + match plan { + LogicalPlan::Filter(Filter { predicate, input }) => Ok(LogicalPlan::Filter(Filter { + predicate: predicate.clone().rewrite(&mut rewriter)?, + input: Arc::new(self.optimize(input, execution_props)?), + })), + LogicalPlan::TableScan(TableScan { + table_name, + source, + projection, + projected_schema, + filters, + limit, + }) => { + let rewrite_filters = filters + .clone() + .into_iter() + .map(|e| e.rewrite(&mut rewriter)) + .collect::>>()?; + Ok(LogicalPlan::TableScan(TableScan { + table_name: table_name.clone(), + source: source.clone(), + projection: projection.clone(), + projected_schema: projected_schema.clone(), + filters: rewrite_filters, + limit: *limit, + })) + } + LogicalPlan::Projection { .. } + | LogicalPlan::Window { .. } + | LogicalPlan::Aggregate { .. } + | LogicalPlan::Repartition { .. } + | LogicalPlan::CreateExternalTable { .. } + | LogicalPlan::Extension { .. } + | LogicalPlan::Sort { .. } + | LogicalPlan::Explain { .. } + | LogicalPlan::Limit { .. } + | LogicalPlan::Union { .. } + | LogicalPlan::Join { .. } + | LogicalPlan::CrossJoin { .. } + | LogicalPlan::CreateMemoryTable { .. } + | LogicalPlan::DropTable { .. } + | LogicalPlan::Values { .. } + | LogicalPlan::Analyze { .. } => { + let inputs = plan.inputs(); + let new_inputs = inputs + .iter() + .map(|plan| self.optimize(plan, execution_props)) + .collect::>>()?; + + let expr = plan + .expressions() + .into_iter() + .map(|e| e.rewrite(&mut rewriter)) + .collect::>>()?; + + utils::from_plan(plan, &expr, &new_inputs) + } + LogicalPlan::EmptyRelation { .. } => Ok(plan.clone()), + } + } + + fn name(&self) -> &str { + "type_conversion" + } +} + +struct TypeRewriter<'a> { + /// input schemas + schemas: Vec<&'a DFSchemaRef>, +} + +impl<'a> TypeRewriter<'a> { + fn column_data_type(&self, expr: &Expr) -> Option { + if let Expr::Column(_) = expr { + for schema in &self.schemas { + if let Ok(v) = expr.get_type(schema) { + return Some(v); + } + } + } + + None + } + + fn convert_type<'b>(&self, mut left: &'b Expr, mut right: &'b Expr) -> Result<(Expr, Expr)> { + let left_type = self.column_data_type(left); + let right_type = self.column_data_type(right); + + let mut reverse = false; + let left_type = match (&left_type, &right_type) { + (Some(v), None) => v, + (None, Some(v)) => { + reverse = true; + mem::swap(&mut left, &mut right); + v + } + _ => return Ok((left.clone(), right.clone())), + }; + + match (left, right) { + (Expr::Column(col), Expr::Literal(value)) => { + let casted_right = Self::cast_scalar_value(value, left_type)?; + debug!( + "TypeRewriter convert type, origin_left:{:?}, type:{}, right:{:?}, casted_right:{:?}", + col, left_type, value, casted_right + ); + if casted_right.is_null() { + return Err(DataFusionError::Plan(format!( + "column:{:?} value:{:?} is invalid", + col, value + ))); + } + if reverse { + Ok((Expr::Literal(casted_right), left.clone())) + } else { + Ok((left.clone(), Expr::Literal(casted_right))) + } + } + _ => Ok((left.clone(), right.clone())), + } + } + + fn cast_scalar_value(value: &ScalarValue, data_type: &DataType) -> Result { + if let DataType::Timestamp(_, _) = data_type { + if let ScalarValue::Utf8(Some(v)) = value { + return string_to_timestamp_ms(v); + } + } + + if let DataType::Boolean = data_type { + if let ScalarValue::Utf8(Some(v)) = value { + return match v.to_lowercase().as_str() { + "true" => Ok(ScalarValue::Boolean(Some(true))), + "false" => Ok(ScalarValue::Boolean(Some(false))), + _ => Ok(ScalarValue::Boolean(None)), + }; + } + } + + let array = value.to_array(); + ScalarValue::try_from_array( + &compute::cast(&array, data_type).map_err(DataFusionError::ArrowError)?, + // index: Converts a value in `array` at `index` into a ScalarValue + 0, + ) + } +} + +impl<'a> ExprRewriter for TypeRewriter<'a> { + fn mutate(&mut self, expr: Expr) -> Result { + let new_expr = match expr { + Expr::BinaryExpr { left, op, right } => match op { + Operator::Eq + | Operator::NotEq + | Operator::Lt + | Operator::LtEq + | Operator::Gt + | Operator::GtEq => { + let (left, right) = self.convert_type(&left, &right)?; + Expr::BinaryExpr { + left: Box::new(left), + op, + right: Box::new(right), + } + } + _ => Expr::BinaryExpr { left, op, right }, + }, + Expr::Between { + expr, + negated, + low, + high, + } => { + let (expr, low) = self.convert_type(&expr, &low)?; + let (expr, high) = self.convert_type(&expr, &high)?; + Expr::Between { + expr: Box::new(expr), + negated, + low: Box::new(low), + high: Box::new(high), + } + } + Expr::InList { + expr, + list, + negated, + } => { + let mut list_expr = Vec::with_capacity(list.len()); + for e in list { + let (_, expr_conversion) = self.convert_type(&expr, &e)?; + list_expr.push(expr_conversion); + } + Expr::InList { + expr, + list: list_expr, + negated, + } + } + Expr::Literal(value) => match value { + ScalarValue::TimestampSecond(Some(i), _) => { + timestamp_to_timestamp_ms_expr(TimestampType::Second, i) + } + ScalarValue::TimestampMicrosecond(Some(i), _) => { + timestamp_to_timestamp_ms_expr(TimestampType::Microsecond, i) + } + ScalarValue::TimestampNanosecond(Some(i), _) => { + timestamp_to_timestamp_ms_expr(TimestampType::Nanosecond, i) + } + _ => Expr::Literal(value), + }, + expr => { + // no rewrite possible + expr + } + }; + Ok(new_expr) + } +} + +fn string_to_timestamp_ms(string: &str) -> Result { + Ok(ScalarValue::TimestampMillisecond( + Some( + string_to_timestamp_nanos(string) + .map(|t| t / 1_000_000) + .map_err(DataFusionError::from)?, + ), + None, + )) +} + +#[allow(dead_code)] +enum TimestampType { + Second, + Millisecond, + Microsecond, + Nanosecond, +} + +fn timestamp_to_timestamp_ms_expr(typ: TimestampType, timestamp: i64) -> Expr { + let timestamp = match typ { + TimestampType::Second => timestamp * 1_000, + TimestampType::Millisecond => timestamp, + TimestampType::Microsecond => timestamp / 1_000, + TimestampType::Nanosecond => timestamp / 1_000 / 1_000, + }; + + Expr::Literal(ScalarValue::TimestampMillisecond(Some(timestamp), None)) +} + +#[cfg(test)] +mod tests { + use arrow_deps::{ + arrow::datatypes::TimeUnit, + datafusion::{ + logical_plan::{DFField, DFSchema}, + prelude::col, + }, + }; + + use super::*; + + fn expr_test_schema() -> DFSchemaRef { + Arc::new( + DFSchema::new(vec![ + DFField::new(None, "c1", DataType::Utf8, true), + DFField::new(None, "c2", DataType::Int64, true), + DFField::new(None, "c3", DataType::Float64, true), + DFField::new(None, "c4", DataType::Float32, true), + DFField::new(None, "c5", DataType::Boolean, true), + DFField::new( + None, + "c6", + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + ]) + .unwrap(), + ) + } + + #[test] + fn test_type_conversion_int64() { + let int_value = 100; + let int_str = int_value.to_string(); + let not_int_str = "100ss".to_string(); + let schema = expr_test_schema(); + let mut rewriter = TypeRewriter { + schemas: vec![&schema], + }; + + // Int64 c2 > "100" success + let exp = col("c2").gt(Expr::Literal(ScalarValue::Utf8(Some(int_str.clone())))); + let rewrite_exp = exp.rewrite(&mut rewriter).unwrap(); + assert_eq!( + rewrite_exp, + col("c2").gt(Expr::Literal(ScalarValue::Int64(Some(int_value)),)) + ); + + // Int64 "100" > c2 success + let exp = Expr::Literal(ScalarValue::Utf8(Some(int_str))).gt(col("c2")); + let rewrite_exp = exp.rewrite(&mut rewriter).unwrap(); + assert_eq!( + rewrite_exp, + Expr::Literal(ScalarValue::Int64(Some(int_value))).gt(col("c2")) + ); + + // Int64 c2 > "100ss" fail + let exp = col("c2").gt(Expr::Literal(ScalarValue::Utf8(Some(not_int_str)))); + assert!(exp.rewrite(&mut rewriter).is_err()); + } + + #[test] + fn test_type_conversion_float() { + let double_value = 100.1; + let double_str = double_value.to_string(); + let not_int_str = "100ss".to_string(); + let schema = expr_test_schema(); + let mut rewriter = TypeRewriter { + schemas: vec![&schema], + }; + + // Float64 c3 > "100" success + let exp = col("c3").gt(Expr::Literal(ScalarValue::Utf8(Some(double_str.clone())))); + let rewrite_exp = exp.rewrite(&mut rewriter).unwrap(); + assert_eq!( + rewrite_exp, + col("c3").gt(Expr::Literal(ScalarValue::Float64(Some(double_value)),)) + ); + + // Float64 c3 > "100ss" fail + let exp = col("c3").gt(Expr::Literal(ScalarValue::Utf8(Some(not_int_str.clone())))); + assert!(exp.rewrite(&mut rewriter).is_err()); + + // Float32 c4 > "100" success + let exp = col("c4").gt(Expr::Literal(ScalarValue::Utf8(Some(double_str)))); + let rewrite_exp = exp.rewrite(&mut rewriter).unwrap(); + assert_eq!( + rewrite_exp, + col("c4").gt(Expr::Literal(ScalarValue::Float32(Some( + double_value as f32 + )),)) + ); + + // Float32 c4 > "100ss" fail + let exp = col("c4").gt(Expr::Literal(ScalarValue::Utf8(Some(not_int_str)))); + assert!(exp.rewrite(&mut rewriter).is_err()); + } + + #[test] + fn test_type_conversion_boolean() { + let bool_value = true; + let bool_str = bool_value.to_string(); + let not_int_str = "100ss".to_string(); + let schema = expr_test_schema(); + let mut rewriter = TypeRewriter { + schemas: vec![&schema], + }; + + // Boolean c5 > "100ss" fail + let exp = col("c5").gt(Expr::Literal(ScalarValue::Utf8(Some(not_int_str)))); + assert!(exp.rewrite(&mut rewriter).is_err()); + + // Boolean c5 > "true" success + let exp = col("c5").gt(Expr::Literal(ScalarValue::Utf8(Some(bool_str)))); + let rewrite_exp = exp.rewrite(&mut rewriter).unwrap(); + assert_eq!( + rewrite_exp, + col("c5").gt(Expr::Literal(ScalarValue::Boolean(Some(bool_value)),)) + ); + + // Boolean c5 > true success + let exp = col("c5").gt(Expr::Literal(ScalarValue::Boolean(Some(bool_value)))); + let rewrite_exp = exp.rewrite(&mut rewriter).unwrap(); + assert_eq!( + rewrite_exp, + col("c5").gt(Expr::Literal(ScalarValue::Boolean(Some(bool_value)),)) + ); + } + + #[test] + fn test_type_conversion_timestamp() { + let date_string = "2021-09-07 16:00:00".to_string(); + let schema = expr_test_schema(); + let mut rewriter = TypeRewriter { + schemas: vec![&schema], + }; + + // Timestamp c6 > "2021-09-07 16:00:00" + let exp = col("c6").gt(Expr::Literal(ScalarValue::Utf8(Some(date_string.clone())))); + let rewrite_exp = exp.rewrite(&mut rewriter).unwrap(); + assert_eq!( + rewrite_exp, + col("c6").gt(Expr::Literal(ScalarValue::TimestampMillisecond( + Some( + string_to_timestamp_nanos(&date_string) + .map(|t| t / 1_000_000) + .unwrap(), + ), + None + ),)) + ); + + // "2021-09-07 16:00:00" > Timestamp c6 + let exp = Expr::Literal(ScalarValue::Utf8(Some(date_string.clone()))).gt(col("c6")); + let rewrite_exp = exp.rewrite(&mut rewriter).unwrap(); + assert_eq!( + rewrite_exp, + Expr::Literal(ScalarValue::TimestampMillisecond( + Some( + string_to_timestamp_nanos(&date_string) + .map(|t| t / 1_000_000) + .unwrap(), + ), + None + ),) + .gt(col("c6")) + ); + + // Timestamp c6 > 1642141472 + let timestamp_int = 1642141472; + let exp = col("c6").gt(Expr::Literal(ScalarValue::TimestampSecond( + Some(timestamp_int), + None, + ))); + let rewrite_exp = exp.rewrite(&mut rewriter).unwrap(); + assert_eq!( + rewrite_exp, + col("c6").gt(Expr::Literal(ScalarValue::TimestampMillisecond( + Some(timestamp_int * 1000), + None + ))) + ); + + // Timestamp c6 between "2021-09-07 16:00:00" and "2021-09-07 17:00:00" + let date_string2 = "2021-09-07 17:00:00".to_string(); + let exp = Expr::Between { + expr: Box::new(col("c6")), + negated: false, + low: Box::new(Expr::Literal(ScalarValue::Utf8(Some(date_string.clone())))), + high: Box::new(Expr::Literal(ScalarValue::Utf8(Some(date_string2.clone())))), + }; + let rewrite_exp = exp.rewrite(&mut rewriter).unwrap(); + assert_eq!( + rewrite_exp, + Expr::Between { + expr: Box::new(col("c6")), + negated: false, + low: Box::new(Expr::Literal(ScalarValue::TimestampMillisecond( + Some( + string_to_timestamp_nanos(&date_string) + .map(|t| t / 1_000_000) + .unwrap(), + ), + None + ),)), + high: Box::new(Expr::Literal(ScalarValue::TimestampMillisecond( + Some( + string_to_timestamp_nanos(&date_string2) + .map(|t| t / 1_000_000) + .unwrap(), + ), + None + ),)) + } + ); + } +} diff --git a/query_engine/src/physical_optimizer/coalesce_batches.rs b/query_engine/src/physical_optimizer/coalesce_batches.rs new file mode 100644 index 0000000000..36645aa633 --- /dev/null +++ b/query_engine/src/physical_optimizer/coalesce_batches.rs @@ -0,0 +1,70 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use arrow_deps::datafusion::{ + physical_optimizer::{coalesce_batches::CoalesceBatches, optimizer::PhysicalOptimizerRule}, + physical_plan::{limit::GlobalLimitExec, ExecutionPlan}, + prelude::ExecutionConfig, +}; + +use crate::physical_optimizer::{Adapter, OptimizeRuleRef}; + +pub struct CoalesceBatchesAdapter { + original_rule: CoalesceBatches, +} + +impl Default for CoalesceBatchesAdapter { + fn default() -> Self { + Self { + original_rule: CoalesceBatches::new(), + } + } +} + +impl Adapter for CoalesceBatchesAdapter { + fn may_adapt(original_rule: OptimizeRuleRef) -> OptimizeRuleRef { + if original_rule.name() == CoalesceBatches::new().name() { + Arc::new(Self::default()) + } else { + original_rule + } + } +} + +impl CoalesceBatchesAdapter { + /// Detect the plan contains any limit plan with a small limit(smaller than + /// `batch_size`). + fn detect_small_limit_plan(plan: &dyn ExecutionPlan, batch_size: usize) -> bool { + if let Some(limit_plan) = plan.as_any().downcast_ref::() { + return limit_plan.limit() < batch_size; + } + + for child_plan in plan.children() { + if Self::detect_small_limit_plan(&*child_plan, batch_size) { + return true; + } + } + + // No small limit plan is found. + false + } +} + +impl PhysicalOptimizerRule for CoalesceBatchesAdapter { + fn optimize( + &self, + plan: Arc, + config: &ExecutionConfig, + ) -> arrow_deps::datafusion::error::Result> { + if Self::detect_small_limit_plan(&*plan, config.runtime.batch_size) { + Ok(plan) + } else { + self.original_rule.optimize(plan, config) + } + } + + fn name(&self) -> &str { + "custom_coalesce_batches" + } +} diff --git a/query_engine/src/physical_optimizer/mod.rs b/query_engine/src/physical_optimizer/mod.rs new file mode 100644 index 0000000000..98571d2d6f --- /dev/null +++ b/query_engine/src/physical_optimizer/mod.rs @@ -0,0 +1,87 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Physical query optimizer + +use std::sync::Arc; + +use arrow_deps::datafusion::{ + error::DataFusionError, physical_optimizer::optimizer::PhysicalOptimizerRule, +}; +use async_trait::async_trait; +use snafu::{Backtrace, ResultExt, Snafu}; +use sql::plan::QueryPlan; + +use crate::{ + context::ContextRef, + physical_optimizer::{ + coalesce_batches::CoalesceBatchesAdapter, repartition::RepartitionAdapter, + }, + physical_plan::{DataFusionPhysicalPlan, PhysicalPlanPtr}, +}; + +pub mod coalesce_batches; +pub mod repartition; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "DataFusion Failed to optimize physical plan, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + // TODO(yingwen): Should we carry plan in this context? + DataFusionOptimize { + source: DataFusionError, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +/// Physical query optimizer that converts a logical plan to a +/// physical plan suitable for execution +#[async_trait] +pub trait PhysicalOptimizer { + /// Create a physical plan from a logical plan + async fn optimize(&mut self, logical_plan: QueryPlan) -> Result; +} + +pub struct PhysicalOptimizerImpl { + ctx: ContextRef, +} + +impl PhysicalOptimizerImpl { + pub fn with_context(ctx: ContextRef) -> Self { + Self { ctx } + } +} + +#[async_trait] +impl PhysicalOptimizer for PhysicalOptimizerImpl { + async fn optimize(&mut self, logical_plan: QueryPlan) -> Result { + let exec_ctx = self.ctx.df_exec_ctx(); + let exec_plan = exec_ctx + .create_physical_plan(&logical_plan.df_plan) + .await + .context(DataFusionOptimize)?; + let physical_plan = DataFusionPhysicalPlan::with_plan(exec_ctx.clone(), exec_plan); + + Ok(Box::new(physical_plan)) + } +} + +pub type OptimizeRuleRef = Arc; + +/// The default optimize rules of the datafusion is not all suitable for our +/// cases so the adapters may change the default rules(normally just decide +/// whether to apply the rule according to the specific plan). +pub trait Adapter { + /// May change the original rule into the custom one. + fn may_adapt(original_rule: OptimizeRuleRef) -> OptimizeRuleRef; +} + +pub fn may_adapt_optimize_rule( + original_rule: Arc, +) -> Arc { + CoalesceBatchesAdapter::may_adapt(RepartitionAdapter::may_adapt(original_rule)) +} diff --git a/query_engine/src/physical_optimizer/repartition.rs b/query_engine/src/physical_optimizer/repartition.rs new file mode 100644 index 0000000000..e45d2c939b --- /dev/null +++ b/query_engine/src/physical_optimizer/repartition.rs @@ -0,0 +1,59 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Adapter for the original datafusion repartiton optimization rule. + +use std::sync::Arc; + +use arrow_deps::datafusion::{ + physical_optimizer::{optimizer::PhysicalOptimizerRule, repartition::Repartition}, + physical_plan::ExecutionPlan, + prelude::ExecutionConfig, +}; +use log::debug; + +use crate::physical_optimizer::{Adapter, OptimizeRuleRef}; + +pub struct RepartitionAdapter { + original_rule: Repartition, +} + +impl Default for RepartitionAdapter { + fn default() -> Self { + Self { + original_rule: Repartition::new(), + } + } +} + +impl Adapter for RepartitionAdapter { + fn may_adapt(original_rule: OptimizeRuleRef) -> OptimizeRuleRef { + if original_rule.name() == Repartition::new().name() { + Arc::new(Self::default()) + } else { + original_rule + } + } +} + +impl PhysicalOptimizerRule for RepartitionAdapter { + fn optimize( + &self, + plan: Arc, + config: &ExecutionConfig, + ) -> arrow_deps::datafusion::error::Result> { + // the underlying plan maybe requires the order of the output. + if plan.output_partitioning().partition_count() == 1 { + debug!( + "RepartitionAdapter avoid repartion optimization for plan:{:?}", + plan + ); + Ok(plan) + } else { + self.original_rule.optimize(plan, config) + } + } + + fn name(&self) -> &str { + "custom-repartition" + } +} diff --git a/query_engine/src/physical_plan.rs b/query_engine/src/physical_plan.rs new file mode 100644 index 0000000000..28e344b839 --- /dev/null +++ b/query_engine/src/physical_plan.rs @@ -0,0 +1,101 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Physical execution plan + +use std::{ + fmt::{Debug, Formatter}, + sync::Arc, +}; + +use arrow_deps::datafusion::{ + error::DataFusionError, + physical_plan::{ + coalesce_partitions::CoalescePartitionsExec, display::DisplayableExecutionPlan, + ExecutionPlan, + }, + prelude::ExecutionContext, +}; +use async_trait::async_trait; +use snafu::{Backtrace, ResultExt, Snafu}; +use table_engine::stream::{FromDfStream, SendableRecordBatchStream}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "DataFusion Failed to execute plan, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + DataFusionExec { + partition_count: usize, + source: DataFusionError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to convert datafusion stream, err:{}", source))] + ConvertStream { source: table_engine::stream::Error }, +} + +define_result!(Error); + +#[async_trait] +pub trait PhysicalPlan: std::fmt::Debug { + /// execute this plan and returns the result + async fn execute(&self) -> Result; + + /// Convert internal metrics to string. + fn metrics_to_string(&self) -> String; +} + +pub type PhysicalPlanPtr = Box; + +pub struct DataFusionPhysicalPlan { + ctx: ExecutionContext, + plan: Arc, +} + +impl DataFusionPhysicalPlan { + pub fn with_plan(ctx: ExecutionContext, plan: Arc) -> Self { + Self { ctx, plan } + } +} + +impl Debug for DataFusionPhysicalPlan { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DataFusionPhysicalPlan") + .field("plan", &self.plan) + .finish() + } +} + +#[async_trait] +impl PhysicalPlan for DataFusionPhysicalPlan { + async fn execute(&self) -> Result { + let runtime = self.ctx.state.lock().unwrap().runtime_env.clone(); + let partition_count = self.plan.output_partitioning().partition_count(); + let df_stream = if partition_count <= 1 { + self.plan + .execute(0, runtime) + .await + .context(DataFusionExec { partition_count })? + } else { + // merge into a single partition + let plan = CoalescePartitionsExec::new(self.plan.clone()); + // MergeExec must produce a single partition + assert_eq!(1, plan.output_partitioning().partition_count()); + plan.execute(0, runtime) + .await + .context(DataFusionExec { partition_count })? + }; + + let stream = FromDfStream::new(df_stream).context(ConvertStream)?; + + Ok(Box::pin(stream)) + } + + fn metrics_to_string(&self) -> String { + DisplayableExecutionPlan::with_metrics(&*self.plan) + .indent() + .to_string() + } +} diff --git a/rust-toolchain b/rust-toolchain new file mode 100644 index 0000000000..58d0130e05 --- /dev/null +++ b/rust-toolchain @@ -0,0 +1 @@ +nightly-2022-01-06 diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000000..61594ccda0 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,14 @@ +# https://github.com/rust-lang/rustfmt/blob/master/Configurations.md + +# Break comments to fit on the line +wrap_comments = true +# Merge multiple imports into a single nested import. +imports_granularity = "Crate" +# Format code snippet included in doc comments. +format_code_in_doc_comments = true +# Reorder impl items. type and const are put first, then macros and methods. +reorder_impl_items = true +# Discard existing import groups, and create three groups for std, external crates, crates +group_imports = "StdExternalCrate" + +license_template_path = "etc/license.template" \ No newline at end of file diff --git a/server/Cargo.toml b/server/Cargo.toml new file mode 100644 index 0000000000..5f7b349704 --- /dev/null +++ b/server/Cargo.toml @@ -0,0 +1,44 @@ +[package] +name = "server" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +analytic_engine = { path = "../analytic_engine" } +arrow_deps = { path = "../arrow_deps" } +async-trait = "0.1.41" +avro-rs = "0.13" +catalog = { path = "../catalog" } +ceresdbproto = { git = "https://github.com/CeresDB/ceresdbproto.git"} +common_types = { path = "../common_types" } +common_util = { path = "../common_util" } +futures = "0.3" +grpcio = { path = "../grpcio" } +http = "0.2" +interpreters = { path = "../interpreters" } +lazy_static = "1.4.0" +log = "0.4" +logger = { path = "../components/logger" } +meta_client = { path = "../meta_client" } +profile = { path = "../components/profile" } +protobuf = "2.20" +query_engine = { path = "../query_engine" } +prometheus = "0.12" +prometheus-static-metric = "0.5" +serde = "1.0" +serde_derive = "1.0" +serde_json = "1.0.60" +snafu = { version ="0.6.10", features = ["backtraces"]} +sql = { path = "../sql" } +system_catalog = { path = "../system_catalog" } +table_engine = { path = "../table_engine" } +tokio = { version = "1.0", features = ["full"] } +twox-hash = "1.6" +udf = { path = "../udf" } +warp = "0.3" + +[dev-dependencies] +sql = { path = "../sql" , features=["test"]} diff --git a/server/src/avro_util.rs b/server/src/avro_util.rs new file mode 100644 index 0000000000..69ab049ca3 --- /dev/null +++ b/server/src/avro_util.rs @@ -0,0 +1,166 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Avro utility + +use std::collections::HashMap; + +use avro_rs::{ + schema::{Name, RecordField, RecordFieldOrder}, + types::{Record, Value}, +}; +use common_types::{ + bytes::ByteVec, + column::ColumnBlock, + datum::{Datum, DatumKind}, + record_batch::RecordBatch, + schema::RecordSchema, +}; +use common_util::define_result; +use snafu::{Backtrace, ResultExt, Snafu}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Failed to write avro record, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + WriteAvroRecord { + source: avro_rs::Error, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +/// Create [avro_rs::Schema] with given `name` from [RecordSchema] +pub fn to_avro_schema(name: &str, schema: &RecordSchema) -> avro_rs::Schema { + let columns = schema.columns(); + let mut lookup = HashMap::with_capacity(columns.len()); + let mut avro_fields = Vec::with_capacity(columns.len()); + + for (pos, column) in columns.iter().enumerate() { + // Create avro record field + let default = if column.is_nullable { + Some(serde_json::value::Value::Null) + } else { + None + }; + + let field_schema = if column.is_nullable { + // We want to declare a schema which may be either a null or non-null value, + // for example: ["null", "string"]. + // + // However, `avro_rs` does not provide an accessible API to build a `Union`. + // We can't find a better way to do this than using JSON. + let field_schema_str = format!( + r#"["null", {}]"#, + data_type_to_schema(&column.data_type).canonical_form() + ); + avro_rs::Schema::parse_str(&field_schema_str).unwrap() + } else { + data_type_to_schema(&column.data_type) + }; + + let record_field = RecordField { + name: column.name.clone(), + doc: None, + default, + schema: field_schema, + order: RecordFieldOrder::Ignore, + position: pos, + }; + + avro_fields.push(record_field); + lookup.insert(column.name.clone(), pos); + } + + avro_rs::Schema::Record { + name: Name::new(name), + doc: None, + fields: avro_fields, + lookup, + } +} + +fn data_type_to_schema(data_type: &DatumKind) -> avro_rs::Schema { + match data_type { + DatumKind::Null => avro_rs::Schema::Null, + DatumKind::Timestamp => avro_rs::Schema::TimestampMillis, + DatumKind::Double => avro_rs::Schema::Double, + DatumKind::Float => avro_rs::Schema::Float, + DatumKind::Varbinary => avro_rs::Schema::Bytes, + DatumKind::String => avro_rs::Schema::String, + DatumKind::UInt32 | DatumKind::Int64 | DatumKind::UInt64 => avro_rs::Schema::Long, + DatumKind::UInt16 + | DatumKind::UInt8 + | DatumKind::Int32 + | DatumKind::Int16 + | DatumKind::Int8 => avro_rs::Schema::Int, + DatumKind::Boolean => avro_rs::Schema::Boolean, + } +} + +/// Convert record batch to avro format +pub fn record_batch_to_avro( + record_batch: &RecordBatch, + schema: &avro_rs::Schema, + rows: &mut Vec, +) -> Result<()> { + let record_batch_schema = record_batch.schema(); + assert_eq!( + record_batch_schema.num_columns(), + record_batch.num_columns() + ); + + rows.reserve(record_batch.num_rows()); + + let column_schemas = record_batch_schema.columns(); + for row_idx in 0..record_batch.num_rows() { + let mut record = Record::new(schema).unwrap(); + for (col_idx, column_schema) in column_schemas.iter().enumerate() { + let column = record_batch.column(col_idx); + let value = column_to_value(column, row_idx, column_schema.is_nullable); + + record.put(&column_schema.name, value); + } + + let row_bytes = avro_rs::to_avro_datum(schema, record).context(WriteAvroRecord)?; + + rows.push(row_bytes); + } + + Ok(()) +} + +/// Panic if row_idx is out of bound. +fn column_to_value(array: &ColumnBlock, row_idx: usize, is_nullable: bool) -> Value { + let datum = array.datum(row_idx); + match datum { + Datum::Null => may_union(Value::Null, is_nullable), + Datum::Timestamp(v) => may_union(Value::TimestampMillis(v.as_i64()), is_nullable), + Datum::Double(v) => may_union(Value::Double(v), is_nullable), + Datum::Float(v) => may_union(Value::Float(v), is_nullable), + Datum::Varbinary(v) => may_union(Value::Bytes(v.to_vec()), is_nullable), + Datum::String(v) => may_union(Value::String(v.to_string()), is_nullable), + // TODO(yingwen): Should we return error if overflow? Avro does not support uint64. + Datum::UInt64(v) => may_union(Value::Long(v as i64), is_nullable), + Datum::Int64(v) => may_union(Value::Long(v), is_nullable), + Datum::UInt32(v) => may_union(Value::Long(i64::from(v)), is_nullable), + Datum::UInt16(v) => may_union(Value::Int(i32::from(v)), is_nullable), + Datum::UInt8(v) => may_union(Value::Int(i32::from(v)), is_nullable), + Datum::Int32(v) => may_union(Value::Int(v), is_nullable), + Datum::Int16(v) => may_union(Value::Int(i32::from(v)), is_nullable), + Datum::Int8(v) => may_union(Value::Int(i32::from(v)), is_nullable), + Datum::Boolean(v) => may_union(Value::Boolean(v), is_nullable), + } +} + +#[inline] +fn may_union(val: Value, is_nullable: bool) -> Value { + if is_nullable { + Value::Union(Box::new(val)) + } else { + val + } +} diff --git a/server/src/config.rs b/server/src/config.rs new file mode 100644 index 0000000000..3a62758a0d --- /dev/null +++ b/server/src/config.rs @@ -0,0 +1,88 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Server configs + +use analytic_engine; +use meta_client::MetaClientConfig; +use serde_derive::Deserialize; + +use crate::router::RuleList; + +#[derive(Debug, Deserialize)] +#[serde(default)] +pub struct RuntimeConfig { + // Runtime for reading data + pub read_thread_num: usize, + // Runtime for writing data + pub write_thread_num: usize, + // Runtime for background tasks + pub background_thread_num: usize, +} + +// TODO(yingwen): Split config into several sub configs. +#[derive(Debug, Deserialize)] +#[serde(default)] +pub struct Config { + /// The address to listen. + pub bind_addr: String, + pub http_port: u16, + pub grpc_port: u16, + pub grpc_server_cq_count: usize, + + // Engine related configs: + pub runtime: RuntimeConfig, + + // Log related configs: + pub log_level: String, + pub enable_async_log: bool, + pub async_log_channel_len: i32, + + // Tracing related configs: + pub tracing_log_dir: String, + pub tracing_log_name: String, + pub tracing_level: String, + + // Meta client related configs: + pub meta_client: MetaClientConfig, + // Config of router. + pub route_rules: RuleList, + + // Analytic engine configs: + pub analytic: analytic_engine::Config, +} + +impl Default for RuntimeConfig { + fn default() -> Self { + Self { + read_thread_num: 8, + write_thread_num: 8, + background_thread_num: 8, + } + } +} + +impl Default for Config { + fn default() -> Self { + let grpc_port = 8831; + Self { + bind_addr: String::from("127.0.0.1"), + http_port: 5000, + grpc_port, + grpc_server_cq_count: 20, + runtime: RuntimeConfig::default(), + log_level: "debug".to_string(), + enable_async_log: true, + async_log_channel_len: 102400, + tracing_log_dir: String::from("/tmp/ceresdbx"), + tracing_log_name: String::from("tracing"), + tracing_level: String::from("info"), + meta_client: MetaClientConfig { + node: String::from("127.0.0.1"), + port: grpc_port, + ..Default::default() + }, + route_rules: RuleList::default(), + analytic: analytic_engine::Config::default(), + } + } +} diff --git a/server/src/consts.rs b/server/src/consts.rs new file mode 100644 index 0000000000..bbaa5c1c98 --- /dev/null +++ b/server/src/consts.rs @@ -0,0 +1,8 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Common constants + +/// Header of catalog name +pub const CATALOG_HEADER: &str = "x-ceresdbx-catalog"; +/// Header of tenant name +pub const TENANT_HEADER: &str = "x-ceresdbx-access-tenant"; diff --git a/server/src/context.rs b/server/src/context.rs new file mode 100644 index 0000000000..119c3ec984 --- /dev/null +++ b/server/src/context.rs @@ -0,0 +1,81 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Server context + +use std::sync::Arc; + +use common_util::runtime::Runtime; +use snafu::{ensure, Backtrace, OptionExt, Snafu}; + +#[allow(clippy::enum_variant_names)] +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Missing catalog.\nBacktrace:\n{}", backtrace))] + MissingCatalog { backtrace: Backtrace }, + + #[snafu(display("Missing tenant.\nBacktrace:\n{}", backtrace))] + MissingTenant { backtrace: Backtrace }, + + #[snafu(display("Missing runtime.\nBacktrace:\n{}", backtrace))] + MissingRuntime { backtrace: Backtrace }, +} + +define_result!(Error); + +/// Server request context +/// +/// Context for request, may contains +/// 1. Request context and options +/// 2. Info from http headers +pub struct RequestContext { + /// Catalog of the request + pub catalog: String, + /// Tenant of request + pub tenant: String, + /// Runtime of this request + pub runtime: Arc, +} + +impl RequestContext { + pub fn builder() -> Builder { + Builder::default() + } +} + +#[derive(Default)] +pub struct Builder { + catalog: String, + tenant: String, + runtime: Option>, +} + +impl Builder { + pub fn catalog(mut self, catalog: String) -> Self { + self.catalog = catalog; + self + } + + pub fn tenant(mut self, tenant: String) -> Self { + self.tenant = tenant; + self + } + + pub fn runtime(mut self, runtime: Arc) -> Self { + self.runtime = Some(runtime); + self + } + + pub fn build(self) -> Result { + ensure!(!self.catalog.is_empty(), MissingCatalog); + // We use tenant as schema, so we use default schema if tenant is not specific + ensure!(!self.tenant.is_empty(), MissingTenant); + + let runtime = self.runtime.context(MissingRuntime)?; + + Ok(RequestContext { + catalog: self.catalog, + tenant: self.tenant, + runtime, + }) + } +} diff --git a/server/src/error.rs b/server/src/error.rs new file mode 100644 index 0000000000..47006fde7e --- /dev/null +++ b/server/src/error.rs @@ -0,0 +1,67 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Server error + +use common_util::define_result; +use snafu::Snafu; + +/// Server status code +#[derive(Debug, Clone, Copy)] +pub enum StatusCode { + Ok = 200, + InvalidArgument = 400, + NotFound = 404, + TooManyRequests = 429, + InternalError = 500, +} + +impl StatusCode { + pub fn as_u32(&self) -> u32 { + *self as u32 + } +} + +define_result!(ServerError); + +#[derive(Snafu, Debug)] +#[snafu(visibility(pub(crate)))] +pub enum ServerError { + #[snafu(display("Rpc error, code:{}, message:{}", code.as_u32(), msg))] + ErrNoCause { code: StatusCode, msg: String }, + + #[snafu(display("Rpc error, code:{}, message:{}, cause:{}", code.as_u32(), msg, source))] + ErrWithCause { + code: StatusCode, + msg: String, + source: Box, + }, +} + +impl ServerError { + pub fn code(&self) -> StatusCode { + match *self { + ServerError::ErrNoCause { code, .. } => code, + ServerError::ErrWithCause { code, .. } => code, + } + } + + /// Get the error message returned to the user. + pub fn error_message(&self) -> String { + match self { + ServerError::ErrNoCause { msg, .. } => msg.clone(), + + ServerError::ErrWithCause { msg, source, .. } => { + let err_string = source.to_string(); + let first_line = first_line_in_error(&err_string); + format!("{}. Caused by: {}", msg, first_line) + } + } + } +} + +/// Returns first line in error message, now we use this hack to exclude +/// backtrace from error message that returned to user. +// TODO(yingwen): Consider a better way to get the error message. +pub(crate) fn first_line_in_error(err_string: &str) -> &str { + err_string.split('\n').next().unwrap_or(err_string) +} diff --git a/server/src/grpc/metrics.rs b/server/src/grpc/metrics.rs new file mode 100644 index 0000000000..aec9f7acdc --- /dev/null +++ b/server/src/grpc/metrics.rs @@ -0,0 +1,42 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Grpc server metrics + +use lazy_static::lazy_static; +use prometheus::{exponential_buckets, register_histogram_vec, HistogramVec}; +use prometheus_static_metric::{auto_flush_from, make_auto_flush_static_metric}; + +// Register auto flush static metrics. +make_auto_flush_static_metric! { + pub label_enum GrpcTypeKind { + handle_route, + handle_write, + handle_query, + handle_stream_write, + handle_stream_query, + } + + pub struct GrpcHandlerDurationHistogramVec: LocalHistogram { + "type" => GrpcTypeKind, + } +} + +// Register global metrics. +lazy_static! { + pub static ref GRPC_HANDLER_DURATION_HISTOGRAM_VEC_GLOBAL: HistogramVec = + register_histogram_vec!( + "grpc_handler_duration", + "Bucketed histogram of grpc server handler", + &["type"], + exponential_buckets(0.0005, 2.0, 20).unwrap() + ) + .unwrap(); +} + +// Register thread local metrics with default flush interval (1s). +lazy_static! { + pub static ref GRPC_HANDLER_DURATION_HISTOGRAM_VEC: GrpcHandlerDurationHistogramVec = auto_flush_from!( + GRPC_HANDLER_DURATION_HISTOGRAM_VEC_GLOBAL, + GrpcHandlerDurationHistogramVec + ); +} diff --git a/server/src/grpc/mod.rs b/server/src/grpc/mod.rs new file mode 100644 index 0000000000..521400ab72 --- /dev/null +++ b/server/src/grpc/mod.rs @@ -0,0 +1,1034 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Grpc services + +use std::{ + collections::{BTreeMap, HashMap}, + sync::Arc, + time::Instant, +}; + +use async_trait::async_trait; +use catalog::{consts as catalogConst, manager::Manager as CatalogManager}; +use ceresdbproto::{ + common::ResponseHeader, + prometheus::{PrometheusQueryRequest, PrometheusQueryResponse}, + storage::{ + QueryRequest, QueryResponse, RouteRequest, RouteResponse, Value_oneof_value, WriteMetric, + WriteRequest, WriteResponse, + }, + storage_grpc::{self, StorageService}, +}; +use common_types::{ + column_schema::{self, ColumnSchema}, + datum::DatumKind, + schema::{Builder as SchemaBuilder, Error as SchemaError, Schema, TSID_COLUMN}, +}; +use common_util::{define_result, time::InstantExt}; +use futures::{stream::StreamExt, FutureExt, SinkExt, TryFutureExt}; +use grpcio::{ + ClientStreamingSink, Environment, Metadata, RequestStream, RpcContext, Server, ServerBuilder, + ServerStreamingSink, UnarySink, WriteFlags, +}; +use log::{error, info}; +use meta_client::{ + ClusterViewRef, FailGetCatalog, FailOnChangeView, MetaClient, MetaClientConfig, MetaWatcher, + SchemaConfig, +}; +use query_engine::executor::Executor as QueryExecutor; +use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu}; +use sql::plan::CreateTablePlan; +use table_engine::engine::EngineRuntimes; +use tokio::sync::oneshot; + +use crate::{ + consts, + error::{ErrNoCause, ErrWithCause, Result as ServerResult, ServerError, StatusCode}, + grpc::metrics::GRPC_HANDLER_DURATION_HISTOGRAM_VEC, + instance::InstanceRef, + router::{Router, RouterRef, RuleBasedRouter, RuleList}, +}; + +mod metrics; +mod prom_query; +mod query; +mod route; +mod write; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Failed to build rpc server, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + BuildRpcServer { + source: grpcio::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to build meta client, err:{}", source))] + BuildMetaClient { source: meta_client::Error }, + + #[snafu(display("Failed to start meta client, err:{}", source))] + StartMetaClient { source: meta_client::Error }, + + #[snafu(display("Missing meta client config.\nBacktrace:\n{}", backtrace))] + MissingMetaClientConfig { backtrace: Backtrace }, + + #[snafu(display("Missing grpc environment.\nBacktrace:\n{}", backtrace))] + MissingEnv { backtrace: Backtrace }, + + #[snafu(display("Missing runtimes.\nBacktrace:\n{}", backtrace))] + MissingRuntimes { backtrace: Backtrace }, + + #[snafu(display("Missing instance.\nBacktrace:\n{}", backtrace))] + MissingInstance { backtrace: Backtrace }, + + #[snafu(display("Catalog name is not utf8.\nBacktrace:\n{}", backtrace))] + ParseCatalogName { + source: std::string::FromUtf8Error, + backtrace: Backtrace, + }, + + #[snafu(display("Schema name is not utf8.\nBacktrace:\n{}", backtrace))] + ParseSchemaName { + source: std::string::FromUtf8Error, + backtrace: Backtrace, + }, + + #[snafu(display("Fail to build table schema for metric: {}, err:{}", metric, source))] + BuildTableSchema { metric: String, source: SchemaError }, + + #[snafu(display( + "Fail to build column schema from column: {}, err:{}", + column_name, + source + ))] + BuildColumnSchema { + column_name: String, + source: column_schema::Error, + }, + #[snafu(display("Invalid column: {} schema, err:{}", column_name, source))] + InvalidColumnSchema { + column_name: String, + source: column_schema::Error, + }, + + #[snafu(display("Invalid argument: {}", msg))] + InvalidArgument { msg: String }, + + #[snafu(display( + "Failed to send response to grpc sink, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + GrpcSink { + source: grpcio::Error, + backtrace: Backtrace, + }, +} + +const STREAM_QUERY_CHANNEL_LEN: usize = 20; + +define_result!(Error); + +/// Rpc request header +#[derive(Debug, Default)] +pub struct RequestHeader { + metas: HashMap>, +} + +impl From<&Metadata> for RequestHeader { + fn from(meta: &Metadata) -> Self { + let metas = meta + .iter() + .map(|(k, v)| (k.to_string(), v.to_vec())) + .collect(); + + Self { metas } + } +} + +impl RequestHeader { + pub fn get(&self, key: &str) -> Option<&[u8]> { + self.metas.get(key).map(|v| v.as_slice()) + } +} + +pub struct HandlerContext<'a, C, Q> { + #[allow(dead_code)] + header: RequestHeader, + router: RouterRef, + instance: InstanceRef, + catalog: String, + schema: String, + schema_config: Option<&'a SchemaConfig>, +} + +impl<'a, C: CatalogManager, Q> HandlerContext<'a, C, Q> { + fn new( + header: RequestHeader, + router: Arc, + instance: InstanceRef, + cluster_view: &'a ClusterViewRef, + ) -> Result { + let default_catalog = instance.catalog_manager.default_catalog_name(); + let default_schema = instance.catalog_manager.default_schema_name(); + + let catalog = header + .get(consts::CATALOG_HEADER) + .map(|v| String::from_utf8(v.to_vec())) + .transpose() + .context(ParseCatalogName)? + .unwrap_or_else(|| default_catalog.to_string()); + + let schema = header + .get(consts::TENANT_HEADER) + .map(|v| String::from_utf8(v.to_vec())) + .transpose() + .context(ParseSchemaName)? + .unwrap_or_else(|| default_schema.to_string()); + + let schema_config = cluster_view.schema_configs.get(&schema); + + Ok(Self { + header, + router, + instance, + catalog, + schema, + schema_config, + }) + } + + #[inline] + fn catalog(&self) -> &str { + &self.catalog + } + + #[inline] + fn tenant(&self) -> &str { + &self.schema + } +} + +/// Rpc services manages all grpc services of the server. +pub struct RpcServices { + /// The grpc server + rpc_server: Server, + /// Meta client + meta_client: Arc, +} + +impl RpcServices { + /// Start the rpc services + pub async fn start(&mut self) -> Result<()> { + self.meta_client.start().await.context(StartMetaClient)?; + + self.rpc_server.start(); + for (host, port) in self.rpc_server.bind_addrs() { + info!("Grpc server listening on {}:{}", host, port); + } + + Ok(()) + } + + pub fn shutdown(&mut self) { + self.rpc_server.shutdown(); + } +} + +pub struct Builder { + bind_addr: String, + port: u16, + meta_client_config: Option, + env: Option>, + runtimes: Option>, + instance: Option>, + route_rules: RuleList, +} + +impl Builder { + pub fn new() -> Self { + Self { + bind_addr: String::from("0.0.0.0"), + port: 38081, + meta_client_config: None, + env: None, + runtimes: None, + instance: None, + route_rules: RuleList::default(), + } + } + + pub fn bind_addr(mut self, addr: String) -> Self { + self.bind_addr = addr; + self + } + + pub fn port(mut self, port: u16) -> Self { + self.port = port; + self + } + + pub fn meta_client_config(mut self, config: MetaClientConfig) -> Self { + self.meta_client_config = Some(config); + self + } + + pub fn env(mut self, env: Arc) -> Self { + self.env = Some(env); + self + } + + pub fn runtimes(mut self, runtimes: Arc) -> Self { + self.runtimes = Some(runtimes); + self + } + + pub fn instance(mut self, instance: InstanceRef) -> Self { + self.instance = Some(instance); + self + } + + pub fn route_rules(mut self, route_rules: RuleList) -> Self { + self.route_rules = route_rules; + self + } +} + +impl Builder { + pub fn build(self) -> Result { + let meta_client_config = self.meta_client_config.context(MissingMetaClientConfig)?; + let runtimes = self.runtimes.context(MissingRuntimes)?; + let instance = self.instance.context(MissingInstance)?; + + let watcher = Box::new(SchemaWatcher { + catalog_manager: instance.catalog_manager.clone(), + }); + + let meta_client = meta_client::build_meta_client( + meta_client_config, + runtimes.bg_runtime.clone(), + Some(watcher), + ) + .context(BuildMetaClient)?; + let router = Arc::new(RuleBasedRouter::new(meta_client.clone(), self.route_rules)); + let storage_service = StorageServiceImpl { + router, + instance, + runtimes, + meta_client: meta_client.clone(), + }; + let rpc_service = storage_grpc::create_storage_service(storage_service); + + let env = self.env.context(MissingEnv)?; + + let rpc_server = ServerBuilder::new(env) + .register_service(rpc_service) + .bind(self.bind_addr, self.port) + .build() + .context(BuildRpcServer)?; + + Ok(RpcServices { + rpc_server, + meta_client, + }) + } +} + +struct SchemaWatcher { + catalog_manager: C, +} + +#[async_trait] +impl MetaWatcher for SchemaWatcher { + async fn on_change(&self, view: ClusterViewRef) -> meta_client::Result<()> { + for schema in view.schema_shards.keys() { + let default_catalog = catalogConst::DEFAULT_CATALOG; + if let Some(catalog) = self + .catalog_manager + .catalog_by_name(default_catalog) + .map_err(|e| Box::new(e) as _) + .context(FailGetCatalog { + catalog: default_catalog, + })? + { + catalog + .create_schema(schema) + .await + .map_err(|e| Box::new(e) as _) + .context(FailOnChangeView { + schema, + catalog: default_catalog, + })?; + } + } + Ok(()) + } +} + +fn build_err_header(err: ServerError) -> ResponseHeader { + let mut header = ResponseHeader::new(); + header.set_code(err.code().as_u32()); + header.set_error(err.error_message()); + + header +} + +fn build_ok_header() -> ResponseHeader { + let mut header = ResponseHeader::new(); + header.set_code(StatusCode::Ok.as_u32()); + + header +} + +struct StorageServiceImpl { + router: Arc, + instance: InstanceRef, + runtimes: Arc, + meta_client: Arc, +} + +impl Clone for StorageServiceImpl { + fn clone(&self) -> Self { + Self { + router: self.router.clone(), + instance: self.instance.clone(), + runtimes: self.runtimes.clone(), + meta_client: self.meta_client.clone(), + } + } +} + +macro_rules! handle_request { + ($mod_name: ident, $handle_fn: ident, $req_ty: ident, $resp_ty: ident) => { + fn $mod_name(&mut self, ctx: RpcContext<'_>, req: $req_ty, sink: UnarySink<$resp_ty>) { + let begin_instant = Instant::now(); + + let router = self.router.clone(); + let header = RequestHeader::from(ctx.request_headers()); + let instance = self.instance.clone(); + let (tx, rx) = oneshot::channel(); + + // The future spawned by tokio cannot be executed by other executor/runtime, so + + let runtime = match stringify!($mod_name) { + "query" => &self.runtimes.read_runtime, + "write" => &self.runtimes.write_runtime, + _ => &self.runtimes.bg_runtime, + }; + + let cluster_view = self.meta_client.get_cluster_view(); + // we need to pass the result via channel + runtime.spawn( + async move { + let handler_ctx = HandlerContext::new(header, router, instance, &cluster_view) + .map_err(|e| Box::new(e) as _) + .context(ErrWithCause { + code: StatusCode::InvalidArgument, + msg: "Invalid header", + })?; + $mod_name::$handle_fn(&handler_ctx, req).await.map_err(|e| { + error!( + "Failed to handle request, mod:{}, handler:{}, err:{}", + stringify!($mod_name), + stringify!($handle_fn), + e + ); + e + }) + } + .then(|resp_result| async move { + if tx.send(resp_result).is_err() { + error!( + "Failed to send handler result, mod:{}, handler:{}", + stringify!($mod_name), + stringify!($handle_fn), + ) + } + }), + ); + + let task = async move { + let resp_result = match rx.await { + Ok(resp_result) => resp_result, + Err(_e) => ErrNoCause { + code: StatusCode::InternalError, + msg: "Result channel disconnected", + } + .fail(), + }; + + let resp = match resp_result { + Ok(resp) => resp, + Err(e) => { + let mut resp = $resp_ty::new(); + resp.set_header(build_err_header(e)); + resp + } + }; + let ret = sink.success(resp).await.context(GrpcSink); + + GRPC_HANDLER_DURATION_HISTOGRAM_VEC + .$handle_fn + .observe(begin_instant.saturating_elapsed().as_secs_f64()); + + ret?; + + Result::Ok(()) + } + .map_err(move |e| { + error!( + "Failed to reply grpc resp, mod:{}, handler:{}, err:{:?}", + stringify!($mod_name), + stringify!($handle_fn), + e + ) + }) + .map(|_| ()); + + ctx.spawn(task); + } + }; +} + +impl StorageService + for StorageServiceImpl +{ + handle_request!(route, handle_route, RouteRequest, RouteResponse); + + handle_request!(write, handle_write, WriteRequest, WriteResponse); + + handle_request!(query, handle_query, QueryRequest, QueryResponse); + + handle_request!( + prom_query, + handle_query, + PrometheusQueryRequest, + PrometheusQueryResponse + ); + + fn stream_write( + &mut self, + ctx: RpcContext<'_>, + mut stream_req: RequestStream, + sink: ClientStreamingSink, + ) { + let begin_instant = Instant::now(); + let router = self.router.clone(); + let header = RequestHeader::from(ctx.request_headers()); + let instance = self.instance.clone(); + let cluster_view = self.meta_client.get_cluster_view(); + + let (tx, rx) = oneshot::channel(); + self.runtimes.write_runtime.spawn(async move { + let handler_ctx = HandlerContext::new(header, router, instance, &cluster_view) + .map_err(|e| Box::new(e) as _) + .context(ErrWithCause { + code: StatusCode::InvalidArgument, + msg: "Invalid header", + })?; + let mut total_success = 0; + let mut resp = WriteResponse::new(); + let mut has_err = false; + while let Some(req) = stream_req.next().await { + let write_result = write::handle_write( + &handler_ctx, + req.map_err(|e| Box::new(e) as _).context(ErrWithCause { + code: StatusCode::InternalError, + msg: "Failed to fetch request", + })?, + ) + .await + .map_err(|e| { + error!("Failed to handle request, mod:stream_write, handler:handle_stream_write, err:{}", e); + e + }); + + match write_result { + Ok(write_resp) => total_success += write_resp.success, + Err(e) => { + resp.set_header(build_err_header(e)); + has_err = true; + break; + } + } + } + if !has_err { + resp.set_header(build_ok_header()); + resp.set_success(total_success as u32); + } + + ServerResult::Ok(resp) + }.then(|resp_result| async move { + if tx.send(resp_result).is_err() { + error!("Failed to send handler result, mod:stream_write, handler:handle_stream_write"); + } + }), + ); + + let task = async move { + let resp_result = match rx.await { + Ok(resp_result) => resp_result, + Err(_e) => ErrNoCause { + code: StatusCode::InternalError, + msg: "Result channel disconnected", + } + .fail(), + }; + + let resp = match resp_result { + Ok(resp) => resp, + Err(e) => { + let mut resp = WriteResponse::new(); + resp.set_header(build_err_header(e)); + resp + } + }; + sink.success(resp).await.context(GrpcSink)?; + + GRPC_HANDLER_DURATION_HISTOGRAM_VEC + .handle_stream_write + .observe(begin_instant.saturating_elapsed().as_secs_f64()); + + Result::Ok(()) + } + .map_err(move |e| { + error!( + "Failed to reply grpc resp, mod:stream_write, handler:handle_stream_write, err:{}", + e + ) + }) + .map(|_| ()); + + ctx.spawn(task); + } + + fn stream_query( + &mut self, + ctx: RpcContext<'_>, + req: QueryRequest, + mut sink: ServerStreamingSink, + ) { + let begin_instant = Instant::now(); + let router = self.router.clone(); + let header = RequestHeader::from(ctx.request_headers()); + let instance = self.instance.clone(); + let cluster_view = self.meta_client.get_cluster_view(); + let (tx, mut rx) = tokio::sync::mpsc::channel(STREAM_QUERY_CHANNEL_LEN); + self.runtimes.read_runtime.spawn(async move { + let handler_ctx = HandlerContext::new(header, router, instance, &cluster_view) + .map_err(|e| Box::new(e) as _) + .context(ErrWithCause { + code: StatusCode::InvalidArgument, + msg: "Invalid header", + })?; + let output = query::fetch_query_output(&handler_ctx, &req) + .await + .map_err(|e| { + error!("Failed to handle request, mod:stream_query, handler:handle_stream_query, err:{}", e); + e + })?; + if let Some(batch) = query::get_record_batch(&output) { + for i in 0..batch.len() { + let resp = query::convert_records(&batch[i..i + 1]); + if tx.send(resp).await.is_err() { + error!("Failed to send handler result, mod:stream_query, handler:handle_stream_query"); + break; + } + } + } else { + let mut resp = QueryResponse::new(); + resp.set_header(build_ok_header()); + + if tx.send(ServerResult::Ok(resp)).await.is_err() { + error!("Failed to send handler result, mod:stream_query, handler:handle_stream_query"); + } + } + ServerResult::Ok(()) + }); + + let mut has_err = false; + let task = async move { + while let Some(result) = rx.recv().await { + let resp = match result { + Ok(resp) => resp, + Err(e) => { + has_err = true; + let mut resp = QueryResponse::new(); + resp.set_header(build_err_header(e)); + resp + } + }; + sink.send((resp, WriteFlags::default())) + .await + .context(GrpcSink)?; + if has_err { + break; + } + } + sink.flush().await.context(GrpcSink)?; + sink.close().await.context(GrpcSink)?; + GRPC_HANDLER_DURATION_HISTOGRAM_VEC + .handle_stream_query + .observe(begin_instant.saturating_elapsed().as_secs_f64()); + Result::Ok(()) + } + .map_err(move |e| { + error!( + "Failed to reply grpc resp, mod:stream_query, handler:handle_stream_query, err:{}", + e + ); + }) + .map(|_| ()); + + ctx.spawn(task); + } +} + +/// Create CreateTablePlan from a write metric. +// The caller must ENSURE that the HandlerContext's schema_config is not None. +pub fn write_metric_to_create_table_plan< + C: CatalogManager + 'static, + Q: QueryExecutor + 'static, +>( + ctx: &HandlerContext, + write_metric: &WriteMetric, +) -> Result { + let schema_config = ctx.schema_config.unwrap(); + Ok(CreateTablePlan { + engine: schema_config.default_engine_type.clone(), + if_not_exists: true, + table: write_metric.get_metric().to_string(), + table_schema: build_schema_from_metric(schema_config, write_metric)?, + options: HashMap::default(), + }) +} + +fn build_column_schema( + column_name: &str, + data_type: DatumKind, + is_tag: bool, +) -> Result { + let builder = column_schema::Builder::new(column_name.to_string(), data_type) + .is_nullable(true) + .is_tag(is_tag); + + builder.build().context(BuildColumnSchema { column_name }) +} + +fn build_schema_from_metric(schema_config: &SchemaConfig, metric: &WriteMetric) -> Result { + let field_names = metric.get_field_names(); + let tag_names = metric.get_tag_names(); + let table_name = metric.get_metric(); + + let mut schema_builder = + SchemaBuilder::with_capacity(field_names.len()).auto_increment_column_id(true); + + let write_entries = metric.get_entries(); + + ensure!( + !write_entries.is_empty(), + InvalidArgument { + msg: format!("Emtpy write entires to write table:{}", table_name,), + } + ); + + let mut name_column_map: BTreeMap<_, ColumnSchema> = BTreeMap::new(); + for write_entry in write_entries { + // parse tags + for tag in write_entry.get_tags() { + let name_index = tag.name_index as usize; + ensure!( + name_index < tag_names.len(), + InvalidArgument { + msg: format!( + "tag index {} is not found in tag_names:{:?}, table:{}", + name_index, tag_names, table_name, + ), + } + ); + + let tag_name = &tag_names[name_index]; + + let tag_value = tag + .get_value() + .value + .as_ref() + .with_context(|| InvalidArgument { + msg: format!("Tag value is needed, tag_name:{} ", tag_name), + })?; + + let data_type = try_get_data_type_from_value(tag_value)?; + + if let Some(column_schema) = name_column_map.get(tag_name) { + ensure_data_type_compatible(table_name, tag_name, true, data_type, column_schema)?; + } + let column_schema = build_column_schema(tag_name, data_type, true)?; + name_column_map.insert(tag_name, column_schema); + } + + // parse fields + for field_group in write_entry.get_field_groups().iter() { + for field in field_group.get_fields() { + if (field.name_index as usize) < field_names.len() { + let field_name = &field_names[field.name_index as usize]; + let field_value = + field + .get_value() + .value + .as_ref() + .with_context(|| InvalidArgument { + msg: format!( + "Field: {} value is needed, table:{}", + field_name, table_name + ), + })?; + + let data_type = try_get_data_type_from_value(field_value)?; + + if let Some(column_schema) = name_column_map.get(field_name) { + ensure_data_type_compatible( + table_name, + field_name, + false, + data_type, + column_schema, + )?; + } + + let column_schema = build_column_schema(field_name, data_type, false)?; + name_column_map.insert(field_name, column_schema); + } + } + } + } + + // Timestamp column will be the last column + let timestamp_column_schema = column_schema::Builder::new( + schema_config.default_timestamp_column_name.clone(), + DatumKind::Timestamp, + ) + .is_nullable(false) + .build() + .context(InvalidColumnSchema { + column_name: TSID_COLUMN, + })?; + + // Use (timestamp, tsid) as primary key. + let tsid_column_schema = + column_schema::Builder::new(TSID_COLUMN.to_string(), DatumKind::UInt64) + .is_nullable(false) + .build() + .context(InvalidColumnSchema { + column_name: TSID_COLUMN, + })?; + + schema_builder = schema_builder + .enable_tsid_primary_key(true) + .add_key_column(timestamp_column_schema) + .with_context(|| BuildTableSchema { metric: table_name })? + .add_key_column(tsid_column_schema) + .with_context(|| BuildTableSchema { metric: table_name })?; + + for col in name_column_map.into_values() { + schema_builder = schema_builder + .add_normal_column(col) + .with_context(|| BuildTableSchema { metric: table_name })?; + } + + schema_builder.build().with_context(|| BuildTableSchema { + metric: metric.get_metric(), + }) +} + +fn ensure_data_type_compatible( + table_name: &str, + column_name: &str, + is_tag: bool, + data_type: DatumKind, + column_schema: &ColumnSchema, +) -> Result<()> { + ensure!( + column_schema.is_tag == is_tag, + InvalidArgument { + msg: format!( + "Duplicated column: {} in fields and tags for table: {}", + column_name, table_name, + ), + } + ); + ensure!( + column_schema.data_type == data_type, + InvalidArgument { + msg: format!( + "Column: {} in table: {} data type is not same, expected: {}, actual: {}", + column_name, table_name, column_schema.data_type, data_type, + ), + } + ); + Ok(()) +} + +fn try_get_data_type_from_value(value: &Value_oneof_value) -> Result { + match value { + Value_oneof_value::float64_value(_) => Ok(DatumKind::Double), + Value_oneof_value::string_value(_) => Ok(DatumKind::String), + Value_oneof_value::int64_value(_) => Ok(DatumKind::Int64), + Value_oneof_value::float32_value(_) => Ok(DatumKind::Float), + Value_oneof_value::int32_value(_) => Ok(DatumKind::Int32), + Value_oneof_value::int16_value(_) => Ok(DatumKind::Int16), + Value_oneof_value::int8_value(_) => Ok(DatumKind::Int8), + Value_oneof_value::bool_value(_) => Ok(DatumKind::Boolean), + Value_oneof_value::uint64_value(_) => Ok(DatumKind::UInt64), + Value_oneof_value::uint32_value(_) => Ok(DatumKind::UInt32), + Value_oneof_value::uint16_value(_) => Ok(DatumKind::UInt16), + Value_oneof_value::uint8_value(_) => Ok(DatumKind::UInt8), + Value_oneof_value::timestamp_value(_) => Ok(DatumKind::Timestamp), + Value_oneof_value::varbinary_value(_) => Ok(DatumKind::Varbinary), + } +} + +#[cfg(test)] +mod tests { + use ceresdbproto::storage::{Field, FieldGroup, Tag, Value, WriteEntry, WriteMetric}; + use common_types::datum::DatumKind; + use meta_client::SchemaConfig; + + use super::*; + + const TAG1: &str = "host"; + const TAG2: &str = "idc"; + const FIELD1: &str = "cpu"; + const FIELD2: &str = "memory"; + const FIELD3: &str = "log"; + const FIELD4: &str = "ping_ok"; + const METRIC: &str = "pod_system_metric"; + const TIMESTAMP_COLUMN: &str = "custom_timestamp"; + + fn generate_write_metric() -> WriteMetric { + let mut write_metric = WriteMetric::default(); + write_metric.set_metric(METRIC.to_string()); + + let tag_names = vec![TAG1.to_string(), TAG2.to_string()]; + let field_names = vec![ + FIELD1.to_string(), + FIELD2.to_string(), + FIELD3.to_string(), + FIELD4.to_string(), + ]; + + write_metric.set_field_names(field_names.into()); + write_metric.set_tag_names(tag_names.into()); + + //tags + let mut tag1 = Tag::new(); + tag1.set_name_index(0); + let mut tag_val1 = Value::new(); + tag_val1.set_string_value("test.host".to_string()); + tag1.set_value(tag_val1); + let mut tag2 = Tag::new(); + tag2.set_name_index(1); + let mut tag_val2 = Value::new(); + tag_val2.set_string_value("test.idc".to_string()); + tag2.set_value(tag_val2); + let tags = vec![tag1, tag2]; + + //fields + let mut field1 = Field::new(); + field1.set_name_index(0); + let mut field_val1 = Value::new(); + field_val1.set_float64_value(100.0); + field1.set_value(field_val1); + let mut field2 = Field::new(); + field2.set_name_index(1); + let mut field_val2 = Value::new(); + field_val2.set_float64_value(1024.0); + field2.set_value(field_val2); + let mut field3 = Field::new(); + field3.set_name_index(2); + let mut field_val3 = Value::new(); + field_val3.set_string_value("test log".to_string()); + field3.set_value(field_val3); + let mut field4 = Field::new(); + field4.set_name_index(3); + let mut field_val4 = Value::new(); + field_val4.set_bool_value(true); + field4.set_value(field_val4); + + let mut field_group1 = FieldGroup::new(); + field_group1.set_timestamp(1000); + field_group1.set_fields(vec![field1.clone(), field4].into()); + + let mut field_group2 = FieldGroup::new(); + field_group2.set_timestamp(2000); + field_group2.set_fields(vec![field1.clone(), field2.clone()].into()); + + let mut field_group3 = FieldGroup::new(); + field_group3.set_timestamp(3000); + field_group3.set_fields(vec![field3].into()); + + let mut write_entry = WriteEntry::new(); + write_entry.set_tags(tags.into()); + write_entry.set_field_groups(vec![field_group1, field_group2, field_group3].into()); + + write_metric.set_entries(vec![write_entry].into()); + + write_metric + } + + #[test] + fn test_build_schema_from_metric() { + let schema_config = SchemaConfig { + auto_create_tables: true, + default_timestamp_column_name: TIMESTAMP_COLUMN.to_string(), + ..SchemaConfig::default() + }; + let write_metric = generate_write_metric(); + + let schema = build_schema_from_metric(&schema_config, &write_metric); + assert!(schema.is_ok()); + + let schema = schema.unwrap(); + + assert_eq!(8, schema.num_columns()); + assert_eq!(2, schema.num_key_columns()); + assert_eq!(TIMESTAMP_COLUMN, schema.timestamp_name()); + let tsid = schema.tsid_column(); + assert!(tsid.is_some()); + + let key_columns = schema.key_columns(); + assert_eq!(2, key_columns.len()); + assert_eq!(TIMESTAMP_COLUMN, key_columns[0].name); + assert_eq!("tsid", key_columns[1].name); + + let columns = schema.normal_columns(); + assert_eq!(6, columns.len()); + + // sorted by column names because of btree + assert_eq!(FIELD1, columns[0].name); + assert!(!columns[0].is_tag); + assert_eq!(DatumKind::Double, columns[0].data_type); + assert_eq!(TAG1, columns[1].name); + assert!(columns[1].is_tag); + assert_eq!(DatumKind::String, columns[1].data_type); + assert_eq!(TAG2, columns[2].name); + assert!(columns[2].is_tag); + assert_eq!(DatumKind::String, columns[2].data_type); + assert_eq!(FIELD3, columns[3].name); + assert!(!columns[3].is_tag); + assert_eq!(DatumKind::String, columns[3].data_type); + assert_eq!(FIELD2, columns[4].name); + assert!(!columns[4].is_tag); + assert_eq!(DatumKind::Double, columns[4].data_type); + assert_eq!(FIELD4, columns[5].name); + assert!(!columns[5].is_tag); + assert_eq!(DatumKind::Boolean, columns[5].data_type); + + for column in columns { + assert!(column.is_nullable); + } + } +} diff --git a/server/src/grpc/prom_query.rs b/server/src/grpc/prom_query.rs new file mode 100644 index 0000000000..44916788ff --- /dev/null +++ b/server/src/grpc/prom_query.rs @@ -0,0 +1,467 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + collections::{BTreeMap, HashMap}, + sync::Arc, +}; + +use catalog::manager::Manager as CatalogManager; +use ceresdbproto::{ + common::ResponseHeader, + prometheus::{Label, PrometheusQueryRequest, PrometheusQueryResponse, Sample, TimeSeries}, +}; +use common_types::{ + datum::DatumKind, + record_batch::RecordBatch, + request_id::RequestId, + schema::{RecordSchema, TSID_COLUMN}, +}; +use interpreters::{context::Context as InterpreterContext, factory::Factory, interpreter::Output}; +use log::debug; +use query_engine::executor::{Executor as QueryExecutor, RecordBatchVec}; +use snafu::{ensure, OptionExt, ResultExt}; +use sql::{ + frontend::{Context as SqlContext, Frontend}, + promql::ColumnNames, + provider::CatalogMetaProvider, +}; + +use crate::{ + error::{ErrNoCause, ErrWithCause, Result, ServerError, StatusCode}, + grpc::HandlerContext, +}; + +pub async fn handle_query( + ctx: &HandlerContext<'_, C, Q>, + req: PrometheusQueryRequest, +) -> Result +where + C: CatalogManager + 'static, + Q: QueryExecutor + 'static, +{ + let request_id = RequestId::next_id(); + + debug!( + "Grpc handle query begin, catalog:{}, tenant:{}, request_id:{}, request:{:?}", + ctx.catalog(), + ctx.tenant(), + request_id, + req, + ); + + let instance = &ctx.instance; + // We use tenant as schema + // TODO(yingwen): Privilege check, cannot access data of other tenant + // TODO(yingwen): Maybe move MetaProvider to instance + let provider = CatalogMetaProvider { + manager: &instance.catalog_manager, + default_catalog: ctx.catalog(), + default_schema: ctx.tenant(), + function_registry: &*instance.function_registry, + }; + let frontend = Frontend::new(provider); + + let mut sql_ctx = SqlContext::new(request_id); + let expr = frontend + .parse_promql(&mut sql_ctx, req) + .map_err(|e| Box::new(e) as _) + .context(ErrWithCause { + code: StatusCode::InvalidArgument, + msg: "Invalid request", + })?; + + let (plan, column_name) = frontend + .promql_expr_to_plan(&mut sql_ctx, expr) + .map_err(|e| { + // TODO(chenxiang): improve error match + let code = if e.to_string().contains("Table not found") { + StatusCode::NotFound + } else { + StatusCode::InternalError + }; + ServerError::ErrWithCause { + code, + msg: "Failed to create plan".to_string(), + source: Box::new(e), + } + })?; + + if ctx.instance.limiter.should_limit(&plan) { + ErrNoCause { + code: StatusCode::TooManyRequests, + msg: "Query limited by reject list", + } + .fail()?; + } + + // Execute in interpreter + let interpreter_ctx = InterpreterContext::builder(request_id) + // Use current ctx's catalog and tenant as default catalog and tenant + .default_catalog_and_schema(ctx.catalog().to_string(), ctx.tenant().to_string()) + .build(); + let interpreter_factory = Factory::new( + instance.query_executor.clone(), + instance.catalog_manager.clone(), + instance.table_engine.clone(), + ); + let interpreter = interpreter_factory.create(interpreter_ctx, plan); + + let output = interpreter + .execute() + .await + .map_err(|e| Box::new(e) as _) + .with_context(|| ErrWithCause { + code: StatusCode::InternalError, + msg: "Failed to execute interpreter", + })?; + + let resp = convert_output(output, column_name) + .map_err(|e| Box::new(e) as _) + .with_context(|| ErrWithCause { + code: StatusCode::InternalError, + msg: "Failed to convert output", + })?; + + Ok(resp) +} + +fn convert_output( + output: Output, + column_name: Arc, +) -> Result { + match output { + Output::Records(records) => convert_records(records, column_name), + _ => unreachable!(), + } +} + +fn convert_records( + records: RecordBatchVec, + column_name: Arc, +) -> Result { + if records.is_empty() { + return Ok(empty_ok_resp()); + } + + let mut resp = empty_ok_resp(); + let mut tsid_to_tags = HashMap::new(); + let mut tsid_to_samples = HashMap::new(); + + // TODO(chenxiang): benchmark iterator by columns + for record_batch in records { + let converter = RecordConverter::try_new(&column_name, record_batch.schema())?; + + for (tsid, samples) in converter.convert_to_samples(record_batch, &mut tsid_to_tags) { + tsid_to_samples + .entry(tsid) + .or_insert_with(Vec::new) + .extend(samples) + } + } + + let series_set = tsid_to_samples + .into_iter() + .map(|(tsid, samples)| { + let tags = tsid_to_tags + .get(&tsid) + .expect("ensured in convert_to_samples"); + let mut timeseries = TimeSeries::new(); + timeseries.set_labels( + tags.iter() + .map(|(k, v)| { + let mut label = Label::new(); + label.set_name(k.clone()); + label.set_value(v.clone()); + label + }) + .collect::>() + .into(), + ); + timeseries.set_samples(samples.into()); + timeseries + }) + .collect::>(); + + resp.set_timeseries(series_set.into()); + Ok(resp) +} + +fn empty_ok_resp() -> PrometheusQueryResponse { + let mut header = ResponseHeader::new(); + header.code = StatusCode::Ok.as_u32(); + + let mut resp = PrometheusQueryResponse::new(); + resp.set_header(header); + + resp +} + +/// RecordConverter convert RecordBatch to time series format required by PromQL +struct RecordConverter { + tsid_idx: usize, + timestamp_idx: usize, + tags_idx: BTreeMap, // tag_key -> column_index + field_idx: usize, +} + +impl RecordConverter { + fn try_new(column_name: &ColumnNames, record_schema: &RecordSchema) -> Result { + let tsid_idx = record_schema + .index_of(TSID_COLUMN) + .with_context(|| ErrNoCause { + code: StatusCode::InvalidArgument, + msg: "Failed to find Tsid column".to_string(), + })?; + let timestamp_idx = record_schema + .index_of(&column_name.timestamp) + .with_context(|| ErrNoCause { + code: StatusCode::InvalidArgument, + msg: "Failed to find Timestamp column".to_string(), + })?; + ensure!( + record_schema.column(timestamp_idx).data_type == DatumKind::Timestamp, + ErrNoCause { + code: StatusCode::InvalidArgument, + msg: "Timestamp column should be timestamp type" + } + ); + let field_idx = record_schema + .index_of(&column_name.field) + .with_context(|| ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!("Failed to find {} column", column_name.field), + })?; + let field_type = record_schema.column(field_idx).data_type; + ensure!( + field_type.is_f64_castable(), + ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!( + "Field type must be f64-compatibile type, current:{}", + field_type + ) + } + ); + + let tags_idx: BTreeMap<_, _> = column_name + .tag_keys + .iter() + .filter_map(|tag_key| { + record_schema + .index_of(tag_key) + .map(|idx| (tag_key.to_string(), idx)) + }) + .collect(); + + Ok(Self { + tsid_idx, + timestamp_idx, + tags_idx, + field_idx, + }) + } + + fn convert_to_samples( + &self, + record_batch: RecordBatch, + tsid_to_tags: &mut HashMap>, + ) -> HashMap> { + let mut tsid_to_samples = HashMap::new(); + + let tsid_cols = record_batch.column(self.tsid_idx); + let timestamp_cols = record_batch.column(self.timestamp_idx); + let field_cols = record_batch.column(self.field_idx); + for row_idx in 0..record_batch.num_rows() { + let timestamp = timestamp_cols + .datum(row_idx) + .as_timestamp() + .expect("checked in try_new") + .as_i64(); + let field = field_cols + .datum(row_idx) + .as_f64() + .expect("checked in try_new"); + let tsid = tsid_cols + .datum(row_idx) + .as_u64() + .expect("checked in try_new"); + + tsid_to_tags.entry(tsid).or_insert_with(|| { + self.tags_idx + .iter() + .filter_map(|(tag_key, col_idx)| { + // TODO(chenxiang): avoid clone? + record_batch + .column(*col_idx) + .datum(row_idx) + .as_str() + .and_then(|tag_value| { + // filter empty tag value out, since Prometheus don't allow it. + if tag_value.is_empty() { + None + } else { + Some((tag_key.clone(), tag_value.to_string())) + } + }) + }) + .collect::>() + }); + + let samples = tsid_to_samples.entry(tsid).or_insert_with(Vec::new); + let mut sample = Sample::new(); + sample.set_value(field); + sample.set_timestamp(timestamp); + samples.push(sample); + } + + tsid_to_samples + } +} + +#[cfg(test)] +mod tests { + + use common_types::{ + column::{ColumnBlock, ColumnBlockBuilder}, + column_schema, + datum::Datum, + row::Row, + schema, + string::StringBytes, + time::Timestamp, + }; + + use super::*; + + fn build_schema() -> schema::Schema { + schema::Builder::new() + .auto_increment_column_id(true) + .enable_tsid_primary_key(true) + .add_key_column( + column_schema::Builder::new("timestamp".to_string(), DatumKind::Timestamp) + .build() + .unwrap(), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new(TSID_COLUMN.to_string(), DatumKind::UInt64) + .build() + .unwrap(), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Double) + .build() + .unwrap(), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("tag1".to_string(), DatumKind::String) + .is_tag(true) + .build() + .unwrap(), + ) + .unwrap() + .build() + .unwrap() + } + + fn build_column_block() -> Vec { + let build_row = |ts: i64, tsid: u64, field1: f64, field2: &str| -> Row { + let datums = vec![ + Datum::Timestamp(Timestamp::new(ts)), + Datum::UInt64(tsid), + Datum::Double(field1), + Datum::String(StringBytes::from(field2)), + ]; + + Row::from_datums(datums) + }; + + let rows = vec![ + build_row(1000001, 1, 10.0, "v5"), + build_row(1000002, 1, 11.0, "v5"), + build_row(1000000, 2, 10.0, "v4"), + build_row(1000000, 3, 10.0, "v3"), + ]; + + let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::Timestamp, 2); + for row in &rows { + builder.append(row[0].clone()).unwrap(); + } + let timestamp_block = builder.build(); + + let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::UInt64, 2); + for row in &rows { + builder.append(row[1].clone()).unwrap(); + } + let tsid_block = builder.build(); + + let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::Double, 2); + for row in &rows { + builder.append(row[2].clone()).unwrap(); + } + let field_block = builder.build(); + + let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 2); + for row in &rows { + builder.append(row[3].clone()).unwrap(); + } + let tag_block = builder.build(); + + vec![timestamp_block, tsid_block, field_block, tag_block] + } + + fn make_sample(timestamp: i64, value: f64) -> Sample { + let mut sample = Sample::new(); + sample.set_value(value); + sample.set_timestamp(timestamp); + sample + } + + fn make_tags(tags: Vec<(String, String)>) -> BTreeMap { + tags.into_iter().collect::>() + } + + #[test] + fn test_record_convert() { + let schema = build_schema(); + let record_schema = schema.to_record_schema(); + let column_blocks = build_column_block(); + let record_batch = RecordBatch::new(record_schema, column_blocks).unwrap(); + + let column_name = ColumnNames { + timestamp: "timestamp".to_string(), + tag_keys: vec!["tag1".to_string()], + field: "field1".to_string(), + }; + let converter = RecordConverter::try_new(&column_name, &schema.to_record_schema()).unwrap(); + let mut tsid_to_tags = HashMap::new(); + let tsid_to_samples = converter.convert_to_samples(record_batch, &mut tsid_to_tags); + + assert_eq!( + tsid_to_samples.get(&1).unwrap().clone(), + vec![make_sample(1000001, 10.0), make_sample(1000002, 11.0)] + ); + assert_eq!( + tsid_to_samples.get(&2).unwrap().clone(), + vec![make_sample(1000000, 10.0)] + ); + assert_eq!( + tsid_to_samples.get(&3).unwrap().clone(), + vec![make_sample(1000000, 10.0)] + ); + assert_eq!( + tsid_to_tags.get(&1).unwrap().clone(), + make_tags(vec![("tag1".to_string(), "v5".to_string())]) + ); + assert_eq!( + tsid_to_tags.get(&2).unwrap().clone(), + make_tags(vec![("tag1".to_string(), "v4".to_string())]) + ); + assert_eq!( + tsid_to_tags.get(&3).unwrap().clone(), + make_tags(vec![("tag1".to_string(), "v3".to_string())]) + ); + } +} diff --git a/server/src/grpc/query.rs b/server/src/grpc/query.rs new file mode 100644 index 0000000000..9c36a196c4 --- /dev/null +++ b/server/src/grpc/query.rs @@ -0,0 +1,224 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Query handler + +use std::time::Instant; + +use catalog::manager::Manager as CatalogManager; +use ceresdbproto::{ + common::ResponseHeader, + storage::{QueryRequest, QueryResponse, QueryResponse_SchemaType}, +}; +use common_types::{record_batch::RecordBatch, request_id::RequestId}; +use common_util::time::InstantExt; +use interpreters::{context::Context as InterpreterContext, factory::Factory, interpreter::Output}; +use log::info; +use query_engine::executor::{Executor as QueryExecutor, RecordBatchVec}; +use snafu::{ensure, ResultExt}; +use sql::{ + frontend::{Context as SqlContext, Frontend}, + provider::CatalogMetaProvider, +}; + +use crate::{ + avro_util, + error::{ErrNoCause, ErrWithCause, Result, StatusCode}, + grpc::HandlerContext, +}; + +/// Schema name of the record +const RECORD_NAME: &str = "Result"; + +fn empty_ok_resp() -> QueryResponse { + let mut header = ResponseHeader::new(); + header.code = StatusCode::Ok.as_u32(); + + let mut resp = QueryResponse::new(); + resp.set_header(header); + + resp +} + +pub async fn handle_query( + ctx: &HandlerContext<'_, C, Q>, + req: QueryRequest, +) -> Result { + let output_result = fetch_query_output(ctx, &req).await?; + if let Some(output) = output_result { + convert_output(&output) + .map_err(|e| Box::new(e) as _) + .with_context(|| ErrWithCause { + code: StatusCode::InternalError, + msg: format!("Failed to convert output, query:{}", &req.ql), + }) + } else { + Ok(empty_ok_resp()) + } +} + +pub async fn fetch_query_output( + ctx: &HandlerContext<'_, C, Q>, + req: &QueryRequest, +) -> Result> { + let request_id = RequestId::next_id(); + let begin_instant = Instant::now(); + + info!( + "Grpc handle query begin, catalog:{}, tenant:{}, request_id:{}, request:{:?}", + ctx.catalog(), + ctx.tenant(), + request_id, + req, + ); + + let instance = &ctx.instance; + // We use tenant as schema + // TODO(yingwen): Privilege check, cannot access data of other tenant + // TODO(yingwen): Maybe move MetaProvider to instance + let provider = CatalogMetaProvider { + manager: &instance.catalog_manager, + default_catalog: ctx.catalog(), + default_schema: ctx.tenant(), + function_registry: &*instance.function_registry, + }; + let frontend = Frontend::new(provider); + + let mut sql_ctx = SqlContext::new(request_id); + // Parse sql, frontend error of invalid sql already contains sql + // TODO(yingwen): Maybe move sql from frontend error to outer error + let mut stmts = frontend + .parse_sql(&mut sql_ctx, &req.ql) + .map_err(|e| Box::new(e) as _) + .context(ErrWithCause { + code: StatusCode::InvalidArgument, + msg: "Failed to parse sql", + })?; + + if stmts.is_empty() { + return Ok(None); + } + + // TODO(yingwen): For simplicity, we only support executing one statement now + // TODO(yingwen): INSERT/UPDATE/DELETE can be batched + ensure!( + stmts.len() == 1, + ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!( + "Only support execute one statement now, current num:{}, query:{}", + stmts.len(), + req.ql + ), + } + ); + + // Create logical plan + // Note: Remember to store sql in error when creating logical plan + let plan = frontend + // TODO(yingwen): Check error, some error may indicate that the sql is invalid. Now we + // return internal server error in those cases + .statement_to_plan(&mut sql_ctx, stmts.remove(0)) + .map_err(|e| Box::new(e) as _) + .with_context(|| ErrWithCause { + code: StatusCode::InternalError, + msg: format!("Failed to create plan, query:{}", req.ql), + })?; + + if ctx.instance.limiter.should_limit(&plan) { + ErrNoCause { + code: StatusCode::TooManyRequests, + msg: "Query limited by reject list", + } + .fail()?; + } + + // Execute in interpreter + let interpreter_ctx = InterpreterContext::builder(request_id) + // Use current ctx's catalog and tenant as default catalog and tenant + .default_catalog_and_schema(ctx.catalog().to_string(), ctx.tenant().to_string()) + .build(); + let interpreter_factory = Factory::new( + instance.query_executor.clone(), + instance.catalog_manager.clone(), + instance.table_engine.clone(), + ); + let interpreter = interpreter_factory.create(interpreter_ctx, plan); + + let output = interpreter + .execute() + .await + .map_err(|e| Box::new(e) as _) + .with_context(|| ErrWithCause { + code: StatusCode::InternalError, + msg: format!("Failed to execute interpreter, query:{}", req.ql), + })?; + + info!( + "Grpc handle query success, catalog:{}, tenant:{}, request_id:{}, cost:{}, request:{:?}", + ctx.catalog(), + ctx.tenant(), + request_id, + begin_instant.saturating_elapsed().as_millis(), + req, + ); + + Ok(Some(output)) +} + +fn convert_output(output: &Output) -> Result { + match output { + Output::Records(records) => convert_records(records), + _ => unreachable!(), + } +} + +pub fn get_record_batch(op: &Option) -> Option<&RecordBatchVec> { + if let Some(output) = op { + match output { + Output::Records(records) => Some(records), + _ => unreachable!(), + } + } else { + None + } +} + +/// REQUIRE: records have same schema +pub fn convert_records(records: &[RecordBatch]) -> Result { + if records.is_empty() { + return Ok(empty_ok_resp()); + } + + let mut resp = empty_ok_resp(); + let mut avro_schema_opt = None; + + let total_row = records.iter().map(|v| v.num_rows()).sum(); + let mut rows = Vec::with_capacity(total_row); + for record_batch in records { + let avro_schema = match avro_schema_opt.as_ref() { + Some(schema) => schema, + None => { + let avro_schema = avro_util::to_avro_schema(RECORD_NAME, record_batch.schema()); + + // We only set schema_json once, so all record batches need to have same schema + resp.schema_type = QueryResponse_SchemaType::AVRO; + resp.schema_content = avro_schema.canonical_form(); + + avro_schema_opt = Some(avro_schema); + + avro_schema_opt.as_ref().unwrap() + } + }; + + avro_util::record_batch_to_avro(record_batch, avro_schema, &mut rows) + .map_err(|e| Box::new(e) as _) + .context(ErrWithCause { + code: StatusCode::InternalError, + msg: "Failed to convert record batch", + })?; + } + + resp.set_rows(rows.into()); + + Ok(resp) +} diff --git a/server/src/grpc/route.rs b/server/src/grpc/route.rs new file mode 100644 index 0000000000..ec0f354637 --- /dev/null +++ b/server/src/grpc/route.rs @@ -0,0 +1,35 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Route handler + +use std::sync::Arc; + +use catalog::manager::Manager; +use ceresdbproto::storage::{RouteRequest, RouteResponse}; + +use crate::{ + error::Result, + grpc::{self, HandlerContext}, + router::Router, +}; + +pub async fn handle_route( + ctx: &HandlerContext<'_, C, Q>, + req: RouteRequest, +) -> Result { + handle_route_sync(ctx.router.clone(), req, ctx.tenant()) +} + +fn handle_route_sync( + router: Arc, + req: RouteRequest, + schema: &str, +) -> Result { + let route_vec = router.route(schema, req)?; + + let mut resp = RouteResponse::new(); + resp.set_header(grpc::build_ok_header()); + resp.set_routes(route_vec.into()); + + Ok(resp) +} diff --git a/server/src/grpc/write.rs b/server/src/grpc/write.rs new file mode 100644 index 0000000000..55f1880d57 --- /dev/null +++ b/server/src/grpc/write.rs @@ -0,0 +1,586 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Write handler + +use std::collections::HashMap; + +use catalog::manager::Manager as CatalogManager; +use ceresdbproto::storage::{ + Value_oneof_value, WriteEntry, WriteMetric, WriteRequest, WriteResponse, +}; +use common_types::{ + bytes::Bytes, + datum::{Datum, DatumKind}, + request_id::RequestId, + row::{Row, RowGroupBuilder}, + schema::Schema, + time::Timestamp, +}; +use interpreters::{context::Context as InterpreterContext, factory::Factory, interpreter::Output}; +use log::debug; +use query_engine::executor::Executor as QueryExecutor; +use snafu::{ensure, OptionExt, ResultExt}; +use sql::plan::{InsertPlan, Plan}; +use table_engine::table::TableRef; + +use crate::{ + error::{ErrNoCause, ErrWithCause, Result, StatusCode}, + grpc::{self, HandlerContext}, +}; + +pub(crate) async fn handle_write( + ctx: &HandlerContext<'_, C, Q>, + req: WriteRequest, +) -> Result { + let request_id = RequestId::next_id(); + + debug!( + "Grpc handle write begin, catalog:{}, tenant:{}, request_id:{}, first_table:{:?}, num_tables:{}", + ctx.catalog(), + ctx.tenant(), + request_id, + req.get_metrics() + .first() + .map(|m| (m.get_metric(), m.get_tag_names(), m.get_field_names())), + req.get_metrics().len(), + ); + + let instance = &ctx.instance; + let plan_vec = write_request_to_insert_plan(ctx, req, request_id).await?; + + let mut success = 0; + for insert_plan in plan_vec { + debug!( + "Grpc handle write table begin, table:{}, row_num:{}", + insert_plan.table.name(), + insert_plan.rows.num_rows() + ); + let plan = Plan::Insert(insert_plan); + + if ctx.instance.limiter.should_limit(&plan) { + ErrNoCause { + code: StatusCode::TooManyRequests, + msg: "Insert limited by reject list", + } + .fail()?; + } + + let interpreter_ctx = InterpreterContext::builder(request_id) + // Use current ctx's catalog and tenant as default catalog and tenant + .default_catalog_and_schema(ctx.catalog().to_string(), ctx.tenant().to_string()) + .build(); + let interpreter_factory = Factory::new( + instance.query_executor.clone(), + instance.catalog_manager.clone(), + instance.table_engine.clone(), + ); + let interpreter = interpreter_factory.create(interpreter_ctx, plan); + + let row_num = match interpreter + .execute() + .await + .map_err(|e| Box::new(e) as _) + .context(ErrWithCause { + code: StatusCode::InternalError, + msg: "Failed to execute interpreter", + })? { + Output::AffectedRows(n) => n, + _ => unreachable!(), + }; + + success += row_num; + } + + let mut resp = WriteResponse::new(); + resp.set_header(grpc::build_ok_header()); + resp.set_success(success as u32); + + debug!( + "Grpc handle write finished, catalog:{}, tenant:{}, resp:{:?}", + ctx.catalog(), + ctx.tenant(), + resp + ); + + Ok(resp) +} + +async fn write_request_to_insert_plan( + ctx: &HandlerContext<'_, C, Q>, + mut write_request: WriteRequest, + request_id: RequestId, +) -> Result> { + let mut plan_vec = Vec::with_capacity(write_request.get_metrics().len()); + + for write_metric in write_request.take_metrics() { + let table_name = write_metric.get_metric(); + let mut table = try_get_table(ctx, table_name)?; + + if table.is_none() { + if let Some(config) = ctx.schema_config { + if config.auto_create_tables { + create_table(ctx, &write_metric, request_id).await?; + // try to get table again + table = try_get_table(ctx, table_name)?; + } + } + } + + match table { + Some(table) => { + let plan = write_metric_to_insert_plan(table, write_metric)?; + plan_vec.push(plan); + } + None => { + return ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!("Table not found, table:{}", write_metric.get_metric()), + } + .fail(); + } + } + } + + Ok(plan_vec) +} + +fn try_get_table( + ctx: &HandlerContext<'_, C, Q>, + table_name: &str, +) -> Result> { + ctx.instance + .catalog_manager + .catalog_by_name(ctx.catalog()) + .map_err(|e| Box::new(e) as _) + .with_context(|| ErrWithCause { + code: StatusCode::InternalError, + msg: format!("Failed to find catalog, catalog_name:{}", ctx.catalog()), + })? + .with_context(|| ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!("Catalog not found, catalog_name:{}", ctx.catalog()), + })? + .schema_by_name(ctx.tenant()) + .map_err(|e| Box::new(e) as _) + .with_context(|| ErrWithCause { + code: StatusCode::InternalError, + msg: format!("Failed to find tenant, tenant_name:{}", ctx.tenant()), + })? + .with_context(|| ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!("Tenant not found, tenant_name:{}", ctx.tenant()), + })? + .table_by_name(table_name) + .map_err(|e| Box::new(e) as _) + .with_context(|| ErrWithCause { + code: StatusCode::InternalError, + msg: format!("Failed to find table, table:{}", table_name), + }) +} + +async fn create_table( + ctx: &HandlerContext<'_, C, Q>, + write_metric: &WriteMetric, + request_id: RequestId, +) -> Result<()> { + let create_table_plan = grpc::write_metric_to_create_table_plan(ctx, write_metric) + .map_err(|e| Box::new(e) as _) + .with_context(|| ErrWithCause { + code: StatusCode::InternalError, + msg: format!( + "Failed to build creating table plan from metric, table:{}", + write_metric.get_metric() + ), + })?; + + debug!( + "Grpc handle create table begin, table:{}, schema: {:?}", + create_table_plan.table, create_table_plan.table_schema, + ); + let plan = Plan::Create(create_table_plan); + + let instance = &ctx.instance; + + if instance.limiter.should_limit(&plan) { + ErrNoCause { + code: StatusCode::TooManyRequests, + msg: "Create table limited by reject list", + } + .fail()?; + } + + let interpreter_ctx = InterpreterContext::builder(request_id) + // Use current ctx's catalog and tenant as default catalog and tenant + .default_catalog_and_schema(ctx.catalog().to_string(), ctx.tenant().to_string()) + .build(); + let interpreter_factory = Factory::new( + instance.query_executor.clone(), + instance.catalog_manager.clone(), + instance.table_engine.clone(), + ); + let interpreter = interpreter_factory.create(interpreter_ctx, plan); + + let _ = match interpreter + .execute() + .await + .map_err(|e| Box::new(e) as _) + .context(ErrWithCause { + code: StatusCode::InternalError, + msg: "Failed to execute interpreter", + })? { + Output::AffectedRows(n) => n, + _ => unreachable!(), + }; + + Ok(()) +} + +fn write_metric_to_insert_plan( + table: TableRef, + mut write_metric: WriteMetric, +) -> Result { + let schema = table.schema(); + + let mut rows_total = Vec::new(); + for write_entry in write_metric.take_entries() { + let mut rows = write_entry_to_rows( + write_metric.get_metric(), + &schema, + write_metric.get_tag_names(), + write_metric.get_field_names(), + write_entry, + )?; + rows_total.append(&mut rows); + } + // The row group builder will checks nullable. + let row_group = RowGroupBuilder::with_rows(schema, rows_total) + .map_err(|e| Box::new(e) as _) + .context(ErrWithCause { + code: StatusCode::InternalError, + msg: format!("Failed to build row group, table:{}", table.name()), + })? + .build(); + Ok(InsertPlan { + table, + rows: row_group, + }) +} + +fn write_entry_to_rows( + table_name: &str, + schema: &Schema, + tag_names: &[String], + field_names: &[String], + mut write_entry: WriteEntry, +) -> Result> { + // Init all columns by null. + let mut rows = vec![ + Row::from_datums(vec![Datum::Null; schema.num_columns()]); + write_entry.get_field_groups().len() + ]; + + // Fill tsid by default value. + if let Some(tsid_idx) = schema.index_of_tsid() { + let kind = &schema.tsid_column().unwrap().data_type; + let default_datum = Datum::empty(kind); + for row in &mut rows { + row[tsid_idx] = default_datum.clone(); + } + } + + // Fill tags. + for mut tag in write_entry.take_tags() { + let name_index = tag.name_index as usize; + ensure!( + name_index < tag_names.len(), + ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!( + "tag index {} is not found in tag_names:{:?}, table:{}", + name_index, tag_names, table_name, + ), + } + ); + + let tag_name = &tag_names[name_index]; + let tag_index_in_schema = schema.index_of(tag_name).with_context(|| ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!( + "Can't find tag in schema, table:{}, tag_name:{}", + table_name, tag_name + ), + })?; + + let column_schema = schema.column(tag_index_in_schema); + ensure!( + column_schema.is_tag, + ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!( + "column {} is a field rather than a tag, table:{}", + tag_name, table_name + ), + } + ); + + let tag_value = tag.take_value().value.with_context(|| ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!( + "Tag value is needed, table:{}, tag_name:{}", + table_name, tag_name + ), + })?; + for row in &mut rows { + row[tag_index_in_schema] = convert_proto_value_to_datum( + table_name, + tag_name, + tag_value.clone(), + column_schema.data_type, + )?; + } + } + + // Fill fields. + let mut field_name_index: HashMap = HashMap::new(); + for (i, mut field_group) in write_entry.take_field_groups().into_iter().enumerate() { + // timestamp + let timestamp_index_in_schema = schema.timestamp_index(); + rows[i][timestamp_index_in_schema] = + Datum::Timestamp(Timestamp::new(field_group.get_timestamp())); + + for mut field in field_group.take_fields() { + if (field.name_index as usize) < field_names.len() { + let field_name = &field_names[field.name_index as usize]; + let index_in_schema = if field_name_index.contains_key(field_name) { + field_name_index.get(field_name).unwrap().to_owned() + } else { + let index_in_schema = + schema.index_of(field_name).with_context(|| ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!( + "Can't find field in schema, table:{}, field_name:{}", + table_name, field_name + ), + })?; + field_name_index.insert(field_name.to_string(), index_in_schema); + index_in_schema + }; + let column_schema = schema.column(index_in_schema); + ensure!( + !column_schema.is_tag, + ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!( + "Column {} is a tag rather than a field, table:{}", + field_name, table_name + ) + } + ); + let field_value = field.take_value().value.with_context(|| ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!("Field is needed, table:{}", table_name), + })?; + + rows[i][index_in_schema] = convert_proto_value_to_datum( + table_name, + field_name, + field_value, + column_schema.data_type, + )?; + } + } + } + + Ok(rows) +} + +/// Convert the `Value_oneof_value` defined in protos into the datum. +fn convert_proto_value_to_datum( + table_name: &str, + name: &str, + value: Value_oneof_value, + data_type: DatumKind, +) -> Result { + match (value, data_type) { + (Value_oneof_value::float64_value(v), DatumKind::Double) => Ok(Datum::Double(v)), + (Value_oneof_value::string_value(v), DatumKind::String) => Ok(Datum::String(v.into())), + (Value_oneof_value::int64_value(v), DatumKind::Int64) => Ok(Datum::Int64(v)), + (Value_oneof_value::float32_value(v), DatumKind::Float) => Ok(Datum::Float(v)), + (Value_oneof_value::int32_value(v), DatumKind::Int32) => Ok(Datum::Int32(v)), + (Value_oneof_value::int16_value(v), DatumKind::Int16) => Ok(Datum::Int16(v as i16)), + (Value_oneof_value::int8_value(v), DatumKind::Int8) => Ok(Datum::Int8(v as i8)), + (Value_oneof_value::bool_value(v), DatumKind::Boolean) => Ok(Datum::Boolean(v)), + (Value_oneof_value::uint64_value(v), DatumKind::UInt64) => Ok(Datum::UInt64(v)), + (Value_oneof_value::uint32_value(v), DatumKind::UInt32) => Ok(Datum::UInt32(v)), + (Value_oneof_value::uint16_value(v), DatumKind::UInt16) => Ok(Datum::UInt16(v as u16)), + (Value_oneof_value::uint8_value(v), DatumKind::UInt8) => Ok(Datum::UInt8(v as u8)), + (Value_oneof_value::timestamp_value(v), DatumKind::Timestamp) => Ok(Datum::Timestamp(Timestamp::new(v))), + (Value_oneof_value::varbinary_value(v), DatumKind::Varbinary) => Ok(Datum::Varbinary(Bytes::from(v))), + (v, _) => ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!( + "Value type is not same, table:{}, value_name:{}, schema_type:{:?}, actual_value:{:?}", + table_name, + name, + data_type, + v + ), + } + .fail(), + } +} + +#[cfg(test)] +mod test { + use ceresdbproto::storage::{Field, FieldGroup, Tag, Value}; + use common_types::{ + column_schema::{self, ColumnSchema}, + schema::Builder, + }; + use system_catalog::sys_catalog_table::TIMESTAMP_COLUMN_NAME; + + use super::*; + + const TAG_K: &str = "tagk"; + const TAG_V: &str = "tagv"; + const TAG_K1: &str = "tagk1"; + const TAG_V1: &str = "tagv1"; + const FIELD_NAME: &str = "field"; + const FIELD_NAME1: &str = "field1"; + const FIELD_VALUE_STRING: &str = "stringValue"; + + // tag_names field_names write_entry + fn generate_write_entry() -> (Schema, Vec, Vec, WriteEntry) { + let tag_names = vec![TAG_K.to_string(), TAG_K1.to_string()]; + let field_names = vec![FIELD_NAME.to_string(), FIELD_NAME1.to_string()]; + + let mut tag = Tag::new(); + tag.set_name_index(0); + let mut tag_val = Value::new(); + tag_val.set_string_value(TAG_V.to_string()); + tag.set_value(tag_val); + + let mut tag1 = Tag::new(); + tag1.set_name_index(1); + let mut tag_val1 = Value::new(); + tag_val1.set_string_value(TAG_V1.to_string()); + tag1.set_value(tag_val1); + let tags = vec![tag, tag1]; + + let mut field = Field::new(); + field.set_name_index(0); + let mut field_val = Value::new(); + field_val.set_float64_value(100.0); + field.set_value(field_val); + let mut field1 = Field::new(); + field1.set_name_index(1); + let mut field_val1 = Value::new(); + field_val1.set_string_value(FIELD_VALUE_STRING.to_string()); + field1.set_value(field_val1); + let mut field_group = FieldGroup::new(); + field_group.set_timestamp(1000); + field_group.set_fields(vec![field].into()); + + let mut field_group1 = FieldGroup::new(); + field_group1.set_timestamp(2000); + field_group1.set_fields(vec![field1.clone()].into()); + + let mut field_group2 = FieldGroup::new(); + field_group2.set_timestamp(3000); + field_group2.set_fields(vec![field1].into()); + + let mut write_entry = WriteEntry::new(); + + write_entry.set_tags(tags.into()); + + write_entry.set_field_groups(vec![field_group, field_group1, field_group2].into()); + + let schema_builder = Builder::new(); + let schema = schema_builder + .auto_increment_column_id(true) + .add_key_column(ColumnSchema { + id: column_schema::COLUMN_ID_UNINIT, + name: TIMESTAMP_COLUMN_NAME.to_string(), + data_type: DatumKind::Timestamp, + is_nullable: false, + is_tag: false, + comment: String::new(), + }) + .unwrap() + .add_key_column(ColumnSchema { + id: column_schema::COLUMN_ID_UNINIT, + name: TAG_K.to_string(), + data_type: DatumKind::String, + is_nullable: false, + is_tag: true, + comment: String::new(), + }) + .unwrap() + .add_normal_column(ColumnSchema { + id: column_schema::COLUMN_ID_UNINIT, + name: TAG_K1.to_string(), + data_type: DatumKind::String, + is_nullable: false, + is_tag: true, + comment: String::new(), + }) + .unwrap() + .add_normal_column(ColumnSchema { + id: column_schema::COLUMN_ID_UNINIT, + name: FIELD_NAME.to_string(), + data_type: DatumKind::Double, + is_nullable: true, + is_tag: false, + comment: String::new(), + }) + .unwrap() + .add_normal_column(ColumnSchema { + id: column_schema::COLUMN_ID_UNINIT, + name: FIELD_NAME1.to_string(), + data_type: DatumKind::String, + is_nullable: true, + is_tag: false, + comment: String::new(), + }) + .unwrap() + .build() + .unwrap(); + (schema, tag_names, field_names, write_entry) + } + + #[test] + fn test_write_entry_to_row_group() { + let (schema, tag_names, field_names, write_entry) = generate_write_entry(); + let rows = + write_entry_to_rows("test_table", &schema, &tag_names, &field_names, write_entry) + .unwrap(); + let row0 = vec![ + Datum::Timestamp(Timestamp::new(1000)), + Datum::String(TAG_V.into()), + Datum::String(TAG_V1.into()), + Datum::Double(100.0), + Datum::Null, + ]; + let row1 = vec![ + Datum::Timestamp(Timestamp::new(2000)), + Datum::String(TAG_V.into()), + Datum::String(TAG_V1.into()), + Datum::Null, + Datum::String(FIELD_VALUE_STRING.into()), + ]; + let row2 = vec![ + Datum::Timestamp(Timestamp::new(3000)), + Datum::String(TAG_V.into()), + Datum::String(TAG_V1.into()), + Datum::Null, + Datum::String(FIELD_VALUE_STRING.into()), + ]; + + let expect_rows = vec![ + Row::from_datums(row0), + Row::from_datums(row1), + Row::from_datums(row2), + ]; + assert_eq!(rows, expect_rows); + } +} diff --git a/server/src/handlers/admin.rs b/server/src/handlers/admin.rs new file mode 100644 index 0000000000..1779e917c6 --- /dev/null +++ b/server/src/handlers/admin.rs @@ -0,0 +1,71 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::collections::BTreeSet; + +use crate::handlers::prelude::*; + +#[derive(Debug, Deserialize)] +pub enum Operation { + Add, + Set, + Remove, +} + +#[derive(Debug, Deserialize)] +pub struct RejectRequest { + operation: Operation, + write_reject_list: Vec, + read_reject_list: Vec, +} + +#[derive(Serialize)] +pub struct RejectResponse { + write_reject_list: BTreeSet, + read_reject_list: BTreeSet, +} + +pub async fn handle_reject( + _ctx: RequestContext, + instance: InstanceRef, + request: RejectRequest, +) -> Result { + match request.operation { + Operation::Add => { + instance + .limiter + .add_write_reject_list(request.write_reject_list); + instance + .limiter + .add_read_reject_list(request.read_reject_list); + } + Operation::Set => { + instance + .limiter + .set_write_reject_list(request.write_reject_list); + instance + .limiter + .set_read_reject_list(request.read_reject_list); + } + Operation::Remove => { + instance + .limiter + .remove_write_reject_list(request.write_reject_list); + instance + .limiter + .remove_read_reject_list(request.read_reject_list); + } + } + + Ok(RejectResponse { + write_reject_list: instance + .limiter + .get_write_reject_list() + .into_iter() + .collect::>(), + read_reject_list: instance + .limiter + .get_read_reject_list() + .into_iter() + .collect::>(), + }) +} diff --git a/server/src/handlers/error.rs b/server/src/handlers/error.rs new file mode 100644 index 0000000000..0d781f2560 --- /dev/null +++ b/server/src/handlers/error.rs @@ -0,0 +1,52 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Error of handlers + +use snafu::{Backtrace, Snafu}; + +// TODO(yingwen): Avoid printing huge sql string +// TODO(yingwen): Maybe add an error type to sql sub mod +#[derive(Debug, Snafu)] +#[snafu(visibility(pub(crate)))] +pub enum Error { + #[snafu(display("Failed to parse sql, err:{}", source))] + ParseSql { source: sql::frontend::Error }, + + #[snafu(display("Failed to create plan, query:{}, err:{}", query, source))] + CreatePlan { + query: String, + source: sql::frontend::Error, + }, + + #[snafu(display( + "Only support execute one statement now, current num:{}, query:{}.\nBacktrace:\n{}", + len, + query, + backtrace, + ))] + TooMuchStmt { + len: usize, + query: String, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to execute interpreter, query:{}, err:{}", query, source))] + InterpreterExec { + query: String, + source: interpreters::interpreter::Error, + }, + + #[snafu(display( + "Failed to convert arrow to string, query:{}, err:{}.\nBacktrace:\n{}", + query, + source, + backtrace + ))] + ArrowToString { + query: String, + source: arrow_deps::arrow::error::ArrowError, + backtrace: Backtrace, + }, +} + +define_result!(Error); diff --git a/server/src/handlers/mod.rs b/server/src/handlers/mod.rs new file mode 100644 index 0000000000..e695b3b610 --- /dev/null +++ b/server/src/handlers/mod.rs @@ -0,0 +1,21 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Request handlers + +pub mod admin; +pub mod error; +pub mod sql; + +mod prelude { + pub use catalog::manager::Manager as CatalogManager; + pub use query_engine::executor::Executor as QueryExecutor; + pub use serde_derive::{Deserialize, Serialize}; + pub use snafu::ResultExt; + pub use warp::Filter; + + pub use crate::{ + context::RequestContext, + handlers::error::{Error, Result}, + instance::InstanceRef, + }; +} diff --git a/server/src/handlers/sql.rs b/server/src/handlers/sql.rs new file mode 100644 index 0000000000..1fa96b1d54 --- /dev/null +++ b/server/src/handlers/sql.rs @@ -0,0 +1,148 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! SQL request handler + +use std::collections::HashMap; + +use arrow_deps::arrow::error::Result as ArrowResult; +use common_types::{datum::Datum, request_id::RequestId}; +use interpreters::{context::Context as InterpreterContext, factory::Factory, interpreter::Output}; +use log::info; +use query_engine::executor::RecordBatchVec; +use serde_derive::Serialize; +use snafu::ensure; +use sql::{ + frontend::{Context as SqlContext, Frontend}, + provider::CatalogMetaProvider, +}; + +use crate::handlers::{ + error::{ArrowToString, CreatePlan, InterpreterExec, ParseSql, TooMuchStmt}, + prelude::*, +}; + +#[derive(Debug, Deserialize)] +pub struct Request { + query: String, +} + +// TODO(yingwen): Improve serialize performance +#[derive(Serialize)] +#[serde(rename_all = "snake_case")] +pub enum Response { + AffectedRows(usize), + Rows(Vec>), +} + +pub async fn handle_sql( + ctx: RequestContext, + instance: InstanceRef, + request: Request, +) -> Result { + let request_id = RequestId::next_id(); + + info!( + "sql handler try to process request, request_id:{}, request:{:?}", + request_id, request + ); + + // We use tenant as schema + // TODO(yingwen): Privilege check, cannot access data of other tenant + // TODO(yingwen): Maybe move MetaProvider to instance + let provider = CatalogMetaProvider { + manager: &instance.catalog_manager, + default_catalog: &ctx.catalog, + default_schema: &ctx.tenant, + function_registry: &*instance.function_registry, + }; + let frontend = Frontend::new(provider); + + let mut sql_ctx = SqlContext::new(request_id); + // Parse sql, frontend error of invalid sql already contains sql + // TODO(yingwen): Maybe move sql from frontend error to outer error + let mut stmts = frontend + .parse_sql(&mut sql_ctx, &request.query) + .context(ParseSql)?; + + if stmts.is_empty() { + return Ok(Response::AffectedRows(0)); + } + + // TODO(yingwen): For simplicity, we only support executing one statement now + // TODO(yingwen): INSERT/UPDATE/DELETE can be batched + ensure!( + stmts.len() == 1, + TooMuchStmt { + len: stmts.len(), + query: request.query, + } + ); + + // Create logical plan + // Note: Remember to store sql in error when creating logical plan + let plan = frontend + .statement_to_plan(&mut sql_ctx, stmts.remove(0)) + .context(CreatePlan { + query: &request.query, + })?; + + // Execute in interpreter + let interpreter_ctx = InterpreterContext::builder(request_id) + // Use current ctx's catalog and tenant as default catalog and tenant + .default_catalog_and_schema(ctx.catalog, ctx.tenant) + .build(); + let interpreter_factory = Factory::new( + instance.query_executor.clone(), + instance.catalog_manager.clone(), + instance.table_engine.clone(), + ); + let interpreter = interpreter_factory.create(interpreter_ctx, plan); + + let output = interpreter.execute().await.context(InterpreterExec { + query: &request.query, + })?; + + // Convert output to json + let resp = convert_output(output).context(ArrowToString { + query: &request.query, + })?; + + info!( + "sql handler finished processing request, request:{:?}", + request + ); + + Ok(resp) +} + +fn convert_output(output: Output) -> ArrowResult { + match output { + Output::AffectedRows(n) => Ok(Response::AffectedRows(n)), + Output::Records(records) => convert_records(records), + } +} + +fn convert_records(records: RecordBatchVec) -> ArrowResult { + let total_rows = records.iter().map(|v| v.num_rows()).sum(); + let mut resp = Vec::with_capacity(total_rows); + for record_batch in records { + let num_cols = record_batch.num_columns(); + let num_rows = record_batch.num_rows(); + let schema = record_batch.schema(); + + for row_idx in 0..num_rows { + let mut row = HashMap::with_capacity(num_cols); + for col_idx in 0..num_cols { + let column = record_batch.column(col_idx); + let column = column.datum(row_idx); + + let column_name = schema.column(col_idx).name.clone(); + row.insert(column_name, column); + } + + resp.push(row); + } + } + + Ok(Response::Rows(resp)) +} diff --git a/server/src/http.rs b/server/src/http.rs new file mode 100644 index 0000000000..7318d60433 --- /dev/null +++ b/server/src/http.rs @@ -0,0 +1,341 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Http service + +use std::{convert::Infallible, net::IpAddr, sync::Arc}; + +use catalog::manager::Manager as CatalogManager; +use log::error; +use profile::Profiler; +use query_engine::executor::Executor as QueryExecutor; +use serde_derive::Serialize; +use snafu::{Backtrace, OptionExt, ResultExt, Snafu}; +use table_engine::engine::EngineRuntimes; +use tokio::sync::oneshot::{self, Sender}; +use warp::{ + header, + http::StatusCode, + reject, + reply::{self, Reply}, + Filter, +}; + +use crate::{consts, context::RequestContext, error, handlers, instance::InstanceRef, metrics}; + +#[derive(Debug)] +pub struct Config { + pub ip: String, + pub port: u16, +} + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to create request context, err:{}", source))] + CreateContext { source: crate::context::Error }, + + #[snafu(display("Failed to handle request, err:{}", source))] + HandleRequest { + source: crate::handlers::error::Error, + }, + + #[snafu(display("Missing runtimes to build service.\nBacktrace:\n{}", backtrace))] + MissingRuntimes { backtrace: Backtrace }, + + #[snafu(display("Missing instance to build service.\nBacktrace:\n{}", backtrace))] + MissingInstance { backtrace: Backtrace }, + + #[snafu(display( + "Fail to do heap profiling, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + ProfileHeap { + source: profile::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Fail to join async task, err:{}.", source))] + JoinAsyncTask { source: common_util::runtime::Error }, + + #[snafu(display( + "Failed to parse ip addr, ip:{}, err:{}.\nBacktrace:\n{}", + ip, + source, + backtrace + ))] + ParseIpAddr { + ip: String, + source: std::net::AddrParseError, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +impl reject::Reject for Error {} + +/// Http service +/// +/// Note that the service does not owns the runtime +pub struct Service { + runtimes: Arc, + instance: InstanceRef, + profiler: Arc, + tx: Sender<()>, +} + +impl Service { + // TODO(yingwen): Maybe log error or return error + pub fn stop(self) { + let _ = self.tx.send(()); + } +} + +// TODO(yingwen): How to support non json response? +impl Service { + fn routes(&self) -> impl Filter + Clone { + self.home() + .or(self.metrics()) + .or(self.sql()) + .or(self.heap_profile()) + .or(self.admin_reject()) + } + + fn home(&self) -> impl Filter + Clone { + warp::path::end().and(warp::get()).map(|| { + use std::collections::HashMap; + let mut resp = HashMap::new(); + resp.insert("status", "ok"); + reply::json(&resp) + }) + } + + // TODO(yingwen): Avoid boilterplate code if there are more handlers + fn sql(&self) -> impl Filter + Clone { + warp::path!("sql") + .and(warp::post()) + // TODO(yingwen): content length limit + .and(warp::body::json()) + .and(self.with_context()) + .and(self.with_instance()) + .and_then(|req, ctx, instance| async { + // TODO(yingwen): Wrap common logic such as metrics, trace and error log + let result = handlers::sql::handle_sql(ctx, instance, req) + .await + .map_err(|e| { + // TODO(yingwen): Maybe truncate and print the sql + error!("Http service Failed to handle sql, err:{}", e); + e + }) + .context(HandleRequest); + match result { + Ok(res) => Ok(reply::json(&res)), + Err(e) => Err(reject::custom(e)), + } + }) + } + + fn metrics(&self) -> impl Filter + Clone { + warp::path!("metrics").and(warp::get()).map(metrics::dump) + } + + fn heap_profile( + &self, + ) -> impl Filter + Clone { + warp::path!("debug" / "heap_profile" / ..) + .and(warp::path::param::()) + .and(warp::get()) + .and(self.with_context()) + .and(self.with_profiler()) + .and_then( + |duration_sec: u64, ctx: RequestContext, profiler: Arc| async move { + let handle = ctx.runtime.spawn_blocking(move || { + profiler.dump_mem_prof(duration_sec).context(ProfileHeap) + }); + let result = handle.await.context(JoinAsyncTask); + match result { + Ok(Ok(prof_data)) => Ok(prof_data.into_response()), + Ok(Err(e)) => Err(reject::custom(e)), + Err(e) => Err(reject::custom(e)), + } + }, + ) + } + + fn with_context( + &self, + ) -> impl Filter + Clone { + let default_catalog = self + .instance + .catalog_manager + .default_catalog_name() + .to_string(); + let default_schema = self + .instance + .catalog_manager + .default_schema_name() + .to_string(); + //TODO(boyan) use read/write runtime by sql type. + let runtime = self.runtimes.bg_runtime.clone(); + + header::optional::(consts::CATALOG_HEADER) + .and(header::optional::(consts::TENANT_HEADER)) + .and_then(move |catalog: Option<_>, tenant: Option<_>| { + // Clone the captured variables + let default_catalog = default_catalog.clone(); + let default_schema = default_schema.clone(); + let runtime = runtime.clone(); + async { + RequestContext::builder() + .catalog(catalog.unwrap_or(default_catalog)) + .tenant(tenant.unwrap_or(default_schema)) + .runtime(runtime) + .build() + .context(CreateContext) + .map_err(reject::custom) + } + }) + } + + fn with_profiler(&self) -> impl Filter,), Error = Infallible> + Clone { + let profiler = self.profiler.clone(); + warp::any().map(move || profiler.clone()) + } + + fn with_instance( + &self, + ) -> impl Filter,), Error = Infallible> + Clone { + let instance = self.instance.clone(); + warp::any().map(move || instance.clone()) + } + + fn admin_reject( + &self, + ) -> impl Filter + Clone { + warp::path!("reject") + .and(warp::post()) + .and(warp::body::json()) + .and(self.with_context()) + .and(self.with_instance()) + .and_then(|req, ctx, instance| async { + let result = handlers::admin::handle_reject(ctx, instance, req) + .await + .map_err(|e| { + error!("Http service failed to handle admin reject, err:{}", e); + e + }) + .context(HandleRequest); + + match result { + Ok(res) => Ok(reply::json(&res)), + Err(e) => Err(reject::custom(e)), + } + }) + } +} + +/// Service builder +pub struct Builder { + config: Config, + runtimes: Option>, + instance: Option>, +} + +impl Builder { + pub fn new(config: Config) -> Self { + Self { + config, + runtimes: None, + instance: None, + } + } + + pub fn runtimes(mut self, runtimes: Arc) -> Self { + self.runtimes = Some(runtimes); + self + } + + pub fn instance(mut self, instance: InstanceRef) -> Self { + self.instance = Some(instance); + self + } +} + +impl Builder { + /// Build and start the service + pub fn build(self) -> Result> { + let runtimes = self.runtimes.context(MissingRuntimes)?; + let instance = self.instance.context(MissingInstance)?; + let (tx, rx) = oneshot::channel(); + + let service = Service { + runtimes: runtimes.clone(), + instance, + profiler: Arc::new(Profiler::default()), + tx, + }; + + let ip_addr: IpAddr = self + .config + .ip + .parse() + .context(ParseIpAddr { ip: self.config.ip })?; + + // Register filters to warp and rejection handler + let routes = service.routes().recover(handle_rejection); + let (_addr, server) = + warp::serve(routes).bind_with_graceful_shutdown((ip_addr, self.config.port), async { + rx.await.ok(); + }); + // Run the service + runtimes.bg_runtime.spawn(server); + + Ok(service) + } +} + +#[derive(Debug, Serialize)] +struct ErrorResponse { + code: u16, + message: String, +} + +fn error_to_status_code(err: &Error) -> StatusCode { + match err { + Error::CreateContext { .. } => StatusCode::BAD_REQUEST, + // TODO(yingwen): Map handle request error to more accurate status code + Error::HandleRequest { .. } + | Error::MissingRuntimes { .. } + | Error::MissingInstance { .. } + | Error::ParseIpAddr { .. } + | Error::ProfileHeap { .. } + | Error::JoinAsyncTask { .. } => StatusCode::INTERNAL_SERVER_ERROR, + } +} + +async fn handle_rejection( + rejection: warp::Rejection, +) -> std::result::Result { + let code; + let message; + + if rejection.is_not_found() { + code = StatusCode::NOT_FOUND; + message = String::from("NOT_FOUND"); + } else if let Some(err) = rejection.find() { + code = error_to_status_code(err); + let err_string = err.to_string(); + message = error::first_line_in_error(&err_string).to_string(); + } else { + error!("handle error: {:?}", rejection); + code = StatusCode::INTERNAL_SERVER_ERROR; + message = format!("UNKNOWN_ERROR: {:?}", rejection); + } + + let json = reply::json(&ErrorResponse { + code: code.as_u16(), + message, + }); + + Ok(reply::with_status(json, code)) +} diff --git a/server/src/instance.rs b/server/src/instance.rs new file mode 100644 index 0000000000..64d3ada775 --- /dev/null +++ b/server/src/instance.rs @@ -0,0 +1,26 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Instance contains shared states of service + +use std::sync::Arc; + +use table_engine::engine::TableEngineRef; +use udf::registry::FunctionRegistryRef; + +use crate::limiter::Limiter; + +/// A cluster instance. Usually there is only one instance per cluster +/// +/// C: catalog::manager::Manager +/// Q: query_engine::executor::Executor +pub struct Instance { + pub catalog_manager: C, + pub query_executor: Q, + pub table_engine: TableEngineRef, + // User defined functions registry. + pub function_registry: FunctionRegistryRef, + pub limiter: Limiter, +} + +/// A reference counted instance pointer +pub type InstanceRef = Arc>; diff --git a/server/src/lib.rs b/server/src/lib.rs new file mode 100644 index 0000000000..122735a07f --- /dev/null +++ b/server/src/lib.rs @@ -0,0 +1,25 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Rpc server + +// TODO(yingwen): +// Borrow some ideas from tikv: https://github.com/tikv/tikv/blob/dc8ce2cf6a8904cb3dad556f71b11bac3531689b/src/server/service/kv.rs#L51 + +#[macro_use] +extern crate common_util; + +mod avro_util; +pub mod config; +mod consts; +mod context; +mod error; +mod grpc; +mod handlers; +mod http; +mod instance; +pub mod limiter; +pub mod logger; +mod metrics; +mod router; +pub mod server; +pub mod table_engine; diff --git a/server/src/limiter.rs b/server/src/limiter.rs new file mode 100644 index 0000000000..f594b2028b --- /dev/null +++ b/server/src/limiter.rs @@ -0,0 +1,194 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{collections::HashSet, sync::RwLock}; + +use arrow_deps::datafusion::catalog::TableReference; +use sql::plan::Plan; + +pub struct Limiter { + write_reject_list: RwLock>, + read_reject_list: RwLock>, +} + +impl Default for Limiter { + fn default() -> Self { + Self { + write_reject_list: RwLock::new(HashSet::new()), + read_reject_list: RwLock::new(HashSet::new()), + } + } +} + +impl Limiter { + pub fn should_limit(&self, plan: &Plan) -> bool { + match plan { + Plan::Query(query) => { + let read_reject_list = self.read_reject_list.read().unwrap().clone(); + for table in read_reject_list { + if query + .tables + .get(TableReference::from(table.as_str())) + .is_some() + { + return true; + } + } + false + } + Plan::Insert(insert) => self + .write_reject_list + .read() + .unwrap() + .contains(insert.table.name()), + _ => false, + } + } + + pub fn add_write_reject_list(&self, reject_list: Vec) { + self.write_reject_list + .write() + .unwrap() + .extend(reject_list.into_iter()) + } + + pub fn add_read_reject_list(&self, reject_list: Vec) { + self.read_reject_list + .write() + .unwrap() + .extend(reject_list.into_iter()) + } + + pub fn set_write_reject_list(&self, reject_list: Vec) { + *self.write_reject_list.write().unwrap() = reject_list.into_iter().collect(); + } + + pub fn set_read_reject_list(&self, reject_list: Vec) { + *self.read_reject_list.write().unwrap() = reject_list.into_iter().collect(); + } + + pub fn get_write_reject_list(&self) -> HashSet { + self.write_reject_list.write().unwrap().clone() + } + + pub fn get_read_reject_list(&self) -> HashSet { + self.read_reject_list.write().unwrap().clone() + } + + pub fn remove_write_reject_list(&self, reject_list: Vec) { + let mut write_reject_list = self.write_reject_list.write().unwrap(); + for value in reject_list { + write_reject_list.remove(&value); + } + } + + pub fn remove_read_reject_list(&self, reject_list: Vec) { + let mut read_reject_list = self.read_reject_list.write().unwrap(); + for value in reject_list { + read_reject_list.remove(&value); + } + } +} + +#[cfg(test)] +mod tests { + use common_types::request_id::RequestId; + use sql::{parser::Parser, plan::Plan, planner::Planner, tests::MockMetaProvider}; + + use crate::limiter::Limiter; + + fn sql_to_plan(meta_provider: &MockMetaProvider, sql: &str) -> Plan { + let planner = Planner::new(meta_provider, RequestId::next_id(), 1); + let mut statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + planner.statement_to_plan(statements.remove(0)).unwrap() + } + + fn prepare() -> (MockMetaProvider, Limiter) { + let mock = MockMetaProvider::default(); + + let reject_list = vec!["test_table".to_string()]; + let limiter = Limiter::default(); + limiter.set_read_reject_list(reject_list.clone()); + limiter.set_write_reject_list(reject_list); + (mock, limiter) + } + + #[test] + fn test_limiter() { + let (mock, limiter) = prepare(); + let query = "select * from test_table"; + let query_plan = sql_to_plan(&mock, query); + assert!(limiter.should_limit(&query_plan)); + + let insert="INSERT INTO test_table(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3')"; + let insert_plan = sql_to_plan(&mock, insert); + assert!(limiter.should_limit(&insert_plan)); + } + + #[test] + fn test_limiter_remove() { + let (mock, limiter) = prepare(); + let test_data = vec!["test_table".to_string()]; + + let query = "select * from test_table"; + let query_plan = sql_to_plan(&mock, query); + assert!(limiter.should_limit(&query_plan)); + + let insert="INSERT INTO test_table(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3')"; + let insert_plan = sql_to_plan(&mock, insert); + assert!(limiter.should_limit(&insert_plan)); + + limiter.remove_write_reject_list(test_data.clone()); + limiter.remove_read_reject_list(test_data); + assert!(!limiter.should_limit(&query_plan)); + assert!(!limiter.should_limit(&insert_plan)); + } + + #[test] + fn test_limiter_add() { + let (mock, limiter) = prepare(); + let test_data = vec!["test_table2".to_string()]; + + let query = "select * from test_table2"; + let query_plan = sql_to_plan(&mock, query); + assert!(!limiter.should_limit(&query_plan)); + + let insert="INSERT INTO test_table2(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3')"; + let insert_plan = sql_to_plan(&mock, insert); + assert!(!limiter.should_limit(&insert_plan)); + + limiter.add_write_reject_list(test_data.clone()); + limiter.add_read_reject_list(test_data); + assert!(limiter.should_limit(&query_plan)); + assert!(limiter.should_limit(&insert_plan)); + } + + #[test] + fn test_limiter_set() { + let (mock, limiter) = prepare(); + let test_data = vec!["test_table2".to_string()]; + + let query = "select * from test_table"; + let query_plan = sql_to_plan(&mock, query); + assert!(limiter.should_limit(&query_plan)); + + let query2 = "select * from test_table2"; + let query_plan2 = sql_to_plan(&mock, query2); + assert!(!limiter.should_limit(&query_plan2)); + + let insert="INSERT INTO test_table(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3')"; + let insert_plan = sql_to_plan(&mock, insert); + assert!(limiter.should_limit(&insert_plan)); + + let insert2="INSERT INTO test_table2(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3')"; + let insert_plan2 = sql_to_plan(&mock, insert2); + assert!(!limiter.should_limit(&insert_plan2)); + + limiter.set_read_reject_list(test_data.clone()); + limiter.set_write_reject_list(test_data); + assert!(!limiter.should_limit(&query_plan)); + assert!(!limiter.should_limit(&insert_plan)); + assert!(limiter.should_limit(&query_plan2)); + assert!(limiter.should_limit(&insert_plan2)); + } +} diff --git a/server/src/logger.rs b/server/src/logger.rs new file mode 100644 index 0000000000..a05ecd44ec --- /dev/null +++ b/server/src/logger.rs @@ -0,0 +1,32 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::str::FromStr; + +use log::SetLoggerError; +use logger::{Level, LogDispatcher, RuntimeLevel}; + +use crate::config::Config; + +pub fn init_log(config: &Config) -> Result { + let level = match Level::from_str(&config.log_level) { + Ok(v) => v, + Err(e) => { + panic!( + "Parse log level failed, level: {}, err: {:?}", + &config.log_level, e + ); + } + }; + + let term_drain = logger::term_drainer(); + let drain = LogDispatcher::new(term_drain); + + // Use async and init stdlog + logger::init_log( + drain, + level, + config.enable_async_log, + config.async_log_channel_len, + true, + ) +} diff --git a/server/src/metrics.rs b/server/src/metrics.rs new file mode 100644 index 0000000000..89dd08fdbd --- /dev/null +++ b/server/src/metrics.rs @@ -0,0 +1,19 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Metrics util for server. + +use log::warn; +use prometheus::{Encoder, TextEncoder}; + +/// Gather and dump prometheus to string. +pub fn dump() -> String { + let mut buffer = vec![]; + let encoder = TextEncoder::new(); + let metric_families = prometheus::gather(); + for mf in metric_families { + if let Err(e) = encoder.encode(&[mf], &mut buffer) { + warn!("prometheus encoding error, err:{}", e); + } + } + String::from_utf8(buffer).unwrap() +} diff --git a/server/src/router.rs b/server/src/router.rs new file mode 100644 index 0000000000..aa687c714b --- /dev/null +++ b/server/src/router.rs @@ -0,0 +1,196 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + collections::HashMap, + hash::{Hash, Hasher}, + sync::Arc, +}; + +use ceresdbproto::storage::{Endpoint, Route, RouteRequest}; +use log::info; +use meta_client::{MetaClient, ShardId}; +use serde_derive::Deserialize; +use twox_hash::XxHash64; + +use crate::error::{ErrNoCause, Result, StatusCode}; + +/// Hash seed to build hasher. Modify the seed will result in different route +/// result! +const HASH_SEED: u64 = 0; + +pub type RouterRef = Arc; + +pub trait Router { + fn route(&self, schema: &str, req: RouteRequest) -> Result>; +} + +#[derive(Debug, Deserialize)] +pub struct PrefixRule { + /// Schema name of the prefix. + pub schema: String, + /// Prefix of the table name. + pub prefix: String, + /// The shard of matched tables. + pub shard: ShardId, +} + +#[derive(Debug, Deserialize)] +pub struct HashRule { + /// Schema name of the prefix. + pub schema: String, + /// The shard list for hash rule. + pub shards: Vec, +} + +#[derive(Debug, Default, Deserialize)] +pub struct RuleList { + pub prefix_rules: Vec, + pub hash_rules: Vec, +} + +impl RuleList { + pub fn split_by_schema(self) -> SchemaRules { + let mut schema_rules = HashMap::new(); + + for rule in self.prefix_rules { + let rule_list = match schema_rules.get_mut(&rule.schema) { + Some(v) => v, + None => schema_rules + .entry(rule.schema.clone()) + .or_insert_with(RuleList::default), + }; + + rule_list.prefix_rules.push(rule); + } + + for rule in self.hash_rules { + let rule_list = match schema_rules.get_mut(&rule.schema) { + Some(v) => v, + None => schema_rules + .entry(rule.schema.clone()) + .or_insert_with(RuleList::default), + }; + + rule_list.hash_rules.push(rule); + } + + schema_rules + } +} + +// Schema -> Rule list of the schema. +type SchemaRules = HashMap; + +pub struct RuleBasedRouter { + meta_client: Arc, + schema_rules: SchemaRules, +} + +impl RuleBasedRouter { + pub fn new(meta_client: Arc, rules: RuleList) -> Self { + let schema_rules = rules.split_by_schema(); + + info!("RuleBasedRouter init with rules, rules:{:?}", schema_rules); + + Self { + meta_client, + schema_rules, + } + } + + fn maybe_route_by_rule(metric: &str, rule_list: &RuleList) -> Option { + for prefix_rule in &rule_list.prefix_rules { + if metric.starts_with(&prefix_rule.prefix) { + return Some(prefix_rule.shard); + } + } + + if let Some(hash_rule) = rule_list.hash_rules.get(0) { + let total_shards = hash_rule.shards.len(); + let hash_value = hash_metric(metric); + let index = hash_value as usize % total_shards; + + return Some(hash_rule.shards[index]); + } + + None + } + + #[inline] + fn route_by_hash(metric: &str, total_shards: usize) -> ShardId { + let hash_value = hash_metric(metric); + (hash_value as usize % total_shards) as ShardId + } + + fn route_metric( + metric: &str, + rule_list_opt: Option<&RuleList>, + total_shards: usize, + ) -> ShardId { + if let Some(rule_list) = rule_list_opt { + if let Some(shard_id) = Self::maybe_route_by_rule(metric, rule_list) { + return shard_id; + } + } + + // Fallback to hash route rule. + Self::route_by_hash(metric, total_shards) + } +} + +impl Router for RuleBasedRouter { + fn route(&self, schema: &str, req: RouteRequest) -> Result> { + let cluster_view = self.meta_client.get_cluster_view(); + if let Some(shard_view_map) = cluster_view.schema_shards.get(schema) { + if shard_view_map.is_empty() { + return ErrNoCause { + code: StatusCode::NotFound, + msg: "shards from meta is empty", + } + .fail(); + } + + // Get rule list of this schema. + let rule_list_opt = self.schema_rules.get(schema); + + // TODO(yingwen): Better way to get total shard number + let total_shards = shard_view_map.len(); + let mut route_vec = Vec::with_capacity(req.metrics.len()); + for metric in req.metrics { + let mut route = Route::new(); + route.set_metric(metric); + + let shard_id = Self::route_metric(route.get_metric(), rule_list_opt, total_shards); + + let mut endpoint = Endpoint::new(); + if let Some(shard_view) = shard_view_map.get(&shard_id) { + let node = &shard_view.node; + endpoint.set_ip(node.addr.clone()); + endpoint.set_port(node.port); + } else { + return ErrNoCause { + code: StatusCode::NotFound, + msg: format!( + "Shard not found, metric:{}, shard_id:{}", + route.get_metric(), + shard_id + ), + } + .fail(); + } + + route.set_endpoint(endpoint); + route_vec.push(route); + } + return Ok(route_vec); + } + + Ok(Vec::new()) + } +} + +fn hash_metric(metric: &str) -> u64 { + let mut hasher = XxHash64::with_seed(HASH_SEED); + metric.hash(&mut hasher); + hasher.finish() +} diff --git a/server/src/server.rs b/server/src/server.rs new file mode 100644 index 0000000000..90e5a999b9 --- /dev/null +++ b/server/src/server.rs @@ -0,0 +1,180 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Server + +use std::sync::Arc; + +use catalog::manager::Manager as CatalogManager; +use grpcio::Environment; +use query_engine::executor::Executor as QueryExecutor; +use snafu::{Backtrace, OptionExt, ResultExt, Snafu}; +use table_engine::engine::{EngineRuntimes, TableEngineRef}; +use udf::registry::FunctionRegistryRef; + +use crate::{ + config::Config, + grpc::{self, RpcServices}, + http::{self, Service}, + instance::{Instance, InstanceRef}, + limiter::Limiter, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Missing runtimes.\nBacktrace:\n{}", backtrace))] + MissingRuntimes { backtrace: Backtrace }, + + #[snafu(display("Missing catalog manager.\nBacktrace:\n{}", backtrace))] + MissingCatalogManager { backtrace: Backtrace }, + + #[snafu(display("Missing query executor.\nBacktrace:\n{}", backtrace))] + MissingQueryExecutor { backtrace: Backtrace }, + + #[snafu(display("Missing table engine.\nBacktrace:\n{}", backtrace))] + MissingTableEngine { backtrace: Backtrace }, + + #[snafu(display("Missing function registry.\nBacktrace:\n{}", backtrace))] + MissingFunctionRegistry { backtrace: Backtrace }, + + #[snafu(display("Missing limiter.\nBacktrace:\n{}", backtrace))] + MissingLimiter { backtrace: Backtrace }, + + #[snafu(display("Failed to start http service, err:{}", source))] + StartHttpService { source: crate::http::Error }, + + #[snafu(display("Failed to register system catalog, err:{}", source))] + RegisterSystemCatalog { source: catalog::manager::Error }, + + #[snafu(display("Failed to build grpc service, err:{}", source))] + BuildGrpcService { source: crate::grpc::Error }, + + #[snafu(display("Failed to start grpc service, err:{}", source))] + StartGrpcService { source: crate::grpc::Error }, +} + +define_result!(Error); + +// TODO(yingwen): Consider a config manager +/// Server +pub struct Server { + http_service: Service, + rpc_services: RpcServices, +} + +impl Server { + pub fn stop(mut self) { + self.rpc_services.shutdown(); + self.http_service.stop(); + } + + pub async fn start(&mut self) -> Result<()> { + self.rpc_services.start().await.context(StartGrpcService) + } +} + +#[must_use] +pub struct Builder { + config: Config, + runtimes: Option>, + catalog_manager: Option, + query_executor: Option, + table_engine: Option, + function_registry: Option, + limiter: Limiter, +} + +impl Builder { + pub fn new(config: Config) -> Self { + Self { + config, + runtimes: None, + catalog_manager: None, + query_executor: None, + table_engine: None, + function_registry: None, + limiter: Limiter::default(), + } + } + + pub fn runtimes(mut self, runtimes: Arc) -> Self { + self.runtimes = Some(runtimes); + self + } + + pub fn catalog_manager(mut self, val: C) -> Self { + self.catalog_manager = Some(val); + self + } + + pub fn query_executor(mut self, val: Q) -> Self { + self.query_executor = Some(val); + self + } + + pub fn table_engine(mut self, val: TableEngineRef) -> Self { + self.table_engine = Some(val); + self + } + + pub fn function_registry(mut self, val: FunctionRegistryRef) -> Self { + self.function_registry = Some(val); + self + } + + pub fn limiter(mut self, val: Limiter) -> Self { + self.limiter = val; + self + } + + /// Build and run the server + pub fn build(self) -> Result> { + // Build runtimes + let runtimes = self.runtimes.context(MissingRuntimes)?; + + // Build instance + let catalog_manager = self.catalog_manager.context(MissingCatalogManager)?; + let query_executor = self.query_executor.context(MissingQueryExecutor)?; + let table_engine = self.table_engine.context(MissingTableEngine)?; + let function_registry = self.function_registry.context(MissingFunctionRegistry)?; + let instance = Instance { + catalog_manager, + query_executor, + table_engine, + function_registry, + limiter: self.limiter, + }; + let instance = InstanceRef::new(instance); + + // Create http config + let http_config = http::Config { + ip: self.config.bind_addr.clone(), + port: self.config.http_port, + }; + + // Start http service + let http_service = http::Builder::new(http_config) + .runtimes(runtimes.clone()) + .instance(instance.clone()) + .build() + .context(StartHttpService)?; + + let meta_client_config = self.config.meta_client; + let env = Arc::new(Environment::new(self.config.grpc_server_cq_count)); + let rpc_services = grpc::Builder::new() + .bind_addr(self.config.bind_addr) + .port(self.config.grpc_port) + .meta_client_config(meta_client_config) + .env(env) + .runtimes(runtimes) + .instance(instance) + .route_rules(self.config.route_rules) + .build() + .context(BuildGrpcService)?; + + let server = Server { + http_service, + rpc_services, + }; + Ok(server) + } +} diff --git a/server/src/table_engine.rs b/server/src/table_engine.rs new file mode 100644 index 0000000000..7f7083b91c --- /dev/null +++ b/server/src/table_engine.rs @@ -0,0 +1,97 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table engine implementation + +use std::sync::Arc; + +use analytic_engine::AnalyticTableEngine; +use async_trait::async_trait; +use table_engine::{ + engine::{ + CreateTableRequest, DropTableRequest, OpenTableRequest, Result, TableEngine, + UnknownEngineType, + }, + memory::MemoryTable, + table::TableRef, + ANALYTIC_ENGINE_TYPE, MEMORY_ENGINE_TYPE, +}; + +/// Memory table engine implementation +// Mainly for test purpose now +pub struct MemoryTableEngine; + +#[async_trait] +impl TableEngine for MemoryTableEngine { + fn engine_type(&self) -> &str { + MEMORY_ENGINE_TYPE + } + + async fn close(&self) -> Result<()> { + Ok(()) + } + + async fn create_table(&self, request: CreateTableRequest) -> Result { + Ok(Arc::new(MemoryTable::new( + request.table_name, + request.table_id, + request.table_schema, + MEMORY_ENGINE_TYPE.to_string(), + ))) + } + + async fn drop_table(&self, _request: DropTableRequest) -> Result { + Ok(true) + } + + async fn open_table(&self, _request: OpenTableRequest) -> Result> { + Ok(None) + } +} + +/// Route [CreateTableRequest] to the correct engine by its engine type +pub struct TableEngineProxy { + /// Memory table engine + pub memory: MemoryTableEngine, + /// Analytic table engine + pub analytic: AnalyticTableEngine, +} + +#[async_trait] +impl TableEngine for TableEngineProxy { + fn engine_type(&self) -> &str { + "TableEngineProxy" + } + + async fn close(&self) -> Result<()> { + self.memory.close().await?; + self.analytic.close().await?; + + Ok(()) + } + + async fn create_table(&self, request: CreateTableRequest) -> Result { + // TODO(yingwen): Use a map + match request.engine.as_str() { + MEMORY_ENGINE_TYPE => self.memory.create_table(request).await, + ANALYTIC_ENGINE_TYPE => self.analytic.create_table(request).await, + engine_type => UnknownEngineType { engine_type }.fail(), + } + } + + async fn drop_table(&self, request: DropTableRequest) -> Result { + match request.engine.as_str() { + MEMORY_ENGINE_TYPE => self.memory.drop_table(request).await, + ANALYTIC_ENGINE_TYPE => self.analytic.drop_table(request).await, + engine_type => UnknownEngineType { engine_type }.fail(), + } + } + + /// Open table, return error if table not exists + async fn open_table(&self, request: OpenTableRequest) -> Result> { + match request.engine.as_str() { + MEMORY_ENGINE_TYPE => self.memory.open_table(request).await, + ANALYTIC_ENGINE_TYPE => self.analytic.open_table(request).await, + engine_type => UnknownEngineType { engine_type }.fail(), + } + } +} diff --git a/sql/Cargo.toml b/sql/Cargo.toml new file mode 100644 index 0000000000..3056272218 --- /dev/null +++ b/sql/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "sql" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[features] +test = [] + +[dependencies] +# In alphabetical order +arrow_deps = { path = "../arrow_deps" } +catalog = { path = "../catalog" } +common_types = { path = "../common_types"} +common_util = { path = "../common_util" } +log = "0.4" +paste = "1.0" +snafu = { version ="0.6.10", features = ["backtraces"]} +sqlparser = "0.13.0" +table_engine = { path = "../table_engine" } +udf = { path = "../udf" } +ceresdbproto = { git = "https://github.com/CeresDB/ceresdbproto.git"} +regex = "1" + +[dev-dependencies] +common_types = { path = "../common_types", features = ["test"] } +tokio = { version = "1.0", features = ["full"] } diff --git a/sql/src/ast.rs b/sql/src/ast.rs new file mode 100644 index 0000000000..e68e486e43 --- /dev/null +++ b/sql/src/ast.rs @@ -0,0 +1,80 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! SQL statement + +use sqlparser::ast::{ + ColumnDef, ObjectName, SqlOption, Statement as SqlStatement, TableConstraint, +}; + +/// Statement representations +#[derive(Debug, PartialEq)] +pub enum Statement { + /// ANSI SQL AST node + Standard(Box), + // Other extensions + /// CREATE TABLE + Create(CreateTable), + /// Drop TABLE + Drop(DropTable), + Describe(DescribeTable), + AlterModifySetting(AlterModifySetting), + AlterAddColumn(AlterAddColumn), + /// SHOW CREATE TABLE + ShowCreate(ShowCreate), + Exists(ExistsTable), +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub enum ShowCreateObject { + Table, +} + +#[derive(Debug, PartialEq)] +pub struct CreateTable { + /// Create if not exists + pub if_not_exists: bool, + /// Table name + pub name: ObjectName, + pub columns: Vec, + pub engine: String, + pub constraints: Vec, + /// Table options in `WITH`. + pub options: Vec, +} + +#[derive(Debug, PartialEq)] +pub struct DropTable { + /// Table name + pub name: ObjectName, + pub if_exists: bool, + pub engine: String, +} + +#[derive(Debug, PartialEq)] +pub struct DescribeTable { + pub table_name: ObjectName, +} + +#[derive(Debug, PartialEq)] +pub struct AlterModifySetting { + pub table_name: ObjectName, + pub options: Vec, +} + +#[derive(Debug, PartialEq)] +pub struct AlterAddColumn { + pub table_name: ObjectName, + pub columns: Vec, +} + +#[derive(Debug, PartialEq)] +pub struct ShowCreate { + pub obj_type: ShowCreateObject, + pub obj_name: ObjectName, +} + +#[derive(Debug, PartialEq)] +pub struct ExistsTable { + pub table_name: ObjectName, +} diff --git a/sql/src/container.rs b/sql/src/container.rs new file mode 100644 index 0000000000..eac30eb737 --- /dev/null +++ b/sql/src/container.rs @@ -0,0 +1,175 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table container + +use std::{collections::HashMap, sync::Arc}; + +pub use arrow_deps::datafusion::catalog::{ResolvedTableReference, TableReference}; +use table_engine::provider::TableProviderAdapter; + +// Rust has poor support of using tuple as map key, so we use a 3 level +// map to store catalog -> schema -> table mapping +type CatalogMap = HashMap; +type SchemaMap = HashMap; +type TableMap = HashMap>; + +/// Container to hold table adapters +/// +/// Optimized for default catalog and schema +#[derive(Default)] +pub struct TableContainer { + default_catalog: String, + default_schema: String, + default_tables: HashMap>, + other_tables: CatalogMap, +} + +impl TableContainer { + pub fn new(default_catalog: String, default_schema: String) -> Self { + Self { + default_catalog, + default_schema, + default_tables: HashMap::new(), + other_tables: CatalogMap::new(), + } + } + + /// Catalog num + pub fn num_catalogs(&self) -> usize { + if self.other_tables.is_empty() { + 1 + } else { + self.other_tables.len() + 1 + } + } + + pub fn get(&self, name: TableReference) -> Option> { + match name { + TableReference::Bare { table } => self.get_default(table), + TableReference::Partial { schema, table } => { + if schema == self.default_schema { + self.get_default(table) + } else { + self.get_other(&self.default_catalog, schema, table) + } + } + TableReference::Full { + catalog, + schema, + table, + } => { + if catalog == self.default_catalog && schema == self.default_schema { + self.get_default(table) + } else { + self.get_other(catalog, schema, table) + } + } + } + } + + fn get_default(&self, table: &str) -> Option> { + self.default_tables.get(table).cloned() + } + + fn get_other( + &self, + catalog: &str, + schema: &str, + table: &str, + ) -> Option> { + self.other_tables + .get(catalog) + .and_then(|schemas| schemas.get(schema)) + .and_then(|tables| tables.get(table)) + .cloned() + } + + pub fn insert(&mut self, name: TableReference, table_adapter: Arc) { + match name { + TableReference::Bare { table } => self.insert_default(table, table_adapter), + TableReference::Partial { schema, table } => { + if schema == self.default_schema { + self.insert_default(table, table_adapter) + } else { + self.insert_other( + self.default_catalog.clone(), + schema.to_string(), + table.to_string(), + table_adapter, + ) + } + } + TableReference::Full { + catalog, + schema, + table, + } => { + if catalog == self.default_catalog && schema == self.default_schema { + self.insert_default(table, table_adapter) + } else { + self.insert_other( + catalog.to_string(), + schema.to_string(), + table.to_string(), + table_adapter, + ) + } + } + } + } + + fn insert_default(&mut self, table: &str, table_adapter: Arc) { + self.default_tables.insert(table.to_string(), table_adapter); + } + + fn insert_other( + &mut self, + catalog: String, + schema: String, + table: String, + table_adapter: Arc, + ) { + self.other_tables + .entry(catalog) + .or_insert_with(HashMap::new) + .entry(schema) + .or_insert_with(HashMap::new) + .insert(table, table_adapter); + } + + /// Visit all tables + /// + /// If f returns error, stop iteration and return the error + pub fn visit(&self, mut f: F) -> Result<(), E> + where + F: FnMut(ResolvedTableReference, &Arc) -> Result<(), E>, + { + // Visit default tables first + for (table, adapter) in &self.default_tables { + // default_catalog/default_schema can be empty string, but that's + // ok since we have table under them + let table_ref = ResolvedTableReference { + catalog: &self.default_catalog, + schema: &self.default_schema, + table, + }; + f(table_ref, adapter)?; + } + + // Visit other tables + for (catalog, schemas) in &self.other_tables { + for (schema, tables) in schemas { + for (table, adapter) in tables { + let table_ref = ResolvedTableReference { + catalog, + schema, + table, + }; + f(table_ref, adapter)?; + } + } + } + + Ok(()) + } +} diff --git a/sql/src/frontend.rs b/sql/src/frontend.rs new file mode 100644 index 0000000000..f45e6def4d --- /dev/null +++ b/sql/src/frontend.rs @@ -0,0 +1,108 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Frontend + +use std::{convert::TryInto, sync::Arc}; + +use ceresdbproto::prometheus::PrometheusQueryRequest; +use common_types::request_id::RequestId; +use snafu::{ResultExt, Snafu}; +use table_engine::table; + +use crate::{ + ast::Statement, + parser::Parser, + plan::Plan, + planner::Planner, + promql::{ColumnNames, Expr}, + provider::MetaProvider, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + // Invalid sql is quite common, so we don't provide a backtrace now. + #[snafu(display("Invalid sql, sql:{}, err:{}", sql, source))] + InvalidSql { + sql: String, + source: sqlparser::parser::ParserError, + }, + + // TODO(yingwen): Should we store stmt here? + #[snafu(display("Failed to create plan, err:{}", source))] + CreatePlan { source: crate::planner::Error }, + + #[snafu(display("Invalid prom request, err:{}", source))] + InvalidPromRequest { source: crate::promql::Error }, +} + +define_result!(Error); + +pub type StatementVec = Vec; + +/// Context used by Frontend +/// +/// We can collect metrics and trace info in it instead of using global +/// metrics or trace collector. +pub struct Context { + /// Id of the query request. + pub request_id: RequestId, + /// Parallelism to read table. + pub read_parallelism: usize, +} + +impl Context { + pub fn new(request_id: RequestId) -> Self { + Self { + request_id, + read_parallelism: table::DEFAULT_READ_PARALLELISM, + } + } +} + +/// SQL frontend implementation +/// +/// Thought the parser supports using multiple statements in a sql, but +/// this frontend only support planning one statement at a time now +#[derive(Debug)] +pub struct Frontend