diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..7f220c5e8a --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +target +.DS_Store +.idea/ +.vscode diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000000..6958411537 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,4363 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "addr2line" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e61f2b7f93d2c7d2b08263acaa4a363b3e276806c68af6134c44f523bf1aacd" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "adler32" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" + +[[package]] +name = "ahash" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217" +dependencies = [ + "const-random", +] + +[[package]] +name = "ahash" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43bb833f0bf979d8475d38fbf09ed3b8a55e1885fe93ad3f93239fc6a4f17b98" +dependencies = [ + "getrandom 0.2.3", + "once_cell", + "version_check", +] + +[[package]] +name = "aho-corasick" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" +dependencies = [ + "memchr", +] + +[[package]] +name = "alloc-no-stdlib" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35ef4730490ad1c4eae5c4325b2a95f521d023e5c885853ff7aca0a6a1631db3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "697ed7edc0f1711de49ce108c541623a0af97c6c60b2f6e2b65229847ac843c2" +dependencies = [ + "alloc-no-stdlib", +] + +[[package]] +name = "analytic_engine" +version = "0.1.0" +dependencies = [ + "arc-swap 1.4.0", + "arena", + "arrow_deps", + "async-trait", + "base64", + "common_types", + "common_util", + "env_logger", + "futures", + "lazy_static", + "log", + "object_store", + "parquet 0.1.0", + "prometheus 0.12.0", + "proto", + "protobuf", + "serde", + "serde_derive", + "skiplist", + "smallvec", + "snafu", + "table_engine", + "tempfile", + "tokio", + "wal", +] + +[[package]] +name = "ansi_term" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +dependencies = [ + "winapi", +] + +[[package]] +name = "ansi_term" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi", +] + +[[package]] +name = "anyhow" +version = "1.0.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28ae2b3dec75a406790005a200b1bd89785afc02517a00ca99ecfe093ee9e6cf" + +[[package]] +name = "arc-swap" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc4662175ead9cd84451d5c35070517777949a2ed84551764129cedb88384841" + +[[package]] +name = "arc-swap" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6df5aef5c5830360ce5218cecb8f018af3438af5686ae945094affc86fdec63" + +[[package]] +name = "arena" +version = "0.1.0" +dependencies = [ + "parking_lot", +] + +[[package]] +name = "arrayref" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4c527152e37cf757a3f78aae5a06fbeefdb07ccc535c980a3208ee3060dd544" + +[[package]] +name = "arrayvec" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" + +[[package]] +name = "arrow" +version = "7.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66ec0a5964feebf378e2fc6db9530e712657b8edf72aa17b1b277b0f52a48e2d" +dependencies = [ + "bitflags", + "chrono", + "comfy-table", + "csv", + "flatbuffers", + "half", + "hex", + "indexmap", + "lazy_static", + "lexical-core", + "multiversion", + "num", + "rand 0.8.4", + "regex", + "serde", + "serde_derive", + "serde_json", +] + +[[package]] +name = "arrow-format" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7da2d9660bfaebbdb0a44a33b3bd1dcb5a952fafa02c0dfc6a51ea471fef2a" +dependencies = [ + "flatbuffers", +] + +[[package]] +name = "arrow2" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d873e2775c3d87a4e8d77aa544cbd43f34a0779d5164c59e7c6a1dd0678eb395" +dependencies = [ + "arrow-format", + "base64", + "chrono", + "futures", + "hash_hasher", + "num-traits", + "parquet2", + "simdutf8", +] + +[[package]] +name = "arrow_deps" +version = "0.1.0" +dependencies = [ + "arrow", + "datafusion", + "parquet 7.0.0", + "uncover", +] + +[[package]] +name = "async-stream" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "171374e7e3b2504e0e5236e3b59260560f9fe94bfe9ac39ba5e4e929c5590625" +dependencies = [ + "async-stream-impl", + "futures-core", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "648ed8c8d2ce5409ccd57453d9d1b214b342a0d69376a6feda1fd6cae3299308" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "async-trait" +version = "0.1.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44318e776df68115a881de9a8fd1b9e53368d7a4a5ce4cc48517da3393233a5e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + +[[package]] +name = "autocfg" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" + +[[package]] +name = "avro-rs" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ece550dd6710221de9bcdc1697424d8eee4fc4ca7e017479ea9d50c348465e37" +dependencies = [ + "byteorder", + "digest 0.9.0", + "lazy_static", + "libflate", + "num-bigint 0.2.6", + "rand 0.7.3", + "serde", + "serde_json", + "strum 0.18.0", + "strum_macros 0.18.0", + "thiserror", + "typed-builder", + "uuid", + "zerocopy", +] + +[[package]] +name = "backtrace" +version = "0.3.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7a905d892734eea339e896738c14b9afce22b5318f64b951e70bf3844419b01" +dependencies = [ + "addr2line", + "cc", + "cfg-if 1.0.0", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "base64" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" + +[[package]] +name = "benchmarks" +version = "0.1.0" +dependencies = [ + "analytic_engine", + "arena", + "arrow2", + "arrow_deps", + "clap", + "common_types", + "common_util", + "criterion", + "env_logger", + "futures", + "log", + "object_store", + "parquet 0.1.0", + "serde", + "serde_derive", + "table_engine", + "tokio", +] + +[[package]] +name = "bindgen" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd4865004a46a0aafb2a0a5eb19d3c9fc46ee5f063a6cfc605c69ac9ecf5263d" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "peeking_take_while", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitpacking" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8c7d2ac73c167c06af4a5f37e6e59d84148d57ccbe4480b76f0273eefea82d7" +dependencies = [ + "crunchy", +] + +[[package]] +name = "blake2" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a4e37d16930f5459780f5621038b6382b9bb37c19016f39fb6b5808d831f174" +dependencies = [ + "crypto-mac", + "digest 0.9.0", + "opaque-debug", +] + +[[package]] +name = "blake3" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "882e99e4a0cb2ae6cb6e442102e8e6b7131718d94110e64c3e6a34ea9b106f37" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if 1.0.0", + "constant_time_eq", + "digest 0.10.1", +] + +[[package]] +name = "block-buffer" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" +dependencies = [ + "generic-array", +] + +[[package]] +name = "block-buffer" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1d36a02058e76b040de25a4464ba1c80935655595b661505c8b39b664828b95" +dependencies = [ + "generic-array", +] + +[[package]] +name = "boringssl-src" +version = "0.3.0+688fc5c" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f901accdf830d2ea2f4e27f923a5e1125cd8b1a39ab578b9db1a42d578a6922b" +dependencies = [ + "cmake", +] + +[[package]] +name = "brotli" +version = "3.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71cb90ade945043d3d53597b2fc359bb063db8ade2bcffe7997351d0756e9d50" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ad2d4653bf5ca36ae797b1f4bb4dbddb60ce49ca4aed8a2ce4829f60425b80" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + +[[package]] +name = "bstr" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90682c8d613ad3373e66de8c6411e0ae2ab2571e879d2efbf73558cc66f21279" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + +[[package]] +name = "buf_redux" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b953a6887648bb07a535631f2bc00fbdb2a2216f135552cb3f534ed136b9c07f" +dependencies = [ + "memchr", + "safemem", +] + +[[package]] +name = "bumpalo" +version = "3.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c59e7af012c713f529e7a3ee57ce9b31ddd858d4b512923602f74608b009631" + +[[package]] +name = "bytecount" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72feb31ffc86498dacdbd0fcebb56138e7177a8cc5cea4516031d15ae85a742e" + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + +[[package]] +name = "bytes" +version = "0.1.0" +dependencies = [ + "bytes 1.1.0", + "snafu", +] + +[[package]] +name = "bytes" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8" + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + +[[package]] +name = "cast" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a" +dependencies = [ + "rustc_version", +] + +[[package]] +name = "catalog" +version = "0.1.0" +dependencies = [ + "async-trait", + "common_types", + "common_util", + "snafu", + "table_engine", +] + +[[package]] +name = "catalog_impls" +version = "0.1.0" +dependencies = [ + "analytic_engine", + "async-trait", + "catalog", + "common_types", + "common_util", + "log", + "server", + "snafu", + "system_catalog", + "table_engine", + "tokio", +] + +[[package]] +name = "cc" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e70cc2f62c6ce1868963827bd677764c62d07c3d9a3e1fb1177ee1a9ab199eb2" +dependencies = [ + "jobserver", +] + +[[package]] +name = "ceresdbproto" +version = "0.1.0" +source = "git+https://github.com/CeresDB/ceresdbproto.git#dc8eb387ca66347c2ea9d5b00924ae63e7360be3" +dependencies = [ + "futures", + "grpcio 0.9.1", + "protobuf", + "protobuf-builder", +] + +[[package]] +name = "ceresdbx" +version = "0.1.0" +dependencies = [ + "analytic_engine", + "catalog", + "catalog_impls", + "clap", + "common_util", + "log", + "logger", + "query_engine", + "server", + "signal-hook", + "table_engine", + "tracing_util", + "udf", + "vergen", +] + +[[package]] +name = "cexpr" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4aedb84272dbe89af497cf81375129abda4fc0a9e7c5d317498c15cc30c0d27" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" +dependencies = [ + "libc", + "num-integer", + "num-traits", + "time", + "winapi", +] + +[[package]] +name = "clang-sys" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa66045b9cb23c2e9c1520732030608b02ee07e5cfaa5a521ec15ded7fa24c90" +dependencies = [ + "glob", + "libc", + "libloading", +] + +[[package]] +name = "clap" +version = "2.33.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" +dependencies = [ + "ansi_term 0.11.0", + "atty", + "bitflags", + "strsim", + "textwrap", + "unicode-width", + "vec_map", +] + +[[package]] +name = "cmake" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb6210b637171dfba4cda12e579ac6dc73f5165ad56133e5d72ef3131f320855" +dependencies = [ + "cc", +] + +[[package]] +name = "comfy-table" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c42350b81f044f576ff88ac750419f914abb46a03831bb1747134344ee7a4e64" +dependencies = [ + "strum 0.22.0", + "strum_macros 0.22.0", + "unicode-width", +] + +[[package]] +name = "common_types" +version = "0.1.0" +dependencies = [ + "arrow_deps", + "byteorder", + "bytes 0.1.0", + "chrono", + "murmur3", + "paste 1.0.5", + "proto", + "serde", + "serde_derive", + "snafu", + "sqlparser", +] + +[[package]] +name = "common_util" +version = "0.1.0" +dependencies = [ + "backtrace", + "chrono", + "common_types", + "crossbeam-utils 0.8.5", + "env_logger", + "gag", + "lazy_static", + "libc", + "log", + "logger", + "nix", + "pin-project-lite", + "prometheus 0.12.0", + "proto", + "serde", + "serde_derive", + "slog", + "slog-global 0.1.0 (git+https://github.com/breezewish/slog-global.git?rev=0e23a5baff302a9d7bccd85f8f31e43339c2f2c1)", + "snafu", + "tempfile", + "time", + "tokio", + "tokio-test", + "toml", +] + +[[package]] +name = "const-random" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f590d95d011aa80b063ffe3253422ed5aa462af4e9867d43ce8337562bac77c4" +dependencies = [ + "const-random-macro", + "proc-macro-hack", +] + +[[package]] +name = "const-random-macro" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "615f6e27d000a2bffbc7f2f6a8669179378fa27ee4d0a509e985dfc0a7defb40" +dependencies = [ + "getrandom 0.2.3", + "lazy_static", + "proc-macro-hack", + "tiny-keccak", +] + +[[package]] +name = "constant_time_eq" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" + +[[package]] +name = "core-foundation" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a89e2ae426ea83155dccf10c0fa6b1463ef6d5fcb44cee0b224a408fa640a62" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea221b5284a47e40033bf9b66f35f984ec0ea2931eb03505246cd27a963f981b" + +[[package]] +name = "cpufeatures" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95059428f66df56b63431fdb4e1947ed2190586af5c5a8a8b71122bdf5a7f469" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81156fece84ab6a9f2afdb109ce3ae577e42b1228441eded99bd77f627953b1a" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "criterion" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10" +dependencies = [ + "atty", + "cast", + "clap", + "criterion-plot", + "csv", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_cbor", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-utils 0.8.5", +] + +[[package]] +name = "crossbeam-deque" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c20ff29ded3204c5106278a81a38f4b482636ed4fa1e6cfbeef193291beb29ed" +dependencies = [ + "crossbeam-epoch 0.8.2", + "crossbeam-utils 0.7.2", + "maybe-uninit", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-epoch 0.9.5", + "crossbeam-utils 0.8.5", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "058ed274caafc1f60c4997b5fc07bf7dc7cca454af7c6e81edffe5f33f70dace" +dependencies = [ + "autocfg", + "cfg-if 0.1.10", + "crossbeam-utils 0.7.2", + "lazy_static", + "maybe-uninit", + "memoffset 0.5.6", + "scopeguard", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-utils 0.8.5", + "lazy_static", + "memoffset 0.6.4", + "scopeguard", +] + +[[package]] +name = "crossbeam-utils" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8" +dependencies = [ + "autocfg", + "cfg-if 0.1.10", + "lazy_static", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" +dependencies = [ + "cfg-if 1.0.0", + "lazy_static", +] + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "crypto-common" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d6b536309245c849479fba3da410962a43ed8e51c26b729208ec0ac2798d0" +dependencies = [ + "generic-array", +] + +[[package]] +name = "crypto-mac" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b584a330336237c1eecd3e94266efb216c56ed91225d634cb2991c5f3fd1aeab" +dependencies = [ + "generic-array", + "subtle", +] + +[[package]] +name = "csv" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +dependencies = [ + "bstr", + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + +[[package]] +name = "dashmap" +version = "3.11.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f260e2fc850179ef410018660006951c1b55b79e8087e87111a2c388994b9b5" +dependencies = [ + "ahash 0.3.8", + "cfg-if 0.1.10", + "num_cpus", +] + +[[package]] +name = "datafusion" +version = "6.0.0" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=444c153863520072ea22d4f8c498dee39437516d#444c153863520072ea22d4f8c498dee39437516d" +dependencies = [ + "ahash 0.7.4", + "arrow", + "async-trait", + "blake2", + "blake3", + "chrono", + "futures", + "hashbrown", + "lazy_static", + "log", + "md-5", + "num_cpus", + "ordered-float 2.10.0", + "parquet 7.0.0", + "paste 1.0.5", + "pin-project-lite", + "rand 0.8.4", + "regex", + "sha2", + "smallvec", + "sqlparser", + "tempfile", + "tokio", + "tokio-stream", + "unicode-segmentation", +] + +[[package]] +name = "digest" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" +dependencies = [ + "generic-array", +] + +[[package]] +name = "digest" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b697d66081d42af4fba142d56918a3cb21dc8eb63372c6b85d14f44fb9c5979b" +dependencies = [ + "block-buffer 0.10.0", + "crypto-common", + "generic-array", + "subtle", +] + +[[package]] +name = "dirs-next" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1" +dependencies = [ + "cfg-if 1.0.0", + "dirs-sys-next", +] + +[[package]] +name = "dirs-sys-next" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" +dependencies = [ + "libc", + "redox_users", + "winapi", +] + +[[package]] +name = "doc-comment" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" + +[[package]] +name = "either" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" + +[[package]] +name = "encoding_rs" +version = "0.8.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80df024fbc5ac80f87dfef0d9f5209a252f2a497f7f42944cff24d8253cac065" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "enum-iterator" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4eeac5c5edb79e4e39fe8439ef35207780a11f69c52cbe424ce3dfad4cb78de6" +dependencies = [ + "enum-iterator-derive", +] + +[[package]] +name = "enum-iterator-derive" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c134c37760b27a871ba422106eedbb8247da973a09e82558bf26d619c882b159" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "env_logger" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aafcde04e90a5226a6443b7aabdb016ba2f8307c847d524724bd9b346dd1a2d3" +dependencies = [ + "atty", + "humantime", + "log", + "regex", + "termcolor", +] + +[[package]] +name = "fail" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be3c61c59fdc91f5dbc3ea31ee8623122ce80057058be560654c5d410d181a6" +dependencies = [ + "lazy_static", + "log", + "rand 0.7.3", +] + +[[package]] +name = "failure" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d32e9bd16cc02eae7db7ef620b392808b89f6a5e16bb3497d159c6b92a0f4f86" +dependencies = [ + "backtrace", + "failure_derive", +] + +[[package]] +name = "failure_derive" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa4da3c766cd7a0db8242e326e9e4e081edd567072893ed320008189715366a4" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + +[[package]] +name = "filedescriptor" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed3d8a5e20435ff00469e51a0d82049bae66504b5c429920dadf9bb54d47b3f" +dependencies = [ + "libc", + "thiserror", + "winapi", +] + +[[package]] +name = "flatbuffers" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef4c5738bcd7fad10315029c50026f83c9da5e4a21f8ed66826f43e0e2bde5f6" +dependencies = [ + "bitflags", + "smallvec", + "thiserror", +] + +[[package]] +name = "flate2" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd3aec53de10fe96d7d8c565eb17f2c687bb5518a2ec453b5b1252964526abe0" +dependencies = [ + "cfg-if 1.0.0", + "crc32fast", + "libc", + "miniz_oxide", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] +name = "form_urlencoded" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fc25a87fa4fd2094bffb06925852034d90a17f0d1e05197d4956d3555752191" +dependencies = [ + "matches", + "percent-encoding", +] + +[[package]] +name = "fs_extra" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394" + +[[package]] +name = "futures" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1adc00f486adfc9ce99f77d717836f0c5aa84965eb0b4f051f4e83f7cab53f8b" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74ed2411805f6e4e3d9bc904c95d5d423b89b3b25dc0250aa74729de20629ff9" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af51b1b4a7fdff033703db39de8802c673eb91855f2e0d47dcf3bf2c0ef01f99" + +[[package]] +name = "futures-executor" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d0d535a57b87e1ae31437b892713aee90cd2d7b0ee48727cd11fc72ef54761c" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b0e06c393068f3a6ef246c75cdca793d6a46347e75286933e5e75fd2fd11582" + +[[package]] +name = "futures-macro" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c54913bae956fb8df7f4dc6fc90362aa72e69148e3f39041fbe8742d21e0ac57" +dependencies = [ + "autocfg", + "proc-macro-hack", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f30aaa67363d119812743aa5f33c201a7a66329f97d1a887022971feea4b53" + +[[package]] +name = "futures-task" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbe54a98670017f3be909561f6ad13e810d9a51f3f061b902062ca3da80799f2" + +[[package]] +name = "futures-util" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eb846bfd58e44a8481a00049e82c43e0ccb5d61f8dc071057cb19249dd4d78" +dependencies = [ + "autocfg", + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "proc-macro-hack", + "proc-macro-nested", + "slab", +] + +[[package]] +name = "gag" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a713bee13966e9fbffdf7193af71d54a6b35a0bb34997cd6c9519ebeb5005972" +dependencies = [ + "filedescriptor", + "tempfile", +] + +[[package]] +name = "generic-array" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "501466ecc8a30d1d3b7fc9229b122b2ce8ed6e9d9223f1138d4babb253e51817" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + +[[package]] +name = "getrandom" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "wasi 0.10.2+wasi-snapshot-preview1", +] + +[[package]] +name = "getset" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24b328c01a4d71d2d8173daa93562a73ab0fe85616876f02500f53d82948c504" +dependencies = [ + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "gimli" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0a01e0497841a3b2db4f8afa483cce65f7e96a3498bd6c541734792aeac8fe7" + +[[package]] +name = "git2" +version = "0.13.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "659cd14835e75b64d9dba5b660463506763cf0aa6cb640aeeb0e98d841093490" +dependencies = [ + "bitflags", + "libc", + "libgit2-sys", + "log", + "url", +] + +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + +[[package]] +name = "grpcio" +version = "0.1.0" +dependencies = [ + "grpcio 0.9.1", +] + +[[package]] +name = "grpcio" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d99e00eed7e0a04ee2705112e7cfdbe1a3cc771147f22f016a8cd2d002187b" +dependencies = [ + "futures", + "grpcio-sys", + "libc", + "log", + "parking_lot", + "protobuf", +] + +[[package]] +name = "grpcio-compiler" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1f1abac9f330ac9ee0950220c10eea84d66479cede4836f0b924407fecf093c" +dependencies = [ + "protobuf", +] + +[[package]] +name = "grpcio-sys" +version = "0.9.1+1.38.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9447d1a926beeef466606cc45717f80897998b548e7dc622873d453e1ecb4be4" +dependencies = [ + "bindgen", + "boringssl-src", + "cc", + "cmake", + "libc", + "libz-sys", + "pkg-config", + "walkdir", +] + +[[package]] +name = "h2" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7f3675cfef6a30c8031cf9e6493ebdc3bb3272a3fea3923c4210d1830e6a472" +dependencies = [ + "bytes 1.1.0", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing 0.1.26", +] + +[[package]] +name = "half" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" + +[[package]] +name = "hash_hasher" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74721d007512d0cb3338cd20f0654ac913920061a4c4d0d8708edb3f2a698c0c" + +[[package]] +name = "hashbrown" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" +dependencies = [ + "ahash 0.7.4", +] + +[[package]] +name = "headers" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0b7591fb62902706ae8e7aaff416b1b0fa2c0fd0878b46dc13baa3712d8a855" +dependencies = [ + "base64", + "bitflags", + "bytes 1.1.0", + "headers-core", + "http", + "mime", + "sha-1", + "time", +] + +[[package]] +name = "headers-core" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7f66481bfee273957b1f20485a4ff3362987f85b2c236580d81b4eb7a326429" +dependencies = [ + "http", +] + +[[package]] +name = "heck" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "http" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "527e8c9ac747e28542699a951517aa9a6945af506cd1f2e1b53a576c17b6cc11" +dependencies = [ + "bytes 1.1.0", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "399c583b2979440c60be0821a6199eca73bc3c8dcd9d070d75ac726e2c6186e5" +dependencies = [ + "bytes 1.1.0", + "http", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acd94fdbe1d4ff688b67b04eee2e17bd50995534a61539e45adfefb45e5e5503" + +[[package]] +name = "httpdate" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6456b8a6c8f33fee7d958fcd1b60d55b11940a79e63ae87013e6d22e26034440" + +[[package]] +name = "humantime" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df004cfca50ef23c36850aaaa59ad52cc70d0e90243c3c7737a4dd32dc7a3c4f" +dependencies = [ + "quick-error", +] + +[[package]] +name = "hyper" +version = "0.14.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13f67199e765030fa08fe0bd581af683f0d5bc04ea09c2b1102012c5fb90e7fd" +dependencies = [ + "bytes 1.1.0", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing 0.1.26", + "want", +] + +[[package]] +name = "hyper-tls" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" +dependencies = [ + "bytes 1.1.0", + "hyper", + "native-tls", + "tokio", + "tokio-native-tls", +] + +[[package]] +name = "hyperloglog" +version = "1.0.0" +dependencies = [ + "bytecount", + "bytes 0.1.0", + "rand 0.8.4", + "siphasher", + "snafu", +] + +[[package]] +name = "idna" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8" +dependencies = [ + "matches", + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "indexmap" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc633605454125dec4b66843673f01c7df2b89479b32e0ed634e43a91cff62a5" +dependencies = [ + "autocfg", + "hashbrown", +] + +[[package]] +name = "input_buffer" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f97967975f448f1a7ddb12b0bc41069d09ed6a1c161a92687e057325db35d413" +dependencies = [ + "bytes 1.1.0", +] + +[[package]] +name = "instant" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bee0328b1209d157ef001c94dd85b4f8f64139adb0eac2659f4b08382b2f474d" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "integer-encoding" +version = "1.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48dc51180a9b377fd75814d0cc02199c20f8e99433d6762f650d39cdbbd3b56f" + +[[package]] +name = "integer-encoding" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90c11140ffea82edce8dcd74137ce9324ec24b3cf0175fc9d7e29164da9915b8" +dependencies = [ + "async-trait", + "futures-util", +] + +[[package]] +name = "interpreters" +version = "0.1.0" +dependencies = [ + "analytic_engine", + "arrow_deps", + "async-trait", + "catalog", + "catalog_impls", + "common_types", + "common_util", + "log", + "query_engine", + "snafu", + "sql", + "table_engine", + "tokio", + "udf", +] + +[[package]] +name = "ipnet" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f2d64f2edebec4ce84ad108148e67e1064789bee435edc5b60ad398714a3a9" + +[[package]] +name = "itertools" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" + +[[package]] +name = "jemalloc-ctl" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c502a5ff9dd2924f1ed32ba96e3b65735d837b4bfd978d3161b1702e66aca4b7" +dependencies = [ + "jemalloc-sys", + "libc", + "paste 0.1.18", +] + +[[package]] +name = "jemalloc-sys" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d3b9f3f5c9b31aa0f5ed3260385ac205db665baa41d49bb8338008ae94ede45" +dependencies = [ + "cc", + "fs_extra", + "libc", +] + +[[package]] +name = "jemallocator" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43ae63fcfc45e99ab3d1b29a46782ad679e98436c3169d15a167a1108a724b69" +dependencies = [ + "jemalloc-sys", + "libc", +] + +[[package]] +name = "jobserver" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af25a77299a7f711a01975c35a6a424eb6862092cc2d6c72c4ed6cbc56dfc1fa" +dependencies = [ + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.53" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4bf49d50e2961077d9c99f4b7997d770a1114f087c3c2e0069b36c13fc2979d" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "lexical-core" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a3926d8f156019890be4abe5fd3785e0cff1001e06f59c597641fd513a5a284" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4d066d004fa762d9da995ed21aa8845bb9f6e4265f540d716fb4b315197bf0e" +dependencies = [ + "lexical-parse-integer", + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-parse-integer" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2c92badda8cc0fc4f3d3cc1c30aaefafb830510c8781ce4e8669881f3ed53ac" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-util" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ff669ccaae16ee33af90dc51125755efed17f1309626ba5c12052512b11e291" +dependencies = [ + "static_assertions", +] + +[[package]] +name = "lexical-write-float" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b5186948c7b297abaaa51560f2581dae625e5ce7dfc2d8fdc56345adb6dc576" +dependencies = [ + "lexical-util", + "lexical-write-integer", + "static_assertions", +] + +[[package]] +name = "lexical-write-integer" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ece956492e0e40fd95ef8658a34d53a3b8c2015762fdcaaff2167b28de1f56ef" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "libc" +version = "0.2.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cb00336871be5ed2c8ed44b60ae9959dc5b9f08539422ed43f09e34ecaeba21" + +[[package]] +name = "libflate" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16364af76ebb39b5869bb32c81fa93573267cd8c62bb3474e28d78fac3fb141e" +dependencies = [ + "adler32", + "crc32fast", + "libflate_lz77", +] + +[[package]] +name = "libflate_lz77" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39a734c0493409afcd49deee13c006a04e3586b9761a03543c6272c9c51f2f5a" +dependencies = [ + "rle-decode-fast", +] + +[[package]] +name = "libgit2-sys" +version = "0.12.22+1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89c53ac117c44f7042ad8d8f5681378dfbc6010e49ec2c0d1f11dfedc7a4a1c3" +dependencies = [ + "cc", + "libc", + "libz-sys", + "pkg-config", +] + +[[package]] +name = "libloading" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afe203d669ec979b7128619bae5a63b7b42e9203c1b29146079ee05e2f604b52" +dependencies = [ + "cfg-if 1.0.0", + "winapi", +] + +[[package]] +name = "librocksdb_sys" +version = "0.1.0" +source = "git+https://github.com/tikv/rust-rocksdb.git?branch=tikv-5.2#23bd00d50c79b40b6a32c11446c86f0714fa7844" +dependencies = [ + "bindgen", + "bzip2-sys", + "cc", + "cmake", + "libc", + "libtitan_sys", + "libz-sys", + "lz4-sys", + "snappy-sys", + "zstd-sys", +] + +[[package]] +name = "libtitan_sys" +version = "0.0.1" +source = "git+https://github.com/tikv/rust-rocksdb.git?branch=tikv-5.2#23bd00d50c79b40b6a32c11446c86f0714fa7844" +dependencies = [ + "bzip2-sys", + "cc", + "cmake", + "libc", + "libz-sys", + "lz4-sys", + "snappy-sys", + "zstd-sys", +] + +[[package]] +name = "libz-sys" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de5435b8549c16d423ed0c03dbaafe57cf6c3344744f1242520d59c9d8ecec66" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "lock_api" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712a4d093c9976e24e7dbca41db895dabcbac38eb5f4045393d17a95bdfb1109" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "logger" +version = "0.1.0" +dependencies = [ + "chrono", + "grpcio 0.1.0", + "log", + "slog", + "slog-async", + "slog-global 0.1.0 (git+https://github.com/breeswish/slog-global.git?rev=0e23a5baff302a9d7bccd85f8f31e43339c2f2c1)", + "slog-term", + "slog_derive", +] + +[[package]] +name = "lru" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c748cfe47cb8da225c37595b3108bea1c198c84aaae8ea0ba76d01dda9fc803" +dependencies = [ + "hashbrown", +] + +[[package]] +name = "lz4" +version = "1.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aac20ed6991e01bf6a2e68cc73df2b389707403662a8ba89f68511fb340f724c" +dependencies = [ + "libc", + "lz4-sys", +] + +[[package]] +name = "lz4-sys" +version = "1.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dca79aa95d8b3226213ad454d328369853be3a1382d89532a854f4d69640acae" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "matchers" +version = "0.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1" +dependencies = [ + "regex-automata", +] + +[[package]] +name = "matches" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f" + +[[package]] +name = "maybe-uninit" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" + +[[package]] +name = "md-5" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b5a279bb9607f9f53c22d496eade00d138d1bdcccd07d74650387cf94942a15" +dependencies = [ + "block-buffer 0.9.0", + "digest 0.9.0", + "opaque-debug", +] + +[[package]] +name = "memchr" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" + +[[package]] +name = "memoffset" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "043175f069eda7b85febe4a74abbaeff828d9f8b448515d3151a14a3542811aa" +dependencies = [ + "autocfg", +] + +[[package]] +name = "memoffset" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" +dependencies = [ + "autocfg", +] + +[[package]] +name = "meta_client" +version = "0.1.0" +dependencies = [ + "async-trait", + "catalog", + "ceresdbproto", + "common_types", + "common_util", + "futures", + "grpcio 0.1.0", + "log", + "rand 0.7.3", + "reqwest", + "serde", + "serde_derive", + "serde_json", + "snafu", + "table_engine", + "tokio", + "url", +] + +[[package]] +name = "mime" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" + +[[package]] +name = "mime_guess" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2684d4c2e97d99848d30b324b00c8fcc7e5c897b7cbb5819b09e7c90e8baf212" +dependencies = [ + "mime", + "unicase", +] + +[[package]] +name = "miniz_oxide" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b" +dependencies = [ + "adler", + "autocfg", +] + +[[package]] +name = "mio" +version = "0.7.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c2bdb6314ec10835cd3293dd268473a835c02b7b352e788be788b3c6ca6bb16" +dependencies = [ + "libc", + "log", + "miow", + "ntapi", + "winapi", +] + +[[package]] +name = "miow" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" +dependencies = [ + "winapi", +] + +[[package]] +name = "multipart" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d050aeedc89243f5347c3e237e3e13dc76fbe4ae3742a57b94dc14f69acf76d4" +dependencies = [ + "buf_redux", + "httparse", + "log", + "mime", + "mime_guess", + "quick-error", + "rand 0.7.3", + "safemem", + "tempfile", + "twoway", +] + +[[package]] +name = "multiversion" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "025c962a3dd3cc5e0e520aa9c612201d127dcdf28616974961a649dca64f5373" +dependencies = [ + "multiversion-macros", +] + +[[package]] +name = "multiversion-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8a3e2bde382ebf960c1f3e79689fa5941625fe9bf694a1cb64af3e85faff3af" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "murmur3" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a198f9589efc03f544388dfc4a19fe8af4323662b62f598b8dcfdac62c14771c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "native-tls" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48ba9f7719b5a0f42f338907614285fb5fd70e53858141f69898a1fb7203b24d" +dependencies = [ + "lazy_static", + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + +[[package]] +name = "nix" +version = "0.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ccba0cfe4fdf15982d1674c69b1fd80bad427d293849982668dfe454bd61f2" +dependencies = [ + "bitflags", + "cc", + "cfg-if 1.0.0", + "libc", +] + +[[package]] +name = "nom" +version = "5.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af" +dependencies = [ + "memchr", + "version_check", +] + +[[package]] +name = "ntapi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" +dependencies = [ + "winapi", +] + +[[package]] +name = "num" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43db66d1170d347f9a065114077f7dccb00c1b9478c89384490a3425279a4606" +dependencies = [ + "num-bigint 0.4.1", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "090c7f9998ee0ff65aa5b723e4009f7b217707f1fb5ea551329cc4d6231fb304" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76e97c412795abf6c24ba30055a8f20642ea57ca12875220b854cfa501bf1e48" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26873667bbbb7c5182d4a37c1add32cdf09f841af72da53318fdb81543c15085" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2021c8337a54d21aca0d59a92577a029af9431cb59b909b03252b9c164fad59" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d41702bd167c2df5520b384281bc111a4b5efcf7fbc4c9c222c815b07e0a6a6a" +dependencies = [ + "autocfg", + "num-bigint 0.4.1", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "object" +version = "0.26.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39f37e50073ccad23b6d09bcb5b263f4e76d3bb6038e4a3c08e52162ffa8abc2" +dependencies = [ + "memchr", +] + +[[package]] +name = "object_store" +version = "0.1.0" +dependencies = [ + "async-trait", + "bytes 1.1.0", + "common_util", + "futures", + "itertools", + "percent-encoding", + "snafu", + "tempfile", + "tokio", + "tokio-util", + "walkdir", +] + +[[package]] +name = "once_cell" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "692fcb63b64b1758029e0a96ee63e049ce8c5948587f2f7208df04625e5f6b56" + +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + +[[package]] +name = "opaque-debug" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" + +[[package]] +name = "openssl" +version = "0.10.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d9facdb76fec0b73c406f125d44d86fdad818d66fef0531eec9233ca425ff4a" +dependencies = [ + "bitflags", + "cfg-if 1.0.0", + "foreign-types", + "libc", + "once_cell", + "openssl-sys", +] + +[[package]] +name = "openssl-probe" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28988d872ab76095a6e6ac88d99b54fd267702734fd7ffe610ca27f533ddb95a" + +[[package]] +name = "openssl-sys" +version = "0.9.66" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1996d2d305e561b70d1ee0c53f1542833f4e1ac6ce9a6708b6ff2738ca67dc82" +dependencies = [ + "autocfg", + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "ordered-float" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3305af35278dd29f46fcdd139e0b1fbfae2153f0e5928b39b035542dd31e37b7" +dependencies = [ + "num-traits", +] + +[[package]] +name = "ordered-float" +version = "2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7940cf2ca942593318d07fcf2596cdca60a85c9e7fab408a5e21a4f9dcd40d87" +dependencies = [ + "num-traits", +] + +[[package]] +name = "parking_lot" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216" +dependencies = [ + "cfg-if 1.0.0", + "instant", + "libc", + "redox_syscall", + "smallvec", + "winapi", +] + +[[package]] +name = "parquet" +version = "0.1.0" +dependencies = [ + "arrow_deps", + "lru", + "parquet-format", + "thrift", +] + +[[package]] +name = "parquet" +version = "7.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c718575b34e488fa78d4f0286356abb8466573cb17ae8faa96ffd871ca6e8c6" +dependencies = [ + "arrow", + "base64", + "brotli", + "byteorder", + "chrono", + "flate2", + "lz4", + "num-bigint 0.4.1", + "parquet-format", + "rand 0.8.4", + "snap", + "thrift", + "zstd", +] + +[[package]] +name = "parquet-format" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f0c06cdcd5460967c485f9c40a821746f5955ad81990533c7fae95dbd9bc0b5" +dependencies = [ + "thrift", +] + +[[package]] +name = "parquet-format-async-temp" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03abc2f9c83fe9ceec83f47c76cc071bfd56caba33794340330f35623ab1f544" +dependencies = [ + "async-trait", + "byteorder", + "futures", + "integer-encoding 3.0.2", + "ordered-float 1.1.1", +] + +[[package]] +name = "parquet2" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db82df54cdd88931d29b850190915b9069bb93fba8e1aefc0d59d8ca81603d6d" +dependencies = [ + "async-stream", + "bitpacking", + "futures", + "parquet-format-async-temp", + "streaming-decompression", +] + +[[package]] +name = "paste" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45ca20c77d80be666aef2b45486da86238fabe33e38306bd3118fe4af33fa880" +dependencies = [ + "paste-impl", + "proc-macro-hack", +] + +[[package]] +name = "paste" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acbf547ad0c65e31259204bd90935776d1c693cec2f4ff7abb7a1bbbd40dfe58" + +[[package]] +name = "paste-impl" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d95a7db200b97ef370c8e6de0088252f7e0dfff7d047a28528e47456c0fc98b6" +dependencies = [ + "proc-macro-hack", +] + +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + +[[package]] +name = "percent-encoding" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" + +[[package]] +name = "pin-project" +version = "0.4.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "918192b5c59119d51e0cd221f4d49dde9112824ba717369e903c97d076083d0f" +dependencies = [ + "pin-project-internal 0.4.28", +] + +[[package]] +name = "pin-project" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "576bc800220cc65dac09e99e97b08b358cfab6e17078de8dc5fee223bd2d0c08" +dependencies = [ + "pin-project-internal 1.0.8", +] + +[[package]] +name = "pin-project-internal" +version = "0.4.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be26700300be6d9d23264c73211d8190e755b6b5ca7a1b28230025511b52a5e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pin-project-internal" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e8fe8163d14ce7f0cdac2e040116f22eac817edabff0be91e8aff7e9accf389" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d31d11c69a6b52a174b42bdc0c30e5e11670f90788b2c471c31c1d17d449443" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3831453b3449ceb48b6d9c7ad7c96d5ea673e9b470a1dc578c2ce6521230884c" + +[[package]] +name = "plotters" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d88417318da0eaf0fdcdb51a0ee6c3bed624333bff8f946733049380be67ac1c" + +[[package]] +name = "plotters-svg" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro-hack" +version = "0.5.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" + +[[package]] +name = "proc-macro-nested" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086" + +[[package]] +name = "proc-macro2" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c7ed8b8c7b886ea3ed7dde405212185f423ab44682667c8c6dd14aa1d9f6612" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "profile" +version = "0.1.0" +dependencies = [ + "jemalloc-ctl", + "jemalloc-sys", + "jemallocator", + "log", + "tempfile", +] + +[[package]] +name = "prometheus" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30d70cf4412832bcac9cffe27906f4a66e450d323525e977168c70d1b36120ae" +dependencies = [ + "cfg-if 0.1.10", + "fnv", + "lazy_static", + "parking_lot", + "regex", + "thiserror", +] + +[[package]] +name = "prometheus" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5986aa8d62380092d2f50f8b1cdba9cb9b6731ffd4b25b51fd126b6c3e05b99c" +dependencies = [ + "cfg-if 1.0.0", + "fnv", + "lazy_static", + "memchr", + "parking_lot", + "protobuf", + "thiserror", +] + +[[package]] +name = "prometheus-static-metric" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8f30cdb09c39930b8fa5e0f23cbb895ab3f766b187403a0ba0956fc1ef4f0e5" +dependencies = [ + "lazy_static", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "proto" +version = "0.1.0" +dependencies = [ + "protobuf", + "protobuf-builder", +] + +[[package]] +name = "protobuf" +version = "2.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23129d50f2c9355ced935fce8a08bd706ee2e7ce2b3b33bf61dace0e379ac63a" + +[[package]] +name = "protobuf-builder" +version = "0.1.0" +source = "git+https://github.com/CeresDB/protobuf-builder.git?rev=745cc8527d1c5eb48745f5ce74b2b5bdb75c3bf2#745cc8527d1c5eb48745f5ce74b2b5bdb75c3bf2" +dependencies = [ + "protobuf", + "protoc", + "protoc-bin-vendored", + "protoc-grpcio", +] + +[[package]] +name = "protobuf-codegen" +version = "2.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ba98ce0dadaa6de1e7f1b6d82a0a73b03e0c049169a167c919d906b0875026c" +dependencies = [ + "protobuf", +] + +[[package]] +name = "protoc" +version = "2.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace5c4ea0e4b0381eb37837e070182b7ab491445e2d5ea2201d861f2b2f94f82" +dependencies = [ + "log", + "which", +] + +[[package]] +name = "protoc-bin-vendored" +version = "2.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a56d817108caebed2cfb20931270a6d95dc6e36a0801999eacfbf35c21a5dd" + +[[package]] +name = "protoc-grpcio" +version = "3.0.0" +source = "git+https://github.com/CeresDB/protoc-grpcio.git?rev=fe9664cf003c908528f940d003a9c3e90e522658#fe9664cf003c908528f940d003a9c3e90e522658" +dependencies = [ + "failure", + "grpcio-compiler", + "protobuf", + "protobuf-codegen", + "protoc", + "tempfile", +] + +[[package]] +name = "query_engine" +version = "0.1.0" +dependencies = [ + "arrow_deps", + "async-trait", + "common_types", + "common_util", + "futures", + "log", + "snafu", + "sql", + "table_engine", + "udf", +] + +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + +[[package]] +name = "quote" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc 0.2.0", +] + +[[package]] +name = "rand" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.3", + "rand_hc 0.3.1", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.3", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", +] + +[[package]] +name = "rand_core" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" +dependencies = [ + "getrandom 0.2.3", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", +] + +[[package]] +name = "rand_hc" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7" +dependencies = [ + "rand_core 0.6.3", +] + +[[package]] +name = "rayon" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" +dependencies = [ + "autocfg", + "crossbeam-deque 0.8.1", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque 0.8.1", + "crossbeam-utils 0.8.5", + "lazy_static", + "num_cpus", +] + +[[package]] +name = "redox_syscall" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" +dependencies = [ + "bitflags", +] + +[[package]] +name = "redox_users" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64" +dependencies = [ + "getrandom 0.2.3", + "redox_syscall", +] + +[[package]] +name = "regex" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +dependencies = [ + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" + +[[package]] +name = "remove_dir_all" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" +dependencies = [ + "winapi", +] + +[[package]] +name = "reqwest" +version = "0.11.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "246e9f61b9bb77df069a947682be06e31ac43ea37862e244a69f177694ea6d22" +dependencies = [ + "base64", + "bytes 1.1.0", + "encoding_rs", + "futures-core", + "futures-util", + "http", + "http-body", + "hyper", + "hyper-tls", + "ipnet", + "js-sys", + "lazy_static", + "log", + "mime", + "native-tls", + "percent-encoding", + "pin-project-lite", + "serde", + "serde_urlencoded", + "tokio", + "tokio-native-tls", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "winreg", +] + +[[package]] +name = "rle-decode-fast" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cabe4fa914dec5870285fa7f71f602645da47c486e68486d2b4ceb4a343e90ac" + +[[package]] +name = "rocksdb" +version = "0.3.0" +source = "git+https://github.com/tikv/rust-rocksdb.git?branch=tikv-5.2#23bd00d50c79b40b6a32c11446c86f0714fa7844" +dependencies = [ + "libc", + "librocksdb_sys", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + +[[package]] +name = "rustversion" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61b3909d758bb75c79f23d4736fac9433868679d3ad2ea7a61e3c25cfda9a088" + +[[package]] +name = "ryu" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" + +[[package]] +name = "safemem" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef703b7cb59335eae2eb93ceb664c0eb7ea6bf567079d843e09420219668e072" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "schannel" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f05ba609c234e60bee0d547fe94a4c7e9da733d1c962cf6e59efa4cd9c8bc75" +dependencies = [ + "lazy_static", + "winapi", +] + +[[package]] +name = "scoped-tls" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea6a9290e3c9cf0f18145ef7ffa62d68ee0bf5fcd651017e586dc7fd5da448c2" + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "security-framework" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c1016a0b396a0e68d6c541a54370e0db49524aead4c9e6aa263d6855d978d78" +dependencies = [ + "bitflags", + "core-foundation", + "core-foundation-sys", + "libc", + "num", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f6f179cd85a30f8652b3f8830f73861c76e87e70b939773e72daf38be3afc02" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "568a8e6258aa33c13358f81fd834adb854c6f7c9468520910a9b1e8fac068012" + +[[package]] +name = "serde" +version = "1.0.130" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f12d06de37cf59146fbdecab66aa99f9fe4f78722e3607577a5375d66bd0c913" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_cbor" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" +dependencies = [ + "half", + "serde", +] + +[[package]] +name = "serde_derive" +version = "1.0.130" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7bc1a1ab1961464eae040d96713baa5a724a8152c1222492465b54322ec508b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.67" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7f9e390c27c3c0ce8bc5d725f6e4d30a29d26659494aa4b17535f7522c5c950" +dependencies = [ + "indexmap", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edfa57a7f8d9c1d260a549e7224100f6c43d43f9103e06dd8b4095a9b2b43ce9" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "server" +version = "0.1.0" +dependencies = [ + "analytic_engine", + "arrow_deps", + "async-trait", + "avro-rs", + "catalog", + "ceresdbproto", + "common_types", + "common_util", + "futures", + "grpcio 0.1.0", + "http", + "interpreters", + "lazy_static", + "log", + "logger", + "meta_client", + "profile", + "prometheus 0.12.0", + "prometheus-static-metric", + "protobuf", + "query_engine", + "serde", + "serde_derive", + "serde_json", + "snafu", + "sql", + "system_catalog", + "table_engine", + "tokio", + "twox-hash", + "udf", + "warp", +] + +[[package]] +name = "sha-1" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99cd6713db3cf16b6c84e06321e049a9b9f699826e16096d23bbcc44d15d51a6" +dependencies = [ + "block-buffer 0.9.0", + "cfg-if 1.0.0", + "cpufeatures", + "digest 0.9.0", + "opaque-debug", +] + +[[package]] +name = "sha2" +version = "0.9.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800" +dependencies = [ + "block-buffer 0.9.0", + "cfg-if 1.0.0", + "cpufeatures", + "digest 0.9.0", + "opaque-debug", +] + +[[package]] +name = "sharded-slab" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "740223c51853f3145fe7c90360d2d4232f2b62e3449489c207eccde818979982" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "shlex" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fdf1b9db47230893d76faad238fd6097fd6d6a9245cd7a4d90dbd639536bbd2" + +[[package]] +name = "signal-hook" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "470c5a6397076fae0094aaf06a08e6ba6f37acb77d3b1b91ea92b4d6c8650c39" +dependencies = [ + "libc", + "signal-hook-registry", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" +dependencies = [ + "libc", +] + +[[package]] +name = "simdutf8" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c970da16e7c682fa90a261cf0724dee241c9f7831635ecc4e988ae8f3b505559" + +[[package]] +name = "siphasher" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "729a25c17d72b06c68cb47955d44fda88ad2d3e7d77e025663fdd69b93dd71a1" + +[[package]] +name = "skiplist" +version = "0.1.0" +dependencies = [ + "arena", + "bytes 1.1.0", + "criterion", + "rand 0.7.3", + "yatp", +] + +[[package]] +name = "slab" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c307a32c1c5c437f38c7fd45d753050587732ba8628319fbdf12a7e289ccc590" + +[[package]] +name = "slog" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8347046d4ebd943127157b94d63abb990fcf729dc4e9978927fdf4ac3c998d06" + +[[package]] +name = "slog-async" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "766c59b252e62a34651412870ff55d8c4e6d04df19b43eecb2703e417b097ffe" +dependencies = [ + "crossbeam-channel", + "slog", + "take_mut", + "thread_local", +] + +[[package]] +name = "slog-global" +version = "0.1.0" +source = "git+https://github.com/breeswish/slog-global.git?rev=0e23a5baff302a9d7bccd85f8f31e43339c2f2c1#0e23a5baff302a9d7bccd85f8f31e43339c2f2c1" +dependencies = [ + "arc-swap 0.3.11", + "lazy_static", + "log", + "slog", +] + +[[package]] +name = "slog-global" +version = "0.1.0" +source = "git+https://github.com/breezewish/slog-global.git?rev=0e23a5baff302a9d7bccd85f8f31e43339c2f2c1#0e23a5baff302a9d7bccd85f8f31e43339c2f2c1" +dependencies = [ + "arc-swap 0.3.11", + "lazy_static", + "log", + "slog", +] + +[[package]] +name = "slog-term" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95c1e7e5aab61ced6006149ea772770b84a0d16ce0f7885def313e4829946d76" +dependencies = [ + "atty", + "chrono", + "slog", + "term", + "thread_local", +] + +[[package]] +name = "slog_derive" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a945ec7f7ce853e89ffa36be1e27dce9a43e82ff9093bf3461c30d5da74ed11b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "smallvec" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" + +[[package]] +name = "snafu" +version = "0.6.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eab12d3c261b2308b0d80c26fffb58d17eba81a4be97890101f416b478c79ca7" +dependencies = [ + "backtrace", + "doc-comment", + "futures-core", + "pin-project 0.4.28", + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.6.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1508efa03c362e23817f96cde18abed596a25219a8b2c66e8db33c03543d315b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "snap" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45456094d1983e2ee2a18fdfebce3189fa451699d0502cb8e3b49dba5ba41451" + +[[package]] +name = "snappy-sys" +version = "0.1.0" +source = "git+https://github.com/busyjay/rust-snappy.git?branch=static-link#8c12738bad811397600455d6982aff754ea2ac44" +dependencies = [ + "cmake", + "libc", + "pkg-config", +] + +[[package]] +name = "socket2" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "765f090f0e423d2b55843402a07915add955e7d60657db13707a159727326cad" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "sql" +version = "0.1.0" +dependencies = [ + "arrow_deps", + "catalog", + "ceresdbproto", + "common_types", + "common_util", + "log", + "paste 1.0.5", + "regex", + "snafu", + "sqlparser", + "table_engine", + "tokio", + "udf", +] + +[[package]] +name = "sqlparser" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9907f54bd0f7b6ce72c2be1e570a614819ee08e3deb66d90480df341d8a12a8" +dependencies = [ + "log", +] + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "streaming-decompression" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9bc687acd5dc742c4a7094f2927a8614a68e4743ef682e7a2f9f0f711656cc92" +dependencies = [ + "fallible-streaming-iterator", +] + +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" + +[[package]] +name = "strum" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57bd81eb48f4c437cadc685403cad539345bf703d78e63707418431cecd4522b" + +[[package]] +name = "strum" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7ac893c7d471c8a21f31cfe213ec4f6d9afeed25537c772e08ef3f005f8729e" + +[[package]] +name = "strum_macros" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87c85aa3f8ea653bfd3ddf25f7ee357ee4d204731f6aa9ad04002306f6e2774c" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "strum_macros" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "339f799d8b549e3744c7ac7feb216383e4005d94bdb22561b3ab8f3b808ae9fb" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "subtle" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" + +[[package]] +name = "syn" +version = "1.0.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7f58f7e8eaa0009c5fec437aabf511bd9933e4b2d7407bd05273c01a8906ea7" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "synstructure" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "474aaa926faa1603c40b7885a9eaea29b444d1cb2850cb7c0e37bb1a4182f4fa" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "unicode-xid", +] + +[[package]] +name = "system_catalog" +version = "0.1.0" +dependencies = [ + "arrow_deps", + "async-trait", + "catalog", + "common_types", + "common_util", + "futures", + "log", + "proto", + "protobuf", + "snafu", + "table_engine", + "tokio", +] + +[[package]] +name = "table_engine" +version = "0.1.0" +dependencies = [ + "arrow_deps", + "async-trait", + "common_types", + "common_util", + "futures", + "log", + "proto", + "protobuf", + "serde", + "serde_derive", + "smallvec", + "snafu", + "tokio", +] + +[[package]] +name = "take_mut" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60" + +[[package]] +name = "tempfile" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "rand 0.8.4", + "redox_syscall", + "remove_dir_all", + "winapi", +] + +[[package]] +name = "term" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c59df8ac95d96ff9bede18eb7300b0fda5e5d8d90960e76f8e14ae765eedbf1f" +dependencies = [ + "dirs-next", + "rustversion", + "winapi", +] + +[[package]] +name = "termcolor" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "thiserror" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "283d5230e63df9608ac7d9691adc1dfb6e701225436eb64d0b9a7f0a5a04f6ec" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa3884228611f5cd3608e2d409bf7dce832e4eb3135e3f11addbd7e41bd68e71" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thread_local" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8018d24e04c95ac8790716a5987d0fec4f8b27249ffa0f7d33f1369bdfb88cbd" +dependencies = [ + "once_cell", +] + +[[package]] +name = "threadpool" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa" +dependencies = [ + "num_cpus", +] + +[[package]] +name = "thrift" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c6d965454947cc7266d22716ebfd07b18d84ebaf35eec558586bbb2a8cb6b5b" +dependencies = [ + "byteorder", + "integer-encoding 1.1.7", + "log", + "ordered-float 1.1.1", + "threadpool", +] + +[[package]] +name = "time" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca8a50ef2360fbd1eeb0ecd46795a87a19024eb4b53c5dc916ca1fd95fe62438" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "tinyvec" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "848a1e1181b9f6753b5e96a092749e29b11d19ede67dfbbd6c7dc7e0f49b5338" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" + +[[package]] +name = "tokio" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbbf1c778ec206785635ce8ad57fe52b3009ae9e0c9f574a728f3049d3e55838" +dependencies = [ + "bytes 1.1.0", + "libc", + "memchr", + "mio", + "num_cpus", + "once_cell", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "tokio-macros", + "winapi", +] + +[[package]] +name = "tokio-macros" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b557f72f448c511a979e2564e55d74e6c4432fc96ff4f6241bc6bded342643b7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-native-tls" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d995660bd2b7f8c1568414c1126076c13fbb725c40112dc0120b78eb9b717b" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-stream" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b2f3f698253f03119ac0102beaa64f67a67e08074d03a22d18784104543727f" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-test" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53474327ae5e166530d17f2d956afcb4f8a004de581b3cae10f12006bc8163e3" +dependencies = [ + "async-stream", + "bytes 1.1.0", + "futures-core", + "tokio", + "tokio-stream", +] + +[[package]] +name = "tokio-tungstenite" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1a5f475f1b9d077ea1017ecbc60890fda8e54942d680ca0b1d2b47cfa2d861b" +dependencies = [ + "futures-util", + "log", + "pin-project 1.0.8", + "tokio", + "tungstenite", +] + +[[package]] +name = "tokio-util" +version = "0.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1caa0b0c8d94a049db56b5acf8cba99dc0623aab1b26d5b5f5e2d945846b3592" +dependencies = [ + "bytes 1.1.0", + "futures-core", + "futures-io", + "futures-sink", + "log", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "toml" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31142970826733df8241ef35dc040ef98c679ab14d7c3e54d827099b3acecaa" +dependencies = [ + "serde", +] + +[[package]] +name = "tower-service" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" + +[[package]] +name = "trace_examples" +version = "0.1.0" +dependencies = [ + "tracing 0.1.0", + "tracing_util", +] + +[[package]] +name = "tracing" +version = "0.1.0" +dependencies = [ + "tracing 0.1.26", +] + +[[package]] +name = "tracing" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d" +dependencies = [ + "cfg-if 1.0.0", + "log", + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-appender" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9965507e507f12c8901432a33e31131222abac31edd90cabbcf85cf544b7127a" +dependencies = [ + "chrono", + "crossbeam-channel", + "tracing-subscriber", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c42e6fa53307c8a17e4ccd4dc81cf5ec38db9209f59b222210375b54ee40d1e2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ca517f43f0fb96e0c3072ed5c275fe5eece87e8cb52f4a77b69226d3b1c9df8" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "tracing-log" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3" +dependencies = [ + "lazy_static", + "log", + "tracing-core", +] + +[[package]] +name = "tracing-serde" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b" +dependencies = [ + "serde", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9cbe87a2fa7e35900ce5de20220a582a9483a7063811defce79d7cbd59d4cfe" +dependencies = [ + "ansi_term 0.12.1", + "chrono", + "lazy_static", + "matchers", + "regex", + "serde", + "serde_json", + "sharded-slab", + "smallvec", + "thread_local", + "tracing 0.1.26", + "tracing-core", + "tracing-log", + "tracing-serde", +] + +[[package]] +name = "tracing_util" +version = "0.1.0" +dependencies = [ + "lazy_static", + "tracing 0.1.26", + "tracing-appender", + "tracing-subscriber", +] + +[[package]] +name = "try-lock" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642" + +[[package]] +name = "tungstenite" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ada8297e8d70872fa9a551d93250a9f407beb9f37ef86494eb20012a2ff7c24" +dependencies = [ + "base64", + "byteorder", + "bytes 1.1.0", + "http", + "httparse", + "input_buffer", + "log", + "rand 0.8.4", + "sha-1", + "url", + "utf-8", +] + +[[package]] +name = "twoway" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59b11b2b5241ba34be09c3cc85a36e56e48f9888862e19cedf23336d35316ed1" +dependencies = [ + "memchr", +] + +[[package]] +name = "twox-hash" +version = "1.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ee73e6e4924fe940354b8d4d98cad5231175d615cd855b758adc658c0aac6a0" +dependencies = [ + "cfg-if 1.0.0", + "rand 0.8.4", + "static_assertions", +] + +[[package]] +name = "typed-builder" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78cea224ddd4282dfc40d1edabbd0c020a12e946e3a48e2c2b8f6ff167ad29fe" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "typenum" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879f6906492a7cd215bfa4cf595b600146ccfac0c79bcbd1f3000162af5e8b06" + +[[package]] +name = "udf" +version = "0.1.0" +dependencies = [ + "arrow_deps", + "base64", + "chrono", + "common_types", + "common_util", + "hyperloglog", + "smallvec", + "snafu", +] + +[[package]] +name = "uncover" +version = "0.1.1" +source = "git+https://github.com/matklad/uncover.git?rev=1d0770d997e29731b287e9e11e4ffbbea5f456da#1d0770d997e29731b287e9e11e4ffbbea5f456da" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "unicase" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" +dependencies = [ + "version_check", +] + +[[package]] +name = "unicode-bidi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "246f4c42e67e7a4e3c6106ff716a5d067d4132a642840b242e357e468a2a0085" + +[[package]] +name = "unicode-normalization" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" + +[[package]] +name = "unicode-width" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3" + +[[package]] +name = "unicode-xid" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" + +[[package]] +name = "url" +version = "2.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c" +dependencies = [ + "form_urlencoded", + "idna", + "matches", + "percent-encoding", +] + +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + +[[package]] +name = "uuid" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" +dependencies = [ + "getrandom 0.2.3", + "serde", +] + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" + +[[package]] +name = "vergen" +version = "5.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "265455aab08c55a1ab13f07c8d5e25c7d46900f4484dd7cbd682e77171f93f3c" +dependencies = [ + "anyhow", + "cfg-if 1.0.0", + "chrono", + "enum-iterator", + "getset", + "git2", + "rustversion", + "thiserror", +] + +[[package]] +name = "version_check" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" + +[[package]] +name = "wal" +version = "0.1.0" +dependencies = [ + "async-trait", + "common_types", + "common_util", + "futures", + "log", + "rocksdb", + "snafu", + "tempfile", + "tokio", +] + +[[package]] +name = "walkdir" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" +dependencies = [ + "same-file", + "winapi", + "winapi-util", +] + +[[package]] +name = "want" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0" +dependencies = [ + "log", + "try-lock", +] + +[[package]] +name = "warp" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "332d47745e9a0c38636dbd454729b147d16bd1ed08ae67b3ab281c4506771054" +dependencies = [ + "bytes 1.1.0", + "futures", + "headers", + "http", + "hyper", + "log", + "mime", + "mime_guess", + "multipart", + "percent-encoding", + "pin-project 1.0.8", + "scoped-tls", + "serde", + "serde_json", + "serde_urlencoded", + "tokio", + "tokio-stream", + "tokio-tungstenite", + "tokio-util", + "tower-service", + "tracing 0.1.26", +] + +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + +[[package]] +name = "wasi" +version = "0.10.2+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" + +[[package]] +name = "wasm-bindgen" +version = "0.2.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce9b1b516211d33767048e5d47fa2a381ed8b76fc48d2ce4aa39877f9f183e0" +dependencies = [ + "cfg-if 1.0.0", + "serde", + "serde_json", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe8dc78e2326ba5f845f4b5bf548401604fa20b1dd1d365fb73b6c1d6364041" +dependencies = [ + "bumpalo", + "lazy_static", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95fded345a6559c2cfee778d562300c581f7d4ff3edb9b0d230d69800d213972" +dependencies = [ + "cfg-if 1.0.0", + "js-sys", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44468aa53335841d9d6b6c023eaab07c0cd4bddbcfdee3e2bb1e8d2cb8069fef" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0195807922713af1e67dc66132c7328206ed9766af3858164fb583eedc25fbad" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdb075a845574a1fa5f09fd77e43f7747599301ea3417a9fbffdeedfc1f4a29" + +[[package]] +name = "web-sys" +version = "0.3.53" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224b2f6b67919060055ef1a67807367c2066ed520c3862cc013d26cf893a783c" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "which" +version = "4.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea187a8ef279bc014ec368c27a920da2024d2a711109bfbe3440585d5cf27ad9" +dependencies = [ + "either", + "lazy_static", + "libc", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "winreg" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0120db82e8a1e0b9fb3345a539c478767c0048d842860994d96113d5b667bd69" +dependencies = [ + "winapi", +] + +[[package]] +name = "yatp" +version = "0.0.1" +source = "git+https://github.com/tikv/yatp.git?rev=4b71f8abd86890f0d1e95778c2b6bf5a9ee4c502#4b71f8abd86890f0d1e95778c2b6bf5a9ee4c502" +dependencies = [ + "crossbeam-deque 0.7.4", + "dashmap", + "fail", + "lazy_static", + "num_cpus", + "parking_lot_core", + "prometheus 0.10.0", + "rand 0.7.3", +] + +[[package]] +name = "zerocopy" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6580539ad917b7c026220c4b3f2c08d52ce54d6ce0dc491e66002e35388fab46" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d498dbd1fd7beb83c86709ae1c33ca50942889473473d287d56ce4770a18edfb" +dependencies = [ + "proc-macro2", + "syn", + "synstructure", +] + +[[package]] +name = "zstd" +version = "0.9.0+zstd.1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07749a5dc2cb6b36661290245e350f15ec3bbb304e493db54a1d354480522ccd" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "4.1.1+zstd.1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c91c90f2c593b003603e5e0493c837088df4469da25aafff8bce42ba48caf079" +dependencies = [ + "libc", + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "1.6.1+zstd.1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "615120c7a2431d16cf1cf979e7fc31ba7a5b5e5707b29c8a99e5dbf8a8392a33" +dependencies = [ + "cc", + "libc", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000000..7ad1ca4a7f --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,74 @@ +[package] +name = "ceresdbx" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" +resolver = "2" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[workspace] +# In alphabetical order +members = [ + "analytic_engine", + "arrow_deps", + "benchmarks", + "catalog", + "catalog_impls", + "common_types", + "common_util", + "components/arena", + "components/bytes", + "components/logger", + "components/object_store", + "components/parquet", + "components/profile", + "components/rust-hyperloglog", + "components/skiplist", + "components/tracing", + "components/tracing_util", + "components/tracing_examples", + "grpcio", + "interpreters", + "meta_client", + "proto", + "query_engine", + "server", + "sql", + "system_catalog", + "table_engine", + "udf", + "wal", +] + +[[bin]] +name = "ceresdb-server" + +[dependencies] +# Workspace dependencies, in alphabetical order +analytic_engine = { path = "analytic_engine" } +catalog = { path = "catalog" } +catalog_impls = { path = "catalog_impls" } +clap = "2.0" +common_util = { path = "common_util" } +log = "0.4" +logger = { path = "components/logger" } +query_engine = { path = "query_engine" } +server = { path = "server" } +table_engine = { path = "table_engine" } +tracing_util = { path = "components/tracing_util" } +udf = { path = "udf" } + +# Crates.io dependencies, in alphabetical order +signal-hook = "0.3" + +[build-dependencies] +vergen = { version = "5", default-features = false, features = ["build", "git"] } + +[profile.release] +debug = true +opt-level = 2 +overflow-checks = true + +[profile.bench] +debug = true diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000..37cf72300f --- /dev/null +++ b/Dockerfile @@ -0,0 +1,39 @@ +ARG RUST_VERSION=1.59.0 +FROM rust:${RUST_VERSION}-slim-bullseye as build + +# cache mounts below may already exist and owned by root +USER root + +RUN apt update && apt install --yes gcc g++ libssl-dev pkg-config cmake && rm -rf /var/lib/apt/lists/* + +# Build ceresdb +COPY . /ceresdb +WORKDIR /ceresdb + +RUN make build + +FROM ubuntu:20.04 +# create admin user +ARG USER=admin +ARG PASS="1q2w3s" +RUN useradd -m -s /bin/bash $USER && echo "$USER:$PASS" | chpasswd + +COPY --from=build /ceresdb/target/release/ceresdb-server /usr/bin/ceresdb-server + +RUN apt update && apt install --yes curl gdb iotop cron + +ENV RUST_BACKTRACE 1 + +COPY ./docker/entrypoint.py /entrypoint.py +COPY ./docker/supervisor/supervisord.conf /etc/supervisor/supervisord.conf +COPY ./docker/supervisor/conf.d /etc/supervisor/conf.d +COPY ./configs/ceresdb.toml /usr/bin/ + +RUN mkdir -p /etc/ceresdb +RUN chmod +x /usr/bin/ceresdb-server + +COPY ./configs /etc/ceresdb + +COPY ./docker/tini /tini +RUN chmod +x /tini +ENTRYPOINT ["/tini", "--", "/entrypoint.py"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000..7a4a3ea242 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000..dfc71ea25a --- /dev/null +++ b/Makefile @@ -0,0 +1,64 @@ +SHELL = /bin/bash + +DIR=$(shell pwd) + +init: + echo "init" + echo "Git branch: $GITBRANCH" + +build: + ls -alh + cd $(DIR); cargo build --release + +build-asan: + ls -alh + export RUSTFLAGS=-Zsanitizer=address RUSTDOCFLAGS=-Zsanitizer=address + cd $(DIR); cargo build -Zbuild-std --target x86_64-unknown-linux-gnu --release + +build-arm64: + ls -alh + cd $(DIR); cargo build --release --no-default-features + +test: + cd $(DIR); cargo test --workspace -- --test-threads=4 + +# grcov needs build first, then run test +build-ut: + echo $(CARGO_INCREMENTAL) + echo $(RUSTFLAGS) + echo $(RUSTDOCFLAGS) + cd $(DIR); cargo build -j 4 --workspace + +test-ut: + echo $(CARGO_INCREMENTAL) + echo $(RUSTFLAGS) + echo $(RUSTDOCFLAGS) + cd $(DIR); cargo test -j 4 --workspace -- -Z unstable-options --format json | tee results.json; \ + cat results.json | cargo2junit > ${WORKSPACE}/testresult/TEST-all.xml + +fmt: + cd $(DIR); cargo fmt -- --check + +clippy: + cd $(DIR); cargo clippy --all-targets --all-features --workspace -- -D warnings + +# test with address sanitizer +asan-test: + export RUSTFLAGS=-Zsanitizer=address RUSTDOCFLAGS=-Zsanitizer=address + cd $(DIR); cargo test -Zbuild-std --target x86_64-unknown-linux-gnu --workspace + +# test with address sanitizer under release mode to workaround `attempt to create unaligned or null slice` +# error in parquet crate. +asan-test-release: + export RUSTFLAGS=-Zsanitizer=address RUSTDOCFLAGS=-Zsanitizer=address + cd $(DIR); cargo test -Zbuild-std --target x86_64-unknown-linux-gnu --release --workspace + +# test with memory sanitizer +mem-test: + export RUSTFLAGS=-Zsanitizer=memory RUSTDOCFLAGS=-Zsanitizer=memory + cd $(DIR); cargo test -Zbuild-std --target x86_64-unknown-linux-gnu --workspace + +# test with miri. +# only list packages will be tested. +miri: + cd $(DIR); cargo miri test --package arena diff --git a/README.md b/README.md new file mode 100644 index 0000000000..4e9d118178 --- /dev/null +++ b/README.md @@ -0,0 +1,90 @@ +# ceresdbx + +## Building +Install clang (for rocksdb) + +Install deps (required by rust-rocksdb) +```bash +brew install cmake +brew install lz4 +``` + +Build in debug mode +```bash +cargo build --bin ceresdb-server +``` + +Build in release mode +```bash +cargo build --release --bin ceresdb-server +``` + +## Usage +Run the server +```bash +./ceresdb-server +``` + +## RESTful API +```bash +curl -L -X POST 'http://localhost:5000/sql' \ +-H 'Content-Type: application/json' \ +-d '{ + "query": "your DDL sql" +}' +``` + +Describe a table +```bash +curl -L -X POST 'http://localhost:5000/sql' \ +-H 'Content-Type: application/json' \ +-d '{ + "query": "DESCRIBE TABLE mytest" +}' +``` + +Insert data +```bash +curl -L -X POST 'http://localhost:5000/sql' \ +-H 'Content-Type: application/json' \ +--data-raw '{ + "query": "INSERT INTO mytest(c1, c2, c3, c4, c5, c6) VALUES(1618310218001, 12.5, '\''hello world'\'', 3.14159265, true, 2147483650)" +}' +``` + +Query +```bash +curl -L -X POST 'http://localhost:5000/sql' \ +-H 'Content-Type: application/json' \ +-d '{ + "query": "SELECT c1, c2, c3, c4, c5, c6 FROM mytest LIMIT 3" +}' +``` + +Query from system tables +```bash +curl -L -X POST 'http://localhost:5000/sql' \ +-H 'Content-Type: application/json' \ +-d '{ + "query": "SELECT * FROM system.numbers LIMIT 3" +}' +``` + +## Support Data Type +| SQL | CeresDB | Arrow | +| --- | --- | --- | +| null | Null | Null | +| timestamp | Timestamp | Timestamp(TimeUnit::Millisecond, None) | +| double | Double | Float64 | +| float | Float | Float32 | +| string | String | String | +| Varbinary | Varbinary | Binary | +| uint64 | UInt64 | UInt64 | +| uint32 | UInt32 | UInt32 | +| uint16 | UInt16 | UInt16 | +| uint8 | UInt8 | UInt8 | +| int64/bigint | Int64 | Int64 | +| int32/int | Int32 | Int32 | +| int16/smallint | Int16 | Int16 | +| int8/tinyint | Int8 | Int8 | +| boolean | Boolean | Boolean | diff --git a/analytic_engine/Cargo.toml b/analytic_engine/Cargo.toml new file mode 100644 index 0000000000..3be6760574 --- /dev/null +++ b/analytic_engine/Cargo.toml @@ -0,0 +1,43 @@ +[package] +name = "analytic_engine" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[features] +test = ["tempfile"] + +[dependencies] +# In alphabetical order +arc-swap = "1.4.0" +arena = { path = "../components/arena" } +arrow_deps = { path = "../arrow_deps" } +async-trait = "0.1.41" +base64 = "0.13" +common_types = { path = "../common_types" } +common_util = { path = "../common_util"} +futures = "0.3" +lazy_static = "1.4.0" +log = "0.4" +object_store = { path = "../components/object_store" } +parquet = { path = "../components/parquet" } +prometheus = "0.12" +proto = { path = "../proto" } +protobuf = "2.20" +serde = "1.0" +serde_derive = "1.0" +skiplist = { path = "../components/skiplist" } +smallvec = "1.6" +snafu = { version = "0.6.10", features = ["backtraces"] } +table_engine = { path = "../table_engine" } +tokio = { version = "1.0", features = ["sync", "time"] } +wal = { path = "../wal" } +tempfile = { version = "3.1.0", optional = true } + +[dev-dependencies] +common_types = { path = "../common_types", features = ["test"] } +common_util = { path = "../common_util", features = ["test"] } +env_logger = "0.6" +tempfile = "3.1.0" diff --git a/analytic_engine/src/compaction/metrics.rs b/analytic_engine/src/compaction/metrics.rs new file mode 100644 index 0000000000..61d76453e3 --- /dev/null +++ b/analytic_engine/src/compaction/metrics.rs @@ -0,0 +1,15 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Metrics of compaction. + +use lazy_static::lazy_static; +use prometheus::{register_int_gauge, IntGauge}; + +lazy_static! { + // Counters: + pub static ref COMPACTION_PENDING_REQUEST_GAUGE: IntGauge = register_int_gauge!( + "compaction_pending_request_gauge", + "Pending request queue length of compaction" + ) + .unwrap(); +} diff --git a/analytic_engine/src/compaction/mod.rs b/analytic_engine/src/compaction/mod.rs new file mode 100644 index 0000000000..a76ce2324a --- /dev/null +++ b/analytic_engine/src/compaction/mod.rs @@ -0,0 +1,494 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Compaction. + +use std::{collections::HashMap, sync::Arc}; + +use common_util::config::{ReadableSize, TimeUnit}; +use serde_derive::Deserialize; +use snafu::{ensure, Backtrace, GenerateBacktrace, ResultExt, Snafu}; +use tokio::sync::oneshot; + +use crate::{ + compaction::picker::{CommonCompactionPicker, CompactionPickerRef}, + instance::write_worker::CompactionNotifier, + sst::file::{FileHandle, Level}, + table::data::TableDataRef, + table_options::COMPACTION_STRATEGY, +}; + +mod metrics; +pub mod picker; +pub mod scheduler; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Unable to parse compaction strategy, value: {}", value))] + ParseStrategy { value: String, backtrace: Backtrace }, + #[snafu(display("Unable to parse float, key: {}, value: {}", key, value))] + ParseFloat { + key: String, + value: String, + source: std::num::ParseFloatError, + backtrace: Backtrace, + }, + #[snafu(display("Unable to parse int, key: {}, value: {}", key, value))] + ParseInt { + key: String, + value: String, + source: std::num::ParseIntError, + backtrace: Backtrace, + }, + #[snafu(display("Unable to parse readable size, key: {}, value: {}", key, value))] + ParseSize { + key: String, + value: String, + error: String, + backtrace: Backtrace, + }, + #[snafu(display("Unable to parse time unit, key: {}, value: {}", key, value))] + ParseTimeUnit { + key: String, + value: String, + error: String, + backtrace: Backtrace, + }, + #[snafu(display("Invalid compaction option value, err: {}", error))] + InvalidOption { error: String, backtrace: Backtrace }, +} + +#[derive(Debug, Clone, Copy, Deserialize, PartialEq)] +pub enum CompactionStrategy { + Default, + TimeWindow(TimeWindowCompactionOptions), + SizeTiered(SizeTieredCompactionOptions), +} + +#[derive(Debug, Clone, Copy, Deserialize, PartialEq)] +pub struct SizeTieredCompactionOptions { + pub bucket_low: f32, + pub bucket_high: f32, + pub min_sstable_size: ReadableSize, + pub min_threshold: usize, + pub max_threshold: usize, +} + +#[derive(Debug, Clone, Copy, Deserialize, PartialEq)] +pub struct TimeWindowCompactionOptions { + pub size_tiered: SizeTieredCompactionOptions, + // TODO(boyan) In fact right now we only supports TimeUnit::Milliseconds resolution. + pub timestamp_resolution: TimeUnit, +} + +impl protobuf::Clear for SizeTieredCompactionOptions { + fn clear(&mut self) { + *self = SizeTieredCompactionOptions::default() + } +} + +impl protobuf::Clear for TimeWindowCompactionOptions { + fn clear(&mut self) { + *self = TimeWindowCompactionOptions::default() + } +} + +impl Default for SizeTieredCompactionOptions { + fn default() -> Self { + Self { + bucket_low: 0.5, + bucket_high: 1.5, + min_sstable_size: ReadableSize::mb(50), + min_threshold: 4, + max_threshold: 16, + } + } +} + +impl Default for TimeWindowCompactionOptions { + fn default() -> Self { + Self { + size_tiered: SizeTieredCompactionOptions::default(), + timestamp_resolution: TimeUnit::Milliseconds, + } + } +} + +impl Default for CompactionStrategy { + fn default() -> Self { + CompactionStrategy::Default + } +} + +const BUCKET_LOW_KEY: &str = "compaction_bucket_low"; +const BUCKET_HIGH_KEY: &str = "compaction_bucket_high"; +const MIN_THRESHOLD_KEY: &str = "compaction_min_threshold"; +const MAX_THRESHOLD_KEY: &str = "compaction_max_threshold"; +const MIN_SSTABLE_SIZE_KEY: &str = "compaction_min_sstable_size"; +const TIMESTAMP_RESOLUTION_KEY: &str = "compaction_timestamp_resolution"; +const DEFAULT_STRATEGY: &str = "default"; +const STC_STRATEGY: &str = "size_tiered"; +const TWC_STRATEGY: &str = "time_window"; + +impl CompactionStrategy { + pub(crate) fn parse_from( + value: &str, + options: &HashMap, + ) -> Result { + match value.trim().to_lowercase().as_str() { + DEFAULT_STRATEGY => Ok(CompactionStrategy::Default), + STC_STRATEGY => Ok(CompactionStrategy::SizeTiered( + SizeTieredCompactionOptions::parse_from(options)?, + )), + TWC_STRATEGY => Ok(CompactionStrategy::TimeWindow( + TimeWindowCompactionOptions::parse_from(options)?, + )), + _ => ParseStrategy { + value: value.to_string(), + } + .fail(), + } + } + + pub(crate) fn fill_raw_map(&self, m: &mut HashMap) { + match self { + CompactionStrategy::Default => { + m.insert( + COMPACTION_STRATEGY.to_string(), + DEFAULT_STRATEGY.to_string(), + ); + } + CompactionStrategy::SizeTiered(opts) => { + m.insert(COMPACTION_STRATEGY.to_string(), STC_STRATEGY.to_string()); + opts.fill_raw_map(m); + } + CompactionStrategy::TimeWindow(opts) => { + m.insert(COMPACTION_STRATEGY.to_string(), TWC_STRATEGY.to_string()); + opts.fill_raw_map(m); + } + } + } +} + +impl SizeTieredCompactionOptions { + pub(crate) fn validate(&self) -> Result<(), Error> { + ensure!( + self.bucket_high > self.bucket_low, + InvalidOption { + error: format!( + "{} value({}) is less than or equal to the {} value({}) ", + BUCKET_HIGH_KEY, self.bucket_high, BUCKET_LOW_KEY, self.bucket_low + ), + } + ); + + Ok(()) + } + + fn fill_raw_map(&self, m: &mut HashMap) { + m.insert(BUCKET_LOW_KEY.to_string(), format!("{}", self.bucket_low)); + m.insert(BUCKET_HIGH_KEY.to_string(), format!("{}", self.bucket_high)); + m.insert( + MIN_SSTABLE_SIZE_KEY.to_string(), + format!("{}", self.min_sstable_size.0), + ); + m.insert( + MAX_THRESHOLD_KEY.to_string(), + format!("{}", self.max_threshold), + ); + m.insert( + MIN_THRESHOLD_KEY.to_string(), + format!("{}", self.min_threshold), + ); + } + + pub(crate) fn parse_from( + options: &HashMap, + ) -> Result { + let mut opts = SizeTieredCompactionOptions::default(); + if let Some(v) = options.get(BUCKET_LOW_KEY) { + opts.bucket_low = v.parse().context(ParseFloat { + key: BUCKET_HIGH_KEY, + value: v, + })?; + } + if let Some(v) = options.get(BUCKET_HIGH_KEY) { + opts.bucket_high = v.parse().context(ParseFloat { + key: BUCKET_HIGH_KEY, + value: v, + })?; + } + if let Some(v) = options.get(MIN_SSTABLE_SIZE_KEY) { + opts.min_sstable_size = v.parse::().map_err(|err| Error::ParseSize { + key: MIN_SSTABLE_SIZE_KEY.to_string(), + value: v.to_string(), + error: err, + backtrace: Backtrace::generate(), + })?; + } + if let Some(v) = options.get(MAX_THRESHOLD_KEY) { + opts.max_threshold = v.parse().context(ParseInt { + key: MAX_THRESHOLD_KEY, + value: v, + })?; + } + if let Some(v) = options.get(MIN_THRESHOLD_KEY) { + opts.min_threshold = v.parse().context(ParseInt { + key: MIN_THRESHOLD_KEY, + value: v, + })?; + } + + opts.validate()?; + + Ok(opts) + } +} + +impl TimeWindowCompactionOptions { + /// TODO(boyan) In fact right now we only supports TimeUnit::Milliseconds + /// resolution. + fn valid_timestamp_unit(unit: TimeUnit) -> bool { + matches!( + unit, + TimeUnit::Seconds + | TimeUnit::Milliseconds + | TimeUnit::Microseconds + | TimeUnit::Nanoseconds + ) + } + + fn fill_raw_map(&self, m: &mut HashMap) { + self.size_tiered.fill_raw_map(m); + + m.insert( + TIMESTAMP_RESOLUTION_KEY.to_string(), + format!("{}", self.timestamp_resolution), + ); + } + + pub(crate) fn validate(&self) -> Result<(), Error> { + if !Self::valid_timestamp_unit(self.timestamp_resolution) { + return InvalidOption { + error: format!( + "{:?} is not valid for {}) ", + self.timestamp_resolution, TIMESTAMP_RESOLUTION_KEY + ), + } + .fail(); + } + + Ok(()) + } + + pub(crate) fn parse_from( + options: &HashMap, + ) -> Result { + let mut opts = TimeWindowCompactionOptions { + size_tiered: SizeTieredCompactionOptions::parse_from(options)?, + ..Default::default() + }; + + if let Some(v) = options.get(TIMESTAMP_RESOLUTION_KEY) { + opts.timestamp_resolution = + v.parse::().map_err(|err| Error::ParseTimeUnit { + key: TIMESTAMP_RESOLUTION_KEY.to_string(), + value: v.to_string(), + error: err, + backtrace: Backtrace::generate(), + })?; + } + + opts.validate()?; + + Ok(opts) + } +} + +#[derive(Debug, Clone)] +pub struct CompactionInputFiles { + /// Level of the files to be compacted. + pub level: Level, + /// Files to be compacted. + pub files: Vec, + /// The output level of the merged file. + pub output_level: Level, +} + +#[derive(Default, Clone)] +pub struct ExpiredFiles { + /// Level of the expired files. + pub level: Level, + /// Expired files. + pub files: Vec, +} + +#[derive(Default, Clone)] +pub struct CompactionTask { + pub compaction_inputs: Vec, + pub expired: Vec, +} + +impl CompactionTask { + pub fn mark_files_being_compacted(&self, being_compacted: bool) { + for input in &self.compaction_inputs { + for file in &input.files { + file.set_being_compacted(being_compacted); + } + } + for expired in &self.expired { + for file in &expired.files { + file.set_being_compacted(being_compacted); + } + } + } +} + +pub struct PickerManager { + default_picker: CompactionPickerRef, + time_window_picker: CompactionPickerRef, + size_tiered_picker: CompactionPickerRef, +} + +impl Default for PickerManager { + fn default() -> Self { + let size_tiered_picker = Arc::new(CommonCompactionPicker::new( + CompactionStrategy::SizeTiered(SizeTieredCompactionOptions::default()), + )); + let time_window_picker = Arc::new(CommonCompactionPicker::new( + CompactionStrategy::TimeWindow(TimeWindowCompactionOptions::default()), + )); + + Self { + default_picker: time_window_picker.clone(), + size_tiered_picker, + time_window_picker, + } + } +} + +impl PickerManager { + pub fn get_picker(&self, strategy: CompactionStrategy) -> CompactionPickerRef { + match strategy { + CompactionStrategy::Default => self.default_picker.clone(), + CompactionStrategy::SizeTiered(_) => self.size_tiered_picker.clone(), + CompactionStrategy::TimeWindow(_) => self.time_window_picker.clone(), + } + } +} + +#[derive(Debug, Snafu)] +pub enum WaitError { + #[snafu(display("The compaction is canceled"))] + Canceled, + + #[snafu(display("Failed to compact, err:{}", source))] + Compaction { + source: Arc, + }, +} + +pub type WaitResult = std::result::Result; + +pub struct WaiterNotifier { + waiter: Option>>, +} + +impl WaiterNotifier { + pub fn new(waiter: Option>>) -> Self { + Self { waiter } + } + + pub fn notify_wait_result(mut self, res: WaitResult<()>) { + // Ignore error if failed to send result. + if let Some(waiter) = self.waiter.take() { + let _ = waiter.send(res); + } + } +} + +impl Drop for WaiterNotifier { + fn drop(&mut self) { + if let Some(waiter) = self.waiter.take() { + // The compaction result hasn't been sent before the notifier dropped, we + // send a canceled error to waiter. + let _ = waiter.send(Canceled.fail()); + } + } +} + +/// Request to compact single table. +pub struct TableCompactionRequest { + pub table_data: TableDataRef, + pub compaction_notifier: CompactionNotifier, + pub waiter: Option>>, +} + +impl TableCompactionRequest { + pub fn no_waiter(table_data: TableDataRef, compaction_notifier: CompactionNotifier) -> Self { + TableCompactionRequest { + table_data, + compaction_notifier, + waiter: None, + } + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use super::*; + + #[test] + fn test_fill_raw_map_then_parse() { + let c = CompactionStrategy::Default; + let mut m = HashMap::new(); + c.fill_raw_map(&mut m); + assert_eq!(1, m.len()); + assert_eq!(m[COMPACTION_STRATEGY], "default"); + assert_eq!(c, CompactionStrategy::parse_from("default", &m).unwrap()); + + let opts = SizeTieredCompactionOptions { + bucket_low: 0.1, + min_sstable_size: ReadableSize(1024), + max_threshold: 10, + ..Default::default() + }; + + let c = CompactionStrategy::SizeTiered(opts); + let mut m = HashMap::new(); + c.fill_raw_map(&mut m); + assert_eq!(6, m.len()); + assert_eq!(m[COMPACTION_STRATEGY], "size_tiered"); + assert_eq!(m[BUCKET_LOW_KEY], "0.1"); + assert_eq!(m[BUCKET_HIGH_KEY], "1.5"); + assert_eq!(m[MIN_SSTABLE_SIZE_KEY], "1024"); + assert_eq!(m[MIN_THRESHOLD_KEY], "4"); + assert_eq!(m[MAX_THRESHOLD_KEY], "10"); + assert_eq!( + c, + CompactionStrategy::parse_from("size_tiered", &m).unwrap() + ); + + let twc_opts = TimeWindowCompactionOptions { + size_tiered: opts, + ..Default::default() + }; + let c = CompactionStrategy::TimeWindow(twc_opts); + let mut m = HashMap::new(); + c.fill_raw_map(&mut m); + + assert_eq!(7, m.len()); + assert_eq!(m[COMPACTION_STRATEGY], "time_window"); + assert_eq!(m[BUCKET_LOW_KEY], "0.1"); + assert_eq!(m[BUCKET_HIGH_KEY], "1.5"); + assert_eq!(m[MIN_SSTABLE_SIZE_KEY], "1024"); + assert_eq!(m[MIN_THRESHOLD_KEY], "4"); + assert_eq!(m[MAX_THRESHOLD_KEY], "10"); + assert_eq!(m[TIMESTAMP_RESOLUTION_KEY], "milliseconds"); + assert_eq!( + c, + CompactionStrategy::parse_from("time_window", &m).unwrap() + ); + } +} diff --git a/analytic_engine/src/compaction/picker.rs b/analytic_engine/src/compaction/picker.rs new file mode 100644 index 0000000000..5cc9f2afc9 --- /dev/null +++ b/analytic_engine/src/compaction/picker.rs @@ -0,0 +1,740 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Compaction picker. + +use std::{ + collections::{BTreeSet, HashMap}, + sync::Arc, + time::Duration, +}; + +use common_types::time::Timestamp; +use common_util::{config::TimeUnit, define_result}; +use log::{debug, info}; +use snafu::Snafu; + +use crate::{ + compaction::{ + CompactionInputFiles, CompactionStrategy, CompactionTask, SizeTieredCompactionOptions, + TimeWindowCompactionOptions, + }, + sst::{ + file::{FileHandle, Level}, + manager::LevelsController, + }, +}; + +#[derive(Debug, Snafu)] +pub enum Error {} + +define_result!(Error); + +#[derive(Clone)] +pub struct PickerContext { + pub segment_duration: Duration, + /// The ttl of the data in sst. + pub ttl: Option, + pub strategy: CompactionStrategy, +} + +impl PickerContext { + fn size_tiered_opts(&self) -> SizeTieredCompactionOptions { + match self.strategy { + CompactionStrategy::SizeTiered(opts) => opts, + _ => SizeTieredCompactionOptions::default(), + } + } + + fn time_window_opts(&self) -> TimeWindowCompactionOptions { + match self.strategy { + CompactionStrategy::TimeWindow(opts) => opts, + _ => TimeWindowCompactionOptions::default(), + } + } +} + +pub trait CompactionPicker { + /// Pick candidate files for compaction. + /// + /// Note: files being compacted should be ignored. + fn pick_compaction( + &self, + ctx: PickerContext, + levels_controller: &LevelsController, + ) -> Result; +} + +pub type CompactionPickerRef = Arc; + +trait LevelPicker { + /// Pick candidate files for compaction at level + fn pick_candidates_at_level( + &self, + ctx: &PickerContext, + levels_controller: &LevelsController, + level: Level, + expire_time: Option, + ) -> Option>; +} + +type LevelPickerRef = Arc; + +pub struct CommonCompactionPicker { + level_picker: LevelPickerRef, +} + +impl CommonCompactionPicker { + pub fn new(strategy: CompactionStrategy) -> Self { + let level_picker: LevelPickerRef = match strategy { + CompactionStrategy::SizeTiered(_) | CompactionStrategy::Default => { + Arc::new(SizeTieredPicker::default()) + } + CompactionStrategy::TimeWindow(_) => Arc::new(TimeWindowPicker::default()), + }; + Self { level_picker } + } + + fn pick_compact_candidates( + &self, + ctx: &PickerContext, + levels_controller: &LevelsController, + expire_time: Option, + ) -> Option { + let num_levels = levels_controller.num_levels(); + //TODO(boyan) level compaction strategy + for level in 0..num_levels { + if let Some(files) = self.level_picker.pick_candidates_at_level( + ctx, + levels_controller, + level, + expire_time, + ) { + return Some(CompactionInputFiles { + level, + files, + // Now, we always output to the same level. + output_level: level, + }); + } + } + + None + } +} + +impl CompactionPicker for CommonCompactionPicker { + fn pick_compaction( + &self, + ctx: PickerContext, + levels_controller: &LevelsController, + ) -> Result { + let expire_time = ctx.ttl.map(Timestamp::expire_time); + let mut compaction_task = CompactionTask { + expired: levels_controller.expired_ssts(expire_time), + ..Default::default() + }; + + if let Some(input_files) = + self.pick_compact_candidates(&ctx, levels_controller, expire_time) + { + info!( + "Compaction strategy: {:?} picker pick files to compact, input_files:{:?}", + ctx.strategy, input_files + ); + + compaction_task.compaction_inputs = vec![input_files]; + } + + Ok(compaction_task) + } +} + +#[inline] +fn find_uncompact_files( + levels_controller: &LevelsController, + level: Level, + expire_time: Option, +) -> Vec { + levels_controller + .iter_ssts_at_level(level) + // Only use files not being compacted and not expired. + .filter(|file| !file.being_compacted() && !file.time_range().is_expired(expire_time)) + .map(Clone::clone) + .collect() +} + +/// Size tiered compaction strategy +/// See https://github.com/jeffjirsa/twcs/blob/master/src/main/java/com/jeffjirsa/cassandra/db/compaction/SizeTieredCompactionStrategy.java +#[derive(Default)] +pub struct SizeTieredPicker {} + +/// Similar size files group +#[derive(Debug)] +struct Bucket { + pub avg_size: usize, + pub files: Vec, +} + +impl Bucket { + fn with_file(file: &FileHandle) -> Self { + Self { + avg_size: file.size() as usize, + files: vec![file.clone()], + } + } + + fn with_files(files: Vec) -> Self { + let total: usize = files.iter().map(|f| f.size() as usize).sum(); + Self { + avg_size: total / files.len(), + files, + } + } + + fn insert_file(&mut self, file: &FileHandle) { + let total_size = self.files.len() * self.avg_size + file.size() as usize; + self.avg_size = total_size / (self.files.len() + 1); + self.files.push(file.clone()); + } + + fn get_hotness_map(&self) -> HashMap { + self.files + .iter() + .map(|f| (f.clone(), Self::hotness(f))) + .collect() + } + + #[inline] + fn hotness(f: &FileHandle) -> f64 { + let row_num = match f.row_num() { + 0 => 1, //prevent NAN hotness + v => v, + }; + f.read_meter().h2_rate() / (row_num as f64) + } +} + +impl LevelPicker for SizeTieredPicker { + fn pick_candidates_at_level( + &self, + ctx: &PickerContext, + levels_controller: &LevelsController, + level: Level, + expire_time: Option, + ) -> Option> { + let files_by_segment = + Self::files_by_segment(levels_controller, level, ctx.segment_duration, expire_time); + if files_by_segment.is_empty() { + return None; + } + + let all_segments: BTreeSet<_> = files_by_segment.keys().collect(); + let opts = ctx.size_tiered_opts(); + + // Iterate the segment in reverse order, so newest segment is examined first. + for (idx, segment_key) in all_segments.iter().rev().enumerate() { + // segment_key should always exist. + if let Some(segment) = files_by_segment.get(segment_key) { + let buckets = Self::get_buckets( + segment.to_vec(), + opts.bucket_high, + opts.bucket_low, + opts.min_sstable_size.as_bytes() as f32, + ); + + let files = + Self::most_interesting_bucket(buckets, opts.min_threshold, opts.max_threshold); + + if files.is_some() { + info!( + "Compact segment, idx: {}, size:{}, segment_key:{:?}, files:{:?}", + idx, + segment.len(), + segment_key, + segment + ); + return files; + } + debug!( + "No compaction necessary for segment, size:{}, segment_key:{:?}, idx:{}", + segment.len(), + segment_key, + idx + ); + } + } + + None + } +} + +impl SizeTieredPicker { + /// Group files of similar size into buckets. + fn get_buckets( + mut files: Vec, + bucket_high: f32, + bucket_low: f32, + min_sst_size: f32, + ) -> Vec { + // sort by file length + files.sort_unstable_by_key(FileHandle::size); + + let mut buckets: Vec = Vec::new(); + 'outer: for sst in &files { + let size = sst.size() as f32; + // look for a bucket containing similar-sized files: + // group in the same bucket if it's w/in 50% of the average for this bucket, + // or this file and the bucket are all considered "small" (less than + // `min_sst_size`) + for bucket in buckets.iter_mut() { + let old_avg_size = bucket.avg_size as f32; + if (size > (old_avg_size * bucket_low) && size < (old_avg_size * bucket_high)) + || (size < min_sst_size && old_avg_size < min_sst_size) + { + // find a similar file, insert it into bucket + bucket.insert_file(sst); + continue 'outer; + } + } + + // no similar bucket found + // put it in a new bucket + buckets.push(Bucket::with_file(sst)); + } + + debug!("Group files of similar size into buckets: {:?}", buckets); + + buckets + } + + fn most_interesting_bucket( + buckets: Vec, + min_threshold: usize, + max_threshold: usize, + ) -> Option> { + let mut pruned_bucket_and_hotness = Vec::with_capacity(buckets.len()); + // skip buckets containing less than min_threshold sstables, + // and limit other buckets to max_threshold sstables + for bucket in buckets { + let (bucket, hotness) = Self::trim_to_threshold_with_hotness(bucket, max_threshold); + if bucket.files.len() >= min_threshold { + pruned_bucket_and_hotness.push((bucket, hotness)); + } + } + + if pruned_bucket_and_hotness.is_empty() { + return None; + } + + // Find the hotest bucket + if let Some((bucket, hotness)) = + pruned_bucket_and_hotness + .into_iter() + .max_by(|(b1, h1), (b2, h2)| { + let c = h1.partial_cmp(h2).unwrap(); + if !c.is_eq() { + return c; + } + //TODO(boyan), compacting smallest sstables first? + b1.avg_size.cmp(&b2.avg_size) + }) + { + debug!( + "Find the hotest bucket, hotness: {}, bucket: {:?}", + hotness, bucket + ); + Some(bucket.files) + } else { + None + } + } + + fn files_by_segment( + levels_controller: &LevelsController, + level: Level, + segment_duration: Duration, + expire_time: Option, + ) -> HashMap> { + let mut files_by_segment = HashMap::new(); + let uncompact_files = find_uncompact_files(levels_controller, level, expire_time); + for file in uncompact_files { + // We use the end time of the range to calculate segment. + let segment = file + .time_range() + .exclusive_end() + .truncate_by(segment_duration); + let files = files_by_segment.entry(segment).or_insert_with(Vec::new); + files.push(file); + } + + files_by_segment + } + + fn trim_to_threshold_with_hotness(bucket: Bucket, max_threshold: usize) -> (Bucket, f64) { + let hotness_snapshot = bucket.get_hotness_map(); + + // Sort by sstable hotness (descending). + let mut sorted_files = bucket.files.to_vec(); + sorted_files.sort_unstable_by(|f1, f2| { + hotness_snapshot[f1] + .partial_cmp(&hotness_snapshot[f2]) + .unwrap() + .reverse() + }); + + // and then trim the coldest sstables off the end to meet the max_threshold + let len = sorted_files.len(); + let pruned_bucket: Vec = sorted_files + .into_iter() + .take(std::cmp::min(max_threshold, len)) + .collect(); + + // bucket hotness is the sum of the hotness of all sstable members + let bucket_hotness = pruned_bucket.iter().map(Bucket::hotness).sum(); + + (Bucket::with_files(pruned_bucket), bucket_hotness) + } +} + +/// Time window compaction strategy +/// See https://github.com/jeffjirsa/twcs/blob/master/src/main/java/com/jeffjirsa/cassandra/db/compaction/TimeWindowCompactionStrategy.java +#[derive(Default)] +pub struct TimeWindowPicker {} + +impl TimeWindowPicker { + fn get_window_bounds_in_millis(window: &Duration, ts: i64) -> (i64, i64) { + let ts_secs = ts / 1000; + + let size = window.as_secs() as i64; + + let lower = ts_secs - (ts_secs % size); + let upper = lower + size - 1; + + (lower * 1000, upper * 1000) + } + + #[inline] + fn resolve_timetamp(ts: i64, timestamp_resolution: TimeUnit) -> i64 { + match timestamp_resolution { + TimeUnit::Microseconds => ts / 1000, + TimeUnit::Nanoseconds => ts / 1000000, + TimeUnit::Seconds => ts * 1000, + TimeUnit::Milliseconds => ts, + // the option is validated before, so it won't reach here + _ => unreachable!(), + } + } + + /// Group files of similar timestamp into buckets. + fn get_buckets( + files: &[FileHandle], + window: &Duration, + timestamp_resolution: TimeUnit, + ) -> (HashMap>, i64) { + let mut max_ts = 0i64; + let mut buckets: HashMap> = HashMap::new(); + for f in files { + let ts = f.time_range_ref().exclusive_end().as_i64(); + + let ts = Self::resolve_timetamp(ts, timestamp_resolution); + + let (left, _) = Self::get_window_bounds_in_millis(window, ts); + + let bucket_files = buckets.entry(left).or_insert_with(Vec::new); + + bucket_files.push(f.clone()); + + if left > max_ts { + max_ts = left; + } + } + + debug!( + "Group files of similar timestamp into buckets: {:?}", + buckets + ); + (buckets, max_ts) + } + + fn newest_bucket( + buckets: HashMap>, + size_tiered_opts: SizeTieredCompactionOptions, + now: i64, + ) -> Option> { + // If the current bucket has at least minThreshold SSTables, choose that one. + // For any other bucket, at least 2 SSTables is enough. + // In any case, limit to max_threshold SSTables. + + let all_keys: BTreeSet<_> = buckets.keys().collect(); + + for key in all_keys.into_iter().rev() { + if let Some(bucket) = buckets.get(key) { + debug!("Key {}, now {}", key, now); + + if bucket.len() >= size_tiered_opts.min_threshold && *key >= now { + // If we're in the newest bucket, we'll use STCS to prioritize sstables + let buckets = SizeTieredPicker::get_buckets( + bucket.to_vec(), + size_tiered_opts.bucket_high, + size_tiered_opts.bucket_low, + size_tiered_opts.min_sstable_size.as_bytes() as f32, + ); + let files = SizeTieredPicker::most_interesting_bucket( + buckets, + size_tiered_opts.min_threshold, + size_tiered_opts.max_threshold, + ); + + if files.is_some() { + return files; + } + } else if bucket.len() >= 2 && *key < now { + debug!("Bucket size {} >= 2 and not in current bucket, compacting what's here: {:?}", bucket.len(), bucket); + return Some(Self::trim_to_threshold( + bucket, + size_tiered_opts.max_threshold, + )); + } else { + debug!( + "No compaction necessary for bucket size {} , key {}, now {}", + bucket.len(), + key, + now + ); + } + } + } + + None + } + + fn trim_to_threshold(files: &[FileHandle], max_threshold: usize) -> Vec { + // Sort by sstable file size + let mut sorted_files = files.to_vec(); + sorted_files.sort_unstable_by_key(FileHandle::size); + + // Trim the largest sstables off the end to meet the maxThreshold + let len = sorted_files.len(); + sorted_files + .into_iter() + .take(std::cmp::min(max_threshold, len)) + .collect() + } + + /// Get current window timestamp, the caller MUST ensure the level has ssts, + /// panic otherwise. + fn get_current_window( + levels_controller: &LevelsController, + level: Level, + window: &Duration, + timestamp_resolution: TimeUnit, + ) -> i64 { + // always find the latest sst here + let now = levels_controller + .latest_sst(level) + .unwrap() + .time_range() + .exclusive_end() + .as_i64(); + let now = Self::resolve_timetamp(now, timestamp_resolution); + Self::get_window_bounds_in_millis(window, now).0 + } +} + +impl LevelPicker for TimeWindowPicker { + fn pick_candidates_at_level( + &self, + ctx: &PickerContext, + levels_controller: &LevelsController, + level: Level, + expire_time: Option, + ) -> Option> { + let uncompact_files = find_uncompact_files(levels_controller, level, expire_time); + + if uncompact_files.is_empty() { + return None; + } + + let opts = ctx.time_window_opts(); + + debug!("TWCS compaction options: {:?}", opts); + + let (buckets, ts) = Self::get_buckets( + &uncompact_files, + &ctx.segment_duration, + opts.timestamp_resolution, + ); + + let now = Self::get_current_window( + levels_controller, + level, + &ctx.segment_duration, + opts.timestamp_resolution, + ); + debug!("now {}, max_ts: {}", now, ts); + assert!(now >= ts); + + Self::newest_bucket(buckets, opts.size_tiered, now) + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use common_types::{ + bytes::Bytes, + tests::build_schema, + time::{TimeRange, Timestamp}, + }; + + use crate::{ + compaction::{picker::PickerContext, CompactionStrategy, PickerManager}, + sst::{ + file::SstMetaData, + manager::{tests::LevelsControllerMockBuilder, LevelsController}, + }, + }; + + fn build_sst_meta_data(time_range: TimeRange, size: u64) -> SstMetaData { + SstMetaData { + min_key: Bytes::from_static(b"100"), + max_key: Bytes::from_static(b"200"), + time_range, + max_sequence: 200, + schema: build_schema(), + size, + row_num: 2, + } + } + + // testcase 0: file buckets: old bucket:[0,1] newest bucket:[2], expired:[3] + fn build_old_bucket_case(now: i64) -> LevelsController { + let builder = LevelsControllerMockBuilder::default(); + let sst_meta_vec = vec![ + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 14000), Timestamp::new(now - 13000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 14000), Timestamp::new(now - 13000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(100), Timestamp::new(200)), + 2, + ), + ]; + builder.add_sst(sst_meta_vec).build() + } + + // testcase 1: file buckets: old bucket:[0,1] newest bucket:[2,3,4,5] + // default min_threshold=4 + fn build_newest_bucket_case(now: i64) -> LevelsController { + let builder = LevelsControllerMockBuilder::default(); + let sst_meta_vec = vec![ + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 14000), Timestamp::new(now - 13000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 14000), Timestamp::new(now - 13000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)), + 2, + ), + ]; + builder.add_sst(sst_meta_vec).build() + } + + // testcase 2: file buckets: old bucket:[0] newest bucket:[1,2,3] + // default min_threshold=4 + fn build_newest_bucket_no_match_case(now: i64) -> LevelsController { + let builder = LevelsControllerMockBuilder::default(); + let sst_meta_vec = vec![ + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 14000), Timestamp::new(now - 13000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)), + 2, + ), + build_sst_meta_data( + TimeRange::new_unchecked(Timestamp::new(now - 4000), Timestamp::new(now - 3000)), + 2, + ), + ]; + builder.add_sst(sst_meta_vec).build() + } + + #[test] + fn test_time_window_picker() { + let picker_manager = PickerManager::default(); + let twp = picker_manager.get_picker(CompactionStrategy::Default); + let mut ctx = PickerContext { + segment_duration: Duration::from_millis(1000), + ttl: Some(Duration::from_secs(100000)), + strategy: CompactionStrategy::Default, + }; + let now = Timestamp::now(); + { + let lc = build_old_bucket_case(now.as_i64()); + let task = twp.pick_compaction(ctx.clone(), &lc).unwrap(); + assert_eq!(task.compaction_inputs[0].files.len(), 2); + assert_eq!(task.compaction_inputs[0].files[0].id(), 0); + assert_eq!(task.compaction_inputs[0].files[1].id(), 1); + assert_eq!(task.expired[0].files.len(), 1); + assert_eq!(task.expired[0].files[0].id(), 3); + } + + { + let lc = build_newest_bucket_case(now.as_i64()); + let task = twp.pick_compaction(ctx.clone(), &lc).unwrap(); + assert_eq!(task.compaction_inputs[0].files.len(), 4); + assert_eq!(task.compaction_inputs[0].files[0].id(), 2); + assert_eq!(task.compaction_inputs[0].files[1].id(), 3); + assert_eq!(task.compaction_inputs[0].files[2].id(), 4); + assert_eq!(task.compaction_inputs[0].files[3].id(), 5); + } + + { + let lc = build_newest_bucket_no_match_case(now.as_i64()); + let task = twp.pick_compaction(ctx.clone(), &lc).unwrap(); + assert_eq!(task.compaction_inputs.len(), 0); + } + + // If ttl is None, then no file is expired. + ctx.ttl = None; + { + let lc = build_old_bucket_case(now.as_i64()); + let task = twp.pick_compaction(ctx, &lc).unwrap(); + assert_eq!(task.compaction_inputs[0].files.len(), 2); + assert_eq!(task.compaction_inputs[0].files[0].id(), 0); + assert_eq!(task.compaction_inputs[0].files[1].id(), 1); + assert!(task.expired[0].files.is_empty()); + } + } +} diff --git a/analytic_engine/src/compaction/scheduler.rs b/analytic_engine/src/compaction/scheduler.rs new file mode 100644 index 0000000000..d06925d6d2 --- /dev/null +++ b/analytic_engine/src/compaction/scheduler.rs @@ -0,0 +1,595 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Compaction scheduler. + +use std::{ + collections::{HashMap, VecDeque}, + hash::Hash, + sync::{ + atomic::{AtomicBool, AtomicUsize, Ordering}, + Arc, RwLock, + }, + time::Duration, +}; + +use async_trait::async_trait; +use common_types::{request_id::RequestId, time::Timestamp}; +use common_util::{ + config::ReadableDuration, + define_result, + runtime::{JoinHandle, Runtime}, +}; +use log::{debug, error, info, warn}; +use object_store::ObjectStore; +use serde_derive::Deserialize; +use snafu::{ResultExt, Snafu}; +use table_engine::table::TableId; +use tokio::{ + sync::{ + mpsc::{self, Receiver, Sender}, + Mutex, + }, + time, +}; + +use crate::{ + compaction::{ + metrics::COMPACTION_PENDING_REQUEST_GAUGE, picker::PickerContext, CompactionTask, + PickerManager, TableCompactionRequest, WaitError, WaiterNotifier, + }, + instance::SpaceStore, + meta::Manifest, + sst::factory::Factory, + table::data::TableDataRef, + TableOptions, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to join compaction schedule worker, err:{}", source))] + JoinWorker { source: common_util::runtime::Error }, +} + +define_result!(Error); + +#[derive(Debug, Clone, Deserialize)] +#[serde(default)] +pub struct SchedulerConfig { + pub schedule_channel_len: usize, + pub schedule_interval: ReadableDuration, + pub max_ongoing_tasks: usize, +} + +// TODO(boyan), a better default value? +const MAX_GOING_COMPACTION_TASKS: usize = 8; +const MAX_PENDING_COMPACTION_TASKS: usize = 1024; + +impl Default for SchedulerConfig { + fn default() -> Self { + Self { + schedule_channel_len: 16, + // 30 minutes schedule interval. + schedule_interval: ReadableDuration(Duration::from_secs(60 * 30)), + max_ongoing_tasks: MAX_GOING_COMPACTION_TASKS, + } + } +} + +enum ScheduleTask { + Request(TableCompactionRequest), + Schedule, + Exit, +} + +#[async_trait] +pub trait CompactionScheduler { + /// Stop the scheduler. + async fn stop_scheduler(&self) -> Result<()>; + + /// Schedule a compaction job to background workers. + async fn schedule_table_compaction(&self, request: TableCompactionRequest); +} + +// A FIFO queue that remove duplicate values by key. +struct RequestQueue { + keys: VecDeque, + values: HashMap, +} + +impl Default for RequestQueue { + fn default() -> Self { + Self { + keys: VecDeque::default(), + values: HashMap::default(), + } + } +} + +impl RequestQueue { + fn push_back(&mut self, key: K, value: V) -> bool { + if self.values.insert(key.clone(), value).is_none() { + self.keys.push_back(key); + return true; + } + false + } + + fn pop_front(&mut self) -> Option { + if let Some(key) = self.keys.pop_front() { + return self.values.remove(&key); + } + None + } + + #[inline] + fn len(&self) -> usize { + self.values.len() + } + + #[inline] + fn is_empty(&self) -> bool { + self.values.is_empty() + } +} + +type RequestBuf = RwLock>; + +struct OngoingTaskLimit { + ongoing_tasks: AtomicUsize, + /// Buffer to hold pending requests + request_buf: RequestBuf, +} + +impl OngoingTaskLimit { + #[inline] + fn start_task(&self) { + self.ongoing_tasks.fetch_add(1, Ordering::SeqCst); + } + + #[inline] + fn finish_task(&self) { + self.ongoing_tasks.fetch_sub(1, Ordering::SeqCst); + } + + #[inline] + fn add_request(&self, request: TableCompactionRequest) { + let mut dropped = 0; + + { + let mut req_buf = self.request_buf.write().unwrap(); + + // Remove older requests + if req_buf.len() >= MAX_PENDING_COMPACTION_TASKS { + while req_buf.len() >= MAX_PENDING_COMPACTION_TASKS { + req_buf.pop_front(); + dropped += 1; + } + COMPACTION_PENDING_REQUEST_GAUGE.sub(dropped) + } + + if req_buf.push_back(request.table_data.id, request) { + COMPACTION_PENDING_REQUEST_GAUGE.add(1) + } + } + + if dropped > 0 { + warn!( + "Too many compaction pending tasks, limit: {}, dropped {} older tasks.", + MAX_PENDING_COMPACTION_TASKS, dropped, + ); + } + } + + fn drain_requests(&self, max_num: usize) -> Vec { + let mut result = Vec::with_capacity(max_num); + let mut req_buf = self.request_buf.write().unwrap(); + + while result.len() < max_num { + if let Some(req) = req_buf.pop_front() { + result.push(req); + } else { + break; + } + } + COMPACTION_PENDING_REQUEST_GAUGE.sub(result.len() as i64); + + result + } + + #[inline] + fn has_pending_requests(&self) -> bool { + !self.request_buf.read().unwrap().is_empty() + } + + #[inline] + fn request_buf_len(&self) -> usize { + self.request_buf.read().unwrap().len() + } + + #[inline] + fn ongoing_tasks(&self) -> usize { + self.ongoing_tasks.load(Ordering::SeqCst) + } +} + +pub type CompactionSchedulerRef = Arc; + +pub struct SchedulerImpl { + sender: Sender, + running: Arc, + handle: Mutex>, +} + +impl SchedulerImpl { + pub fn new< + Wal: Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore + Send + Sync + 'static, + Fa: Factory + Send + Sync + 'static, + >( + space_store: Arc>, + runtime: Arc, + config: SchedulerConfig, + ) -> Self { + let (tx, rx) = mpsc::channel(config.schedule_channel_len); + let running = Arc::new(AtomicBool::new(true)); + + let mut worker = ScheduleWorker { + sender: tx.clone(), + receiver: rx, + space_store, + runtime: runtime.clone(), + schedule_interval: config.schedule_interval.0, + picker_manager: PickerManager::default(), + tables_buf: Vec::new(), + max_ongoing_tasks: config.max_ongoing_tasks, + limit: Arc::new(OngoingTaskLimit { + ongoing_tasks: AtomicUsize::new(0), + request_buf: RwLock::new(RequestQueue::default()), + }), + running: running.clone(), + }; + + let handle = runtime.spawn(async move { + worker.schedule_loop().await; + }); + + Self { + sender: tx, + running, + handle: Mutex::new(handle), + } + } +} + +#[async_trait] +impl CompactionScheduler for SchedulerImpl { + async fn stop_scheduler(&self) -> Result<()> { + self.running.store(false, Ordering::Relaxed); + // Wake up the receiver, if the channel is full, the worker should be busy and + // check the running flag later. + let _ = self.sender.try_send(ScheduleTask::Exit); + + let mut handle = self.handle.lock().await; + (&mut *handle).await.context(JoinWorker)?; + + Ok(()) + } + + async fn schedule_table_compaction(&self, request: TableCompactionRequest) { + let send_res = self.sender.send(ScheduleTask::Request(request)).await; + + if let Err(e) = send_res { + error!("Compaction scheduler failed to send request, err:{}", e); + } + } +} + +struct OngoingTask { + limit: Arc, + sender: Sender, +} + +impl OngoingTask { + async fn schedule_worker_if_need(&self) { + if self.limit.has_pending_requests() { + if let Err(e) = self.sender.send(ScheduleTask::Schedule).await { + error!("Fail to schedule worker, err:{}", e); + } + } + } +} + +struct ScheduleWorker { + sender: Sender, + receiver: Receiver, + space_store: Arc>, + runtime: Arc, + schedule_interval: Duration, + picker_manager: PickerManager, + /// Buffer to hold all tables. + tables_buf: Vec, + max_ongoing_tasks: usize, + limit: Arc, + running: Arc, +} + +#[inline] +async fn schedule_table_compaction(sender: Sender, request: TableCompactionRequest) { + if let Err(e) = sender.send(ScheduleTask::Request(request)).await { + error!("Fail to send table compaction request, err:{}", e); + } +} + +impl< + Wal: Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore + Send + Sync + 'static, + Fa: Factory + Send + Sync + 'static, + > ScheduleWorker +{ + async fn schedule_loop(&mut self) { + while self.running.load(Ordering::Relaxed) { + // TODO(yingwen): Maybe add a random offset to the interval. + match time::timeout(self.schedule_interval, self.receiver.recv()).await { + Ok(Some(schedule_task)) => { + self.handle_schedule_task(schedule_task).await; + } + Ok(None) => { + // The channel is disconnected. + info!("Channel disconnected, compaction schedule worker exit"); + break; + } + Err(_) => { + // Timeout. + info!("Periodical compaction schedule start"); + + self.full_ttl_purge(); + + info!("Periodical compaction schedule end"); + } + } + } + + info!("Compaction schedule loop exit"); + } + + // This function is called seqentially, so we can mark files in compaction + // without racy. + async fn handle_schedule_task(&self, schedule_task: ScheduleTask) { + let ongoing = self.limit.ongoing_tasks(); + match schedule_task { + ScheduleTask::Request(compact_req) => { + debug!("Ongoing compaction tasks:{}", ongoing); + if ongoing >= self.max_ongoing_tasks { + self.limit.add_request(compact_req); + warn!( + "Too many compaction ongoing tasks:{}, max:{}, buf_len:{}", + ongoing, + self.max_ongoing_tasks, + self.limit.request_buf_len() + ); + } else { + self.do_table_compaction_request(compact_req).await; + } + } + ScheduleTask::Schedule => { + if self.max_ongoing_tasks > ongoing { + let pending = self.limit.drain_requests(self.max_ongoing_tasks - ongoing); + let len = pending.len(); + for compact_req in pending { + self.do_table_compaction_request(compact_req).await; + } + debug!("Scheduled {} pending compaction tasks.", len); + } + } + ScheduleTask::Exit => (), + }; + } + + async fn do_table_compaction_request(&self, compact_req: TableCompactionRequest) { + let table_data = compact_req.table_data; + let compaction_notifier = compact_req.compaction_notifier; + let waiter_notifier = WaiterNotifier::new(compact_req.waiter); + + let table_options = table_data.table_options(); + let compaction_strategy = table_options.compaction_strategy; + let picker = self.picker_manager.get_picker(compaction_strategy); + let picker_ctx = match new_picker_context(&*table_options) { + Some(v) => v, + None => { + warn!("No valid context can be created, compaction request will be ignored, table_id:{}, table_name:{}", + table_data.id, table_data.name); + return; + } + }; + let version = table_data.current_version(); + + // Pick compaction task. + let compaction_task = version.pick_for_compaction(picker_ctx, &picker); + let compaction_task = match compaction_task { + Ok(v) => v, + Err(e) => { + error!( + "Compaction scheduler failed to pick compaction, table:{}, table_id:{}, err:{}", + table_data.name, table_data.id, e + ); + // Now the error of picking compaction is considered not fatal and not sent to + // compaction notifier. + return; + } + }; + + // Mark files are in compaction. + compaction_task.mark_files_being_compacted(true); + + let keep_scheduling_compaction = !compaction_task.compaction_inputs.is_empty(); + + let runtime = self.runtime.clone(); + let space_store = self.space_store.clone(); + self.limit.start_task(); + let task = OngoingTask { + sender: self.sender.clone(), + limit: self.limit.clone(), + }; + + let sender = self.sender.clone(); + let request_id = RequestId::next_id(); + // Do actual costly compact job in background. + self.runtime.spawn(async move { + let res = space_store + .compact_table(runtime, &table_data, request_id, &compaction_task) + .await; + + if let Err(e) = &res { + // Compaction is failed, we need to unset the compaction mark. + compaction_task.mark_files_being_compacted(false); + + error!( + "Failed to compact table, table_name:{}, table_id:{}, request_id:{}, err:{}", + table_data.name, table_data.id, request_id, e + ); + } + + task.limit.finish_task(); + task.schedule_worker_if_need().await; + + // Notify the background compact table result. + match res { + Ok(()) => { + let new_compaction_notifier = compaction_notifier.clone(); + compaction_notifier.notify_ok(); + waiter_notifier.notify_wait_result(Ok(())); + + if keep_scheduling_compaction { + schedule_table_compaction( + sender, + TableCompactionRequest::no_waiter( + table_data.clone(), + new_compaction_notifier, + ), + ) + .await; + } + } + Err(e) => { + let e = Arc::new(e); + compaction_notifier.notify_err(e.clone()); + let wait_err = WaitError::Compaction { source: e }; + waiter_notifier.notify_wait_result(Err(wait_err)); + } + } + }); + } + + fn full_ttl_purge(&mut self) { + self.tables_buf.clear(); + self.space_store.list_all_tables(&mut self.tables_buf); + + let mut to_purge = Vec::new(); + + let now = Timestamp::now(); + for table_data in &self.tables_buf { + let expire_time = table_data + .table_options() + .ttl() + .map(|ttl| now.sub_duration_or_min(ttl.0)); + + let version = table_data.current_version(); + if !version.has_expired_sst(expire_time) { + debug!( + "Table has no expired sst, table:{}, table_id:{}, expire_time:{:?}", + table_data.name, table_data.id, expire_time + ); + + continue; + } + + // Create a compaction task that only purge expired files. + let compaction_task = CompactionTask { + expired: version.expired_ssts(expire_time), + ..Default::default() + }; + + // Marks being compacted. + compaction_task.mark_files_being_compacted(true); + + to_purge.push((table_data.clone(), compaction_task)); + } + + let runtime = self.runtime.clone(); + let space_store = self.space_store.clone(); + let request_id = RequestId::next_id(); + // Spawn a background job to purge ssts and avoid schedule thread blocked. + self.runtime.spawn(async move { + for (table_data, compaction_task) in to_purge { + info!("Period purge expired files, table:{}, table_id:{}, request_id:{}", table_data.name, table_data.id, request_id); + + if let Err(e) = space_store + .compact_table(runtime.clone(), &table_data, request_id, &compaction_task) + .await + { + error!( + "Failed to purge expired files of table, table:{}, table_id:{}, request_id:{}, err:{}", + table_data.name, table_data.id, request_id, e + ); + + // Unset the compaction mark. + compaction_task.mark_files_being_compacted(false); + } + } + }); + } +} + +// If segment duration is None, then no compaction should be triggered, but we +// return a None context instead of panic here. +fn new_picker_context(table_opts: &TableOptions) -> Option { + table_opts + .segment_duration() + .map(|segment_duration| PickerContext { + segment_duration, + ttl: table_opts.ttl().map(|ttl| ttl.0), + strategy: table_opts.compaction_strategy, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_request_queue() { + let mut q: RequestQueue = RequestQueue::default(); + assert!(q.is_empty()); + assert_eq!(0, q.len()); + + q.push_back(1, "task1".to_string()); + q.push_back(2, "task2".to_string()); + q.push_back(3, "task3".to_string()); + + assert_eq!(3, q.len()); + assert!(!q.is_empty()); + + assert_eq!("task1", q.pop_front().unwrap()); + assert_eq!("task2", q.pop_front().unwrap()); + assert_eq!("task3", q.pop_front().unwrap()); + assert!(q.pop_front().is_none()); + assert!(q.is_empty()); + + q.push_back(1, "task1".to_string()); + q.push_back(2, "task2".to_string()); + q.push_back(3, "task3".to_string()); + q.push_back(1, "task11".to_string()); + q.push_back(3, "task33".to_string()); + q.push_back(3, "task333".to_string()); + + assert_eq!(3, q.len()); + assert_eq!("task11", q.pop_front().unwrap()); + assert_eq!("task2", q.pop_front().unwrap()); + assert_eq!("task333", q.pop_front().unwrap()); + assert!(q.pop_front().is_none()); + assert!(q.is_empty()); + assert_eq!(0, q.len()); + } +} diff --git a/analytic_engine/src/context.rs b/analytic_engine/src/context.rs new file mode 100644 index 0000000000..60f2ef17c5 --- /dev/null +++ b/analytic_engine/src/context.rs @@ -0,0 +1,38 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Context for instance + +use std::{fmt, sync::Arc}; + +use parquet::{DataCacheRef, MetaCacheRef}; +use table_engine::engine::EngineRuntimes; + +use crate::Config; + +/// Common context for instance +pub struct CommonContext { + pub db_write_buffer_size: usize, + pub space_write_buffer_size: usize, +} + +/// Context for instance open +pub struct OpenContext { + /// Engine config + pub config: Config, + + /// Background job runtime + pub runtimes: Arc, + + /// Sst meta data cache. + pub meta_cache: Option, + /// Sst page cache. + pub data_cache: Option, +} + +impl fmt::Debug for OpenContext { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("OpenContext") + .field("config", &self.config) + .finish() + } +} diff --git a/analytic_engine/src/engine.rs b/analytic_engine/src/engine.rs new file mode 100644 index 0000000000..82e785186b --- /dev/null +++ b/analytic_engine/src/engine.rs @@ -0,0 +1,163 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Implements the TableEngine trait + +use std::sync::Arc; + +use async_trait::async_trait; +use log::info; +use object_store::ObjectStore; +use snafu::ResultExt; +use table_engine::{ + engine::{Close, CreateTableRequest, DropTableRequest, OpenTableRequest, Result, TableEngine}, + table::TableRef, + ANALYTIC_ENGINE_TYPE, +}; +use wal::manager::WalManager; + +use crate::{ + context::CommonContext, instance::InstanceRef, meta::Manifest, space::SpaceName, + sst::factory::Factory, table::TableImpl, +}; + +/// TableEngine implementation +pub struct TableEngineImpl { + /// Instance of the table engine + instance: InstanceRef, +} + +impl Clone for TableEngineImpl { + fn clone(&self) -> Self { + Self { + instance: self.instance.clone(), + } + } +} + +impl< + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa, + > TableEngineImpl +{ + pub fn new(instance: InstanceRef) -> Self { + Self { instance } + } +} + +impl TableEngineImpl { + pub fn instance(&self) -> InstanceRef { + self.instance.clone() + } +} + +impl Drop for TableEngineImpl { + fn drop(&mut self) { + info!("Table engine dropped"); + } +} + +#[async_trait] +impl< + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, + > TableEngine for TableEngineImpl +{ + fn engine_type(&self) -> &str { + ANALYTIC_ENGINE_TYPE + } + + async fn close(&self) -> Result<()> { + info!("Try to close table engine"); + + // Close the instance. + self.instance + .close() + .await + .map_err(|e| Box::new(e) as _) + .context(Close)?; + + info!("Table engine closed"); + + Ok(()) + } + + async fn create_table(&self, request: CreateTableRequest) -> Result { + let space = build_space_name(&request.catalog_name, &request.schema_name); + + info!( + "Table engine impl create table, space:{}, request:{:?}", + space, request + ); + + let ctx = CommonContext { + db_write_buffer_size: self.instance.db_write_buffer_size, + space_write_buffer_size: self.instance.space_write_buffer_size, + }; + let space_table = self.instance.create_table(&ctx, &space, request).await?; + + let table_impl = Arc::new(TableImpl::new( + space_table, + self.instance.clone(), + ANALYTIC_ENGINE_TYPE.to_string(), + )); + + Ok(table_impl) + } + + async fn drop_table(&self, request: DropTableRequest) -> Result { + let space = build_space_name(&request.catalog_name, &request.schema_name); + + info!( + "Table engine impl drop table, space:{}, request:{:?}", + space, request + ); + + let ctx = CommonContext { + db_write_buffer_size: self.instance.db_write_buffer_size, + space_write_buffer_size: self.instance.space_write_buffer_size, + }; + let dropped = self.instance.drop_table(&ctx, &space, request).await?; + Ok(dropped) + } + + async fn open_table(&self, request: OpenTableRequest) -> Result> { + let space = build_space_name(&request.catalog_name, &request.schema_name); + + info!( + "Table engine impl open table, space:{}, request:{:?}", + space, request + ); + let ctx = CommonContext { + db_write_buffer_size: self.instance.db_write_buffer_size, + space_write_buffer_size: self.instance.space_write_buffer_size, + }; + let space_table = match self + .instance + .find_table(&ctx, &space, &request.table_name)? + { + Some(v) => v, + None => return Ok(None), + }; + + let table_impl = Arc::new(TableImpl::new( + space_table, + self.instance.clone(), + ANALYTIC_ENGINE_TYPE.to_string(), + )); + + Ok(Some(table_impl)) + } +} + +/// Build the space name from catalog and schema +// TODO(yingwen): Should we store the => space mapping in the +// system catalog, then put it in the CreateTableRequest, avoid generating space +// name here +fn build_space_name(catalog: &str, schema: &str) -> SpaceName { + // FIXME(yingwen): Find out a better way to create space name + format!("{}/{}", catalog, schema) +} diff --git a/analytic_engine/src/instance/alter.rs b/analytic_engine/src/instance/alter.rs new file mode 100644 index 0000000000..e7ee9f6c42 --- /dev/null +++ b/analytic_engine/src/instance/alter.rs @@ -0,0 +1,289 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Alter schema logic of instance + +use std::{collections::HashMap, sync::Arc}; + +use common_types::schema::Version; +use common_util::define_result; +use log::info; +use object_store::ObjectStore; +use snafu::{ensure, Backtrace, ResultExt, Snafu}; +use table_engine::table::AlterSchemaRequest; +use tokio::sync::oneshot; +use wal::manager::WalManager; + +use crate::{ + instance::{ + flush_compaction::TableFlushOptions, + write_worker, + write_worker::{AlterOptionsCommand, AlterSchemaCommand, WorkerLocal}, + Instance, + }, + meta::{ + meta_update::{AlterOptionsMeta, AlterSchemaMeta, MetaUpdate}, + Manifest, + }, + space::SpaceAndTable, + sst::factory::Factory, + table::data::TableDataRef, + table_options, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to alter schema, source:{}", source,))] + AlterSchema { source: write_worker::Error }, + + #[snafu(display("Failed to alter options, source:{}", source,))] + AlterOptions { source: write_worker::Error }, + + #[snafu(display( + "Try to update schema to elder version, table:{}, current_version:{}, given_version:{}.\nBacktrace:\n{}", + table, + current_version, + given_version, + backtrace, + ))] + InvalidSchemaVersion { + table: String, + current_version: Version, + given_version: Version, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid previous schema version, table:{}, current_version:{}, pre_version:{}.\nBacktrace:\n{}", + table, + current_version, + pre_version, + backtrace, + ))] + InvalidPreVersion { + table: String, + current_version: Version, + pre_version: Version, + backtrace: Backtrace, + }, + + #[snafu(display("Alter schema of a dropped table:{}", table))] + AlterDroppedTable { table: String }, + + #[snafu(display("Failed to flush table, table:{}, err:{}", table, source))] + FlushTable { + table: String, + source: crate::instance::flush_compaction::Error, + }, + + #[snafu(display("Failed to persist alter update, err:{}", source))] + PersistAlter { + source: Box, + }, + + #[snafu(display("Invalid options, table:{}, err:{}", table, source))] + InvalidOptions { + table: String, + source: Box, + }, +} + +define_result!(Error); + +impl< + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, + > Instance +{ + // Alter schema need to be handled by write worker. + pub async fn alter_schema_of_table( + &self, + space_table: &SpaceAndTable, + request: AlterSchemaRequest, + ) -> Result<()> { + info!( + "Instance alter schema, space_table:{:?}, request:{:?}", + space_table, request + ); + + // Create a oneshot channel to send/receive alter schema result. + let (tx, rx) = oneshot::channel(); + let cmd = AlterSchemaCommand { + space_table: space_table.clone(), + request, + tx, + }; + + // Send alter schema request to write worker, actual works done in + // Self::process_alter_schema_command() + write_worker::process_command_in_write_worker( + cmd.into_command(), + space_table.table_data(), + rx, + ) + .await + .context(AlterSchema) + } + + /// Do the actual alter schema job, must called by write worker in write + /// thread sequentially. + pub(crate) async fn process_alter_schema_command( + self: &Arc, + worker_local: &mut WorkerLocal, + space_table: &SpaceAndTable, + request: AlterSchemaRequest, + ) -> Result<()> { + let table_data = space_table.table_data(); + // Validate alter schema request. + self.validate_before_alter(table_data, &request)?; + + let opts = TableFlushOptions { + block_on_write_thread: true, + ..Default::default() + }; + // We are in write thread now and there is no write request being processed, but + // we need to trigger a flush to ensure all wal entries with old schema + // are flushed, so we won't need to handle them during replaying wal. + self.flush_table_in_worker(worker_local, table_data, opts) + .await + .context(FlushTable { + table: &table_data.name, + })?; + + // Now we can persist and update the schema, since this function is called by + // write worker, so there is no other concurrent writer altering the + // schema. + let meta_update = MetaUpdate::AlterSchema(AlterSchemaMeta { + space_id: space_table.space().id, + table_id: table_data.id, + schema: request.schema.clone(), + pre_schema_version: request.pre_schema_version, + }); + self.space_store + .manifest + .store_update(meta_update) + .await + .map_err(|e| Box::new(e) as _) + .context(PersistAlter)?; + + info!( + "Instance update table schema, new_schema:{:?}", + request.schema + ); + + // Update schema in memory. + table_data.set_schema(request.schema); + + Ok(()) + } + + // Most validation should be done by catalog module, so we don't do too much + // duplicate check here, especially the schema compatibility. + fn validate_before_alter( + &self, + table_data: &TableDataRef, + request: &AlterSchemaRequest, + ) -> Result<()> { + ensure!( + !table_data.is_dropped(), + AlterDroppedTable { + table: &table_data.name, + } + ); + + let current_version = table_data.schema_version(); + ensure!( + current_version < request.schema.version(), + InvalidSchemaVersion { + table: &table_data.name, + current_version, + given_version: request.schema.version(), + } + ); + + ensure!( + current_version == request.pre_schema_version, + InvalidPreVersion { + table: &table_data.name, + current_version, + pre_version: request.pre_schema_version, + } + ); + + Ok(()) + } + + pub async fn alter_options_of_table( + &self, + space_table: &SpaceAndTable, + options: HashMap, + ) -> Result<()> { + info!( + "Instance alter options of table, space_table:{:?}, options:{:?}", + space_table, options + ); + + // Create a oneshot channel to send/receive alter options result. + let (tx, rx) = oneshot::channel(); + let cmd = AlterOptionsCommand { + space_table: space_table.clone(), + options, + tx, + }; + + // Send alter options request to write worker, actual works done in + // Self::process_alter_options_command() + write_worker::process_command_in_write_worker( + cmd.into_command(), + space_table.table_data(), + rx, + ) + .await + .context(AlterOptions) + } + + /// Do the actual alter options job, must called by write worker in write + /// thread sequentially. + pub(crate) async fn process_alter_options_command( + self: &Arc, + worker_local: &mut WorkerLocal, + space_table: &SpaceAndTable, + options: HashMap, + ) -> Result<()> { + let table_data = space_table.table_data(); + let current_table_options = table_data.table_options(); + info!( + "Instance alter options, space:{:?}, tables:{:?}, old_table_opts:{:?}, options:{:?}", + space_table.space().name, + space_table.table_data().name, + current_table_options, + options + ); + let mut table_opts = + table_options::merge_table_options_for_alter(&options, &*current_table_options) + .map_err(|e| Box::new(e) as _) + .context(InvalidOptions { + table: &table_data.name, + })?; + table_opts.sanitize(); + + // Now we can persist and update the options, since this function is called by + // write worker, so there is no other concurrent writer altering the + // options. + let meta_update = MetaUpdate::AlterOptions(AlterOptionsMeta { + space_id: space_table.space().id, + table_id: table_data.id, + options: table_opts.clone(), + }); + self.space_store + .manifest + .store_update(meta_update) + .await + .map_err(|e| Box::new(e) as _) + .context(PersistAlter)?; + + table_data.set_table_options(worker_local, table_opts); + Ok(()) + } +} diff --git a/analytic_engine/src/instance/close.rs b/analytic_engine/src/instance/close.rs new file mode 100644 index 0000000000..6ae34f4eb5 --- /dev/null +++ b/analytic_engine/src/instance/close.rs @@ -0,0 +1,93 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Close table logic of instance + +use std::sync::Arc; + +use log::{info, warn}; +use object_store::ObjectStore; +use snafu::ResultExt; +use table_engine::engine::CloseTableRequest; +use tokio::sync::oneshot; +use wal::manager::WalManager; + +use crate::{ + instance::{ + engine::{FlushTable, OperateByWriteWorker, Result}, + flush_compaction::TableFlushOptions, + write_worker::{self, CloseTableCommand, WorkerLocal}, + Instance, + }, + meta::Manifest, + space::SpaceRef, + sst::factory::Factory, +}; + +impl Instance +where + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, +{ + /// Close table need to be handled by write worker. + pub async fn do_close_table(&self, space: SpaceRef, request: CloseTableRequest) -> Result<()> { + info!("Instance close table, request:{:?}", request); + + let table_data = match space.find_table_by_id(request.table_id) { + Some(v) => v, + None => return Ok(()), + }; + + let (tx, rx) = oneshot::channel::>(); + let cmd = CloseTableCommand { space, request, tx }; + write_worker::process_command_in_write_worker(cmd.into_command(), &table_data, rx) + .await + .context(OperateByWriteWorker { + space_id: table_data.space_id, + table: &table_data.name, + table_id: table_data.id, + }) + } + + /// Do the actual close table job, must be called by write worker in write + /// thread sequentially. + pub(crate) async fn process_close_table_command( + self: &Arc, + worker_local: &mut WorkerLocal, + space: SpaceRef, + request: CloseTableRequest, + ) -> Result<()> { + let table_data = match space.find_table_by_id(request.table_id) { + Some(v) => v, + None => { + warn!("try to close a closed table, request:{:?}", request); + return Ok(()); + } + }; + + let opts = TableFlushOptions { + block_on_write_thread: true, + // The table will be dropped, no need to trigger a compaction. + compact_after_flush: false, + ..Default::default() + }; + self.flush_table_in_worker(worker_local, &table_data, opts) + .await + .context(FlushTable { + space_id: space.id, + table: &table_data.name, + table_id: table_data.id, + })?; + + // table has been closed so remove it from the space + let removed_table = space.remove_table(&request.table_name); + assert!(removed_table.is_some()); + + info!( + "table:{}-{} has been removed from the space_id:{}", + table_data.name, table_data.id, space.id + ); + Ok(()) + } +} diff --git a/analytic_engine/src/instance/create.rs b/analytic_engine/src/instance/create.rs new file mode 100644 index 0000000000..1597982f27 --- /dev/null +++ b/analytic_engine/src/instance/create.rs @@ -0,0 +1,131 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Create table logic of instance + +use std::sync::Arc; + +use log::info; +use object_store::ObjectStore; +use snafu::ResultExt; +use table_engine::engine::CreateTableRequest; +use tokio::sync::oneshot; +use wal::manager::WalManager; + +use crate::{ + instance::{ + engine::{CreateTableData, InvalidOptions, OperateByWriteWorker, Result, WriteManifest}, + write_worker::{self, CreateTableCommand, WorkerLocal}, + Instance, + }, + meta::{ + meta_update::{AddTableMeta, MetaUpdate}, + Manifest, + }, + space::SpaceRef, + sst::factory::Factory, + table::data::{TableData, TableDataRef}, + table_options, +}; + +impl Instance +where + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, +{ + /// Create table need to be handled by write worker. + pub async fn do_create_table( + &self, + space: SpaceRef, + request: CreateTableRequest, + ) -> Result { + info!("Instance create table, request:{:?}", request); + + let mut table_opts = + table_options::merge_table_options_for_create(&request.options, &self.table_opts) + .map_err(|e| Box::new(e) as _) + .context(InvalidOptions { + space_id: space.id, + table: &request.table_name, + table_id: request.table_id, + })?; + // Sanitize options before creating table. + table_opts.sanitize(); + + if let Some(table_data) = space.find_table_by_id(request.table_id) { + return Ok(table_data); + } + + // Choose a write worker for this table + let write_handle = space.write_group.choose_worker(request.table_id); + let (table_name, table_id) = (request.table_name.clone(), request.table_id); + + let table_data = Arc::new( + TableData::new( + space.id, + request, + write_handle, + table_opts, + &self.file_purger, + space.mem_usage_collector.clone(), + ) + .context(CreateTableData { + space_id: space.id, + table: &table_name, + table_id, + })?, + ); + + let space_id = space.id; + let (tx, rx) = oneshot::channel(); + let cmd = CreateTableCommand { + space, + table_data: table_data.clone(), + tx, + }; + write_worker::process_command_in_write_worker(cmd.into_command(), &table_data, rx) + .await + .context(OperateByWriteWorker { + space_id, + table: table_name, + table_id: table_data.id, + }) + } + + /// Do the actual create table job, must be called by write worker in write + /// thread sequentially. + pub(crate) async fn process_create_table_command( + self: &Arc, + _worker_local: &mut WorkerLocal, + space: SpaceRef, + table_data: TableDataRef, + ) -> Result { + if let Some(table_data) = space.find_table_by_id(table_data.id) { + // Use the table data from the space instead of the table_data in params. + return Ok(table_data); + }; + + // Store table info into meta + let update = MetaUpdate::AddTable(AddTableMeta { + space_id: space.id, + table_id: table_data.id, + table_name: table_data.name.clone(), + schema: table_data.schema(), + opts: table_data.table_options().as_ref().clone(), + }); + self.space_store + .manifest + .store_update(update) + .await + .map_err(|e| Box::new(e) as _) + .context(WriteManifest { + space_id: space.id, + table: &table_data.name, + table_id: table_data.id, + })?; + + space.insert_table(table_data.clone()); + Ok(table_data) + } +} diff --git a/analytic_engine/src/instance/drop.rs b/analytic_engine/src/instance/drop.rs new file mode 100644 index 0000000000..899d937524 --- /dev/null +++ b/analytic_engine/src/instance/drop.rs @@ -0,0 +1,152 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Drop table logic of instance + +use std::sync::Arc; + +use common_util::define_result; +use log::{info, warn}; +use object_store::ObjectStore; +use snafu::{ResultExt, Snafu}; +use table_engine::engine::DropTableRequest; +use tokio::sync::oneshot; +use wal::manager::WalManager; + +use crate::{ + instance::{ + flush_compaction::TableFlushOptions, + write_worker::{self, DropTableCommand, WorkerLocal}, + Instance, + }, + meta::{ + meta_update::{DropTableMeta, MetaUpdate}, + Manifest, + }, + space::SpaceAndTable, + sst::factory::Factory, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Failed to drop table space:{}, table:{}, err:{}", + space, + table, + source, + ))] + DropTable { + space: String, + table: String, + source: write_worker::Error, + }, + + #[snafu(display("Flush before drop failed, table:{}, err:{}", table, source))] + FlushTable { + table: String, + source: crate::instance::flush_compaction::Error, + }, + + #[snafu(display("Failed to persist drop table update, err:{}", source))] + PersistDrop { + source: Box, + }, +} + +define_result!(Error); + +impl< + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, + > Instance +{ + /// Drop table need to be handled by write worker. + pub async fn do_drop_table( + &self, + space_table: SpaceAndTable, + request: DropTableRequest, + ) -> Result<()> { + info!( + "Instance drop table, space_table:{:?}, request:{:?}", + space_table, request + ); + + // Create a oneshot channel to send/receive alter schema result. + let (tx, rx) = oneshot::channel(); + let cmd = DropTableCommand { + space_table: space_table.clone(), + request, + tx, + }; + + write_worker::process_command_in_write_worker( + cmd.into_command(), + space_table.table_data(), + rx, + ) + .await + .context(DropTable { + space: &space_table.space().name, + table: &space_table.table_data().name, + })?; + + Ok(()) + } + + /// Do the actual drop table job, must be called by write worker in write + /// thread sequentially. + pub(crate) async fn process_drop_table_command( + self: &Arc, + worker_local: &mut WorkerLocal, + space_table: &SpaceAndTable, + _request: DropTableRequest, + ) -> Result<()> { + let table_data = space_table.table_data(); + if table_data.is_dropped() { + warn!( + "Process drop table command tries to drop a dropped table, space_table:{:?}", + space_table + ); + return Ok(()); + } + + // Fixme(xikai): Trigger a force flush so that the data of the table in the wal + // is marked for deletable. However, the overhead of the flushing can + // be avoided. + let opts = TableFlushOptions { + block_on_write_thread: true, + // The table will be dropped, no need to trigger a compaction. + compact_after_flush: false, + ..Default::default() + }; + self.flush_table_in_worker(worker_local, table_data, opts) + .await + .context(FlushTable { + table: &table_data.name, + })?; + + // Store the dropping information into meta + let update = MetaUpdate::DropTable(DropTableMeta { + space_id: space_table.space().id, + table_id: table_data.id, + table_name: table_data.name.clone(), + }); + self.space_store + .manifest + .store_update(update) + .await + .map_err(|e| Box::new(e) as _) + .context(PersistDrop)?; + + // Set the table dropped after finishing flushing and storing drop table meta + // information. + table_data.set_dropped(); + + // Clear the memory status after updating manifest and clearing wal so that + // the drop is retryable if fails to update and clear. + space_table.space().remove_table(&table_data.name); + + Ok(()) + } +} diff --git a/analytic_engine/src/instance/engine.rs b/analytic_engine/src/instance/engine.rs new file mode 100644 index 0000000000..a96895070e --- /dev/null +++ b/analytic_engine/src/instance/engine.rs @@ -0,0 +1,230 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table engine logic of instance + +use std::sync::Arc; + +use common_util::define_result; +use log::info; +use object_store::ObjectStore; +use snafu::{ResultExt, Snafu}; +use table_engine::engine::{CreateTableRequest, DropTableRequest}; +use wal::manager::WalManager; + +use crate::{ + context::CommonContext, + instance::{write_worker::WriteGroup, Instance}, + meta::{ + meta_update::{AddSpaceMeta, MetaUpdate}, + Manifest, + }, + space::{Space, SpaceAndTable, SpaceNameRef, SpaceRef}, + sst::factory::Factory, + table_options, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Space failed to create table, err:{}", source))] + SpaceCreateTable { source: crate::space::Error }, + + #[snafu(display("Failed to drop table, err:{}", source))] + DoDropTable { + source: crate::instance::drop::Error, + }, + + #[snafu(display("Failed to store meta of space, space:{}, err:{}", space, source))] + SpaceWriteMeta { + space: String, + source: Box, + }, + #[snafu(display("Invalid options, table:{}, err:{}", table, source))] + InvalidOptions { + table: String, + source: Box, + }, +} + +define_result!(Error); + +impl From for table_engine::engine::Error { + fn from(err: Error) -> Self { + match err { + Error::SpaceCreateTable { source } => Self::from(source), + + // FIXME(xikai): should map drop table error to a more reasonable table engine error. + Error::DoDropTable { .. } => Self::Unexpected { + source: Box::new(err), + }, + + Error::SpaceWriteMeta { .. } => Self::WriteMeta { + source: Box::new(err), + }, + + Error::InvalidOptions { ref table, .. } => Self::InvalidArguments { + table: table.clone(), + source: Box::new(err), + }, + } + } +} + +impl< + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, + > Instance +{ + /// Find space by name, create if the space is not exists + pub async fn find_or_create_space( + self: &Arc, + _ctx: &CommonContext, + space_name: SpaceNameRef<'_>, + ) -> Result { + // Find space first + if let Some(space) = self.get_space_by_read_lock(space_name) { + return Ok(space); + } + + // Persist space data into meta, done with `meta_state` guarded + let mut meta_state = self.space_store.meta_state.lock().await; + // The space may already been created by other thread + if let Some(space) = self.get_space_by_read_lock(space_name) { + return Ok(space); + } + // Now we are the one responsible to create and persist the space info into meta + + let space_id = meta_state.alloc_space_id(); + // Create write group for the space + // TODO(yingwen): Expose options + let write_group_opts = self.write_group_options(space_id); + let write_group = WriteGroup::new(write_group_opts, self.clone()); + + // Create space + let space = Arc::new(Space::new( + space_id, + space_name.to_string(), + self.space_write_buffer_size, + write_group, + self.mem_usage_collector.clone(), + )); + + // Create a meta update and store it + let update = MetaUpdate::AddSpace(AddSpaceMeta { + space_id, + space_name: space_name.to_string(), + }); + info!("Instance create space, update:{:?}", update); + self.space_store + .manifest + .store_update(update) + .await + .map_err(|e| Box::new(e) as _) + .context(SpaceWriteMeta { space: space_name })?; + + let mut spaces = self.space_store.spaces.write().unwrap(); + spaces.insert(space_name.to_string(), space.clone()); + // Now we can release the meta state lock + + Ok(space) + } + + /// Find space by name + pub fn find_space( + &self, + _ctx: &CommonContext, + space: SpaceNameRef, + ) -> Result> { + let spaces = self.space_store.spaces.read().unwrap(); + Ok(spaces.get_by_name(space).cloned()) + } + + /// Create a table under given space + pub async fn create_table( + self: &Arc, + ctx: &CommonContext, + space: SpaceNameRef<'_>, + request: CreateTableRequest, + ) -> Result { + let mut table_opts = + table_options::merge_table_options_for_create(&request.options, &self.table_opts) + .map_err(|e| Box::new(e) as _) + .context(InvalidOptions { + table: &request.table_name, + })?; + // Sanitize options before creating table. + table_opts.sanitize(); + + info!( + "Instance create table, space:{}, request:{:?}, table_opts:{:?}", + space, request, table_opts + ); + + let space = self.find_or_create_space(ctx, space).await?; + + let table_data = space + .create_table( + request, + &self.space_store.manifest, + &table_opts, + &self.file_purger, + ) + .await + .context(SpaceCreateTable)?; + + Ok(SpaceAndTable::new(space, table_data)) + } + + /// Drop a table under given space + pub async fn drop_table( + self: &Arc, + ctx: &CommonContext, + space: SpaceNameRef<'_>, + request: DropTableRequest, + ) -> Result { + info!( + "Instance drop table, space:{}, request:{:?}", + space, request + ); + + let space = match self.find_space(ctx, space)? { + Some(v) => v, + None => return Ok(false), + }; + + // Checks whether the table is exists + let table = match space.find_table(&request.table_name) { + Some(v) => v, + None => return Ok(false), + }; + + let space_table = SpaceAndTable::new(space.clone(), table); + self.do_drop_table(space_table, request) + .await + .context(DoDropTable)?; + + Ok(true) + } + + /// Find the table under given space by its table name + /// + /// Return None if space or table is not found + pub fn find_table( + &self, + ctx: &CommonContext, + space: SpaceNameRef, + table: &str, + ) -> Result> { + let space = match self.find_space(ctx, space)? { + Some(s) => s, + None => return Ok(None), + }; + + let space_table = space + .find_table(table) + .map(|table_data| SpaceAndTable::new(space, table_data)); + + Ok(space_table) + } +} diff --git a/analytic_engine/src/instance/flush_compaction.rs b/analytic_engine/src/instance/flush_compaction.rs new file mode 100644 index 0000000000..f6fd3debf5 --- /dev/null +++ b/analytic_engine/src/instance/flush_compaction.rs @@ -0,0 +1,1037 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Flush and compaction logic of instance + +use std::{cmp, collections::Bound, sync::Arc}; + +use common_types::{ + projected_schema::ProjectedSchema, + record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder}, + request_id::RequestId, + row::RowViewOnBatch, + time::TimeRange, + SequenceNumber, +}; +use common_util::{config::ReadableDuration, define_result, runtime::Runtime}; +use futures::{ + channel::{mpsc, mpsc::channel}, + future::try_join_all, + stream, SinkExt, TryStreamExt, +}; +use log::{error, info}; +use object_store::{path::ObjectStorePath, ObjectStore}; +use snafu::{Backtrace, OptionExt, ResultExt, Snafu}; +use table_engine::{predicate::Predicate, table::Result as TableResult}; +use tokio::sync::oneshot; +use wal::manager::{RegionId, WalManager}; + +use crate::{ + compaction::{ + CompactionInputFiles, CompactionTask, ExpiredFiles, TableCompactionRequest, WaitError, + }, + instance::{ + write_worker::{self, CompactTableCommand, FlushTableCommand, WorkerLocal}, + Instance, SpaceStore, + }, + memtable::{ColumnarIterPtr, MemTableRef, ScanContext, ScanRequest}, + meta::{ + meta_update::{AlterOptionsMeta, MetaUpdate, VersionEditMeta}, + Manifest, + }, + row_iter::{ + self, + dedup::DedupIterator, + merge::{MergeBuilder, MergeConfig}, + IterOptions, + }, + space::SpaceAndTable, + sst::{ + builder::RecordBatchStream, + factory::{Factory, SstBuilderOptions, SstReaderOptions, SstType}, + file::{self, FileMeta, SstMetaData}, + }, + table::{ + data::{MemTableId, TableData, TableDataRef}, + version::{FlushableMemTables, MemTableState, SamplingMemTable}, + version_edit::{AddFile, DeleteFile, VersionEdit}, + }, +}; + +const DEFAULT_CHANNEL_SIZE: usize = 5; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to store version edit, err:{}", source))] + StoreVersionEdit { + source: Box, + }, + + #[snafu(display("Failed to purge wal, region_id:{}, sequence:{}", region_id, sequence))] + PurgeWal { + region_id: RegionId, + sequence: SequenceNumber, + source: wal::manager::Error, + }, + + #[snafu(display("Failed to build mem table iterator, source:{}", source))] + InvalidMemIter { + source: Box, + }, + + #[snafu(display( + "Sst type is not found, sst_type:{:?}.\nBacktrace:\n{}", + sst_type, + backtrace + ))] + InvalidSstType { + sst_type: SstType, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to build sst, file_path:{}, source:{}", path, source))] + FailBuildSst { + path: String, + source: Box, + }, + + #[snafu(display("Background flush failed, cannot schedule flush task, err:{}", source))] + BackgroundFlushFailed { + source: crate::instance::write_worker::Error, + }, + + #[snafu(display("Failed to send flush command, err:{}", source))] + SendFlushCmd { + source: crate::instance::write_worker::Error, + }, + + #[snafu(display("Failed to send compact command, err:{}", source))] + SendCompactCmd { + source: crate::instance::write_worker::Error, + }, + + #[snafu(display("Failed to build merge iterator, table:{}, err:{}", table, source))] + BuildMergeIterator { + table: String, + source: crate::row_iter::merge::Error, + }, + + #[snafu(display("Failed to do manual compaction, err:{}", source))] + ManualCompactFailed { + source: crate::compaction::WaitError, + }, + + #[snafu(display("Failed to split record batch, source:{}", source))] + SplitRecordBatch { + source: Box, + }, + + #[snafu(display("Failed to send to channel, source:{}", source))] + ChannelSend { source: mpsc::SendError }, + + #[snafu(display("Runtime join error, source:{}", source))] + RuntimeJoin { source: common_util::runtime::Error }, +} + +define_result!(Error); + +/// Options to flush single table. +#[derive(Debug)] +pub struct TableFlushOptions { + /// Flush result sender. + /// + /// Default is None. + pub res_sender: Option>>, + /// Schedule a compaction request after flush. + /// + /// Default is true. + pub compact_after_flush: bool, + /// Whether to block on write thread. + /// + /// Default is false. + pub block_on_write_thread: bool, +} + +impl Default for TableFlushOptions { + fn default() -> Self { + Self { + res_sender: None, + compact_after_flush: true, + block_on_write_thread: false, + } + } +} + +/// Request to flush single table. +pub struct TableFlushRequest { + /// Table to flush. + pub table_data: TableDataRef, + /// Max id of memtable to flush (inclusive). + pub max_memtable_id: MemTableId, +} + +impl< + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, + > Instance +{ + /// Flush this table. + pub async fn flush_table( + &self, + space_table: &SpaceAndTable, + flush_opts: TableFlushOptions, + ) -> Result<()> { + info!( + "Instance flush table, space_table:{:?}, flush_opts:{:?}", + space_table, flush_opts + ); + + // Create a oneshot channel to send/receive flush result. + let (tx, rx) = oneshot::channel(); + let cmd = FlushTableCommand { + space_table: space_table.clone(), + flush_opts, + tx, + }; + + // Actual work is done in flush_table_in_worker(). + write_worker::process_command_in_write_worker( + cmd.into_command(), + space_table.table_data(), + rx, + ) + .await + .context(SendFlushCmd) + } + + /// Compact the table manually. + pub async fn manual_compact_table(&self, space_table: &SpaceAndTable) -> Result<()> { + info!("Instance compact table, space_table:{:?}", space_table); + + // Create a oneshot channel to send/receive result from write worker. + let (tx, rx) = oneshot::channel(); + let (compact_tx, compact_rx) = oneshot::channel(); + // Create a oneshot channel to send/receive compaction result. + let cmd = CompactTableCommand { + space_table: space_table.clone(), + waiter: Some(compact_tx), + tx, + }; + + // The write worker will call schedule_table_compaction(). + write_worker::process_command_in_write_worker( + cmd.into_command(), + space_table.table_data(), + rx, + ) + .await + .context(SendCompactCmd)?; + + // Now wait for compaction done, if the sender has been dropped, we convert it + // into Error::Canceled. + compact_rx + .await + .unwrap_or(Err(WaitError::Canceled)) + .context(ManualCompactFailed) + } + + /// Flush given table in write worker thread. + pub async fn flush_table_in_worker( + self: &Arc, + worker_local: &mut WorkerLocal, + table_data: &TableDataRef, + opts: TableFlushOptions, + ) -> Result<()> { + let flush_req = self.preprocess_flush(worker_local, table_data).await?; + + self.schedule_table_flush(worker_local, flush_req, opts) + .await + } + + async fn preprocess_flush( + &self, + worker_local: &mut WorkerLocal, + table_data: &TableDataRef, + ) -> Result { + let current_version = table_data.current_version(); + let last_sequence = table_data.last_sequence(); + // Switch all mutable memtables + if let Some(suggest_segment_duration) = + current_version.switch_memtables_or_suggest_duration(worker_local) + { + info!("Switch memtable and suggest segment duration, table:{}, table_id:{}, segment_duration:{:?}", table_data.name, table_data.id, suggest_segment_duration); + assert!(suggest_segment_duration.as_millis() > 0); + + let mut new_table_opts = (*table_data.table_options()).clone(); + new_table_opts.segment_duration = Some(ReadableDuration(suggest_segment_duration)); + + // Now persist the new options, the `worker_local` ensure there is no race + // condition. + let meta_update = MetaUpdate::AlterOptions(AlterOptionsMeta { + space_id: table_data.space_id, + table_id: table_data.id, + options: new_table_opts.clone(), + }); + self.space_store + .manifest + .store_update(meta_update) + .await + .map_err(|e| Box::new(e) as _) + .context(StoreVersionEdit)?; + + table_data.set_table_options(worker_local, new_table_opts); + + // Now the segment duration is applied, we can stop sampling and freeze the + // sampling memtable. + current_version.freeze_sampling(worker_local); + } + + info!("Try to trigger memtable flush of table, table:{}, table_id:{}, max_memtable_id:{}, last_sequence:{}", + table_data.name, table_data.id, table_data.last_memtable_id(), last_sequence); + + // Try to flush all memtables of current table + Ok(TableFlushRequest { + table_data: table_data.clone(), + max_memtable_id: table_data.last_memtable_id(), + }) + } + + /// Schedule table flush request to background workers + async fn schedule_table_flush( + self: &Arc, + worker_local: &mut WorkerLocal, + flush_req: TableFlushRequest, + opts: TableFlushOptions, + ) -> Result<()> { + // TODO(yingwen): Store pending flush reqs and retry flush on recoverable error, + // or try to recover from background error + let table_data = flush_req.table_data.clone(); + let table = table_data.name.clone(); + + let instance = self.clone(); + let flush_job = async move { instance.flush_memtables_to_outputs(&flush_req).await }; + + let compact_req = TableCompactionRequest::no_waiter( + table_data.clone(), + worker_local.compaction_notifier(), + ); + let instance = self.clone(); + + if opts.compact_after_flush { + // Schedule compaction if flush completed successfully. + let on_flush_success = async move { + instance.schedule_table_compaction(compact_req).await; + }; + + worker_local + .flush_sequentially( + table, + &table_data.metrics, + flush_job, + on_flush_success, + opts.block_on_write_thread, + opts.res_sender, + ) + .await + .context(BackgroundFlushFailed) + } else { + worker_local + .flush_sequentially( + table, + &table_data.metrics, + flush_job, + async {}, + opts.block_on_write_thread, + opts.res_sender, + ) + .await + .context(BackgroundFlushFailed) + } + } + + /// Caller should guarantee flush of single table is sequential + pub(crate) async fn flush_memtables_to_outputs( + &self, + flush_req: &TableFlushRequest, + ) -> Result<()> { + // TODO(yingwen): Record memtables num to flush as statistics + let TableFlushRequest { + table_data, + max_memtable_id, + } = flush_req; + + let current_version = table_data.current_version(); + let mut mems_to_flush = FlushableMemTables::default(); + + current_version.pick_memtables_to_flush(*max_memtable_id, &mut mems_to_flush); + + if mems_to_flush.is_empty() { + return Ok(()); + } + + let request_id = RequestId::next_id(); + + info!( + "Instance try to flush memtables, table:{}, table_id:{}, request_id:{}, mems_to_flush:{:?}", + table_data.name, table_data.id, request_id, mems_to_flush + ); + + let local_metrics = table_data.metrics.local_flush_metrics(); + // Start flush duration timer. + let _timer = local_metrics.flush_duration_histogram.start_timer(); + let mut files_to_level0 = Vec::with_capacity(mems_to_flush.memtables.len()); + let mut flushed_sequence = 0; + let mut sst_num = 0; + + if let Some(sampling_mem) = &mems_to_flush.sampling_mem { + if let Some(seq) = self + .flush_sampling_memtable( + &*table_data, + request_id, + sampling_mem, + &mut files_to_level0, + ) + .await? + { + flushed_sequence = seq; + sst_num += files_to_level0.len(); + for add_file in &files_to_level0 { + local_metrics.observe_sst_size(add_file.file.meta.size); + } + } + } + + for mem in &mems_to_flush.memtables { + let file = self + .flush_memtable_to_output(&*table_data, request_id, mem) + .await?; + if let Some(file) = file { + let sst_size = file.meta.size; + files_to_level0.push(AddFile { level: 0, file }); + + // Set flushed sequence to max of the last_sequence of memtables. + flushed_sequence = cmp::max(flushed_sequence, mem.last_sequence()); + + sst_num += 1; + // Collect sst size metrics. + local_metrics.observe_sst_size(sst_size); + } + } + + // Collect sst num metrics. + local_metrics.observe_sst_num(sst_num); + + info!( + "Instance flush memtables to output, table:{}, table_id:{}, request_id:{}, mems_to_flush:{:?}, files_to_level0:{:?}, flushed_sequence:{}", + table_data.name, + table_data.id, + request_id, + mems_to_flush, + files_to_level0, + flushed_sequence + ); + + // Persist the flush result to manifest. + let edit_meta = VersionEditMeta { + space_id: table_data.space_id, + table_id: table_data.id, + flushed_sequence, + files_to_add: files_to_level0.clone(), + files_to_delete: Vec::new(), + }; + let meta_update = MetaUpdate::VersionEdit(edit_meta); + self.space_store + .manifest + .store_update(meta_update) + .await + .map_err(|e| Box::new(e) as _) + .context(StoreVersionEdit)?; + + // Apply to the table version. + let mems_to_remove = mems_to_flush.ids(); + let edit = VersionEdit { + flushed_sequence, + mems_to_remove, + files_to_add: files_to_level0, + files_to_delete: Vec::new(), + }; + table_data.current_version().apply_edit(edit); + + // Mark sequence <= flushed_sequence to be deleted. + self.space_store + .wal_manager + .mark_delete_entries_up_to(table_data.wal_region_id(), flushed_sequence) + .await + .context(PurgeWal { + region_id: table_data.wal_region_id(), + sequence: flushed_sequence, + })?; + + info!( + "Instance flush memtables done, table:{}, table_id:{}, request_id:{}", + table_data.name, table_data.id, request_id + ); + + Ok(()) + } + + /// Flush rows in sampling memtable to multiple ssts according to segment + /// duration. + /// + /// Returns flushed sequence. + async fn flush_sampling_memtable( + &self, + table_data: &TableData, + request_id: RequestId, + sampling_mem: &SamplingMemTable, + files_to_level0: &mut Vec, + ) -> Result> { + let (min_key, max_key) = match (sampling_mem.mem.min_key(), sampling_mem.mem.max_key()) { + (Some(min_key), Some(max_key)) => (min_key, max_key), + _ => { + // the memtable is empty and nothing needs flushing. + return Ok(None); + } + }; + + let max_sequence = sampling_mem.mem.last_sequence(); + let time_ranges = sampling_mem.sampler.ranges(); + + info!("Flush sampling memtable, table_id:{:?}, table_name:{:?}, request_id:{}, sampling memtable time_ranges:{:?}", + table_data.id,table_data.name, request_id, time_ranges); + + let mut batch_record_senders = Vec::with_capacity(time_ranges.len()); + let mut sst_handlers = Vec::with_capacity(time_ranges.len()); + let mut file_ids = Vec::with_capacity(time_ranges.len()); + + let sst_builder_options = SstBuilderOptions { + sst_type: table_data.sst_type, + num_rows_per_row_group: table_data.table_options().num_rows_per_row_group, + compression: table_data.table_options().compression, + }; + + for time_range in &time_ranges { + let (batch_record_sender, batch_record_receiver) = + channel::>(DEFAULT_CHANNEL_SIZE); + let file_id = table_data.alloc_file_id(); + let mut sst_file_path = self.space_store.store.new_path(); + table_data.set_sst_file_path(file_id, &mut sst_file_path); + + // TODO: min_key max_key set in sst_builder build + let mut sst_meta = SstMetaData { + min_key: min_key.clone(), + max_key: max_key.clone(), + time_range: *time_range, + max_sequence, + schema: table_data.schema(), + size: 0, + row_num: 0, + }; + + let store = self.space_store.clone(); + let sst_builder_options_clone = sst_builder_options.clone(); + let sst_type = table_data.sst_type; + + // spawn build sst + let handler = self.runtimes.bg_runtime.spawn(async move { + let mut builder = store + .sst_factory + .new_sst_builder( + &sst_builder_options_clone, + &sst_file_path, + store.store_ref(), + ) + .context(InvalidSstType { sst_type })?; + + let sst_info = builder + .build( + request_id, + &sst_meta, + Box::new(batch_record_receiver.map_err(|e| Box::new(e) as _)), + ) + .await + .map_err(|e| { + error!("Failed to build sst file, meta:{:?}, err:{}", sst_meta, e); + Box::new(e) as _ + }) + .with_context(|| FailBuildSst { + path: sst_file_path.display(), + })?; + + // update sst metadata by built info. + sst_meta.row_num = sst_info.row_num as u64; + sst_meta.size = sst_info.file_size as u64; + Ok(sst_meta) + }); + + batch_record_senders.push(batch_record_sender); + sst_handlers.push(handler); + file_ids.push(file_id); + } + + let iter = build_mem_table_iter(sampling_mem.mem.clone(), table_data)?; + + let timestamp_idx = table_data.schema().timestamp_index(); + + for data in iter { + for (idx, record_batch) in split_record_batch_with_time_ranges( + data.map_err(|e| Box::new(e) as _).context(InvalidMemIter)?, + &time_ranges, + timestamp_idx, + )? + .into_iter() + .enumerate() + { + if !record_batch.is_empty() { + batch_record_senders[idx] + .send(Ok(record_batch)) + .await + .context(ChannelSend)?; + } + } + } + batch_record_senders.clear(); + + let ret = try_join_all(sst_handlers).await; + for (idx, sst_meta) in ret.context(RuntimeJoin)?.into_iter().enumerate() { + files_to_level0.push(AddFile { + level: 0, + file: FileMeta { + id: file_ids[idx], + meta: sst_meta?, + }, + }) + } + + Ok(Some(max_sequence)) + } + + async fn flush_memtable_to_output( + &self, + table_data: &TableData, + request_id: RequestId, + memtable_state: &MemTableState, + ) -> Result> { + let (min_key, max_key) = match (memtable_state.mem.min_key(), memtable_state.mem.max_key()) + { + (Some(min_key), Some(max_key)) => (min_key, max_key), + _ => { + // the memtable is empty and nothing needs flushing. + return Ok(None); + } + }; + let max_sequence = memtable_state.last_sequence(); + let mut sst_meta = SstMetaData { + min_key, + max_key, + time_range: memtable_state.time_range, + max_sequence, + schema: table_data.schema(), + size: 0, + row_num: 0, + }; + + // Alloc file id for next sst file + let file_id = table_data.alloc_file_id(); + let mut sst_file_path = self.space_store.store.new_path(); + table_data.set_sst_file_path(file_id, &mut sst_file_path); + + let sst_builder_options = SstBuilderOptions { + sst_type: table_data.sst_type, + num_rows_per_row_group: table_data.table_options().num_rows_per_row_group, + compression: table_data.table_options().compression, + }; + let mut builder = self + .space_store + .sst_factory + .new_sst_builder( + &sst_builder_options, + &sst_file_path, + self.space_store.store_ref(), + ) + .context(InvalidSstType { + sst_type: table_data.sst_type, + })?; + + let iter = build_mem_table_iter(memtable_state.mem.clone(), table_data)?; + + let record_batch_stream: RecordBatchStream = + Box::new(stream::iter(iter).map_err(|e| Box::new(e) as _)); + + let sst_info = builder + .build(request_id, &sst_meta, record_batch_stream) + .await + .map_err(|e| { + // TODO(yingwen): Maybe remove this log. + error!("Failed to build sst file, meta:{:?}, err:{}", sst_meta, e); + Box::new(e) as _ + }) + .with_context(|| FailBuildSst { + path: sst_file_path.display(), + })?; + + // update sst metadata by built info. + sst_meta.row_num = sst_info.row_num as u64; + sst_meta.size = sst_info.file_size as u64; + + Ok(Some(FileMeta { + id: file_id, + meta: sst_meta, + })) + } + + /// Schedule table compaction request to background workers and return + /// immediately. + pub async fn schedule_table_compaction(&self, compact_req: TableCompactionRequest) { + self.compaction_scheduler + .schedule_table_compaction(compact_req) + .await; + } +} + +impl SpaceStore { + pub(crate) async fn compact_table( + &self, + runtime: Arc, + table_data: &TableData, + request_id: RequestId, + task: &CompactionTask, + ) -> Result<()> { + let mut edit_meta = VersionEditMeta { + space_id: table_data.space_id, + table_id: table_data.id, + flushed_sequence: 0, + // Use the number of compaction inputs as the estimated number of files to add. + files_to_add: Vec::with_capacity(task.compaction_inputs.len()), + files_to_delete: Vec::new(), + }; + + if task.expired.is_empty() && task.compaction_inputs.is_empty() { + // Nothing to compact. + return Ok(()); + } + + for files in &task.expired { + self.delete_expired_files(table_data, request_id, files, &mut edit_meta); + } + + for input in &task.compaction_inputs { + self.compact_input_files( + runtime.clone(), + table_data, + request_id, + input, + &mut edit_meta, + ) + .await?; + } + + let meta_update = MetaUpdate::VersionEdit(edit_meta.clone()); + self.manifest + .store_update(meta_update) + .await + .map_err(|e| Box::new(e) as _) + .context(StoreVersionEdit)?; + + // Apply to the table version. + let edit = edit_meta.into_version_edit(); + table_data.current_version().apply_edit(edit); + + Ok(()) + } + + pub(crate) async fn compact_input_files( + &self, + runtime: Arc, + table_data: &TableData, + request_id: RequestId, + input: &CompactionInputFiles, + edit_meta: &mut VersionEditMeta, + ) -> Result<()> { + if input.files.is_empty() { + return Ok(()); + } + + // metrics + let _timer = table_data + .metrics + .compaction_duration_histogram + .start_timer(); + table_data + .metrics + .compaction_observe_sst_num(input.files.len()); + let mut sst_size = 0; + let mut sst_row_num = 0; + for file in &input.files { + sst_size += file.size(); + sst_row_num += file.row_num(); + } + table_data + .metrics + .compaction_observe_input_sst_size(sst_size); + table_data + .metrics + .compaction_observe_input_sst_row_num(sst_row_num); + + info!( + "Instance try to compact table, table:{}, table_id:{}, request_id:{}, input_files:{:?}", + table_data.name, table_data.id, request_id, input.files, + ); + + // The schema may be modified during compaction, so we acquire it first and use + // the acquired schema as the compacted sst meta. + let schema = table_data.schema(); + let table_options = table_data.table_options(); + + let iter_options = IterOptions::default(); + let merge_iter = { + let space_id = table_data.space_id; + let table_id = table_data.id; + let sequence = table_data.last_sequence(); + let projected_schema = ProjectedSchema::no_projection(schema.clone()); + let sst_reader_options = SstReaderOptions { + sst_type: table_data.sst_type, + read_batch_row_num: table_options.num_rows_per_row_group, + reverse: false, + projected_schema: projected_schema.clone(), + predicate: Arc::new(Predicate::new(TimeRange::min_to_max())), + meta_cache: self.meta_cache.clone(), + data_cache: self.data_cache.clone(), + runtime: runtime.clone(), + }; + let mut builder = MergeBuilder::new(MergeConfig { + request_id, + space_id, + table_id, + sequence, + projected_schema, + predicate: Arc::new(Predicate::empty()), + sst_factory: self.sst_factory.clone(), + sst_reader_options, + store: self.store_ref(), + merge_iter_options: iter_options.clone(), + need_dedup: table_options.need_dedup(), + reverse: false, + }); + // Add all ssts in compaction input to builder. + builder + .mut_ssts_of_level(input.level) + .extend_from_slice(&input.files); + let merge_iter = builder.build().await.context(BuildMergeIterator { + table: table_data.name.clone(), + })?; + merge_iter + }; + + let record_batch_stream = if table_options.need_dedup() { + row_iter::record_batch_with_key_iter_to_stream( + DedupIterator::new(request_id, merge_iter, iter_options), + &runtime, + ) + } else { + row_iter::record_batch_with_key_iter_to_stream(merge_iter, &runtime) + }; + + let mut sst_meta = file::merge_sst_meta(&input.files, schema); + + // Alloc file id for the merged sst. + let file_id = table_data.alloc_file_id(); + let mut sst_file_path = self.store.new_path(); + table_data.set_sst_file_path(file_id, &mut sst_file_path); + + let sst_builder_options = SstBuilderOptions { + sst_type: table_data.sst_type, + num_rows_per_row_group: table_options.num_rows_per_row_group, + compression: table_options.compression, + }; + let mut sst_builder = self + .sst_factory + .new_sst_builder(&sst_builder_options, &sst_file_path, self.store_ref()) + .context(InvalidSstType { + sst_type: table_data.sst_type, + })?; + + let sst_info = sst_builder + .build(request_id, &sst_meta, record_batch_stream) + .await + .map_err(|e| Box::new(e) as _) + .with_context(|| FailBuildSst { + path: sst_file_path.display(), + })?; + + // update sst metadata by built info. + sst_meta.row_num = sst_info.row_num as u64; + sst_meta.size = sst_info.file_size as u64; + + table_data + .metrics + .compaction_observe_output_sst_size(sst_meta.size); + table_data + .metrics + .compaction_observe_output_sst_row_num(sst_meta.row_num); + + info!( + "Instance files compacted, table:{}, table_id:{}, request_id:{}, output_path:{}, input_files:{:?}, sst_meta:{:?}", + table_data.name, + table_data.id, + request_id, + sst_file_path.display(), + input.files, + sst_meta + ); + + // Store updates to edit_meta. + edit_meta.files_to_delete.reserve(input.files.len()); + // The compacted file can be deleted later. + for file in &input.files { + edit_meta.files_to_delete.push(DeleteFile { + level: input.level, + file_id: file.id(), + }); + } + // Add the newly created file to meta. + edit_meta.files_to_add.push(AddFile { + level: input.output_level, + file: FileMeta { + id: file_id, + meta: sst_meta, + }, + }); + + Ok(()) + } + + pub(crate) fn delete_expired_files( + &self, + table_data: &TableData, + request_id: RequestId, + expired: &ExpiredFiles, + edit_meta: &mut VersionEditMeta, + ) { + if !expired.files.is_empty() { + info!( + "Instance try to delete expired files, table:{}, table_id:{}, request_id:{}, level:{}, files:{:?}", + table_data.name, table_data.id, request_id, expired.level, expired.files, + ); + } + + let files = &expired.files; + edit_meta.files_to_delete.reserve(files.len()); + for file in files { + edit_meta.files_to_delete.push(DeleteFile { + level: expired.level, + file_id: file.id(), + }); + } + } +} + +fn split_record_batch_with_time_ranges( + record_batch: RecordBatchWithKey, + time_ranges: &[TimeRange], + timestamp_idx: usize, +) -> Result> { + let mut builders: Vec = (0..time_ranges.len()) + .into_iter() + .map(|_| RecordBatchWithKeyBuilder::new(record_batch.schema_with_key().clone())) + .collect(); + + for row_idx in 0..record_batch.num_rows() { + let datum = record_batch.column(timestamp_idx).datum(row_idx); + let timestamp = datum.as_timestamp().unwrap(); + let mut idx = None; + for (i, time_range) in time_ranges.iter().enumerate() { + if time_range.contains(timestamp) { + idx = Some(i); + break; + } + } + + if let Some(idx) = idx { + let view = RowViewOnBatch { + record_batch: &record_batch, + row_idx, + }; + builders[idx] + .append_row_view(&view) + .map_err(|e| Box::new(e) as _) + .context(SplitRecordBatch)?; + } else { + panic!( + "Record timestamp is not in time_ranges, timestamp:{:?}, time_ranges:{:?}", + timestamp, time_ranges + ); + } + } + let mut ret = Vec::with_capacity(builders.len()); + for mut builder in builders { + ret.push( + builder + .build() + .map_err(|e| Box::new(e) as _) + .context(SplitRecordBatch)?, + ); + } + Ok(ret) +} + +fn build_mem_table_iter(memtable: MemTableRef, table_data: &TableData) -> Result { + let scan_ctx = ScanContext::default(); + let scan_req = ScanRequest { + start_user_key: Bound::Unbounded, + end_user_key: Bound::Unbounded, + sequence: common_types::MAX_SEQUENCE_NUMBER, + projected_schema: ProjectedSchema::no_projection(table_data.schema()), + need_dedup: table_data.dedup(), + reverse: false, + }; + memtable + .scan(scan_ctx, scan_req) + .map_err(|e| Box::new(e) as _) + .context(InvalidMemIter) +} + +#[cfg(test)] +mod tests { + use common_types::{ + tests::{ + build_record_batch_with_key_by_rows, build_row, build_row_opt, + check_record_batch_with_key_with_rows, + }, + time::TimeRange, + }; + + use crate::instance::flush_compaction::split_record_batch_with_time_ranges; + + #[test] + fn test_split_record_batch_with_time_ranges() { + let rows0 = vec![build_row(b"binary key", 20, 10.0, "string value")]; + let rows1 = vec![build_row(b"binary key1", 120, 11.0, "string value 1")]; + let rows2 = vec![ + build_row_opt(b"binary key2", 220, None, Some("string value 2")), + build_row_opt(b"binary key3", 250, Some(13.0), None), + ]; + + let rows = vec![rows0.clone(), rows1.clone(), rows2.clone()] + .into_iter() + .flatten() + .collect(); + let record_batch_with_key = build_record_batch_with_key_by_rows(rows); + let column_num = record_batch_with_key.num_columns(); + let time_ranges = vec![ + TimeRange::new_unchecked_for_test(0, 100), + TimeRange::new_unchecked_for_test(100, 200), + TimeRange::new_unchecked_for_test(200, 300), + ]; + + let timestamp_idx = 1; + let rets = + split_record_batch_with_time_ranges(record_batch_with_key, &time_ranges, timestamp_idx) + .unwrap(); + + check_record_batch_with_key_with_rows(&rets[0], rows0.len(), column_num, rows0); + check_record_batch_with_key_with_rows(&rets[1], rows1.len(), column_num, rows1); + check_record_batch_with_key_with_rows(&rets[2], rows2.len(), column_num, rows2); + } +} diff --git a/analytic_engine/src/instance/mem_collector.rs b/analytic_engine/src/instance/mem_collector.rs new file mode 100644 index 0000000000..c686974b34 --- /dev/null +++ b/analytic_engine/src/instance/mem_collector.rs @@ -0,0 +1,118 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::sync::atomic::{AtomicUsize, Ordering}; + +use arena::{Collector, CollectorRef}; + +/// Space memtable memory usage collector +pub struct MemUsageCollector { + /// Memory size allocated in bytes. + bytes_allocated: AtomicUsize, + /// Memory size used in bytes. + bytes_used: AtomicUsize, + parent: Option, +} + +impl Collector for MemUsageCollector { + fn on_alloc(&self, bytes: usize) { + self.bytes_allocated.fetch_add(bytes, Ordering::Relaxed); + + if let Some(c) = &self.parent { + c.on_alloc(bytes); + } + } + + fn on_used(&self, bytes: usize) { + self.bytes_used.fetch_add(bytes, Ordering::Relaxed); + + if let Some(c) = &self.parent { + c.on_used(bytes); + } + } + + fn on_free(&self, used: usize, allocated: usize) { + self.bytes_allocated.fetch_sub(allocated, Ordering::Relaxed); + self.bytes_used.fetch_sub(used, Ordering::Relaxed); + + if let Some(c) = &self.parent { + c.on_free(used, allocated); + } + } +} + +impl Default for MemUsageCollector { + fn default() -> Self { + Self { + bytes_allocated: AtomicUsize::new(0), + bytes_used: AtomicUsize::new(0), + parent: None, + } + } +} + +impl MemUsageCollector { + pub fn with_parent(collector: CollectorRef) -> Self { + Self { + bytes_allocated: AtomicUsize::new(0), + bytes_used: AtomicUsize::new(0), + parent: Some(collector), + } + } + + #[inline] + pub fn total_memory_allocated(&self) -> usize { + self.bytes_allocated.load(Ordering::Relaxed) + } +} + +#[cfg(test)] +mod tests { + use std::sync::{atomic::Ordering, Arc}; + + use super::*; + #[test] + fn test_collector() { + let collector = MemUsageCollector::default(); + + collector.on_alloc(1024); + collector.on_used(128); + assert_eq!(1024, collector.total_memory_allocated()); + assert_eq!(128, collector.bytes_used.load(Ordering::Relaxed)); + + collector.on_free(64, 512); + assert_eq!(512, collector.total_memory_allocated()); + assert_eq!(64, collector.bytes_used.load(Ordering::Relaxed)); + collector.on_free(64, 512); + assert_eq!(0, collector.total_memory_allocated()); + assert_eq!(0, collector.bytes_used.load(Ordering::Relaxed)); + } + + #[test] + fn test_collector_with_parent() { + let p = Arc::new(MemUsageCollector::default()); + let c1 = MemUsageCollector::with_parent(p.clone()); + let c2 = MemUsageCollector::with_parent(p.clone()); + + c1.on_alloc(1024); + c1.on_used(128); + c2.on_alloc(1024); + c2.on_used(128); + assert_eq!(1024, c1.total_memory_allocated()); + assert_eq!(128, c1.bytes_used.load(Ordering::Relaxed)); + assert_eq!(1024, c2.total_memory_allocated()); + assert_eq!(128, c2.bytes_used.load(Ordering::Relaxed)); + assert_eq!(2048, p.total_memory_allocated()); + assert_eq!(256, p.bytes_used.load(Ordering::Relaxed)); + + c1.on_free(64, 512); + assert_eq!(512, c1.total_memory_allocated()); + assert_eq!(64, c1.bytes_used.load(Ordering::Relaxed)); + assert_eq!(1536, p.total_memory_allocated()); + assert_eq!(192, p.bytes_used.load(Ordering::Relaxed)); + c2.on_free(64, 512); + assert_eq!(512, c2.total_memory_allocated()); + assert_eq!(64, c2.bytes_used.load(Ordering::Relaxed)); + assert_eq!(1024, p.total_memory_allocated()); + assert_eq!(128, p.bytes_used.load(Ordering::Relaxed)); + } +} diff --git a/analytic_engine/src/instance/mod.rs b/analytic_engine/src/instance/mod.rs new file mode 100644 index 0000000000..07bdcf350b --- /dev/null +++ b/analytic_engine/src/instance/mod.rs @@ -0,0 +1,271 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! A table engine instance +//! +//! The root mod only contains common functions of instance, other logics are +//! divided into the sub crates + +mod alter; +mod drop; +mod engine; +pub mod flush_compaction; +pub(crate) mod mem_collector; +pub mod open; +mod read; +mod write; +pub mod write_worker; + +use std::{ + collections::HashMap, + sync::{Arc, RwLock}, +}; + +use common_util::{define_result, runtime::Runtime}; +use log::info; +use mem_collector::MemUsageCollector; +use object_store::ObjectStore; +use parquet::{DataCacheRef, MetaCacheRef}; +use snafu::{ResultExt, Snafu}; +use table_engine::engine::EngineRuntimes; +use tokio::sync::Mutex; +use wal::manager::WalManager; + +use crate::{ + compaction::scheduler::CompactionSchedulerRef, + meta::Manifest, + space::{SpaceId, SpaceName, SpaceNameRef, SpaceRef}, + sst::file::FilePurger, + table::data::TableDataRef, + TableOptions, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to stop file purger, err:{}", source))] + StopFilePurger { source: crate::sst::file::Error }, + + #[snafu(display("Failed to stop compaction scheduler, err:{}", source))] + StopScheduler { + source: crate::compaction::scheduler::Error, + }, + + #[snafu(display("Failed to close space, name:{}, err:{}", name, source))] + CloseSpace { + name: String, + source: crate::space::Error, + }, +} + +define_result!(Error); + +/// Meta state +#[derive(Debug)] +struct MetaState { + /// Id of the last space + last_space_id: SpaceId, +} + +impl MetaState { + /// Create a new state + fn new() -> Self { + Self { last_space_id: 1 } + } + + /// Acquire next id for a new space + fn alloc_space_id(&mut self) -> SpaceId { + self.last_space_id += 1; + self.last_space_id + } +} + +impl Default for MetaState { + fn default() -> Self { + Self::new() + } +} + +/// Spaces states +#[derive(Default)] +struct Spaces { + /// Name to space + name_to_space: HashMap, + /// Id to space + id_to_space: HashMap, +} + +impl Spaces { + /// Insert space by name, and also insert id to space mapping + fn insert(&mut self, space_name: SpaceName, space: SpaceRef) { + let space_id = space.id; + self.name_to_space.insert(space_name, space.clone()); + self.id_to_space.insert(space_id, space); + } + + fn get_by_name(&self, name: SpaceNameRef) -> Option<&SpaceRef> { + self.name_to_space.get(name) + } + + /// List all tables of all spaces + fn list_all_tables(&self, tables: &mut Vec) { + let total_tables = self.id_to_space.values().map(|s| s.table_num()).sum(); + tables.reserve(total_tables); + for space in self.id_to_space.values() { + space.list_all_tables(tables); + } + } + + fn list_all_spaces(&self) -> Vec { + self.id_to_space.values().cloned().collect() + } +} + +pub struct SpaceStore { + /// All spaces of the engine. + spaces: RwLock, + /// Manifest (or meta) stores meta data of the engine instance. + manifest: Meta, + /// Wal of all tables + wal_manager: Wal, + /// Sst storage. + store: Arc, + /// Meta lock protects mutation to meta data of the instance. This lock + /// should be held when persisting mutation of the instance level meta data + /// to the manifest. + /// - add a space + /// - delete a space + /// + /// Mutation to space's meta, like add/delete a table, is protected by + /// space's lock instead of this lock. + meta_state: Mutex, + /// Sst factory. + sst_factory: Fa, + + meta_cache: Option, + data_cache: Option, +} + +impl Drop for SpaceStore { + fn drop(&mut self) { + info!("SpaceStore dropped"); + } +} + +impl SpaceStore { + async fn close(&self) -> Result<()> { + let spaces = self.spaces.read().unwrap().list_all_spaces(); + for space in spaces { + // Close all spaces. + space + .close() + .await + .context(CloseSpace { name: &space.name })?; + } + + Ok(()) + } +} + +impl SpaceStore { + fn store_ref(&self) -> &Store { + &*self.store + } + + /// List all tables of all spaces + pub fn list_all_tables(&self, tables: &mut Vec) { + let spaces = self.spaces.read().unwrap(); + spaces.list_all_tables(tables); + } + + /// Find the space which it's all memtables consumes maximum memory. + #[inline] + fn find_maximum_memory_usage_space(&self) -> Option { + let spaces = self.spaces.read().unwrap().list_all_spaces(); + spaces.into_iter().max_by_key(|t| t.memtable_memory_usage()) + } +} + +/// Table engine instance +/// +/// Manages all spaces, also contains needed resources shared across all table +// TODO(yingwen): Track memory usage of all tables (or tables of space) +pub struct Instance { + /// Space storage + space_store: Arc>, + /// Runtime to execute async tasks. + runtimes: Arc, + /// Global table options, overwrite mutable options in each table's + /// TableOptions. + table_opts: TableOptions, + + // Write group options: + write_group_worker_num: usize, + write_group_command_channel_cap: usize, + // End of write group options. + compaction_scheduler: CompactionSchedulerRef, + file_purger: FilePurger, + + meta_cache: Option, + data_cache: Option, + /// Engine memtable memory usage collector + mem_usage_collector: Arc, + /// Engine write buffer size + pub(crate) db_write_buffer_size: usize, + /// Space write buffer size + pub(crate) space_write_buffer_size: usize, +} + +impl Instance { + /// Close the instance gracefully. + pub async fn close(&self) -> Result<()> { + self.file_purger.stop().await.context(StopFilePurger)?; + + self.space_store.close().await?; + + self.compaction_scheduler + .stop_scheduler() + .await + .context(StopScheduler) + } +} + +// TODO(yingwen): Instance builder +impl + Instance +{ + /// Find space using read lock + fn get_space_by_read_lock(&self, space: SpaceNameRef) -> Option { + let spaces = self.space_store.spaces.read().unwrap(); + spaces.get_by_name(space).cloned() + } + + /// Returns options to create a write group for given space + fn write_group_options(&self, space_id: SpaceId) -> write_worker::Options { + write_worker::Options { + space_id, + worker_num: self.write_group_worker_num, + runtime: self.write_runtime().clone(), + command_channel_capacity: self.write_group_command_channel_cap, + } + } + + /// Returns true when engine instance's total memtable memory usage reaches + /// db_write_buffer_size limit. + #[inline] + fn should_flush_instance(&self) -> bool { + self.db_write_buffer_size > 0 + && self.mem_usage_collector.total_memory_allocated() >= self.db_write_buffer_size + } + + #[inline] + fn read_runtime(&self) -> &Arc { + &self.runtimes.read_runtime + } + + #[inline] + fn write_runtime(&self) -> &Arc { + &self.runtimes.write_runtime + } +} + +/// Instance reference +pub type InstanceRef = Arc>; diff --git a/analytic_engine/src/instance/open.rs b/analytic_engine/src/instance/open.rs new file mode 100644 index 0000000000..deb5a047b9 --- /dev/null +++ b/analytic_engine/src/instance/open.rs @@ -0,0 +1,415 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Open logic of instance + +use std::sync::{Arc, RwLock}; + +use common_types::schema::IndexInWriterSchema; +use common_util::define_result; +use log::{debug, error, info, trace}; +use object_store::ObjectStore; +use snafu::{ResultExt, Snafu}; +use tokio::sync::{oneshot, Mutex}; +use wal::{ + log_batch::LogEntry, + manager::{LogIterator, ReadBoundary, ReadContext, ReadRequest, WalManager}, +}; + +use crate::{ + compaction::scheduler::SchedulerImpl, + context::OpenContext, + instance::{ + mem_collector::MemUsageCollector, + write_worker, + write_worker::{RecoverTableCommand, WorkerLocal, WriteGroup}, + Instance, MetaState, SpaceStore, Spaces, + }, + meta::{meta_data::ManifestData, Manifest}, + payload::{ReadPayload, WalDecoder}, + space::{Space, SpaceId}, + sst::{factory::Factory, file::FilePurger}, + table::data::{TableData, TableDataRef}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to read meta update, err:{}", source))] + ReadMetaUpdate { + source: Box, + }, + + #[snafu(display( + "Failed to recover table data, space_id:{}, table:{}, err:{}", + space_id, + table, + source + ))] + RecoverTableData { + space_id: SpaceId, + table: String, + source: crate::table::data::Error, + }, + + #[snafu(display("Failed to read wal, err:{}", source))] + ReadWal { source: wal::manager::Error }, + + #[snafu(display("Failed to apply log entry to memtable, err:{}", source))] + ApplyMemTable { + source: crate::instance::write::Error, + }, + + #[snafu(display("Failed to recover table, source:{}", source,))] + RecoverTable { source: write_worker::Error }, +} + +define_result!(Error); + +impl< + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, + > Instance +{ + /// Open a new instance + pub async fn open( + ctx: OpenContext, + manifest: Meta, + wal_manager: Wal, + store: Store, + sst_factory: Fa, + ) -> Result> { + let store = Arc::new(store); + let space_store = Arc::new(SpaceStore { + spaces: RwLock::new(Spaces::default()), + manifest, + wal_manager, + store: store.clone(), + meta_state: Mutex::new(MetaState::default()), + sst_factory, + meta_cache: ctx.meta_cache.clone(), + data_cache: ctx.data_cache.clone(), + }); + + let scheduler_config = ctx.config.compaction_config.clone(); + let bg_runtime = ctx.runtimes.bg_runtime.clone(); + let compaction_scheduler = Arc::new(SchedulerImpl::new( + space_store.clone(), + bg_runtime.clone(), + scheduler_config, + )); + + let file_purger = FilePurger::start(&*bg_runtime, store); + + let instance = Arc::new(Instance { + space_store, + runtimes: ctx.runtimes.clone(), + table_opts: ctx.config.table_opts.clone(), + write_group_worker_num: ctx.config.write_group_worker_num, + write_group_command_channel_cap: ctx.config.write_group_command_channel_cap, + compaction_scheduler, + file_purger, + meta_cache: ctx.meta_cache.clone(), + data_cache: ctx.data_cache.clone(), + mem_usage_collector: Arc::new(MemUsageCollector::default()), + db_write_buffer_size: ctx.config.db_write_buffer_size, + space_write_buffer_size: ctx.config.space_write_buffer_size, + }); + + instance.recover(ctx).await?; + + Ok(instance) + } + + /// Recover the instance + /// + /// Should only called by open() + async fn recover(self: &Arc, ctx: OpenContext) -> Result<()> { + // Recover meta data, such as all spaces and tables + self.recover_from_meta(&ctx).await?; + + // Recover from wal + self.recover_from_wal(&ctx).await?; + + Ok(()) + } + + /// Recover meta data from manifest + async fn recover_from_meta(self: &Arc, ctx: &OpenContext) -> Result<()> { + info!("Instance recover from meta begin"); + + // Load manifest, also create a new snapshot at startup. + let manifest_data = self + .space_store + .manifest + .load_data(true) + .await + .map_err(|e| Box::new(e) as _) + .context(ReadMetaUpdate)?; + + self.apply_manifest_data(manifest_data, ctx).await?; + + info!("Instance recover from meta end"); + + Ok(()) + } + + /// Apply manifest data to instance + async fn apply_manifest_data( + self: &Arc, + manifest_data: ManifestData, + ctx: &OpenContext, + ) -> Result<()> { + // Apply all spaces. + for (space_id, space_meta_data) in manifest_data.spaces { + // Create write group for space. + let space_meta = space_meta_data.space_meta; + let write_group_opts = self.write_group_options(space_id); + let write_group = WriteGroup::new(write_group_opts, self.clone()); + + // Add this space to instance. + let space = Arc::new(Space::new( + space_id, + space_meta.space_name.clone(), + ctx.config.space_write_buffer_size, + write_group, + self.mem_usage_collector.clone(), + )); + { + let mut spaces = self.space_store.spaces.write().unwrap(); + spaces.insert(space_meta.space_name, space.clone()); + } + + // Add all tables to the space. + for (table_id, table_meta_data) in space_meta_data.tables { + let table_meta = table_meta_data.table_meta; + let table_name = table_meta.table_name.clone(); + // Choose write worker for this table + let write_handle = space.write_group.choose_worker(table_id); + + debug!("Instance apply add table, meta :{:?}", table_meta); + + let table_data = Arc::new( + TableData::recover_from_add( + table_meta, + write_handle, + &self.file_purger, + space.mem_usage_collector.clone(), + ) + .context(RecoverTableData { + space_id, + table: &table_name, + })?, + ); + // Apply version meta to the table. + let version_meta = table_meta_data.version_meta; + let max_file_id = version_meta.max_file_id_to_add(); + table_data.current_version().apply_meta(version_meta); + // In recovery case, we need to maintain last file id of the table manually. + if table_data.last_file_id() < max_file_id { + table_data.set_last_file_id(max_file_id); + } + // Add table to space. + space.insert_table(table_data); + } + } + + // Update meta state. + let mut meta_state = self.space_store.meta_state.lock().await; + meta_state.last_space_id = manifest_data.last_space_id; + + Ok(()) + } + + /// Recover all table data from wal + async fn recover_from_wal(&self, ctx: &OpenContext) -> Result<()> { + // replay_batch_size == 0 causes infinite loop. + assert!(ctx.config.replay_batch_size > 0); + + info!("Instance recover from wal begin, ctx:{:?}", ctx); + + // For each table, recover data of that table + let tables = { + let mut tables = Vec::new(); + self.space_store.list_all_tables(&mut tables); + tables + }; + + let replay_batch_size = ctx.config.max_replay_tables_per_batch; + let mut replaying_rxs = Vec::with_capacity(replay_batch_size); + let mut replaying_tables = Vec::with_capacity(replay_batch_size); + + for table_data in tables { + // Create a oneshot channel to send/recieve recover result + let (tx, rx) = oneshot::channel(); + let cmd = RecoverTableCommand { + table_data: table_data.clone(), + tx, + replay_batch_size: ctx.config.replay_batch_size, + }; + + // Send recover request to write worker, actual works done in + // Self::recover_table_from_wal() + write_worker::send_command_to_write_worker(cmd.into_command(), &table_data).await; + + replaying_rxs.push(rx); + replaying_tables.push(table_data.clone()); + + if replaying_rxs.len() >= replay_batch_size { + // Wait batch done + write_worker::join_all(&replaying_tables, replaying_rxs) + .await + .context(RecoverTable)?; + + replaying_rxs = Vec::with_capacity(replay_batch_size); + replaying_tables.clear(); + } + } + + // Don't forget to wait the last batch done. + if !replaying_rxs.is_empty() { + write_worker::join_all(&replaying_tables, replaying_rxs) + .await + .context(RecoverTable)?; + } + + info!("Instance recover from wal end"); + + Ok(()) + } + + /// Recover table data from wal + /// + /// Called by write worker + pub(crate) async fn recover_table_from_wal( + &self, + worker_local: &WorkerLocal, + table: TableDataRef, + replay_batch_size: usize, + read_ctx: &ReadContext, + log_entry_buf: &mut Vec>, + ) -> Result<()> { + let decoder = WalDecoder::default(); + + let read_req = ReadRequest { + region_id: table.wal_region_id(), + start: ReadBoundary::Min, + end: ReadBoundary::Max, + }; + + // Read all wal of current table + let mut log_iter = self + .space_store + .wal_manager + .read(read_ctx, &read_req) + .context(ReadWal)?; + + loop { + // fetch entries to log_entry_buf + let no_more_data = { + log_entry_buf.clear(); + + for _ in 0..replay_batch_size { + if let Some(log_entry) = log_iter.next_log_entry(&decoder).context(ReadWal)? { + log_entry_buf.push(log_entry); + } else { + break; + } + } + + log_entry_buf.len() < replay_batch_size + }; + + // Replay all log entries of current table + self.replay_table_log_entries(worker_local, &*table, log_entry_buf) + .await?; + + // No more entries. + if no_more_data { + break; + } + } + + Ok(()) + } + + /// Replay all log entries into memtable + async fn replay_table_log_entries( + &self, + worker_local: &WorkerLocal, + table_data: &TableData, + log_entries: &mut [LogEntry], + ) -> Result<()> { + if log_entries.is_empty() { + // No data in wal + return Ok(()); + } + + let last_sequence = log_entries.last().unwrap().sequence; + + info!( + "Instance replay table log entries begin, table:{}, table_id:{:?}, sequence:{}", + table_data.name, table_data.id, last_sequence + ); + + // TODO(yingwen): Maybe we need to trigger flush if memtable is full during + // recovery Replay entries + for log_entry in log_entries { + let (sequence, payload) = (log_entry.sequence, &mut log_entry.payload); + + // Apply to memtable + match payload { + ReadPayload::Write { row_group } => { + trace!( + "Instance replay row_group, table:{}, row_group:{:?}", + table_data.name, + row_group + ); + + let table_schema_version = table_data.schema_version(); + if table_schema_version != row_group.schema().version() { + // Data with old schema should already been flushed, but we avoid panic + // here. + error!( + "Ignore data with mismatch schema version during replaying, \ + table:{}, \ + table_id:{:?}, \ + expect:{}, \ + actual:{}, \ + last_sequence:{}, \ + sequence:{}", + table_data.name, + table_data.id, + table_schema_version, + row_group.schema().version(), + last_sequence, + sequence, + ); + + continue; + } + + let index_in_writer = + IndexInWriterSchema::for_same_schema(row_group.schema().num_columns()); + Self::write_to_memtable( + worker_local, + table_data, + sequence, + row_group, + index_in_writer, + ) + .context(ApplyMemTable)?; + } + } + } + + info!( + "Instance replay table log entries end, table:{}, table_id:{:?}, last_sequence:{}", + table_data.name, table_data.id, last_sequence + ); + + table_data.set_last_sequence(last_sequence); + + Ok(()) + } +} diff --git a/analytic_engine/src/instance/read.rs b/analytic_engine/src/instance/read.rs new file mode 100644 index 0000000000..8d47d7d8d3 --- /dev/null +++ b/analytic_engine/src/instance/read.rs @@ -0,0 +1,388 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Read logic of instance + +use std::{ + collections::BTreeMap, + pin::Pin, + task::{Context, Poll}, +}; + +use common_types::{ + projected_schema::ProjectedSchema, record_batch::RecordBatch, schema::RecordSchema, + time::TimeRange, +}; +use common_util::{define_result, runtime::Runtime}; +use futures::stream::Stream; +use log::{debug, error, trace}; +use object_store::ObjectStore; +use snafu::{ResultExt, Snafu}; +use table_engine::{ + stream::{ + self, ErrWithSource, PartitionedStreams, RecordBatchStream, SendableRecordBatchStream, + }, + table::ReadRequest, +}; +use tokio::sync::mpsc::{self, Receiver}; +use wal::manager::WalManager; + +use crate::{ + instance::Instance, + meta::Manifest, + row_iter::{ + chain, + chain::{ChainConfig, ChainIterator}, + dedup::DedupIterator, + merge::{MergeBuilder, MergeConfig, MergeIterator}, + IterOptions, RecordBatchWithKeyIterator, + }, + space::SpaceAndTable, + sst::factory::{Factory, SstReaderOptions}, + table::{ + data::TableData, + version::{ReadView, TableVersion}, + }, + table_options::TableOptions, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to scan memtable, table:{}, err:{}", table, source))] + ScanMemTable { + table: String, + source: crate::memtable::Error, + }, + + #[snafu(display("Failed to build merge iterator, table:{}, err:{}", table, source))] + BuildMergeIterator { + table: String, + source: crate::row_iter::merge::Error, + }, + + #[snafu(display("Failed to build chain iterator, table:{}, err:{}", table, source))] + BuildChainIterator { + table: String, + source: crate::row_iter::chain::Error, + }, +} + +define_result!(Error); + +const RECORD_BATCH_READ_BUF_SIZE: usize = 1000; + +/// Check whether it needs to apply merge sorting when reading the table with +/// the `table_options` by the `read_request`. +fn need_merge_sort_streams(table_options: &TableOptions, read_request: &ReadRequest) -> bool { + table_options.need_dedup() || read_request.order.is_in_order() +} + +impl + Instance +{ + /// Read data in multiple time range from table, and return + /// `read_parallelism` output streams. + pub async fn partitioned_read_from_table( + &self, + space_table: &SpaceAndTable, + request: ReadRequest, + ) -> Result { + debug!( + "Instance read from table, space:{}, table:{}, table_id:{:?}, request:{:?}", + space_table.space().name, + space_table.table_data().name, + space_table.table_data().id, + request + ); + + let table_data = space_table.table_data(); + + // Collect metrics. + table_data.metrics.on_read_request_begin(); + + let iter_options = IterOptions::default(); + let table_options = table_data.table_options(); + + if need_merge_sort_streams(&table_data.table_options(), &request) { + let merge_iters = self + .build_merge_iters(table_data, &request, iter_options, &*table_options) + .await?; + self.build_partitioned_streams(&request, merge_iters) + } else { + let chain_iters = self + .build_chain_iters(table_data, &request, &*table_options) + .await?; + self.build_partitioned_streams(&request, chain_iters) + } + } + + fn build_partitioned_streams( + &self, + request: &ReadRequest, + mut partitioned_iters: Vec, + ) -> Result { + let read_parallelism = request.opts.read_parallelism; + + if read_parallelism == 1 && request.order.is_in_desc_order() { + // TODO(xikai): it seems this can be avoided. + partitioned_iters.reverse(); + }; + + // Split iterators into `read_parallelism` groups. + let mut splited_iters: Vec<_> = std::iter::repeat_with(Vec::new) + .take(read_parallelism) + .collect(); + + for (i, time_aligned_iter) in partitioned_iters.into_iter().enumerate() { + splited_iters[i % read_parallelism].push(time_aligned_iter); + } + + let mut streams = Vec::with_capacity(read_parallelism); + for iters in splited_iters { + let stream = iters_to_stream(iters, self.read_runtime(), &request.projected_schema); + streams.push(stream); + } + + assert_eq!(read_parallelism, streams.len()); + + Ok(PartitionedStreams { streams }) + } + + async fn build_merge_iters( + &self, + table_data: &TableData, + request: &ReadRequest, + iter_options: IterOptions, + table_options: &TableOptions, + ) -> Result>> { + // Current visible sequence + let sequence = table_data.last_sequence(); + let projected_schema = request.projected_schema.clone(); + let sst_reader_options = SstReaderOptions { + sst_type: table_data.sst_type, + read_batch_row_num: table_options.num_rows_per_row_group, + reverse: request.order.is_in_desc_order(), + projected_schema: projected_schema.clone(), + predicate: request.predicate.clone(), + meta_cache: self.meta_cache.clone(), + data_cache: self.data_cache.clone(), + runtime: self.read_runtime().clone(), + }; + + let time_range = request.predicate.time_range; + let version = table_data.current_version(); + let read_views = self.partition_ssts_and_memtables(time_range, version, &*table_options); + + let mut iters = Vec::with_capacity(read_views.len()); + for read_view in read_views { + let merge_config = MergeConfig { + request_id: request.request_id, + space_id: table_data.space_id, + table_id: table_data.id, + sequence, + projected_schema: projected_schema.clone(), + predicate: request.predicate.clone(), + sst_factory: self.space_store.sst_factory.clone(), + sst_reader_options: sst_reader_options.clone(), + store: self.space_store.store_ref(), + merge_iter_options: iter_options.clone(), + need_dedup: table_options.need_dedup(), + reverse: request.order.is_in_desc_order(), + }; + + let merge_iter = MergeBuilder::new(merge_config) + .sampling_mem(read_view.sampling_mem) + .memtables(read_view.memtables) + .ssts_of_level(read_view.leveled_ssts) + .build() + .await + .context(BuildMergeIterator { + table: &table_data.name, + })?; + let dedup_iter = + DedupIterator::new(request.request_id, merge_iter, iter_options.clone()); + + iters.push(dedup_iter); + } + + Ok(iters) + } + + async fn build_chain_iters( + &self, + table_data: &TableData, + request: &ReadRequest, + table_options: &TableOptions, + ) -> Result> { + let projected_schema = request.projected_schema.clone(); + + assert!(request.order.is_out_of_order()); + + let sst_reader_options = SstReaderOptions { + sst_type: table_data.sst_type, + read_batch_row_num: table_options.num_rows_per_row_group, + // no need to read in order so just read in asc order by default. + reverse: false, + projected_schema: projected_schema.clone(), + predicate: request.predicate.clone(), + meta_cache: self.meta_cache.clone(), + data_cache: self.data_cache.clone(), + runtime: self.read_runtime().clone(), + }; + + let time_range = request.predicate.time_range; + let version = table_data.current_version(); + let read_views = self.partition_ssts_and_memtables(time_range, version, &*table_options); + + let mut iters = Vec::with_capacity(read_views.len()); + for read_view in read_views { + let chain_config = ChainConfig { + request_id: request.request_id, + space_id: table_data.space_id, + table_id: table_data.id, + projected_schema: projected_schema.clone(), + predicate: request.predicate.clone(), + sst_reader_options: sst_reader_options.clone(), + sst_factory: self.space_store.sst_factory.clone(), + store: self.space_store.store_ref(), + }; + let builder = chain::Builder::new(chain_config); + let chain_iter = builder + .sampling_mem(read_view.sampling_mem) + .memtables(read_view.memtables) + .ssts(read_view.leveled_ssts) + .build() + .await + .context(BuildChainIterator { + table: &table_data.name, + })?; + + iters.push(chain_iter); + } + + Ok(iters) + } + + fn partition_ssts_and_memtables( + &self, + time_range: TimeRange, + version: &TableVersion, + table_options: &TableOptions, + ) -> Vec { + let read_view = version.pick_read_view(time_range); + + let segment_duration = match table_options.segment_duration { + Some(v) => v.0, + None => { + // Segment duration is unknown, the table maybe still in sampling phase + // or the segment duration is still not applied to the table options, + // just return one partition. + return vec![read_view]; + } + }; + if read_view.contains_sampling() { + // The table contains sampling memtable, just return one partition. + return vec![read_view]; + } + + // Collect the aligned ssts and memtables into the map. + // {aligned timestamp} => {read view} + let mut read_view_by_time = BTreeMap::new(); + for (level, leveled_ssts) in read_view.leveled_ssts.into_iter().enumerate() { + for file in leveled_ssts { + let aligned_ts = file + .time_range() + .inclusive_start() + .truncate_by(segment_duration); + let entry = read_view_by_time + .entry(aligned_ts) + .or_insert_with(ReadView::default); + entry.leveled_ssts[level].push(file); + } + } + + for memtable in read_view.memtables { + let aligned_ts = memtable + .time_range + .inclusive_start() + .truncate_by(segment_duration); + let entry = read_view_by_time + .entry(aligned_ts) + .or_insert_with(ReadView::default); + entry.memtables.push(memtable); + } + + read_view_by_time.into_values().collect() + } +} + +// TODO(xikai): this is a hack way to implement SendableRecordBatchStream for +// MergeIterator. +fn iters_to_stream( + collection: T, + runtime: &Runtime, + schema: &ProjectedSchema, +) -> SendableRecordBatchStream +where + T: IntoIterator + Send + 'static, + T::Item: RecordBatchWithKeyIterator, + T::IntoIter: Send, +{ + let (tx, rx) = mpsc::channel(RECORD_BATCH_READ_BUF_SIZE); + let projected_schema = schema.clone(); + + runtime.spawn(async move { + for mut iter in collection { + while let Some(record_batch) = iter.next_batch().await.transpose() { + let record_batch = + record_batch + .map_err(|e| Box::new(e) as _) + .context(ErrWithSource { + msg: "Read record batch", + }); + + // Apply the projection to RecordBatchWithKey and gets the final RecordBatch. + let record_batch = record_batch.and_then(|batch_with_key| { + // TODO(yingwen): Try to use projector to do this, which precompute row + // indexes to project. + batch_with_key + .try_project(&projected_schema) + .map_err(|e| Box::new(e) as _) + .context(ErrWithSource { + msg: "Project record batch", + }) + }); + + trace!("send next record batch:{:?}", record_batch); + if tx.send(record_batch).await.is_err() { + error!("Failed to send record batch from the merge iterator"); + break; + } + } + } + }); + + Box::pin(ChannelledRecordBatchStream { + schema: schema.to_record_schema(), + rx, + }) +} + +pub struct ChannelledRecordBatchStream { + schema: RecordSchema, + rx: Receiver>, +} + +impl Stream for ChannelledRecordBatchStream { + type Item = stream::Result; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + Pin::new(&mut this.rx).poll_recv(cx) + } +} + +impl RecordBatchStream for ChannelledRecordBatchStream { + fn schema(&self) -> &RecordSchema { + &self.schema + } +} diff --git a/analytic_engine/src/instance/write.rs b/analytic_engine/src/instance/write.rs new file mode 100644 index 0000000000..711e0c9b0d --- /dev/null +++ b/analytic_engine/src/instance/write.rs @@ -0,0 +1,464 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Write logic of instance + +use std::sync::Arc; + +use common_types::{ + bytes::ByteVec, + row::RowGroup, + schema::{IndexInWriterSchema, Schema}, +}; +use common_util::{codec::row, define_result}; +use log::{debug, error, info, trace, warn}; +use object_store::ObjectStore; +use proto::table_requests; +use smallvec::SmallVec; +use snafu::{ensure, Backtrace, ResultExt, Snafu}; +use table_engine::table::WriteRequest; +use tokio::sync::oneshot; +use wal::{ + log_batch::{LogWriteBatch, LogWriteEntry}, + manager::{SequenceNumber, WalManager, WriteContext}, +}; + +use crate::{ + instance::{ + flush_compaction::TableFlushOptions, + write_worker, + write_worker::{BackgroundStatus, WorkerLocal, WriteTableCommand}, + Instance, + }, + memtable::{key::KeySequence, PutContext}, + meta::Manifest, + payload::WritePayload, + space::SpaceAndTable, + sst::factory::Factory, + table::{ + data::{TableData, TableDataRef}, + version::MemTableForWrite, + }, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to write to wal, table:{}, err:{}", table, source))] + WriteLogBatch { + table: String, + source: wal::manager::Error, + }, + + #[snafu(display("Failed to write to memtable, table:{}, err:{}", table, source))] + WriteMemTable { + table: String, + source: crate::table::version::Error, + }, + + #[snafu(display("Try to write to a dropped table, table:{}", table))] + WriteDroppedTable { table: String }, + + #[snafu(display( + "Too many rows to write (more than {}), table:{}, rows:{}.\nBacktrace:\n{}", + MAX_ROWS_TO_WRITE, + table, + rows, + backtrace, + ))] + TooManyRows { + table: String, + rows: usize, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to find mutable memtable, table:{}, err:{}", table, source))] + FindMutableMemTable { + table: String, + source: crate::table::data::Error, + }, + #[snafu(display("Failed to write table, source:{}", source,))] + Write { source: write_worker::Error }, + + #[snafu(display("Failed to flush table, table:{}, err:{}", table, source))] + FlushTable { + table: String, + source: crate::instance::flush_compaction::Error, + }, + + #[snafu(display( + "Background flush failed, cannot write more data, err:{}.\nBacktrace:\n{}", + msg, + backtrace + ))] + BackgroundFlushFailed { msg: String, backtrace: Backtrace }, + + #[snafu(display("Schema of request is incompatible with table, err:{}", source))] + IncompatSchema { + source: common_types::schema::CompatError, + }, + + #[snafu(display("Failed to encode row group, err:{}", source))] + EncodeRowGroup { + source: common_util::codec::row::Error, + }, + + #[snafu(display("Failed to update sequence of memtable, err:{}", source))] + UpdateMemTableSequence { source: crate::memtable::Error }, +} + +define_result!(Error); + +/// Max rows in a write request, must less than [u32::MAX] +const MAX_ROWS_TO_WRITE: usize = 10_000_000; + +pub struct EncodeContext { + row_group: RowGroup, + index_in_writer: IndexInWriterSchema, + encoded_rows: Vec, +} + +impl EncodeContext { + fn new(row_group: RowGroup) -> Self { + Self { + row_group, + index_in_writer: IndexInWriterSchema::default(), + encoded_rows: Vec::new(), + } + } + + fn encode_rows(&mut self, table_schema: &Schema) -> Result<()> { + // Encode the row group into the buffer, which can be reused to write to + // memtable + row::encode_row_group_for_wal( + &self.row_group, + table_schema, + &self.index_in_writer, + &mut self.encoded_rows, + ) + .context(EncodeRowGroup)?; + + assert_eq!(self.row_group.num_rows(), self.encoded_rows.len()); + + Ok(()) + } +} + +impl< + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, + > Instance +{ + /// Write data to the table under give space. + pub async fn write_to_table( + &self, + space_table: &SpaceAndTable, + request: WriteRequest, + ) -> Result { + // Collect metrics. + space_table.table_data().metrics.on_write_request_begin(); + + self.validate_before_write(space_table, &request)?; + + // Create a oneshot channel to send/receive write result. + let (tx, rx) = oneshot::channel(); + let cmd = WriteTableCommand { + space_table: space_table.clone(), + request, + tx, + }; + + // Send write request to write worker, actual works done in + // Self::process_write_table_command(). + write_worker::process_command_in_write_worker( + cmd.into_command(), + space_table.table_data(), + rx, + ) + .await + .context(Write) + } + + /// Do the actual write, must called by write worker in write thread + /// sequentially. + pub(crate) async fn process_write_table_command( + self: &Arc, + worker_local: &mut WorkerLocal, + space_table: &SpaceAndTable, + request: WriteRequest, + ) -> Result { + let mut encode_ctx = EncodeContext::new(request.row_group); + + self.preprocess_write(worker_local, space_table, &mut encode_ctx) + .await?; + + let table_data = space_table.table_data(); + let schema = table_data.schema(); + encode_ctx.encode_rows(&schema)?; + + let EncodeContext { + row_group, + index_in_writer, + encoded_rows, + } = encode_ctx; + + let sequence = self + .write_to_wal(worker_local, &**table_data, encoded_rows) + .await?; + + Self::write_to_memtable( + worker_local, + &**table_data, + sequence, + &row_group, + index_in_writer, + ) + .map_err(|e| { + error!( + "Failed to write to memtable, space_table:{:?}, err:{}", + space_table, e + ); + e + })?; + + // Failure of writing memtable may cause inconsecutive sequence. + if table_data.last_sequence() + 1 != sequence { + warn!( + "Sequence must be consecutive, space_table:{:?}, last_sequence:{}, wal_sequence:{}", + space_table, + table_data.last_sequence(), + sequence + ); + } + + debug!( + "Instance write finished, update sequence, space_table:{:?}, last_sequence:{}", + space_table, sequence + ); + + table_data.set_last_sequence(sequence); + + let num_rows = row_group.num_rows(); + // Collect metrics. + table_data.metrics.on_write_request_done(num_rows); + + Ok(num_rows) + } + + /// Return Ok if the request is valid, this is done before entering the + /// write thread. + fn validate_before_write( + &self, + space_table: &SpaceAndTable, + request: &WriteRequest, + ) -> Result<()> { + ensure!( + request.row_group.num_rows() < MAX_ROWS_TO_WRITE, + TooManyRows { + table: &space_table.table_data().name, + rows: request.row_group.num_rows(), + } + ); + + Ok(()) + } + + /// Preprocess before write, check: + /// - whether table is dropped + /// - memtable capacity and maybe trigger flush + /// + /// Fills [common_types::schema::IndexInWriterSchema] in [EncodeContext] + async fn preprocess_write( + self: &Arc, + worker_local: &mut WorkerLocal, + space_table: &SpaceAndTable, + encode_ctx: &mut EncodeContext, + ) -> Result<()> { + let space = space_table.space(); + let table_data = space_table.table_data(); + + ensure!( + !table_data.is_dropped(), + WriteDroppedTable { + table: &table_data.name, + } + ); + + // Checks schema compability. + table_data + .schema() + .compatible_for_write( + encode_ctx.row_group.schema(), + &mut encode_ctx.index_in_writer, + ) + .context(IncompatSchema)?; + + // TODO(yingwen): Allow write and retry flush. + // Check background status, if background error occured, not allow to write + // again. + match &*worker_local.background_status() { + // Compaction error is ignored now. + BackgroundStatus::Ok | BackgroundStatus::CompactionFailed(_) => (), + BackgroundStatus::FlushFailed(e) => { + return BackgroundFlushFailed { msg: e.to_string() }.fail(); + } + } + + if self.should_flush_instance() { + if let Some(space) = self.space_store.find_maximum_memory_usage_space() { + if let Some(table) = space.find_maximum_memory_usage_table() { + info!("Trying to flush table {} bytes {} in space {} because engine total memtable memory usage exceeds db_write_buffer_size {}.", + table.name, + table.memtable_memory_usage(), + space.name, + self.db_write_buffer_size, + ); + self.handle_memtable_flush(worker_local, &table).await?; + } + } + } + + if space.should_flush_space() { + if let Some(table) = space.find_maximum_memory_usage_table() { + info!("Trying to flush table {} bytes {} in space {} because space total memtable memory usage exceeds space_write_buffer_size {}.", + table.name, + table.memtable_memory_usage() , + space.name, + space.write_buffer_size, + ); + self.handle_memtable_flush(worker_local, &table).await?; + } + } + + if table_data.should_flush_table(worker_local) { + self.handle_memtable_flush(worker_local, table_data).await?; + } + + Ok(()) + } + + /// Write log_batch into wal, return the sequence number of log_batch. + async fn write_to_wal( + &self, + _worker_local: &WorkerLocal, + table_data: &TableData, + encoded_rows: Vec, + ) -> Result { + // Convert into pb + let mut write_req_pb = table_requests::WriteRequest::new(); + // Use the table schema instead of the schema in request to avoid schema + // mismatch during replaying + write_req_pb.set_schema(table_data.schema().into()); + write_req_pb.set_rows(encoded_rows.into()); + + let mut log_batch = LogWriteBatch::new(table_data.wal_region_id()); + // Now we only have one request, so no need to use with_capacity + log_batch.push(LogWriteEntry { + payload: WritePayload::Write(&write_req_pb), + }); + + // Write to wal manager + let write_ctx = WriteContext::default(); + let sequence = self + .space_store + .wal_manager + .write(&write_ctx, &log_batch) + .await + .context(WriteLogBatch { + table: &table_data.name, + })?; + + Ok(sequence) + } + + // TODO(yingwen): How to trigger flush if we found memtables are full during + // inserting memtable? RocksDB checks memtable size in MemTableInserter + /// Write data into memtable. + /// + /// The data in `encoded_rows` will be moved to memtable. + /// + /// The len of `row_group` and `encoded_rows` must be equal. + pub(crate) fn write_to_memtable( + worker_local: &WorkerLocal, + table_data: &TableData, + sequence: SequenceNumber, + row_group: &RowGroup, + index_in_writer: IndexInWriterSchema, + ) -> Result<()> { + if row_group.is_empty() { + return Ok(()); + } + + let schema = row_group.schema(); + // Store all memtables we wrote and update their last sequence later. + let mut wrote_memtables: SmallVec<[_; 4]> = SmallVec::new(); + let mut last_mutable_mem: Option = None; + + let mut ctx = PutContext::new(index_in_writer); + for (row_idx, row) in row_group.iter().enumerate() { + // TODO(yingwen): Add RowWithSchema and take RowWithSchema as input, then remove + // this unwrap() + let timestamp = row.timestamp(schema).unwrap(); + // skip expired row + if table_data.is_expired(timestamp) { + trace!("Skip expired row when write to memtable, row:{:?}", row); + continue; + } + if last_mutable_mem.is_none() + || !last_mutable_mem + .as_ref() + .unwrap() + .accept_timestamp(timestamp) + { + // The time range is not processed by current memtable, find next one. + let mutable_mem = table_data + .find_or_create_mutable(worker_local, timestamp, schema) + .context(FindMutableMemTable { + table: &table_data.name, + })?; + wrote_memtables.push(mutable_mem.clone()); + last_mutable_mem = Some(mutable_mem); + } + + // We have check the row num is less than `MAX_ROWS_TO_WRITE`, it is safe to + // cast it to u32 here + let key_seq = KeySequence::new(sequence, row_idx as u32); + // TODO(yingwen): Batch sample timestamp in sampling phase. + last_mutable_mem + .as_ref() + .unwrap() + .put(&mut ctx, key_seq, row, schema, timestamp) + .context(WriteMemTable { + table: &table_data.name, + })?; + } + + // Update last sequence of memtable. + for mem_wrote in wrote_memtables { + mem_wrote + .set_last_sequence(sequence) + .context(UpdateMemTableSequence)?; + } + + Ok(()) + } + + /// Flush memtables of table in background. + /// + /// Only flush mutable memtables, assuming all immutable memtables are + /// flushing. + async fn handle_memtable_flush( + self: &Arc, + worker_local: &mut WorkerLocal, + table_data: &TableDataRef, + ) -> Result<()> { + let opts = TableFlushOptions::default(); + + // Set `block_on_write_thread` to false and let flush do in background. + self.flush_table_in_worker(worker_local, table_data, opts) + .await + .context(FlushTable { + table: &table_data.name, + }) + } +} diff --git a/analytic_engine/src/instance/write_worker.rs b/analytic_engine/src/instance/write_worker.rs new file mode 100644 index 0000000000..41089a2605 --- /dev/null +++ b/analytic_engine/src/instance/write_worker.rs @@ -0,0 +1,970 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Write workers + +use std::{ + collections::HashMap, + future::Future, + sync::{ + atomic::{AtomicBool, AtomicI64, Ordering}, + Arc, + }, + time::Instant, +}; + +use common_util::{ + define_result, + runtime::{JoinHandle, Runtime}, + time::InstantExt, +}; +use futures::future; +use log::{error, info}; +use object_store::ObjectStore; +use snafu::{Backtrace, ResultExt, Snafu}; +use table_engine::{ + engine::DropTableRequest, + table::{ + AlterSchemaRequest, Error as TableError, Result as TableResult, TableId, WriteRequest, + }, +}; +use tokio::sync::{mpsc, oneshot, watch, watch::Ref, Mutex, Notify}; +use wal::{ + log_batch::LogEntry, + manager::{ReadContext, WalManager}, +}; + +use crate::{ + compaction::{TableCompactionRequest, WaitResult}, + instance::{ + alter, drop, + flush_compaction::{self, TableFlushOptions}, + open, write, write_worker, InstanceRef, + }, + meta::Manifest, + payload::ReadPayload, + space::{SpaceAndTable, SpaceId}, + sst::factory::Factory, + table::{data::TableDataRef, metrics::Metrics}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to wait flush completed, channel disconnected, err:{}", source))] + WaitFlush { + source: Box, + }, + + #[snafu(display( + "Background flush failed, cannot write more data, err:{}.\nBacktrace:\n{}", + msg, + backtrace + ))] + BackgroundFlushFailed { msg: String, backtrace: Backtrace }, + + #[snafu(display( + "Failed to receive cmd result, channel disconnected, table:{}, worker_id:{}.\nBacktrace:\n{}", + table, + worker_id, + backtrace, + ))] + ReceiveFromWorker { + table: String, + worker_id: usize, + backtrace: Backtrace, + }, + + #[snafu(display("Channel error, err:{}", source))] + Channel { + source: Box, + }, +} + +define_result!(Error); + +#[derive(Debug)] +pub enum BackgroundStatus { + Ok, + FlushFailed(Arc), + CompactionFailed(Arc), +} + +/// Local state of worker +/// +/// The worker is single threaded and holding this is equivalent to holding a +/// write lock +#[derive(Debug)] +pub struct WorkerLocal { + data: Arc, + background_rx: watch::Receiver, +} + +/// Notifier for the write worker when finishing flushing. +struct FlushNotifier(Arc); + +impl FlushNotifier { + fn new(data: Arc) -> Self { + data.num_background_jobs.fetch_add(1, Ordering::SeqCst); + + Self(data) + } + + /// Mark flush is done and notify the waiter status ok (write thread). + /// Concurrency: + /// - Caller should guarantee that there is only one thread (the flush + /// thread) calling this method + pub fn notify_ok(self) { + // Mark the worker is not flushing. + self.0.set_is_flushing(false); + // Send message to notify waiter, ignore send result. + let _ = self.0.background_tx.send(BackgroundStatus::Ok); + } + + /// Mark flush is done and notify the waiter error (write thread). + /// Concurrency: + /// - Caller should guarantee that there is only one thread (the flush + /// thread) calling this method + pub fn notify_err(self, err: Arc) { + // Mark the worker is not flushing. + self.0.set_is_flushing(false); + // Send message to notify waiter, ignore send result. + let _ = self + .0 + .background_tx + .send(BackgroundStatus::FlushFailed(err)); + } +} + +impl Drop for FlushNotifier { + fn drop(&mut self) { + // SeqCst to ensure subtraction num_background_jobs won't be reordered. + self.0.num_background_jobs.fetch_sub(1, Ordering::SeqCst); + self.0.background_notify.notify_one(); + } +} + +/// Notifier to notify compaction result. If no compaction happened, then the +/// notifier may not be signaled. +pub struct CompactionNotifier(Arc); + +impl CompactionNotifier { + fn new(data: Arc) -> Self { + data.num_background_jobs.fetch_add(1, Ordering::SeqCst); + + Self(data) + } + + pub fn notify_ok(self) { + // Send message to notify waiter, ignore send result. + let _ = self.0.background_tx.send(BackgroundStatus::Ok); + } + + pub fn notify_err(self, err: Arc) { + // Send message to notify waiter, ignore send result. + let _ = self + .0 + .background_tx + .send(BackgroundStatus::CompactionFailed(err)); + } +} + +impl Clone for CompactionNotifier { + fn clone(&self) -> Self { + // It will add num_background_jobs in CompactionNotifier::new, + // so we can't derive Clone for CompactionNotifier. + CompactionNotifier::new(self.0.clone()) + } +} + +impl Drop for CompactionNotifier { + fn drop(&mut self) { + // SeqCst to ensure subtraction num_background_jobs won't be reordered. + self.0.num_background_jobs.fetch_sub(1, Ordering::SeqCst); + self.0.background_notify.notify_one(); + } +} + +fn send_flush_result(res_sender: Option>>, res: TableResult<()>) { + if let Some(tx) = res_sender { + if let Err(send_res) = tx.send(res) { + error!("Fail to send flush result, send_res: {:?}", send_res); + } + } +} + +impl WorkerLocal { + #[inline] + pub fn background_status(&self) -> Ref<'_, BackgroundStatus> { + self.background_rx.borrow() + } + + /// Control the flush procedure and ensure multiple flush procedures to be + /// sequential. + /// + /// REQUIRE: should only be called by the write thread. + pub async fn flush_sequentially( + &mut self, + table: String, + metrics: &Metrics, + flush_job: F, + on_flush_success: T, + block_on_write_thread: bool, + res_sender: Option>>, + ) -> Result<()> + where + F: Future> + Send + 'static, + T: Future + Send + 'static, + { + // If flush operation is running, then we need to wait for it to complete first. + // Actually, the loop waiting ensures the multiple flush procedures to be + // sequential, that is to say, at most one flush is being executed at + // the same time. + let mut stall_begin = None; + while self.data.is_flushing() { + if stall_begin.is_none() { + stall_begin = Some(Instant::now()); + } + + self.background_rx + .changed() + .await + .map_err(|e| Box::new(e) as _) + .context(WaitFlush)?; + } + assert!(!self.data.is_flushing()); + + // Report write stall. + if let Some(instant) = stall_begin { + metrics.on_write_stall(instant.saturating_elapsed()); + } + + // Check background status, if background error occurred, current flush is not + // allowed. + match &*self.background_status() { + // Now background compaction error is ignored. + BackgroundStatus::Ok | BackgroundStatus::CompactionFailed(_) => (), + BackgroundStatus::FlushFailed(e) => { + return BackgroundFlushFailed { msg: e.to_string() }.fail(); + } + } + + // TODO(yingwen): Store pending flush requests and retry flush on recoverable + // error, or try to recover from background error. + + // Mark the worker is flushing. + self.data.set_is_flushing(true); + + let worker_data = self.data.clone(); + // Create a notifier, remember to mark flushed and notify wait when we done! + let notifier = FlushNotifier::new(worker_data); + let task = async move { + let flush_res = flush_job.await; + + match flush_res { + Ok(()) => { + notifier.notify_ok(); + on_flush_success.await; + send_flush_result(res_sender, Ok(())); + } + Err(e) => { + let e = Arc::new(e); + notifier.notify_err(e.clone()); + send_flush_result( + res_sender, + Err(TableError::Flush { + source: Box::new(e), + table, + }), + ); + } + } + }; + + if block_on_write_thread { + task.await; + } else { + self.data.runtime.spawn(task); + } + + Ok(()) + } + + pub fn compaction_notifier(&self) -> CompactionNotifier { + let data = self.data.clone(); + CompactionNotifier::new(data) + } +} + +/// Write table command. +pub struct WriteTableCommand { + pub space_table: SpaceAndTable, + pub request: WriteRequest, + /// Sender for the worker to return result of write + pub tx: oneshot::Sender>, +} + +impl WriteTableCommand { + /// Convert into [Command] + pub fn into_command(self) -> Command { + Command::Write(self) + } +} + +/// Recover table command. +pub struct RecoverTableCommand { + /// Table to recover + pub table_data: TableDataRef, + /// Sender for the worker to return result of recover + pub tx: oneshot::Sender>, + + // Options for recover: + /// Batch size to read records from wal to replay + pub replay_batch_size: usize, +} + +impl RecoverTableCommand { + /// Convert into [Command] + pub fn into_command(self) -> Command { + Command::Recover(self) + } +} + +/// Drop table command +pub struct DropTableCommand { + pub space_table: SpaceAndTable, + pub request: DropTableRequest, + pub tx: oneshot::Sender>, +} + +impl DropTableCommand { + /// Convert into [Command] + pub fn into_command(self) -> Command { + Command::Drop(self) + } +} + +/// Alter table command. +pub struct AlterSchemaCommand { + pub space_table: SpaceAndTable, + pub request: AlterSchemaRequest, + /// Sender for the worker to return result of alter schema + pub tx: oneshot::Sender>, +} + +impl AlterSchemaCommand { + /// Convert into [Command] + pub fn into_command(self) -> Command { + Command::AlterSchema(self) + } +} + +/// Alter table options command. +pub struct AlterOptionsCommand { + pub space_table: SpaceAndTable, + pub options: HashMap, + /// Sender for the worker to return result of alter schema + pub tx: oneshot::Sender>, +} + +impl AlterOptionsCommand { + /// Convert into [Command] + pub fn into_command(self) -> Command { + Command::AlterOptions(self) + } +} + +/// Flush table request. +pub struct FlushTableCommand { + pub space_table: SpaceAndTable, + pub flush_opts: TableFlushOptions, + pub tx: oneshot::Sender>, +} + +impl FlushTableCommand { + /// Convert into [Command] + pub fn into_command(self) -> Command { + Command::Flush(self) + } +} + +/// Compact table request. +pub struct CompactTableCommand { + pub space_table: SpaceAndTable, + pub waiter: Option>>, + pub tx: oneshot::Sender>, +} + +impl CompactTableCommand { + /// Convert into [Command] + pub fn into_command(self) -> Command { + Command::Compact(self) + } +} + +/// Command sent to write worker +pub enum Command { + /// Write to table + Write(WriteTableCommand), + + /// Drop table + Drop(DropTableCommand), + + /// Recover table + Recover(RecoverTableCommand), + + /// Alter table schema + AlterSchema(AlterSchemaCommand), + + /// Alter table modify setting + AlterOptions(AlterOptionsCommand), + + /// Flush table + Flush(FlushTableCommand), + + /// Compact table + Compact(CompactTableCommand), + + /// Exit the worker + Exit, +} + +/// Write handle hold by a table +#[derive(Debug, Clone)] +pub struct WriteHandle { + worker_data: Arc, +} + +impl WriteHandle { + /// Send command to write worker. + /// + /// Panic if channel is disconnected + pub async fn send_command(&self, cmd: Command) { + if self.worker_data.tx.send(cmd).await.is_err() { + error!( + "Failed to send command to worker, worker_id:{}", + self.worker_id() + ); + + panic!("write worker {} disconnected", self.worker_id()); + } + } + + /// Returns the id of the worker + pub fn worker_id(&self) -> usize { + self.worker_data.id + } +} + +pub async fn send_command_to_write_worker(cmd: Command, table_data: &TableDataRef) { + table_data.write_handle.send_command(cmd).await; +} + +pub async fn process_command_in_write_worker( + cmd: Command, + table_data: &TableDataRef, + rx: oneshot::Receiver>, +) -> Result { + send_command_to_write_worker(cmd, table_data).await; + + // Receive alter options result. + match rx.await { + Ok(res) => res.map_err(|e| Box::new(e) as _).context(Channel), + Err(_) => ReceiveFromWorker { + table: &table_data.name, + worker_id: table_data.write_handle.worker_id(), + } + .fail(), + } +} + +pub async fn join_all( + table_vec: &[TableDataRef], + rx_vec: Vec>>, +) -> Result<()> { + let results = future::join_all(rx_vec).await; + for (pos, res) in results.into_iter().enumerate() { + let table_data = &table_vec[pos]; + match res { + Ok(res) => { + res.map_err(|e| Box::new(e) as _).context(Channel)?; + } + Err(_) => { + return ReceiveFromWorker { + table: &table_data.name, + worker_id: table_data.write_handle.worker_id(), + } + .fail() + } + } + } + + Ok(()) +} + +/// Write group options +pub struct Options { + pub space_id: SpaceId, + pub worker_num: usize, + pub runtime: Arc, + /// Capacity of the command channel for each worker + pub command_channel_capacity: usize, +} + +// TODO(yingwen): Add method to stop all workers +/// Write group manages all write worker of a space +#[derive(Debug)] +pub struct WriteGroup { + /// Space of the write group. + space_id: SpaceId, + /// Shared datas of workers. + worker_datas: Vec>, + /// Join handles of workers. + handles: Mutex>>, +} + +impl WriteGroup { + pub fn new( + opts: Options, + instance: InstanceRef, + ) -> Self + where + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, + { + let mut worker_datas = Vec::with_capacity(opts.worker_num); + let mut handles = Vec::with_capacity(opts.worker_num); + for id in 0..opts.worker_num { + let (tx, rx) = mpsc::channel(opts.command_channel_capacity); + let (background_tx, background_rx) = watch::channel(BackgroundStatus::Ok); + + let data = Arc::new(WorkerSharedData { + space_id: opts.space_id, + id, + tx, + is_flushing: AtomicBool::new(false), + background_tx, + runtime: opts.runtime.clone(), + num_background_jobs: AtomicI64::new(0), + background_notify: Notify::new(), + }); + + let mut worker = WriteWorker { + rx, + instance: instance.clone(), + local: WorkerLocal { + data: data.clone(), + background_rx, + }, + log_entry_buf: Vec::new(), + }; + + let space_id = opts.space_id; + // Spawn a task to run the worker + let handle = opts.runtime.spawn(async move { + worker.run().await; + + info!( + "Write worker waiting background jobs, space_id:{}, id:{}", + space_id, id + ); + + worker.wait_background_jobs_done().await; + + info!("Write worker exit, space_id:{}, id:{}", space_id, id); + }); + + worker_datas.push(data); + handles.push(handle); + } + + Self { + space_id: opts.space_id, + worker_datas, + handles: Mutex::new(handles), + } + } + + /// Stop the write group. + pub async fn stop(&self) { + for data in &self.worker_datas { + if data.tx.send(Command::Exit).await.is_err() { + error!( + "Failed to send exit command, space_id:{}, worker_id:{}", + self.space_id, data.id + ); + } + } + + let mut handles = self.handles.lock().await; + for (i, handle) in handles.iter_mut().enumerate() { + if let Err(e) = handle.await { + error!( + "Failed to join handle, space_id:{}, index:{}, err:{}", + self.space_id, i, e + ); + } + } + + // Clear all handles to avoid await again. + handles.clear(); + } + + /// Choose worker for table with `table_id`. The worker chose should be + /// consistent, so the caller can cached the handle of the worker + /// + /// Returns the WriteHandle of the worker + pub fn choose_worker(&self, table_id: TableId) -> WriteHandle { + let index = table_id.as_u64() as usize % self.worker_datas.len(); + let worker_data = self.worker_datas[index].clone(); + + WriteHandle { worker_data } + } +} + +/// Data of write worker +#[derive(Debug)] +struct WorkerSharedData { + /// Space this worker belongs to + space_id: SpaceId, + /// Id of the write worker + id: usize, + /// Sender to send command to this worker + tx: mpsc::Sender, + + /// Whether the flush job is already running + /// + /// When `is_flushing` is true, no more flush job should be scheduled + is_flushing: AtomicBool, + /// Channel to notify background status + background_tx: watch::Sender, + + /// Background job runtime. + runtime: Arc, + /// Numbers of background jobs. + num_background_jobs: AtomicI64, + /// Notify when background job finished. + background_notify: Notify, +} + +impl WorkerSharedData { + fn is_flushing(&self) -> bool { + self.is_flushing.load(Ordering::Relaxed) + } + + fn set_is_flushing(&self, is_flushing: bool) { + self.is_flushing.store(is_flushing, Ordering::Relaxed); + } +} + +/// Table write worker +/// +/// Each table is managed by exactly one write worker. Write request to a table +/// will be sent to this thread and done in this worker. +/// +/// The write worker should ensure there is only one flush thread (task) is +/// running. +struct WriteWorker { + /// Command receiver + rx: mpsc::Receiver, + /// Engine instance + instance: InstanceRef, + /// Worker local states + local: WorkerLocal, + /// Log entry buffer for recover + log_entry_buf: Vec>, +} + +impl< + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, + > WriteWorker +{ + /// Runs the write loop until stopped + async fn run(&mut self) { + // TODO(yingwen): Maybe batch write tasks to improve performance (group commit) + loop { + let command = match self.rx.recv().await { + Some(cmd) => cmd, + None => { + info!( + "Write worker recv None, exit, space_id:{}, id:{}", + self.space_id(), + self.id() + ); + return; + } + }; + + match command { + Command::Write(cmd) => { + self.handle_write_table(cmd).await; + } + Command::Drop(cmd) => { + self.handle_drop_table(cmd).await; + } + Command::Recover(cmd) => { + self.handle_recover_table(cmd).await; + } + Command::AlterSchema(cmd) => { + self.handle_alter_schema(cmd).await; + } + Command::AlterOptions(cmd) => { + self.handle_alter_options(cmd).await; + } + Command::Flush(cmd) => { + self.handle_flush_table(cmd).await; + } + Command::Compact(cmd) => { + self.handle_compact_table(cmd).await; + } + Command::Exit => { + info!( + "Write worker recv Command::Exit, exit, space_id:{}, id:{}", + self.space_id(), + self.id() + ); + return; + } + } + } + } + + async fn wait_background_jobs_done(&self) { + while self.num_background_jobs() > 0 { + self.wait_for_notify().await; + } + } + + async fn handle_write_table(&mut self, cmd: WriteTableCommand) { + let WriteTableCommand { + space_table, + request, + tx, + } = cmd; + + let write_res = self + .instance + .process_write_table_command(&mut self.local, &space_table, request) + .await; + if let Err(res) = tx.send(write_res) { + error!( + "handle write table failed to send result, write_res:{:?}", + res + ); + } + } + + async fn handle_recover_table(&mut self, cmd: RecoverTableCommand) { + let RecoverTableCommand { + table_data, + tx, + replay_batch_size, + } = cmd; + + let read_ctx = ReadContext::default(); + self.log_entry_buf.reserve(replay_batch_size); + + let recover_res = self + .instance + .recover_table_from_wal( + &self.local, + table_data, + replay_batch_size, + &read_ctx, + &mut self.log_entry_buf, + ) + .await; + if let Err(res) = tx.send(recover_res) { + error!( + "handle recover table failed to send result, recover_res:{:?}", + res + ); + } + } + + async fn handle_drop_table(&mut self, cmd: DropTableCommand) { + let DropTableCommand { + space_table, + request, + tx, + } = cmd; + + let drop_res = self + .instance + .process_drop_table_command(&mut self.local, &space_table, request) + .await; + if let Err(res) = tx.send(drop_res) { + error!( + "handle drop table failed to send result, drop_res:{:?}", + res + ); + } + } + + async fn handle_alter_schema(&mut self, cmd: AlterSchemaCommand) { + let AlterSchemaCommand { + space_table, + request, + tx, + } = cmd; + + let alter_res = self + .instance + .process_alter_schema_command(&mut self.local, &space_table, request) + .await + .map_err(|e| Box::new(e) as Box) + .context(Channel); + if let Err(res) = tx.send(alter_res) { + error!( + "handle alter schema failed to send result, alter_res:{:?}", + res + ); + } + } + + async fn handle_alter_options(&mut self, cmd: AlterOptionsCommand) { + let AlterOptionsCommand { + space_table, + options, + tx, + } = cmd; + + let alter_res = self + .instance + .process_alter_options_command(&mut self.local, &space_table, options) + .await; + if let Err(res) = tx.send(alter_res) { + error!( + "handle alter schema failed to send result, alter_res:{:?}", + res + ); + } + } + + async fn handle_flush_table(&mut self, cmd: FlushTableCommand) { + let FlushTableCommand { + space_table, + flush_opts, + tx, + } = cmd; + + let flush_res = self + .instance + .flush_table_in_worker(&mut self.local, space_table.table_data(), flush_opts) + .await; + if let Err(res) = tx.send(flush_res) { + error!( + "handle flush table failed to send result, flush_res:{:?}", + res + ); + } + } + + async fn handle_compact_table(&mut self, cmd: CompactTableCommand) { + let CompactTableCommand { + space_table, + waiter, + tx, + } = cmd; + + let request = TableCompactionRequest { + table_data: space_table.table_data().clone(), + compaction_notifier: self.local.compaction_notifier(), + waiter, + }; + + self.instance.schedule_table_compaction(request).await; + if let Err(_res) = tx.send(Ok(())) { + error!("handle compact table failed to send result"); + } + } + + #[inline] + fn space_id(&self) -> SpaceId { + self.local.data.space_id + } + + #[inline] + fn id(&self) -> usize { + self.local.data.id + } + + #[inline] + fn num_background_jobs(&self) -> i64 { + self.local.data.num_background_jobs.load(Ordering::SeqCst) + } + + async fn wait_for_notify(&self) { + self.local.data.background_notify.notified().await; + } +} + +#[cfg(test)] +pub mod tests { + use common_util::runtime; + + use super::*; + + pub struct MockedWriteHandle { + pub write_handle: WriteHandle, + pub rx: mpsc::Receiver, + pub worker_local: WorkerLocal, + } + + pub struct WriteHandleMocker { + space_id: SpaceId, + runtime: Option>, + } + + impl Default for WriteHandleMocker { + fn default() -> Self { + Self { + space_id: 1, + runtime: None, + } + } + } + + impl WriteHandleMocker { + pub fn space_id(mut self, space_id: SpaceId) -> Self { + self.space_id = space_id; + self + } + + pub fn build(self) -> MockedWriteHandle { + let (tx, rx) = mpsc::channel(1); + let (background_tx, background_rx) = watch::channel(BackgroundStatus::Ok); + let runtime = self.runtime.unwrap_or_else(|| { + let rt = runtime::Builder::default().build().unwrap(); + Arc::new(rt) + }); + + let worker_data = Arc::new(WorkerSharedData { + space_id: self.space_id, + id: 0, + tx, + is_flushing: AtomicBool::new(false), + background_tx, + runtime, + num_background_jobs: AtomicI64::new(0), + background_notify: Notify::new(), + }); + + let write_handle = WriteHandle { + worker_data: worker_data.clone(), + }; + + MockedWriteHandle { + write_handle, + rx, + worker_local: WorkerLocal { + data: worker_data, + background_rx, + }, + } + } + } +} diff --git a/analytic_engine/src/lib.rs b/analytic_engine/src/lib.rs new file mode 100644 index 0000000000..a4fc60c14f --- /dev/null +++ b/analytic_engine/src/lib.rs @@ -0,0 +1,98 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Analytic table engine implementations + +mod compaction; +mod context; +mod engine; +mod instance; +pub mod memtable; +mod meta; +mod payload; +pub mod row_iter; +mod sampler; +pub mod setup; +pub mod space; +pub mod sst; +pub mod table; +pub mod table_options; + +#[cfg(any(test, feature = "test"))] +pub mod tests; + +use object_store::disk::File; +use serde_derive::Deserialize; +use wal::rocks_impl::manager::RocksImpl; + +pub use crate::{compaction::scheduler::SchedulerConfig, table_options::TableOptions}; +use crate::{ + engine::TableEngineImpl, + instance::InstanceRef, + meta::details::{ManifestImpl, Options as ManifestOptions}, + sst::factory::FactoryImpl, +}; + +/// Analytic table engine +pub type AnalyticTableEngine = + TableEngineImpl, File, FactoryImpl>; +/// Default instance +pub(crate) type EngineInstance = InstanceRef, File, FactoryImpl>; + +/// Config of analytic engine. +#[derive(Debug, Clone, Deserialize)] +#[serde(default)] +pub struct Config { + /// Data path of the engine. + pub data_path: String, + + /// Batch size to read records from wal to replay. + pub replay_batch_size: usize, + /// Batch size to replay tables. + pub max_replay_tables_per_batch: usize, + // Write group options: + pub write_group_worker_num: usize, + pub write_group_command_channel_cap: usize, + // End of write group options. + /// Default options for table. + pub table_opts: TableOptions, + + pub compaction_config: SchedulerConfig, + + /// sst meta cache capacity. + pub sst_meta_cache_cap: Option, + /// sst data cache capacity. + pub sst_data_cache_cap: Option, + + /// Manifest options. + pub manifest: ManifestOptions, + + // Global write buffer options: + /// The maximum write buffer size used for single space. + pub space_write_buffer_size: usize, + /// The maximum size of all Write Buffers across all spaces. + pub db_write_buffer_size: usize, + // End of global write buffer options. +} + +impl Default for Config { + fn default() -> Self { + Self { + data_path: String::from("/tmp/ceresdbx"), + replay_batch_size: 500, + max_replay_tables_per_batch: 64, + write_group_worker_num: 8, + write_group_command_channel_cap: 128, + table_opts: TableOptions::default(), + compaction_config: SchedulerConfig::default(), + sst_meta_cache_cap: Some(1000), + sst_data_cache_cap: Some(1000), + manifest: ManifestOptions::default(), + /// Zero means disabling this param, give a postive value to enable + /// it. + space_write_buffer_size: 0, + /// Zero means disabling this param, give a postive value to enable + /// it. + db_write_buffer_size: 0, + } + } +} diff --git a/analytic_engine/src/memtable/factory.rs b/analytic_engine/src/memtable/factory.rs new file mode 100644 index 0000000000..0867bba2da --- /dev/null +++ b/analytic_engine/src/memtable/factory.rs @@ -0,0 +1,38 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! MemTable factory + +use std::{fmt, sync::Arc}; + +use arena::CollectorRef; +use common_types::{schema::Schema, SequenceNumber}; +use common_util::define_result; +use snafu::Snafu; + +use crate::memtable::MemTableRef; + +#[derive(Debug, Snafu)] +pub enum Error {} + +define_result!(Error); + +/// MemTable options +pub struct Options { + /// Schema of the skiplist. + pub schema: Schema, + /// Block size of arena in bytes. + pub arena_block_size: u32, + /// Log sequence at the memtable creation. + pub creation_sequence: SequenceNumber, + /// Memory usage colllector + pub collector: CollectorRef, +} + +/// MemTable factory +pub trait Factory: fmt::Debug { + /// Create a new memtable instance + fn create_memtable(&self, opts: Options) -> Result; +} + +/// MemTable Factory reference +pub type FactoryRef = Arc; diff --git a/analytic_engine/src/memtable/key.rs b/analytic_engine/src/memtable/key.rs new file mode 100644 index 0000000000..6c11837028 --- /dev/null +++ b/analytic_engine/src/memtable/key.rs @@ -0,0 +1,249 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Memtable key +//! +//! Some concepts: +//! - User key (row key) is a bytes encoded from the key columns of a row +//! - Internal key contains +//! - user key +//! - memtable key sequence +//! - sequence number +//! - index + +use std::mem; + +use common_types::{ + bytes::{BytesMut, MemBuf, MemBufMut}, + row::Row, + schema::Schema, + SequenceNumber, +}; +use common_util::{ + codec::{memcomparable::MemComparable, Decoder, Encoder}, + define_result, +}; +use snafu::{ensure, Backtrace, ResultExt, Snafu}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to encode key datum, err:{}", source))] + EncodeKeyDatum { + source: common_util::codec::memcomparable::Error, + }, + + #[snafu(display("Failed to encode sequence, err:{}", source))] + EncodeSequence { source: common_types::bytes::Error }, + + #[snafu(display("Failed to encode row index, err:{}", source))] + EncodeIndex { source: common_types::bytes::Error }, + + #[snafu(display("Failed to decode sequence, err:{}", source))] + DecodeSequence { source: common_types::bytes::Error }, + + #[snafu(display("Failed to decode row index, err:{}", source))] + DecodeIndex { source: common_types::bytes::Error }, + + #[snafu(display( + "Insufficent internal key length, len:{}.\nBacktrace:\n{}", + len, + backtrace + ))] + InternalKeyLen { len: usize, backtrace: Backtrace }, +} + +define_result!(Error); + +// u64 + u32 +const KEY_SEQUENCE_BYTES_LEN: usize = 12; + +/// Row index in the batch +pub type RowIndex = u32; + +/// Sequence number of row in memtable +/// +/// Contains: +/// - sequence number in wal (sequence number of the write batch) +/// - unique index of the row in the write batch +/// +/// Ordering: +/// 1. ordered by sequence desc +/// 2. ordered by index desc +/// +/// The desc order is implemented via MAX - seq +/// +/// The index is used to distinguish rows with same key of the same write batch +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct KeySequence(SequenceNumber, RowIndex); + +impl KeySequence { + pub fn new(sequence: SequenceNumber, index: RowIndex) -> Self { + Self(sequence, index) + } + + #[inline] + pub fn sequence(&self) -> SequenceNumber { + self.0 + } + + #[inline] + pub fn row_index(&self) -> RowIndex { + self.1 + } +} + +// TODO(yingwen): We also need opcode (PUT/DELETE), put it in key or row value +/// Comparable internal key encoder +/// +/// Key order: +/// 1. ordered by user key ascend (key parts of a row) +/// 2. ordered by sequence descend +/// +/// Encoding: +/// user_key + sequence +/// +/// REQUIRE: The schema of row to encode matches the Self::schema +pub struct ComparableInternalKey<'a> { + /// Sequence number of the row + sequence: KeySequence, + /// Schema of row + schema: &'a Schema, +} + +impl<'a> ComparableInternalKey<'a> { + pub fn new(sequence: KeySequence, schema: &'a Schema) -> Self { + Self { sequence, schema } + } +} + +impl<'a> Encoder for ComparableInternalKey<'a> { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &Row) -> Result<()> { + let encoder = MemComparable; + for idx in 0..self.schema.num_key_columns() { + // Encode each column in primary key + encoder.encode(buf, &value[idx]).context(EncodeKeyDatum)?; + } + SequenceCodec.encode(buf, &self.sequence)?; + + Ok(()) + } + + fn estimate_encoded_size(&self, value: &Row) -> usize { + let encoder = MemComparable; + let mut total_len = 0; + for idx in 0..self.schema.num_key_columns() { + // Size of each column in primary key + total_len += encoder.estimate_encoded_size(&value[idx]); + } + // The size of sequence + total_len += KEY_SEQUENCE_BYTES_LEN; + + total_len + } +} + +struct SequenceCodec; + +impl Encoder for SequenceCodec { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &KeySequence) -> Result<()> { + // Encode sequence number and index in descend order + encode_sequence_number(buf, value.sequence())?; + let reversed_index = RowIndex::MAX - value.row_index(); + buf.write_u32(reversed_index).context(EncodeIndex)?; + + Ok(()) + } + + fn estimate_encoded_size(&self, _value: &KeySequence) -> usize { + KEY_SEQUENCE_BYTES_LEN + } +} + +impl Decoder for SequenceCodec { + type Error = Error; + + fn decode(&self, buf: &mut B) -> Result { + let sequence = buf.read_u64().context(DecodeSequence)?; + // Reverse sequence + let sequence = SequenceNumber::MAX - sequence; + let row_index = buf.read_u32().context(DecodeIndex)?; + // Reverse row index + let row_index = RowIndex::MAX - row_index; + + Ok(KeySequence::new(sequence, row_index)) + } +} + +#[inline] +fn encode_sequence_number(buf: &mut B, sequence: SequenceNumber) -> Result<()> { + // The sequence need to encode in descend order + let reversed_sequence = SequenceNumber::MAX - sequence; + // Encode sequence + buf.write_u64(reversed_sequence).context(EncodeSequence)?; + Ok(()) +} + +// TODO(yingwen): Maybe make decoded internal key a type? + +/// Encode internal key from user key for seek +/// +/// - user_key: the user key to encode +/// - sequence: the sequence number to encode into internal key +/// - scratch: buffer to store the encoded internal key, the scratch will be +/// clear +/// +/// Returns the slice to the encoded internal key +pub fn internal_key_for_seek<'a>( + user_key: &[u8], + sequence: SequenceNumber, + scratch: &'a mut BytesMut, +) -> Result<&'a [u8]> { + scratch.clear(); + + scratch.reserve(user_key.len() + mem::size_of::()); + scratch.extend_from_slice(user_key); + encode_sequence_number(scratch, sequence)?; + + Ok(&scratch[..]) +} + +/// Decode user key and sequence number from the internal key +pub fn user_key_from_internal_key(internal_key: &[u8]) -> Result<(&[u8], KeySequence)> { + // Empty user key is meaningless + ensure!( + internal_key.len() > KEY_SEQUENCE_BYTES_LEN, + InternalKeyLen { + len: internal_key.len(), + } + ); + + let (left, mut right) = internal_key.split_at(internal_key.len() - KEY_SEQUENCE_BYTES_LEN); + // Decode sequence number from right part + let sequence = SequenceCodec.decode(&mut right)?; + + Ok((left, sequence)) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_sequence_codec() { + let codec = SequenceCodec; + + let sequence = KeySequence::new(123, 456); + assert_eq!(12, codec.estimate_encoded_size(&sequence)); + let mut buf = Vec::new(); + codec.encode(&mut buf, &sequence).unwrap(); + assert_eq!(12, buf.len()); + + let mut b = &buf[..]; + let decoded_sequence = codec.decode(&mut b).unwrap(); + + assert_eq!(sequence, decoded_sequence); + } +} diff --git a/analytic_engine/src/memtable/mod.rs b/analytic_engine/src/memtable/mod.rs new file mode 100644 index 0000000000..5074eff34c --- /dev/null +++ b/analytic_engine/src/memtable/mod.rs @@ -0,0 +1,198 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! MemTable + +pub mod factory; +pub mod key; +pub mod skiplist; + +use std::{ops::Bound, sync::Arc}; + +use common_types::{ + bytes::{ByteVec, Bytes}, + projected_schema::ProjectedSchema, + record_batch::RecordBatchWithKey, + row::Row, + schema::{IndexInWriterSchema, Schema}, + SequenceNumber, +}; +use common_util::define_result; +use snafu::{Backtrace, Snafu}; + +use crate::memtable::key::KeySequence; + +const DEFAULT_SCAN_BATCH_SIZE: usize = 500; + +#[derive(Debug, Snafu)] +#[snafu(visibility(pub(crate)))] +pub enum Error { + #[snafu(display("Failed to encode internal key, err:{}", source))] + EncodeInternalKey { source: crate::memtable::key::Error }, + + #[snafu(display("Failed to decode internal key, err:{}", source))] + DecodeInternalKey { source: crate::memtable::key::Error }, + + #[snafu(display("Failed to decode row, err:{}", source))] + DecodeRow { + source: common_util::codec::row::Error, + }, + + #[snafu(display("Failed to append row to batch builder, err:{}", source))] + AppendRow { + source: common_types::record_batch::Error, + }, + + #[snafu(display("Failed to build record batch, err:{}", source,))] + BuildRecordBatch { + source: common_types::record_batch::Error, + }, + + #[snafu(display("Failed to project memtable schema, err:{}", source))] + ProjectSchema { + source: common_types::projected_schema::Error, + }, + + #[snafu(display( + "Invalid sequence number to put, given:{}, last:{}.\nBacktrace:\n{}", + given, + last, + backtrace + ))] + InvalidPutSequence { + given: SequenceNumber, + last: SequenceNumber, + backtrace: Backtrace, + }, + + #[snafu(display("Invalid row, err:{}", source))] + InvalidRow { + source: Box, + }, + + #[snafu(display("Fail to iter in reverse order, err:{}", source))] + IterReverse { + source: Box, + }, +} + +define_result!(Error); + +/// Options for put and context for tracing +pub struct PutContext { + /// Buffer for encoding key, can reuse during put + pub key_buf: ByteVec, + /// Buffer for encoding value, can reuse during put + pub value_buf: ByteVec, + /// Used to encode row. + pub index_in_writer: IndexInWriterSchema, +} + +impl PutContext { + pub fn new(index_in_writer: IndexInWriterSchema) -> Self { + Self { + key_buf: ByteVec::new(), + value_buf: ByteVec::new(), + index_in_writer, + } + } +} + +/// Options for scan and context for tracing +#[derive(Debug, Clone)] +pub struct ScanContext { + /// Suggested row number per batch + pub batch_size: usize, +} + +impl Default for ScanContext { + fn default() -> Self { + Self { + batch_size: DEFAULT_SCAN_BATCH_SIZE, + } + } +} + +/// Scan request +/// +/// Now we only support forward scan. +#[derive(Debug, Clone)] +pub struct ScanRequest { + /// The start key of the encoded user key (without sequence). + pub start_user_key: Bound, + /// The end key of the encoded user key (without sequence). + pub end_user_key: Bound, + /// Max visible sequence (inclusive), row key with sequence <= this can be + /// visible. + pub sequence: SequenceNumber, + /// Schema and projection to read. + pub projected_schema: ProjectedSchema, + pub need_dedup: bool, + pub reverse: bool, +} + +/// In memory storage for table's data. +/// +/// # Concurrency +/// The memtable is designed for single-writer and mutltiple-reader usage, so +/// not all function supports concurrent writer, the caller should guarantee not +/// writing to the memtable concurrrently. +// All operation is done in memory, no need to use async trait +pub trait MemTable { + /// Schema of this memtable + /// + /// The schema of a memtable is not allowed to change now. Modifying the + /// schema of a table requires a memtable switch and external + /// synchronization + fn schema(&self) -> &Schema; + + /// Peek the min key of this memtable. + fn min_key(&self) -> Option; + + /// Peek the max key of this memtable. + fn max_key(&self) -> Option; + + /// Insert one row into the memtable. + /// + ///.- ctx: The put context + /// - sequence: The sequence of the row + /// - row: The row to insert + /// - schema: The schema of the row + /// + /// REQUIRE: + /// - The schema of RowGroup must equal to the schema of memtable. How to + /// handle duplicate entries is implementation specific. + fn put( + &self, + ctx: &mut PutContext, + sequence: KeySequence, + row: &Row, + schema: &Schema, + ) -> Result<()>; + + /// Scan the memtable. + /// + /// Returns the data in columnar format. The returned rows is guaranteed + /// to be ordered by the primary key. + fn scan(&self, ctx: ScanContext, request: ScanRequest) -> Result; + + /// Returns an estimate of the number of bytes of data in used + fn approximate_memory_usage(&self) -> usize; + + /// Set last sequence of the memtable, returns error if the given `sequence` + /// is less than existing last sequence. + /// + /// REQUIRE: + /// - External synchronization is required. + fn set_last_sequence(&self, sequence: SequenceNumber) -> Result<()>; + + /// Returns the last sequence of the memtable. + /// + /// If the memtable is empty, then the last sequence is 0. + fn last_sequence(&self) -> SequenceNumber; +} + +/// A reference to memtable +pub type MemTableRef = Arc; + +/// A pointer to columnar iterator +pub type ColumnarIterPtr = Box> + Send + Sync>; diff --git a/analytic_engine/src/memtable/skiplist/factory.rs b/analytic_engine/src/memtable/skiplist/factory.rs new file mode 100644 index 0000000000..89dd453587 --- /dev/null +++ b/analytic_engine/src/memtable/skiplist/factory.rs @@ -0,0 +1,32 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Skiplist memtable factory + +use std::sync::{atomic::AtomicU64, Arc}; + +use arena::MonoIncArena; +use skiplist::Skiplist; + +use crate::memtable::{ + factory::{Factory, Options, Result}, + skiplist::{BytewiseComparator, SkiplistMemTable}, + MemTableRef, +}; + +/// Factory to create memtable +#[derive(Debug)] +pub struct SkiplistMemTableFactory; + +impl Factory for SkiplistMemTableFactory { + fn create_memtable(&self, opts: Options) -> Result { + let arena = MonoIncArena::with_collector(opts.arena_block_size as usize, opts.collector); + let skiplist = Skiplist::with_arena(BytewiseComparator, arena); + let memtable = Arc::new(SkiplistMemTable { + schema: opts.schema, + skiplist, + last_sequence: AtomicU64::new(opts.creation_sequence), + }); + + Ok(memtable) + } +} diff --git a/analytic_engine/src/memtable/skiplist/iter.rs b/analytic_engine/src/memtable/skiplist/iter.rs new file mode 100644 index 0000000000..0cf60cc90e --- /dev/null +++ b/analytic_engine/src/memtable/skiplist/iter.rs @@ -0,0 +1,346 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Skiplist memtable iterator + +use std::{cmp::Ordering, iter::Rev, ops::Bound}; + +use arena::{Arena, BasicStats}; +use common_types::{ + bytes::{Bytes, BytesMut}, + projected_schema::{ProjectedSchema, RowProjector}, + record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder}, + row::contiguous::{ContiguousRowReader, ProjectedContiguousRow}, + schema::Schema, + SequenceNumber, +}; +use common_util::codec::row; +use log::trace; +use skiplist::{ArenaSlice, IterRef, Skiplist}; +use snafu::ResultExt; + +use crate::memtable::{ + key::{self, KeySequence}, + skiplist::{BytewiseComparator, SkiplistMemTable}, + AppendRow, BuildRecordBatch, DecodeInternalKey, EncodeInternalKey, IterReverse, ProjectSchema, + Result, ScanContext, ScanRequest, +}; + +/// Iterator state +#[derive(Debug, PartialEq)] +enum State { + /// The iterator struct is created but not initialized + Uninitialized, + /// The iterator is initialized (seek) + Initialized, + /// No more element the iterator can return + Finished, +} + +/// Columnar iterator for [SkiplistMemTable] +pub struct ColumnarIterImpl + Clone + Sync + Send> { + /// The internal skiplist iter + iter: IterRef, BytewiseComparator, A>, + + // Schema related: + /// Schema of this memtable, used to decode row + memtable_schema: Schema, + /// Projection of schema to read + projected_schema: ProjectedSchema, + projector: RowProjector, + + // Options related: + batch_size: usize, + + start_user_key: Bound, + end_user_key: Bound, + /// Max visible sequence + sequence: SequenceNumber, + /// State of iterator + state: State, + /// Last internal key this iterator returned + // TODO(yingwen): Wrap a internal key struct? + last_internal_key: Option>, + + /// Dedup rows with key + need_dedup: bool, +} + +impl + Clone + Sync + Send> ColumnarIterImpl { + /// Create a new [ColumnarIterImpl] + pub fn new( + memtable: &SkiplistMemTable, + ctx: ScanContext, + request: ScanRequest, + ) -> Result { + // Create projection for the memtable schema + let projector = request + .projected_schema + .try_project_with_key(&memtable.schema) + .context(ProjectSchema)?; + + let iter = memtable.skiplist.iter(); + let mut columnar_iter = Self { + iter, + memtable_schema: memtable.schema.clone(), + projected_schema: request.projected_schema, + projector, + batch_size: ctx.batch_size, + start_user_key: request.start_user_key, + end_user_key: request.end_user_key, + sequence: request.sequence, + state: State::Uninitialized, + last_internal_key: None, + need_dedup: request.need_dedup, + }; + + columnar_iter.init()?; + + Ok(columnar_iter) + } + + /// Init the iterator, will seek to the proper position for first next() + /// call, so the first entry next() returned is after the + /// `start_user_key`, but we still need to check `end_user_key` + fn init(&mut self) -> Result<()> { + match &self.start_user_key { + Bound::Included(user_key) => { + // Construct seek key + let mut key_buf = BytesMut::new(); + let seek_key = key::internal_key_for_seek(user_key, self.sequence, &mut key_buf) + .context(EncodeInternalKey)?; + + // Seek the skiplist + self.iter.seek(seek_key); + } + Bound::Excluded(user_key) => { + // Construct seek key, just seek to the key with next prefix, so there is no + // need to skip the key until we meet the first key > + // start_user_key + let next_user_key = row::key_prefix_next(user_key); + let mut key_buf = BytesMut::new(); + let seek_key = + key::internal_key_for_seek(&next_user_key, self.sequence, &mut key_buf) + .context(EncodeInternalKey)?; + + // Seek the skiplist + self.iter.seek(seek_key); + } + Bound::Unbounded => self.iter.seek_to_first(), + } + + self.state = State::Initialized; + + Ok(()) + } + + /// Fetch next record batch + fn fetch_next_record_batch(&mut self) -> Result> { + debug_assert_eq!(State::Initialized, self.state); + assert!(self.batch_size > 0); + + let mut builder = RecordBatchWithKeyBuilder::with_capacity( + self.projected_schema.to_record_schema_with_key(), + self.batch_size, + ); + let mut num_rows = 0; + while self.iter.valid() && num_rows < self.batch_size { + if let Some(row) = self.fetch_next_row()? { + let row_reader = ContiguousRowReader::with_schema(&row, &self.memtable_schema); + let projected_row = ProjectedContiguousRow::new(row_reader, &self.projector); + + trace!("Column iterator fetch next row, row:{:?}", projected_row); + + builder + .append_projected_contiguous_row(&projected_row) + .context(AppendRow)?; + num_rows += 1; + } else { + // There is no more row to fetch + self.finish(); + break; + } + } + + if num_rows > 0 { + let batch = builder.build().context(BuildRecordBatch)?; + trace!("column iterator send one batch:{:?}", batch); + + Ok(Some(batch)) + } else { + // If iter is invalid after seek (nothing matched), then it may not be marked as + // finished yet + self.finish(); + Ok(None) + } + } + + /// Fetch next row matched the given condition, the current entry of iter + /// will be considered + /// + /// REQUIRE: The iter is valid + fn fetch_next_row(&mut self) -> Result>> { + debug_assert_eq!(State::Initialized, self.state); + + // TODO(yingwen): Some operation like delete needs to be considered during + // iterating: we need to ignore this key if found a delete mark + while self.iter.valid() { + // Fetch current entry + let key = self.iter.key(); + let (user_key, sequence) = + key::user_key_from_internal_key(key).context(DecodeInternalKey)?; + + // Check user key is still in range + if self.is_after_end_bound(user_key) { + // Out of bound + self.finish(); + return Ok(None); + } + + if self.need_dedup { + // Whether this user key is already returned + let same_key = match &self.last_internal_key { + Some(last_internal_key) => { + // TODO(yingwen): Actually this call wont fail, only valid internal key will + // be set as last_internal_key so maybe we can just + // unwrap it? + let (last_user_key, _) = key::user_key_from_internal_key(last_internal_key) + .context(DecodeInternalKey)?; + user_key == last_user_key + } + // This is the first user key + None => false, + }; + + if same_key { + // We meet duplicate key, move forward and continue to find next user key + self.iter.next(); + continue; + } + // Now this is a new user key + } + + // Check whether this key is visible + if !self.is_visible(sequence) { + // The sequence of this key is not visible, move forward + self.iter.next(); + continue; + } + + // This is the row we want + let row = self.iter.value_with_arena(); + + // Store the last key + self.last_internal_key = Some(self.iter.key_with_arena()); + // Move iter forward + self.iter.next(); + + return Ok(Some(row)); + } + + // No more row in range, we can stop the iterator + self.finish(); + Ok(None) + } + + /// Return true if the sequence is visible + #[inline] + fn is_visible(&self, sequence: KeySequence) -> bool { + sequence.sequence() <= self.sequence + } + + /// Return true if the key is after the `end_user_key` bound + fn is_after_end_bound(&self, key: &[u8]) -> bool { + match &self.end_user_key { + Bound::Included(end) => match key.cmp(end) { + Ordering::Less | Ordering::Equal => false, + Ordering::Greater => true, + }, + Bound::Excluded(end) => match key.cmp(end) { + Ordering::Less => false, + Ordering::Equal | Ordering::Greater => true, + }, + // All key is valid + Bound::Unbounded => false, + } + } + + /// Mark the iterator state to finished and return None + fn finish(&mut self) { + self.state = State::Finished; + } +} + +impl + Clone + Sync + Send> Iterator for ColumnarIterImpl { + type Item = Result; + + fn next(&mut self) -> Option { + if self.state != State::Initialized { + return None; + } + + self.fetch_next_record_batch().transpose() + } +} + +/// Reversed columnar iterator. +// TODO(xikai): Now the implementation is not perfect: read all the entries +// into a buffer and reverse read it. The memtable should support scan in +// reverse order naturally. +pub struct ReversedColumnarIterator { + iter: I, + reversed_iter: Option>>>, + num_record_batch: usize, +} + +impl ReversedColumnarIterator +where + I: Iterator>, +{ + pub fn new(iter: I, num_rows: usize, batch_size: usize) -> Self { + Self { + iter, + reversed_iter: None, + num_record_batch: num_rows / batch_size, + } + } + + fn init_if_necessary(&mut self) { + if self.reversed_iter.is_some() { + return; + } + + let mut buf = Vec::with_capacity(self.num_record_batch); + for item in &mut self.iter { + buf.push(item); + } + self.reversed_iter = Some(buf.into_iter().rev()); + } +} + +impl Iterator for ReversedColumnarIterator +where + I: Iterator>, +{ + type Item = Result; + + fn next(&mut self) -> Option { + self.init_if_necessary(); + self.reversed_iter + .as_mut() + .unwrap() + .next() + .map(|v| match v { + Ok(mut batch_with_key) => { + batch_with_key + .reverse_data() + .map_err(|e| Box::new(e) as _) + .context(IterReverse)?; + + Ok(batch_with_key) + } + Err(e) => Err(e), + }) + } +} + +// TODO(yingwen): Test diff --git a/analytic_engine/src/memtable/skiplist/mod.rs b/analytic_engine/src/memtable/skiplist/mod.rs new file mode 100644 index 0000000000..2a1459bc80 --- /dev/null +++ b/analytic_engine/src/memtable/skiplist/mod.rs @@ -0,0 +1,363 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! MemTable based on skiplist + +pub mod factory; +pub mod iter; + +use std::{ + cmp::Ordering, + convert::TryInto, + sync::atomic::{self, AtomicU64}, +}; + +use arena::{Arena, BasicStats}; +use common_types::{ + bytes::Bytes, + row::{contiguous::ContiguousRowWriter, Row}, + schema::Schema, + SequenceNumber, +}; +use common_util::codec::Encoder; +use log::{debug, trace}; +use skiplist::{KeyComparator, Skiplist}; +use snafu::{ensure, ResultExt}; + +use crate::memtable::{ + key::{ComparableInternalKey, KeySequence}, + skiplist::iter::{ColumnarIterImpl, ReversedColumnarIterator}, + ColumnarIterPtr, EncodeInternalKey, InvalidPutSequence, InvalidRow, MemTable, PutContext, + Result, ScanContext, ScanRequest, +}; + +/// MemTable implementation based on skiplist +pub struct SkiplistMemTable + Clone + Sync + Send> { + /// Schema of this memtable, is immutable. + schema: Schema, + skiplist: Skiplist, + /// The last sequence of the rows in this memtable. Update to this field + /// require external synchronization. + last_sequence: AtomicU64, +} + +impl + Clone + Sync + Send + 'static> MemTable + for SkiplistMemTable +{ + fn schema(&self) -> &Schema { + &self.schema + } + + fn min_key(&self) -> Option { + let mut iter = self.skiplist.iter(); + iter.seek_to_first(); + if !iter.valid() { + None + } else { + Some(iter.key().to_vec().into()) + } + } + + fn max_key(&self) -> Option { + let mut iter = self.skiplist.iter(); + iter.seek_to_last(); + if !iter.valid() { + None + } else { + Some(iter.key().to_vec().into()) + } + } + + // TODO(yingwen): Encode value if value_buf is not set. + // Now the caller is required to encode the row into the `value_buf` in + // PutContext first. + fn put( + &self, + ctx: &mut PutContext, + sequence: KeySequence, + row: &Row, + schema: &Schema, + ) -> Result<()> { + trace!("skiplist put row, sequence:{:?}, row:{:?}", sequence, row); + + let key_encoder = ComparableInternalKey::new(sequence, schema); + + let internal_key = &mut ctx.key_buf; + // Reset key buffer + internal_key.clear(); + // Reserve capacity for key + internal_key.reserve(key_encoder.estimate_encoded_size(row)); + // Encode key + key_encoder + .encode(internal_key, row) + .context(EncodeInternalKey)?; + + // Encode row value. The ContiguousRowWriter will clear the buf. + let row_value = &mut ctx.value_buf; + let mut row_writer = ContiguousRowWriter::new(row_value, schema, &ctx.index_in_writer); + row_writer + .write_row(row) + .map_err(|e| Box::new(e) as _) + .context(InvalidRow)?; + + self.skiplist.put(internal_key, row_value); + + Ok(()) + } + + fn scan(&self, ctx: ScanContext, request: ScanRequest) -> Result { + debug!( + "Scan skiplist memtable, ctx:{:?}, request:{:?}", + ctx, request + ); + + let num_rows = self.skiplist.len(); + let (reverse, batch_size) = (request.reverse, ctx.batch_size); + let iter = ColumnarIterImpl::new(self, ctx, request)?; + if reverse { + Ok(Box::new(ReversedColumnarIterator::new( + iter, num_rows, batch_size, + ))) + } else { + Ok(Box::new(iter)) + } + } + + fn approximate_memory_usage(&self) -> usize { + // Mem size of skiplist is u32, need to cast to usize + match self.skiplist.mem_size().try_into() { + Ok(v) => v, + // The skiplist already use bytes larger than usize + Err(_) => usize::MAX, + } + } + + fn set_last_sequence(&self, sequence: SequenceNumber) -> Result<()> { + let last = self.last_sequence(); + ensure!( + sequence >= last, + InvalidPutSequence { + given: sequence, + last + } + ); + + self.last_sequence + .store(sequence, atomic::Ordering::Relaxed); + + Ok(()) + } + + fn last_sequence(&self) -> SequenceNumber { + self.last_sequence.load(atomic::Ordering::Relaxed) + } +} + +#[derive(Debug, Clone)] +pub struct BytewiseComparator; + +impl KeyComparator for BytewiseComparator { + #[inline] + fn compare_key(&self, lhs: &[u8], rhs: &[u8]) -> Ordering { + lhs.cmp(rhs) + } + + #[inline] + fn same_key(&self, lhs: &[u8], rhs: &[u8]) -> bool { + lhs == rhs + } +} + +#[cfg(test)] +mod tests { + + use std::{ops::Bound, sync::Arc}; + + use arena::NoopCollector; + use common_types::{ + bytes::ByteVec, + datum::Datum, + projected_schema::ProjectedSchema, + record_batch::RecordBatchWithKey, + schema::IndexInWriterSchema, + tests::{build_row, build_schema}, + time::Timestamp, + }; + use common_util::codec::memcomparable::MemComparable; + + use super::*; + use crate::memtable::{ + factory::{Factory, Options}, + skiplist::factory::SkiplistMemTableFactory, + }; + + fn test_memtable_scan_for_scan_request( + schema: Schema, + memtable: Arc, + ) { + let projection: Vec = (0..schema.num_columns()).collect(); + let projected_schema = ProjectedSchema::new(schema, Some(projection)).unwrap(); + + let testcases = vec![ + ( + // limited by sequence + ScanRequest { + start_user_key: Bound::Unbounded, + end_user_key: Bound::Unbounded, + sequence: 2, + projected_schema: projected_schema.clone(), + need_dedup: true, + reverse: false, + }, + vec![ + build_row(b"a", 1, 10.0, "v1"), + build_row(b"b", 2, 10.0, "v2"), + build_row(b"c", 3, 10.0, "v3"), + build_row(b"d", 4, 10.0, "v4"), + build_row(b"e", 5, 10.0, "v5"), + build_row(b"f", 6, 10.0, "v6"), + ], + ), + ( + // limited by sequence and start/end key + ScanRequest { + start_user_key: Bound::Included(build_scan_key("a", 1)), + end_user_key: Bound::Excluded(build_scan_key("e", 5)), + sequence: 2, + projected_schema: projected_schema.clone(), + need_dedup: true, + reverse: false, + }, + vec![ + build_row(b"a", 1, 10.0, "v1"), + build_row(b"b", 2, 10.0, "v2"), + build_row(b"c", 3, 10.0, "v3"), + build_row(b"d", 4, 10.0, "v4"), + ], + ), + ( + // limited by sequence and start/end key + // but seq is one smaller than last case + ScanRequest { + start_user_key: Bound::Included(build_scan_key("a", 1)), + end_user_key: Bound::Excluded(build_scan_key("e", 5)), + sequence: 1, + projected_schema, + need_dedup: true, + reverse: false, + }, + vec![ + build_row(b"a", 1, 10.0, "v1"), + build_row(b"b", 2, 10.0, "v2"), + build_row(b"c", 3, 10.0, "v3"), + ], + ), + ]; + + for (req, expected) in testcases { + let scan_ctx = ScanContext::default(); + let iter = memtable.scan(scan_ctx, req).unwrap(); + check_iterator(iter, expected); + } + } + + fn test_memtable_scan_for_projection( + schema: Schema, + memtable: Arc, + ) { + let projection: Vec = (0..2).collect(); + let projected_schema = ProjectedSchema::new(schema, Some(projection)).unwrap(); + + let testcases = vec![( + ScanRequest { + start_user_key: Bound::Included(build_scan_key("a", 1)), + end_user_key: Bound::Excluded(build_scan_key("e", 5)), + sequence: 2, + projected_schema, + need_dedup: true, + reverse: false, + }, + vec![ + build_row_for_two_column(b"a", 1), + build_row_for_two_column(b"b", 2), + build_row_for_two_column(b"c", 3), + build_row_for_two_column(b"d", 4), + ], + )]; + + for (req, expected) in testcases { + let scan_ctx = ScanContext::default(); + let iter = memtable.scan(scan_ctx, req).unwrap(); + check_iterator(iter, expected); + } + } + + #[test] + fn test_memtable_scan() { + let schema = build_schema(); + let factory = SkiplistMemTableFactory; + let memtable = factory + .create_memtable(Options { + schema: schema.clone(), + arena_block_size: 512, + creation_sequence: 1, + collector: Arc::new(NoopCollector {}), + }) + .unwrap(); + + let mut ctx = PutContext::new(IndexInWriterSchema::for_same_schema(schema.num_columns())); + let input = vec![ + (KeySequence::new(1, 1), build_row(b"a", 1, 10.0, "v1")), + (KeySequence::new(1, 2), build_row(b"b", 2, 10.0, "v2")), + ( + KeySequence::new(1, 3), + build_row(b"c", 3, 10.0, "primary_key same with next row"), + ), + (KeySequence::new(1, 4), build_row(b"c", 3, 10.0, "v3")), + (KeySequence::new(2, 1), build_row(b"d", 4, 10.0, "v4")), + (KeySequence::new(2, 1), build_row(b"e", 5, 10.0, "v5")), + (KeySequence::new(2, 3), build_row(b"f", 6, 10.0, "v6")), + (KeySequence::new(3, 4), build_row(b"g", 7, 10.0, "v7")), + ]; + + for (seq, row) in input { + memtable.put(&mut ctx, seq, &row, &schema).unwrap(); + } + + test_memtable_scan_for_scan_request(schema.clone(), memtable.clone()); + test_memtable_scan_for_projection(schema, memtable); + } + + fn check_iterator>>( + iter: T, + expected_rows: Vec, + ) { + let mut visited_rows = 0; + for batch in iter { + let batch = batch.unwrap(); + for row_idx in 0..batch.num_rows() { + assert_eq!(batch.clone_row_at(row_idx), expected_rows[visited_rows]); + visited_rows += 1; + } + } + + assert_eq!(visited_rows, expected_rows.len()); + } + + fn build_scan_key(c1: &str, c2: i64) -> Bytes { + let mut buf = ByteVec::new(); + let encoder = MemComparable; + encoder.encode(&mut buf, &Datum::from(c1)).unwrap(); + encoder.encode(&mut buf, &Datum::from(c2)).unwrap(); + + Bytes::from(buf) + } + + pub fn build_row_for_two_column(key1: &[u8], key2: i64) -> Row { + let datums = vec![ + Datum::Varbinary(Bytes::copy_from_slice(key1)), + Datum::Timestamp(Timestamp::new(key2)), + ]; + + Row::from_datums(datums) + } +} diff --git a/analytic_engine/src/meta/details.rs b/analytic_engine/src/meta/details.rs new file mode 100644 index 0000000000..ae9c5a1741 --- /dev/null +++ b/analytic_engine/src/meta/details.rs @@ -0,0 +1,1282 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Implementation of Manifest + +use std::sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, +}; + +use async_trait::async_trait; +use common_types; +use common_util::define_result; +use log::{error, info, warn}; +use serde_derive::Deserialize; +use snafu::{ResultExt, Snafu}; +use tokio::sync::Mutex; +use wal::{ + log_batch::{LogWriteBatch, LogWriteEntry}, + manager::{ + LogIterator, ReadBoundary, ReadContext, ReadRequest, RegionId, SequenceNumber, WalManager, + WriteContext, + }, +}; + +use crate::meta::{ + meta_data::ManifestData, + meta_update::{ + MetaUpdate, MetaUpdateDecoder, MetaUpdatePayload, SnapshotManifestMeta, VersionEditMeta, + }, + Manifest, +}; + +/// The region id manifest used. +const MANIFEST_REGION_ID: RegionId = 1; +/// The region id to store snapshot state. +const SNAPSHOT_STATE_REGION_ID: RegionId = 2; +// The first region id of snapshot region. +const FIRST_SNAPSHOT_REGION_ID: RegionId = 3; +// The second region id of snapshot region. +const SECOND_SNAPSHOT_REGION_ID: RegionId = 4; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to write update to wal, err:{}", source))] + WriteWal { source: wal::manager::Error }, + + #[snafu(display("Failed to read wal, err:{}", source))] + ReadWal { source: wal::manager::Error }, + + #[snafu(display("Failed to read log entry, err:{}", source))] + ReadEntry { source: wal::manager::Error }, + + #[snafu(display("Failed to apply meta update, err:{}", source))] + ApplyUpdate { + source: crate::meta::meta_data::Error, + }, + + #[snafu(display("Failed to clean wal, err:{}", source))] + CleanWal { source: wal::manager::Error }, + + #[snafu(display("Failed to clean snapshot, region_id:{}, err:{}", region_id, source))] + CleanSnapshot { + region_id: RegionId, + source: wal::manager::Error, + }, + + #[snafu(display("Failed to load sequence of manifest, err:{}", source))] + LoadSequence { source: wal::manager::Error }, + + #[snafu(display("Failed to load sequence of snapshot state, err:{}", source))] + LoadSnapshotMetaSequence { source: wal::manager::Error }, + + #[snafu(display("Failed to clean snapshot state, err:{}", source))] + CleanSnapshotState { source: wal::manager::Error }, +} + +define_result!(Error); + +const STORE_UPDATE_BATCH: usize = 500; + +/// Implementation of [MetaUpdateReader] +#[derive(Debug)] +pub struct MetaUpdateReaderImpl { + iter: W::Iterator, +} + +impl MetaUpdateReaderImpl { + async fn next_update(&mut self) -> Result> { + let decoder = MetaUpdateDecoder; + + match self.iter.next_log_entry(&decoder).context(ReadEntry)? { + Some(entry) => Ok(Some(entry.payload)), + None => Ok(None), + } + } +} + +/// State to track manifest snapshot. +#[derive(Debug, Default)] +struct SnapshotState { + /// Meta data of the snapshot of the manifest, `None` if there is no + /// snapshot. + snapshot_meta: Option, +} + +impl SnapshotState { + fn install_snapshot_meta(&mut self, snapshot_meta: SnapshotManifestMeta) { + self.snapshot_meta = Some(snapshot_meta); + } + + fn next_snapshot_region_id(&self) -> RegionId { + match self.snapshot_meta { + Some(snapshot_meta) => { + if snapshot_meta.snapshot_region_id == FIRST_SNAPSHOT_REGION_ID { + SECOND_SNAPSHOT_REGION_ID + } else { + FIRST_SNAPSHOT_REGION_ID + } + } + None => FIRST_SNAPSHOT_REGION_ID, + } + } +} + +#[derive(Debug, Clone, Deserialize)] +pub struct Options { + pub snapshot_every_n_updates: usize, + pub paranoid_checks: bool, +} + +impl Default for Options { + fn default() -> Self { + Self { + snapshot_every_n_updates: 10_000, + paranoid_checks: true, + } + } +} + +// TODO(yingwen): Wrap into an inner struct if there are too many Arc fields. +/// Implementation of [Manifest]. +#[derive(Debug, Clone)] +pub struct ManifestImpl { + /// Region id for this manifest. + manifest_region_id: RegionId, + /// Wal manager, the manifest use its own wal manager instance. + wal_manager: Arc, + opts: Options, + + // Snapshot related: + /// Region id to store snapshot state. + snapshot_state_region_id: RegionId, + snapshot_state: Arc>, + /// Number of updates wrote to wal since last snapshot. + num_updates_since_snapshot: Arc, +} + +impl ManifestImpl { + pub async fn open(wal_manager: W, opts: Options) -> Result { + let mut manifest = Self { + manifest_region_id: MANIFEST_REGION_ID, + wal_manager: Arc::new(wal_manager), + opts, + snapshot_state_region_id: SNAPSHOT_STATE_REGION_ID, + snapshot_state: Arc::new(Mutex::new(SnapshotState::default())), + num_updates_since_snapshot: Arc::new(AtomicUsize::new(0)), + }; + + manifest.load_snapshot_state().await?; + + Ok(manifest) + } + + async fn load_snapshot_state(&mut self) -> Result<()> { + // Load snapshot state. + let mut reader = self.read_updates_from_region( + self.snapshot_state_region_id, + ReadBoundary::Min, + ReadBoundary::Max, + )?; + + let mut last_snapshot_meta = None; + while let Some(update) = reader.next_update().await? { + // If the entry is a snapshot entry. + if let Some(snapshot_meta) = update.snapshot_manifest_meta() { + last_snapshot_meta = Some(snapshot_meta); + } else { + error!( + "Manifest found non snapshot state entry, entry:{:?}", + update + ); + } + } + + let mut snapshot_state = self.snapshot_state.lock().await; + if let Some(snapshot_meta) = last_snapshot_meta { + // Previous snapshot exists. + snapshot_state.install_snapshot_meta(snapshot_meta); + + info!( + "Manifest found snapshot_meta, snapshot_state:{:?}, last_snapshot_meta:{:?}", + snapshot_state, last_snapshot_meta + ); + } + + Ok(()) + } + + fn read_updates_from_region( + &self, + region_id: RegionId, + start: ReadBoundary, + end: ReadBoundary, + ) -> Result> { + let request = ReadRequest { + region_id, + start, + end, + }; + let ctx = ReadContext::default(); + + let iter = self.wal_manager.read(&ctx, &request).context(ReadWal)?; + + Ok(MetaUpdateReaderImpl { iter }) + } + + /// Load meta update from region of given `region_id` and apply into + /// `manifest_data`. + async fn load_data_from_region( + &self, + region_id: RegionId, + manifest_data: &mut ManifestData, + ) -> Result<()> { + self.load_data_from_region_in_range( + region_id, + ReadBoundary::Min, + ReadBoundary::Max, + manifest_data, + ) + .await?; + + Ok(()) + } + + /// Load meta update in given range from region of given `region_id` + /// boundary, and apply into `manifest_data`. Returns number of MetaUpdates + /// loaded. + async fn load_data_from_region_in_range( + &self, + region_id: RegionId, + start: ReadBoundary, + end: ReadBoundary, + manifest_data: &mut ManifestData, + ) -> Result { + let mut reader = self.read_updates_from_region(region_id, start, end)?; + let mut loaded = 0; + + while let Some(update) = reader.next_update().await? { + if let Err(e) = manifest_data.apply_meta_update(update).context(ApplyUpdate) { + if self.opts.paranoid_checks { + return Err(e); + } else { + warn!("Manifest load meta update failed, err:{:?}", e); + continue; + } + } + loaded += 1; + } + Ok(loaded) + } + + /// Load data and create a snapshot. + async fn create_snapshot(&self) -> Result { + info!("Manifest try to create snapshot"); + + // Acquire snapshot lock. + let mut snapshot_state = self.snapshot_state.lock().await; + let last_snapshot_meta = snapshot_state.snapshot_meta; + let next_snapshot_region_id = snapshot_state.next_snapshot_region_id(); + + // Clean next snapshot region. + self.clean_snapshot(next_snapshot_region_id).await?; + + // Load previous snapshot. + let mut manifest_start = ReadBoundary::Min; + let mut manifest_data = ManifestData::default(); + if let Some(snapshot_meta) = last_snapshot_meta { + // Load manifest from last snapshot first. + self.load_data_from_region(snapshot_meta.snapshot_region_id, &mut manifest_data) + .await?; + // The sequence after snapshot. + manifest_start = ReadBoundary::Excluded(snapshot_meta.sequence); + } + + // Get current sequence, data until this sequence can be loaded to create next + // snapshot. + let snapshot_sequence = self + .wal_manager + .sequence_num(self.manifest_region_id) + .context(LoadSequence)?; + + // Load manifest up to `snapshot_sequence`. + let num_loaded_from_manifest = self + .load_data_from_region_in_range( + self.manifest_region_id, + manifest_start, + ReadBoundary::Included(snapshot_sequence), + &mut manifest_data, + ) + .await?; + + // Store snapshot. + self.store_snapshot_to_region(next_snapshot_region_id, &manifest_data) + .await?; + + // Store snapshot state. + let next_snapshot_meta = SnapshotManifestMeta { + snapshot_region_id: next_snapshot_region_id, + sequence: snapshot_sequence, + }; + self.store_snapshot_state(next_snapshot_meta).await?; + + info!( + "Manifest stored snapshot, + next_snapshot_meta:{:?}, + last_snapshot_meta:{:?}, + snapshot_state_before_install:{:?}, + num_updates_since_snapshot:{}", + next_snapshot_meta, + last_snapshot_meta, + snapshot_state, + self.num_updates_since_snapshot() + ); + + // Install new snapshot, also bump next snapshot region id. + snapshot_state.install_snapshot_meta(next_snapshot_meta); + + // Data before sequence of the snapshot can also be removed. + self.wal_manager + .mark_delete_entries_up_to(self.manifest_region_id, snapshot_sequence) + .await + .context(CleanWal)?; + + self.decrease_num_updates(num_loaded_from_manifest); + + info!( + "Manifest create snapshot done, + next_snapshot_meta:{:?}, + last_snapshot_meta:{:?}, + snapshot_state:{:?}, + num_loaded_from_manifest:{}, + num_updates:{}", + next_snapshot_meta, + last_snapshot_meta, + snapshot_state, + num_loaded_from_manifest, + self.num_updates_since_snapshot() + ); + + Ok(manifest_data) + } + + async fn clean_snapshot(&self, snapshot_region_id: RegionId) -> Result<()> { + info!("Clean snapshot, snapshot_region_id:{}", snapshot_region_id); + + self.wal_manager + .mark_delete_entries_up_to(snapshot_region_id, common_types::MAX_SEQUENCE_NUMBER) + .await + .context(CleanSnapshot { + region_id: snapshot_region_id, + }) + .map_err(|e| { + error!( + "Failed to clean snapshot, region_id:{}, err:{}", + snapshot_region_id, e + ); + e + }) + } + + async fn store_snapshot_state(&self, snapshot_meta: SnapshotManifestMeta) -> Result<()> { + // Get current snapshot state sequence. + let snapshot_state_sequence = self + .wal_manager + .sequence_num(self.snapshot_state_region_id) + .context(LoadSnapshotMetaSequence)?; + // Write a snapshot entry into the region. + + self.store_update_to_region( + self.snapshot_state_region_id, + MetaUpdate::SnapshotManifest(snapshot_meta), + ) + .await?; + // Clean old snapshot state. + self.wal_manager + .mark_delete_entries_up_to(self.snapshot_state_region_id, snapshot_state_sequence) + .await + .context(CleanSnapshotState) + } + + async fn store_update_to_region( + &self, + region_id: RegionId, + update: MetaUpdate, + ) -> Result { + info!( + "Manifest impl store update, region_id:{}, update:{:?}", + region_id, update + ); + + let mut log_batch = LogWriteBatch::new(region_id); + log_batch.push(LogWriteEntry { + payload: MetaUpdatePayload::from(update), + }); + + let write_ctx = WriteContext::default(); + + self.wal_manager + .write(&write_ctx, &log_batch) + .await + .context(WriteWal) + } + + async fn store_updates_to_region( + &self, + region_id: RegionId, + updates: &[MetaUpdate], + ) -> Result { + let mut log_batch = LogWriteBatch::new(region_id); + for update in updates { + log_batch.push(LogWriteEntry { + payload: MetaUpdatePayload::from(update), + }); + } + + let write_ctx = WriteContext::default(); + + self.wal_manager + .write(&write_ctx, &log_batch) + .await + .context(WriteWal) + } + + async fn store_snapshot_to_region( + &self, + region_id: RegionId, + snapshot: &ManifestData, + ) -> Result<()> { + info!("Manifest store snapshot to region, region_id:{}", region_id); + + let mut meta_updates = Vec::with_capacity(STORE_UPDATE_BATCH); + + // Store all spaces. + for (space_id, space_meta_data) in &snapshot.spaces { + let space_meta = space_meta_data.space_meta.clone(); + // Add this space. + meta_updates.push(MetaUpdate::AddSpace(space_meta)); + + // Add all tables to the space. + for (table_id, table_meta_data) in &space_meta_data.tables { + let table_meta = table_meta_data.table_meta.clone(); + // Store table meta. + meta_updates.push(MetaUpdate::AddTable(table_meta)); + + // Store version edit. + let version_meta = &table_meta_data.version_meta; + let version_edit_meta = VersionEditMeta { + space_id: *space_id, + table_id: *table_id, + flushed_sequence: version_meta.flushed_sequence, + files_to_add: version_meta.ordered_files(), + files_to_delete: Vec::new(), + }; + meta_updates.push(MetaUpdate::VersionEdit(version_edit_meta)); + + if meta_updates.len() >= STORE_UPDATE_BATCH { + self.store_updates_to_region(region_id, &meta_updates) + .await?; + meta_updates.clear(); + } + } + } + + if !meta_updates.is_empty() { + self.store_updates_to_region(region_id, &meta_updates) + .await?; + meta_updates.clear(); + } + + Ok(()) + } + + #[inline] + fn num_updates_since_snapshot(&self) -> usize { + self.num_updates_since_snapshot.load(Ordering::Relaxed) + } + + // Guarded by snapshot state lock. + #[inline] + fn decrease_num_updates(&self, num: usize) { + if num >= self.num_updates_since_snapshot() { + self.num_updates_since_snapshot.store(0, Ordering::Relaxed); + } else { + self.num_updates_since_snapshot + .fetch_sub(num, Ordering::Relaxed); + } + } +} + +#[async_trait] +impl Manifest for ManifestImpl { + type Error = Error; + + async fn store_update(&self, update: MetaUpdate) -> Result<()> { + self.store_update_to_region(self.manifest_region_id, update) + .await?; + + let num_updates = self + .num_updates_since_snapshot + .fetch_add(1, Ordering::Relaxed); + if num_updates >= self.opts.snapshot_every_n_updates { + info!( + "Enough updates in manifest, trigger snapshot, num_updates:{}", + num_updates + ); + + self.create_snapshot().await?; + } + + Ok(()) + } + + async fn load_data(&self, do_snapshot: bool) -> Result { + if do_snapshot { + let manifest_data = self.create_snapshot().await?; + + Ok(manifest_data) + } else { + let mut manifest_data = ManifestData::default(); + + let last_snapshot_meta = { + let snapshot_state = self.snapshot_state.lock().await; + snapshot_state.snapshot_meta + }; + let mut manifest_start = ReadBoundary::Min; + // Load from snapshot. + if let Some(snapshot_meta) = last_snapshot_meta { + self.load_data_from_region(snapshot_meta.snapshot_region_id, &mut manifest_data) + .await?; + // The sequence after snapshot. + manifest_start = ReadBoundary::Excluded(snapshot_meta.sequence); + } + + // Load remaining data from wal. + self.load_data_from_region_in_range( + self.manifest_region_id, + manifest_start, + ReadBoundary::Max, + &mut manifest_data, + ) + .await?; + + Ok(manifest_data) + } + } +} + +#[cfg(test)] +mod tests { + use std::{path::PathBuf, sync::Arc}; + + use common_types::{column_schema, datum::DatumKind, schema, schema::Schema}; + use common_util::{runtime, runtime::Runtime, tests::init_log_for_test}; + use table_engine::table::TableId; + use wal::rocks_impl::manager::{Builder as WalBuilder, RocksImpl}; + + use super::*; + use crate::{ + meta::{ + details::{ManifestImpl, Options}, + meta_update::{ + AddSpaceMeta, AddTableMeta, AlterOptionsMeta, AlterSchemaMeta, DropTableMeta, + MetaUpdate, VersionEditMeta, + }, + Manifest, + }, + TableOptions, + }; + + fn build_altered_schema(schema: &Schema) -> Schema { + let mut builder = schema::Builder::new().auto_increment_column_id(true); + for column_schema in schema.key_columns() { + builder = builder + .add_key_column(column_schema.clone()) + .expect("should succeed to add key column"); + } + for column_schema in schema.normal_columns() { + builder = builder + .add_normal_column(column_schema.clone()) + .expect("should succeed to add normal column"); + } + builder + .add_normal_column( + column_schema::Builder::new("field3".to_string(), DatumKind::String) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .build() + .unwrap() + } + + fn build_runtime(thread_num: usize) -> Arc { + Arc::new( + runtime::Builder::default() + .worker_threads(thread_num) + .enable_all() + .build() + .unwrap(), + ) + } + + async fn build_manifest( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> ManifestImpl { + let manifest_wal = WalBuilder::with_default_rocksdb_config(dir, runtime.clone()) + .build() + .unwrap(); + + ManifestImpl::open(manifest_wal, opts).await.unwrap() + } + + async fn assert_expected( + dir: impl Into, + runtime: Arc, + opts: Options, + expected: &str, + ) -> Result<()> { + let manifest = build_manifest(dir, runtime, opts).await; + let data = manifest.load_data(false).await?; + assert_eq!(format!("{:#?}", data), expected); + Ok(()) + } + + async fn test_manifest_add_space( + dir: impl Into, + runtime: Arc, + opts: Options, + ) { + let space_id = 10; + let space_name = "test".to_string(); + + let manifest = build_manifest(dir, runtime, opts).await; + let add_space = MetaUpdate::AddSpace(AddSpaceMeta { + space_id, + space_name: space_name.clone(), + }); + manifest.store_update(add_space).await.unwrap(); + let data = manifest.load_data(false).await.unwrap(); + assert_eq!(data.spaces.len(), 1); + assert_eq!(data.spaces.get(&10).unwrap().space_meta.space_id, space_id); + assert_eq!( + data.spaces.get(&10).unwrap().space_meta.space_name, + space_name + ); + } + + async fn check_add_table( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let expected = r#"ManifestData { + spaces: { + 10: SpaceMetaData { + space_meta: AddSpaceMeta { + space_id: 10, + space_name: "test", + }, + tables: { + TableId(100, 0, 100): TableMetaData { + table_meta: AddTableMeta { + space_id: 10, + table_id: TableId(100, 0, 100), + table_name: "test_table", + schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + opts: TableOptions { + segment_duration: None, + update_mode: Overwrite, + enable_ttl: true, + ttl: ReadableDuration( + 604800s, + ), + arena_block_size: 2097152, + write_buffer_size: 33554432, + compaction_strategy: Default, + num_rows_per_row_group: 8192, + compression: Zstd, + }, + }, + version_meta: TableVersionMeta { + flushed_sequence: 0, + files: {}, + max_file_id: 0, + }, + }, + }, + }, + }, + last_space_id: 10, +}"#; + assert_expected(dir, runtime, opts, expected).await + } + + async fn test_manifest_add_table( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let space_id = 10; + let manifest = build_manifest(dir, runtime, opts).await; + + let table_id = TableId::from(100); + let table_name = "test_table".to_string(); + let add_table = MetaUpdate::AddTable(AddTableMeta { + space_id, + table_id, + table_name, + schema: common_types::tests::build_schema(), + opts: TableOptions::default(), + }); + manifest.store_update(add_table).await + } + + async fn check_drop_table( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let expected = r#"ManifestData { + spaces: { + 10: SpaceMetaData { + space_meta: AddSpaceMeta { + space_id: 10, + space_name: "test", + }, + tables: {}, + }, + }, + last_space_id: 10, +}"#; + assert_expected(dir, runtime, opts, expected).await + } + + async fn test_manifest_drop_table( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let space_id = 10; + + let manifest = build_manifest(dir, runtime, opts).await; + + let table_id = TableId::from(100); + let table_name = "test_table".to_string(); + let add_table = MetaUpdate::DropTable(DropTableMeta { + space_id, + table_id, + table_name, + }); + manifest.store_update(add_table).await + } + + async fn check_version_edit_with_table( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let expected = r#"ManifestData { + spaces: { + 10: SpaceMetaData { + space_meta: AddSpaceMeta { + space_id: 10, + space_name: "test", + }, + tables: { + TableId(100, 0, 100): TableMetaData { + table_meta: AddTableMeta { + space_id: 10, + table_id: TableId(100, 0, 100), + table_name: "test_table", + schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + opts: TableOptions { + segment_duration: None, + update_mode: Overwrite, + enable_ttl: true, + ttl: ReadableDuration( + 604800s, + ), + arena_block_size: 2097152, + write_buffer_size: 33554432, + compaction_strategy: Default, + num_rows_per_row_group: 8192, + compression: Zstd, + }, + }, + version_meta: TableVersionMeta { + flushed_sequence: 3, + files: {}, + max_file_id: 0, + }, + }, + }, + }, + }, + last_space_id: 10, +}"#; + assert_expected(dir, runtime, opts, expected).await + } + + async fn check_version_edit_no_table( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let expected = r#"ManifestData { + spaces: { + 10: SpaceMetaData { + space_meta: AddSpaceMeta { + space_id: 10, + space_name: "test", + }, + tables: {}, + }, + }, + last_space_id: 10, +}"#; + assert_expected(dir, runtime, opts, expected).await + } + + async fn test_manifest_version_edit( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let space_id = 10; + + let manifest = build_manifest(dir, runtime, opts).await; + + let table_id = TableId::from(100); + let version_edit = MetaUpdate::VersionEdit(VersionEditMeta { + space_id, + table_id, + flushed_sequence: 3, + files_to_add: Vec::new(), + files_to_delete: Vec::new(), + }); + manifest.store_update(version_edit).await + } + + async fn check_alter_schema( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let expected = r#"ManifestData { + spaces: { + 10: SpaceMetaData { + space_meta: AddSpaceMeta { + space_id: 10, + space_name: "test", + }, + tables: { + TableId(100, 0, 100): TableMetaData { + table_meta: AddTableMeta { + space_id: 10, + table_id: TableId(100, 0, 100), + table_name: "test_table", + schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 5, + name: "field3", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + opts: TableOptions { + segment_duration: None, + update_mode: Overwrite, + enable_ttl: true, + ttl: ReadableDuration( + 604800s, + ), + arena_block_size: 2097152, + write_buffer_size: 33554432, + compaction_strategy: Default, + num_rows_per_row_group: 8192, + compression: Zstd, + }, + }, + version_meta: TableVersionMeta { + flushed_sequence: 3, + files: {}, + max_file_id: 0, + }, + }, + }, + }, + }, + last_space_id: 10, +}"#; + assert_expected(dir, runtime, opts, expected).await + } + + async fn test_manifest_alter_schema( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let space_id = 10; + let manifest = build_manifest(dir, runtime, opts).await; + + let table_id = TableId::from(100); + let alter_schema = MetaUpdate::AlterSchema(AlterSchemaMeta { + space_id, + table_id, + schema: build_altered_schema(&common_types::tests::build_schema()), + pre_schema_version: 1, + }); + manifest.store_update(alter_schema).await + } + + async fn check_alter_options( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let expected = r#"ManifestData { + spaces: { + 10: SpaceMetaData { + space_meta: AddSpaceMeta { + space_id: 10, + space_name: "test", + }, + tables: { + TableId(100, 0, 100): TableMetaData { + table_meta: AddTableMeta { + space_id: 10, + table_id: TableId(100, 0, 100), + table_name: "test_table", + schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 5, + name: "field3", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + opts: TableOptions { + segment_duration: None, + update_mode: Overwrite, + enable_ttl: false, + ttl: ReadableDuration( + 604800s, + ), + arena_block_size: 2097152, + write_buffer_size: 33554432, + compaction_strategy: Default, + num_rows_per_row_group: 8192, + compression: Zstd, + }, + }, + version_meta: TableVersionMeta { + flushed_sequence: 3, + files: {}, + max_file_id: 0, + }, + }, + }, + }, + }, + last_space_id: 10, +}"#; + assert_expected(dir, runtime, opts, expected).await + } + + async fn test_manifest_alter_options( + dir: impl Into, + runtime: Arc, + opts: Options, + ) -> Result<()> { + let space_id = 10; + + let manifest = build_manifest(dir, runtime, opts).await; + + let table_id = TableId::from(100); + let alter_options = MetaUpdate::AlterOptions(AlterOptionsMeta { + space_id, + table_id, + options: TableOptions { + enable_ttl: false, + ..Default::default() + }, + }); + manifest.store_update(alter_options).await + } + + #[test] + fn test_manifest() { + init_log_for_test(); + let dir = tempfile::tempdir().unwrap(); + let runtime = build_runtime(2); + let runtime_clone = runtime.clone(); + runtime.block_on(async move { + let opts = Options { + snapshot_every_n_updates: 2, + paranoid_checks: false, + }; + + test_manifest_add_space(dir.path(), runtime_clone.clone(), opts.clone()).await; + + test_manifest_add_table(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .unwrap(); + assert!( + check_add_table(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .is_ok() + ); + + test_manifest_drop_table(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .unwrap(); + assert!( + check_drop_table(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .is_ok() + ); + { + let opts = Options { + snapshot_every_n_updates: 2, + paranoid_checks: true, + }; + test_manifest_version_edit(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .unwrap(); + assert!(check_version_edit_no_table( + dir.path(), + runtime_clone.clone(), + opts.clone() + ) + .await + .is_ok()); + + test_manifest_alter_schema(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .unwrap(); + assert!( + check_alter_schema(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .is_err() + ); + + test_manifest_alter_options(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .unwrap(); + assert!(check_alter_options(dir.path(), runtime_clone.clone(), opts) + .await + .is_err()); + } + { + test_manifest_add_table(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .unwrap(); + assert!( + check_add_table(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .is_ok() + ); + + test_manifest_version_edit(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .unwrap(); + assert!(check_version_edit_with_table( + dir.path(), + runtime_clone.clone(), + opts.clone() + ) + .await + .is_ok()); + + test_manifest_alter_schema(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .unwrap(); + assert!( + check_alter_schema(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .is_ok() + ); + + test_manifest_alter_options(dir.path(), runtime_clone.clone(), opts.clone()) + .await + .unwrap(); + assert!(check_alter_options(dir.path(), runtime_clone, opts) + .await + .is_ok()); + } + }); + } +} diff --git a/analytic_engine/src/meta/meta_data.rs b/analytic_engine/src/meta/meta_data.rs new file mode 100644 index 0000000000..07467d9b9f --- /dev/null +++ b/analytic_engine/src/meta/meta_data.rs @@ -0,0 +1,193 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Meta data of manifest. + +use std::collections::BTreeMap; + +use common_util::define_result; +use log::{debug, info}; +use snafu::{ensure, Backtrace, OptionExt, Snafu}; +use table_engine::table::TableId; + +use crate::{ + meta::meta_update::{AddSpaceMeta, AddTableMeta, MetaUpdate}, + space::SpaceId, + table::version::TableVersionMeta, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Space id corrupted (last >= given), last:{}, given:{}.\nBacktrace:\n{}", + last, + given, + backtrace + ))] + SpaceIdCorrupted { + last: SpaceId, + given: SpaceId, + backtrace: Backtrace, + }, + + #[snafu(display( + "Space of table is missing, maybe corrupted, space_id:{}, table:{}.\nBacktrace:\n{}", + space_id, + table_name, + backtrace, + ))] + TableSpaceMiss { + space_id: SpaceId, + table_name: String, + backtrace: Backtrace, + }, + + #[snafu(display( + "Space is missing, maybe corrupted, space_id:{}.\nBacktrace:\n{}", + space_id, + backtrace, + ))] + SpaceMiss { + space_id: SpaceId, + backtrace: Backtrace, + }, + + #[snafu(display( + "Table is missing, maybe corrupted, space_id:{}, table_id:{}.\nBacktrace:\n{}", + space_id, + table_id, + backtrace, + ))] + TableMiss { + space_id: SpaceId, + table_id: TableId, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +#[derive(Debug)] +pub struct TableMetaData { + pub table_meta: AddTableMeta, + pub version_meta: TableVersionMeta, +} + +#[derive(Debug)] +pub struct SpaceMetaData { + pub space_meta: AddSpaceMeta, + // Use BTreeMap to order table meta by table id. + pub tables: BTreeMap, +} + +/// Holds the final view of the data in manifest. +#[derive(Debug, Default)] +pub struct ManifestData { + // Use BTreeMap to order space meta by space id, so space with smaller id + // can be processed first. This is necessary especially in creating snapshot. + pub spaces: BTreeMap, + pub last_space_id: SpaceId, +} + +impl ManifestData { + pub fn apply_meta_update(&mut self, update: MetaUpdate) -> Result<()> { + debug!("Apply meta update, update:{:?}", update); + + // TODO(yingwen): Ignore space not found error when we support drop space. + match update { + MetaUpdate::AddSpace(meta) => { + ensure!( + self.last_space_id <= meta.space_id, + SpaceIdCorrupted { + last: self.last_space_id, + given: meta.space_id, + } + ); + + self.last_space_id = meta.space_id; + self.spaces.insert( + meta.space_id, + SpaceMetaData { + space_meta: meta, + tables: BTreeMap::new(), + }, + ); + } + MetaUpdate::AddTable(meta) => { + let space = self + .spaces + .get_mut(&meta.space_id) + .context(TableSpaceMiss { + space_id: meta.space_id, + table_name: &meta.table_name, + })?; + space.tables.insert( + meta.table_id, + TableMetaData { + table_meta: meta, + version_meta: TableVersionMeta::default(), + }, + ); + } + MetaUpdate::VersionEdit(meta) => { + let space = self.spaces.get_mut(&meta.space_id).context(SpaceMiss { + space_id: meta.space_id, + })?; + // If there is a background compaction/flush job, then version edit + // may be stored after a drop table entry being stored. We ignore + // that case and won't return error if table is not found. + let table = match space.tables.get_mut(&meta.table_id) { + Some(v) => v, + None => { + info!("Table of version edit not found, meta:{:?}", meta); + + return Ok(()); + } + }; + let edit = meta.into_version_edit(); + table.version_meta.apply_edit(edit); + } + MetaUpdate::AlterSchema(meta) => { + let space = self.spaces.get_mut(&meta.space_id).context(SpaceMiss { + space_id: meta.space_id, + })?; + let table = space.tables.get_mut(&meta.table_id).context(TableMiss { + space_id: meta.space_id, + table_id: meta.table_id, + })?; + + // Update schema of AddTableMeta. + table.table_meta.schema = meta.schema; + } + MetaUpdate::AlterOptions(meta) => { + let space = self.spaces.get_mut(&meta.space_id).context(SpaceMiss { + space_id: meta.space_id, + })?; + let table = space.tables.get_mut(&meta.table_id).context(TableMiss { + space_id: meta.space_id, + table_id: meta.table_id, + })?; + + // Update options of AddTableMeta. + table.table_meta.opts = meta.options; + } + MetaUpdate::DropTable(meta) => { + let space = self.spaces.get_mut(&meta.space_id).context(SpaceMiss { + space_id: meta.space_id, + })?; + + let removed_table = space.tables.remove(&meta.table_id); + + debug!( + "Apply drop table meta update, removed table:{}, removed:{}", + meta.table_name, + removed_table.is_some() + ); + } + MetaUpdate::SnapshotManifest(_) => { + // A snapshot record, no need to handle this. + } + } + + Ok(()) + } +} diff --git a/analytic_engine/src/meta/meta_update.rs b/analytic_engine/src/meta/meta_update.rs new file mode 100644 index 0000000000..06e8f86099 --- /dev/null +++ b/analytic_engine/src/meta/meta_update.rs @@ -0,0 +1,463 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Update to meta + +use std::convert::{TryFrom, TryInto}; + +use common_types::{ + bytes::{MemBuf, MemBufMut, Writer}, + schema::{Schema, Version}, + SequenceNumber, +}; +use common_util::define_result; +use proto::{analytic_common, common as common_pb, meta_update as meta_pb}; +use protobuf::Message; +use snafu::{Backtrace, ResultExt, Snafu}; +use table_engine::table::TableId; +use wal::{ + log_batch::{Payload, PayloadDecoder}, + manager::RegionId, +}; + +use crate::{ + space::SpaceId, + table::version_edit::{AddFile, DeleteFile, VersionEdit}, + TableOptions, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to encode payload, err:{}.\nBacktrace:\n{}", source, backtrace))] + EncodePayloadPb { + source: protobuf::error::ProtobufError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to convert schema, err:{}", source))] + ConvertSchema { source: common_types::schema::Error }, + + #[snafu(display("Empty meta update.\nBacktrace:\n{}", backtrace))] + EmptyMetaUpdate { backtrace: Backtrace }, + + #[snafu(display("Failed to decode payload, err:{}.\nBacktrace:\n{}", source, backtrace))] + DecodePayloadPb { + source: protobuf::error::ProtobufError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to convert version edit, err:{}", source))] + ConvertVersionEdit { + source: crate::table::version_edit::Error, + }, +} + +define_result!(Error); + +/// Modifications to meta data in meta +#[derive(Debug, Clone)] +pub enum MetaUpdate { + AddSpace(AddSpaceMeta), + AddTable(AddTableMeta), + DropTable(DropTableMeta), + VersionEdit(VersionEditMeta), + AlterSchema(AlterSchemaMeta), + AlterOptions(AlterOptionsMeta), + SnapshotManifest(SnapshotManifestMeta), +} + +impl MetaUpdate { + pub fn into_pb(self) -> meta_pb::MetaUpdate { + let mut meta_update = meta_pb::MetaUpdate::new(); + + match self { + MetaUpdate::AddSpace(v) => { + meta_update.set_add_space(v.into_pb()); + } + MetaUpdate::AddTable(v) => { + meta_update.set_add_table(v.into_pb()); + } + MetaUpdate::VersionEdit(v) => { + meta_update.set_version_edit(v.into_pb()); + } + MetaUpdate::AlterSchema(v) => { + meta_update.set_alter_schema(v.into_pb()); + } + MetaUpdate::AlterOptions(v) => { + meta_update.set_alter_options(v.into_pb()); + } + MetaUpdate::DropTable(v) => { + meta_update.set_drop_table(v.into_pb()); + } + MetaUpdate::SnapshotManifest(v) => { + meta_update.set_snapshot_manifest(v.into_pb()); + } + } + + meta_update + } + + pub fn snapshot_manifest_meta(&self) -> Option { + if let MetaUpdate::SnapshotManifest(v) = self { + Some(*v) + } else { + None + } + } +} + +impl TryFrom for MetaUpdate { + type Error = Error; + + fn try_from(src: meta_pb::MetaUpdate) -> Result { + let meta_update = match src.meta { + Some(meta_pb::MetaUpdate_oneof_meta::add_space(v)) => { + let add_space = AddSpaceMeta::from(v); + MetaUpdate::AddSpace(add_space) + } + Some(meta_pb::MetaUpdate_oneof_meta::add_table(v)) => { + let add_table = AddTableMeta::try_from(v)?; + MetaUpdate::AddTable(add_table) + } + Some(meta_pb::MetaUpdate_oneof_meta::version_edit(v)) => { + let version_edit = VersionEditMeta::try_from(v)?; + MetaUpdate::VersionEdit(version_edit) + } + Some(meta_pb::MetaUpdate_oneof_meta::alter_schema(v)) => { + let alter_schema = AlterSchemaMeta::try_from(v)?; + MetaUpdate::AlterSchema(alter_schema) + } + Some(meta_pb::MetaUpdate_oneof_meta::alter_options(v)) => { + let alter_options = AlterOptionsMeta::from(v); + MetaUpdate::AlterOptions(alter_options) + } + Some(meta_pb::MetaUpdate_oneof_meta::drop_table(v)) => { + let drop_table = DropTableMeta::from(v); + MetaUpdate::DropTable(drop_table) + } + Some(meta_pb::MetaUpdate_oneof_meta::snapshot_manifest(v)) => { + let snapshot_manifest = SnapshotManifestMeta::from(v); + MetaUpdate::SnapshotManifest(snapshot_manifest) + } + None => { + // Meta update should not be empty. + return EmptyMetaUpdate.fail(); + } + }; + + Ok(meta_update) + } +} + +/// Meta data for a new space +#[derive(Debug, Clone)] +pub struct AddSpaceMeta { + pub space_id: SpaceId, + pub space_name: String, +} + +impl AddSpaceMeta { + fn into_pb(self) -> meta_pb::AddSpaceMeta { + let mut target = meta_pb::AddSpaceMeta::new(); + target.set_space_id(self.space_id); + target.set_space_name(self.space_name); + + target + } +} + +impl From for AddSpaceMeta { + fn from(src: meta_pb::AddSpaceMeta) -> Self { + Self { + space_id: src.space_id, + space_name: src.space_name, + } + } +} + +/// Meta data for a new table +#[derive(Debug, Clone)] +pub struct AddTableMeta { + /// Space id of the table + pub space_id: SpaceId, + pub table_id: TableId, + pub table_name: String, + /// Schema of the table + pub schema: Schema, + // Options needed to persist + pub opts: TableOptions, +} + +impl AddTableMeta { + fn into_pb(self) -> meta_pb::AddTableMeta { + let mut target = meta_pb::AddTableMeta::new(); + target.set_space_id(self.space_id); + target.set_table_id(self.table_id.as_u64()); + target.set_table_name(self.table_name); + target.set_schema(common_pb::TableSchema::from(self.schema)); + target.set_options(analytic_common::TableOptions::from(self.opts)); + + target + } +} + +impl TryFrom for AddTableMeta { + type Error = Error; + + fn try_from(mut src: meta_pb::AddTableMeta) -> Result { + let table_schema = src.take_schema(); + let opts = src.take_options(); + + Ok(Self { + space_id: src.space_id, + table_id: TableId::from(src.table_id), + table_name: src.table_name, + schema: Schema::try_from(table_schema).context(ConvertSchema)?, + opts: TableOptions::from(opts), + }) + } +} + +/// Meta data for dropping a table +#[derive(Debug, Clone)] +pub struct DropTableMeta { + /// Space id of the table + pub space_id: SpaceId, + pub table_id: TableId, + pub table_name: String, +} + +impl DropTableMeta { + fn into_pb(self) -> meta_pb::DropTableMeta { + let mut target = meta_pb::DropTableMeta::new(); + target.set_space_id(self.space_id); + target.set_table_id(self.table_id.as_u64()); + target.set_table_name(self.table_name); + + target + } +} + +impl From for DropTableMeta { + fn from(src: meta_pb::DropTableMeta) -> Self { + Self { + space_id: src.space_id, + table_id: TableId::from(src.table_id), + table_name: src.table_name, + } + } +} + +/// Meta data of version edit to table +#[derive(Debug, Clone)] +pub struct VersionEditMeta { + pub space_id: SpaceId, + pub table_id: TableId, + /// Sequence number of the flushed data. Set to 0 if this edit is not + /// created by a flush request. + pub flushed_sequence: SequenceNumber, + pub files_to_add: Vec, + pub files_to_delete: Vec, +} + +impl VersionEditMeta { + fn into_pb(self) -> meta_pb::VersionEditMeta { + let mut target = meta_pb::VersionEditMeta::new(); + target.set_space_id(self.space_id); + target.set_table_id(self.table_id.as_u64()); + target.set_flushed_sequence(self.flushed_sequence); + + let mut files_to_add = Vec::with_capacity(self.files_to_add.len()); + for file in self.files_to_add { + files_to_add.push(file.into_pb()); + } + target.files_to_add = files_to_add.into(); + + let mut files_to_delete = Vec::with_capacity(self.files_to_delete.len()); + for file in self.files_to_delete { + files_to_delete.push(file.into_pb()); + } + target.files_to_delete = files_to_delete.into(); + + target + } + + /// Convert into [crate::table::version_edit::VersionEdit]. The + /// `mems_to_remove` field is left empty. + pub fn into_version_edit(self) -> VersionEdit { + VersionEdit { + mems_to_remove: Vec::new(), + flushed_sequence: self.flushed_sequence, + files_to_add: self.files_to_add, + files_to_delete: self.files_to_delete, + } + } +} + +impl TryFrom for VersionEditMeta { + type Error = Error; + + fn try_from(src: meta_pb::VersionEditMeta) -> Result { + let mut files_to_add = Vec::with_capacity(src.files_to_add.len()); + for file_meta in src.files_to_add { + files_to_add.push(AddFile::try_from(file_meta).context(ConvertVersionEdit)?); + } + + let mut files_to_delete = Vec::with_capacity(src.files_to_delete.len()); + for file_meta in src.files_to_delete { + files_to_delete.push(DeleteFile::try_from(file_meta).context(ConvertVersionEdit)?); + } + + Ok(Self { + space_id: src.space_id, + table_id: TableId::from(src.table_id), + flushed_sequence: src.flushed_sequence, + files_to_add, + files_to_delete, + }) + } +} + +/// Meta data of schema update. +#[derive(Debug, Clone)] +pub struct AlterSchemaMeta { + pub space_id: SpaceId, + pub table_id: TableId, + pub schema: Schema, + pub pre_schema_version: Version, +} + +impl AlterSchemaMeta { + fn into_pb(self) -> meta_pb::AlterSchemaMeta { + let mut target = meta_pb::AlterSchemaMeta::new(); + target.set_space_id(self.space_id); + target.set_table_id(self.table_id.as_u64()); + target.set_schema(common_pb::TableSchema::from(self.schema)); + target.set_pre_schema_version(self.pre_schema_version); + + target + } +} + +impl TryFrom for AlterSchemaMeta { + type Error = Error; + + fn try_from(mut src: meta_pb::AlterSchemaMeta) -> Result { + let table_schema = src.take_schema(); + + Ok(Self { + space_id: src.space_id, + table_id: TableId::from(src.table_id), + schema: Schema::try_from(table_schema).context(ConvertSchema)?, + pre_schema_version: src.pre_schema_version, + }) + } +} + +/// Meta data of options update. +#[derive(Debug, Clone)] +pub struct AlterOptionsMeta { + pub space_id: SpaceId, + pub table_id: TableId, + pub options: TableOptions, +} + +impl AlterOptionsMeta { + fn into_pb(self) -> meta_pb::AlterOptionsMeta { + let mut target = meta_pb::AlterOptionsMeta::new(); + target.set_space_id(self.space_id); + target.set_table_id(self.table_id.as_u64()); + target.set_options(analytic_common::TableOptions::from(self.options)); + + target + } +} + +impl From for AlterOptionsMeta { + fn from(mut src: meta_pb::AlterOptionsMeta) -> Self { + let table_options = src.take_options(); + + Self { + space_id: src.space_id, + table_id: TableId::from(src.table_id), + options: TableOptions::from(table_options), + } + } +} + +#[derive(Debug, Clone, Copy)] +pub struct SnapshotManifestMeta { + pub snapshot_region_id: RegionId, + /// The last sequence (inclusive) of the data in this snapshot. + /// + /// Note that the sequence refers to the manifest region. + pub sequence: SequenceNumber, +} + +impl SnapshotManifestMeta { + fn into_pb(self) -> meta_pb::SnapshotManifestMeta { + let mut target = meta_pb::SnapshotManifestMeta::new(); + target.set_region_id(self.snapshot_region_id); + target.set_sequence(self.sequence); + + target + } +} + +impl From for SnapshotManifestMeta { + fn from(src: meta_pb::SnapshotManifestMeta) -> SnapshotManifestMeta { + Self { + snapshot_region_id: src.region_id, + sequence: src.sequence, + } + } +} + +/// An adapter to implement [wal::log_batch::Payload] for +/// [proto::meta_update::MetaUpdate] +#[derive(Debug)] +pub struct MetaUpdatePayload(meta_pb::MetaUpdate); + +impl From for MetaUpdatePayload { + fn from(src: MetaUpdate) -> Self { + MetaUpdatePayload(src.into_pb()) + } +} + +impl From<&MetaUpdate> for MetaUpdatePayload { + fn from(src: &MetaUpdate) -> Self { + MetaUpdatePayload(src.clone().into_pb()) + } +} + +impl Payload for MetaUpdatePayload { + type Error = Error; + + fn encode_size(&self) -> usize { + self.0.compute_size().try_into().unwrap_or(usize::MAX) + } + + fn encode_to(&self, buf: &mut B) -> Result<()> { + let mut writer = Writer::new(buf); + self.0 + .write_to_writer(&mut writer) + .context(EncodePayloadPb)?; + Ok(()) + } +} + +/// Decoder to decode MetaUpdate from log entry +pub struct MetaUpdateDecoder; + +impl PayloadDecoder for MetaUpdateDecoder { + type Error = Error; + type Target = MetaUpdate; + + fn decode(&self, buf: &mut B) -> Result { + let meta_update = meta_pb::MetaUpdate::parse_from_bytes(buf.remaining_slice()) + .context(DecodePayloadPb)?; + + let meta_update = MetaUpdate::try_from(meta_update)?; + + Ok(meta_update) + } +} diff --git a/analytic_engine/src/meta/mod.rs b/analytic_engine/src/meta/mod.rs new file mode 100644 index 0000000000..3bea46d26e --- /dev/null +++ b/analytic_engine/src/meta/mod.rs @@ -0,0 +1,29 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Manage meta data of the engine + +pub mod details; +pub mod meta_data; +pub mod meta_update; + +use std::fmt; + +use async_trait::async_trait; + +use crate::meta::{meta_data::ManifestData, meta_update::MetaUpdate}; + +/// Manifest holds meta data of all tables +#[async_trait] +pub trait Manifest: fmt::Debug { + type Error: std::error::Error + Send + Sync + 'static; + + /// Store update to manifest + async fn store_update(&self, update: MetaUpdate) -> Result<(), Self::Error>; + + /// Load all data from manifest. + /// + /// If `do_snapshot` is true, the manifest will try to create a snapshot of + /// the manifest data. The caller should ensure `store_update()` wont be + /// called during loading data. + async fn load_data(&self, do_snapshot: bool) -> Result; +} diff --git a/analytic_engine/src/payload.rs b/analytic_engine/src/payload.rs new file mode 100644 index 0000000000..02cf58fe0a --- /dev/null +++ b/analytic_engine/src/payload.rs @@ -0,0 +1,174 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Payloads to write to wal + +use std::convert::TryInto; + +use common_types::{ + bytes::{MemBuf, MemBufMut, Writer}, + row::{RowGroup, RowGroupBuilder}, + schema::Schema, +}; +use common_util::{ + codec::{row::WalRowDecoder, Decoder}, + define_result, +}; +use proto::table_requests; +use protobuf::Message; +use snafu::{Backtrace, ResultExt, Snafu}; +use wal::log_batch::{Payload, PayloadDecoder}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to encode header, err:{}", source))] + EncodeHeader { source: common_types::bytes::Error }, + + #[snafu(display("Failed to encode body, err:{}.\nBacktrace:\n{}", source, backtrace))] + EncodeBody { + source: protobuf::ProtobufError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to decode header, err:{}", source))] + DecodeHeader { source: common_types::bytes::Error }, + + #[snafu(display( + "Invalid wal entry header, value:{}.\nBacktrace:\n{}", + value, + backtrace + ))] + InvalidHeader { value: u8, backtrace: Backtrace }, + + #[snafu(display("Failed to decode body, err:{}.\nBacktrace:\n{}", source, backtrace))] + DecodeBody { + source: protobuf::ProtobufError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to decode schema, err:{}", source))] + DecodeSchema { source: common_types::schema::Error }, + + #[snafu(display("Failed to decode row, err:{}", source))] + DecodeRow { + source: common_util::codec::row::Error, + }, +} + +define_result!(Error); + +/// Wal entry header +#[derive(Clone, Copy)] +enum Header { + Write = 1, +} + +impl Header { + pub fn to_u8(self) -> u8 { + self as u8 + } + + pub fn from_u8(value: u8) -> Option { + match value { + value if value == Self::Write as u8 => Some(Self::Write), + _ => None, + } + } +} + +fn write_header(header: Header, buf: &mut B) -> Result<()> { + buf.write_u8(header.to_u8()).context(EncodeHeader)?; + Ok(()) +} + +/// Header size in bytes +const HEADER_SIZE: usize = 1; + +/// Write request to persist in wal +#[derive(Debug)] +pub enum WritePayload<'a> { + Write(&'a table_requests::WriteRequest), +} + +impl<'a> Payload for WritePayload<'a> { + type Error = Error; + + fn encode_size(&self) -> usize { + let body_size = match self { + WritePayload::Write(req) => req.compute_size(), + }; + + HEADER_SIZE + body_size as usize + } + + fn encode_to(&self, buf: &mut B) -> Result<()> { + match self { + WritePayload::Write(req) => { + write_header(Header::Write, buf)?; + let mut writer = Writer::new(buf); + req.write_to_writer(&mut writer).context(EncodeBody)?; + } + } + + Ok(()) + } +} + +/// Payload decoded from wal +#[derive(Debug)] +pub enum ReadPayload { + Write { row_group: RowGroup }, +} + +/// Wal payload decoder +#[derive(Default)] +pub struct WalDecoder; + +impl PayloadDecoder for WalDecoder { + type Error = Error; + type Target = ReadPayload; + + fn decode(&self, buf: &mut B) -> Result { + let header_value = buf.read_u8().context(DecodeHeader)?; + let header = match Header::from_u8(header_value) { + Some(header) => header, + None => { + return InvalidHeader { + value: header_value, + } + .fail() + } + }; + + let payload = match header { + Header::Write => { + let mut write_req_pb: table_requests::WriteRequest = + Message::parse_from_bytes(buf.remaining_slice()).context(DecodeBody)?; + + // Consume and convert schema in pb + let schema: Schema = write_req_pb + .take_schema() + .try_into() + .context(DecodeSchema)?; + + // Consume and convert rows in pb + let encoded_rows = write_req_pb.take_rows().into_vec(); + let mut builder = + RowGroupBuilder::with_capacity(schema.clone(), encoded_rows.len()); + let row_decoder = WalRowDecoder::new(&schema); + for row_bytes in &encoded_rows { + let row = row_decoder + .decode(&mut row_bytes.as_slice()) + .context(DecodeRow)?; + // We skip schema check here + builder.push_checked_row(row); + } + + let row_group = builder.build(); + + ReadPayload::Write { row_group } + } + }; + + Ok(payload) + } +} diff --git a/analytic_engine/src/row_iter/chain.rs b/analytic_engine/src/row_iter/chain.rs new file mode 100644 index 0000000000..881c96db3b --- /dev/null +++ b/analytic_engine/src/row_iter/chain.rs @@ -0,0 +1,373 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{fmt, time::Instant}; + +use async_trait::async_trait; +use common_types::{ + projected_schema::ProjectedSchema, record_batch::RecordBatchWithKey, request_id::RequestId, + schema::RecordSchemaWithKey, +}; +use common_util::define_result; +use futures::StreamExt; +use log::debug; +use object_store::ObjectStore; +use snafu::{ResultExt, Snafu}; +use table_engine::{predicate::PredicateRef, table::TableId}; + +use crate::{ + row_iter::{ + record_batch_stream, record_batch_stream::SequencedRecordBatchStream, + RecordBatchWithKeyIterator, + }, + space::SpaceId, + sst::{ + factory::{Factory, SstReaderOptions}, + file::FileHandle, + }, + table::version::{MemTableVec, SamplingMemTable}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Fail to build stream from the memtable, err:{}", source))] + BuildStreamFromMemtable { + source: crate::row_iter::record_batch_stream::Error, + }, + + #[snafu(display("Fail to build stream from the sst file, err:{}", source))] + BuildStreamFromSst { + source: crate::row_iter::record_batch_stream::Error, + }, + + #[snafu(display("Fail to poll next record batch, err:{}", source))] + PollNextRecordBatch { + source: Box, + }, +} + +define_result!(Error); + +/// Required parameters to construct the [Builder]. +#[derive(Clone, Debug)] +pub struct ChainConfig<'a, S, Fa> { + pub request_id: RequestId, + pub space_id: SpaceId, + pub table_id: TableId, + /// The projected schema to read. + pub projected_schema: ProjectedSchema, + /// Predicate of the query. + pub predicate: PredicateRef, + + pub sst_reader_options: SstReaderOptions, + pub sst_factory: Fa, + /// Sst storage + pub store: &'a S, +} + +/// Builder for [ChainIterator]. +#[must_use] +pub struct Builder<'a, S, Fa> { + config: ChainConfig<'a, S, Fa>, + /// Sampling memtable to read. + sampling_mem: Option, + memtables: MemTableVec, + ssts: Vec>, +} + +impl<'a, S, Fa> Builder<'a, S, Fa> { + pub fn new(config: ChainConfig<'a, S, Fa>) -> Self { + Self { + config, + sampling_mem: None, + memtables: Vec::new(), + ssts: Vec::new(), + } + } + + pub fn sampling_mem(mut self, sampling_mem: Option) -> Self { + self.sampling_mem = sampling_mem; + self + } + + pub fn memtables(mut self, memtables: MemTableVec) -> Self { + self.memtables = memtables; + self + } + + pub fn ssts(mut self, ssts: Vec>) -> Self { + self.ssts = ssts; + self + } +} + +impl<'a, S: ObjectStore, Fa: Factory> Builder<'a, S, Fa> { + pub async fn build(self) -> Result { + let total_sst_streams: usize = self.ssts.iter().map(|v| v.len()).sum(); + let mut total_streams = self.memtables.len() + total_sst_streams; + if self.sampling_mem.is_some() { + total_streams += 1; + } + let mut streams = Vec::with_capacity(total_streams); + + if let Some(v) = &self.sampling_mem { + let stream = record_batch_stream::filtered_stream_from_memtable( + self.config.projected_schema.clone(), + false, + &v.mem, + false, + self.config.predicate.as_ref(), + ) + .context(BuildStreamFromMemtable)?; + streams.push(stream); + } + + for memtable in &self.memtables { + let stream = record_batch_stream::filtered_stream_from_memtable( + self.config.projected_schema.clone(), + false, + // chain iterator only handle the case reading in no order so just read in asc + // order by default. + &memtable.mem, + false, + self.config.predicate.as_ref(), + ) + .context(BuildStreamFromMemtable)?; + streams.push(stream); + } + + for leveled_ssts in &self.ssts { + for sst in leveled_ssts { + let stream = record_batch_stream::filtered_stream_from_sst_file( + self.config.space_id, + self.config.table_id, + sst, + &self.config.sst_factory, + &self.config.sst_reader_options, + self.config.store, + ) + .await + .context(BuildStreamFromSst)?; + streams.push(stream); + } + } + + debug!( + "Build chain iterator, table_id:{:?}, request_id:{}, memtables:{:?}, ssts:{:?}", + self.config.table_id, self.config.request_id, self.memtables, self.ssts + ); + + Ok(ChainIterator { + space_id: self.config.space_id, + table_id: self.config.table_id, + request_id: self.config.request_id, + schema: self.config.projected_schema.to_record_schema_with_key(), + streams, + next_stream_idx: 0, + inited: false, + metrics: Metrics::new(self.memtables.len(), total_sst_streams), + }) + } +} + +/// Metrics for [ChainIterator]. +struct Metrics { + num_memtables: usize, + num_ssts: usize, + /// Total batch fetched. + total_batch_fetched: usize, + /// Total rows fetched. + total_rows_fetched: usize, + /// Create time of the metrics. + create_at: Instant, + /// Inited time of the iterator. + inited_at: Option, +} + +impl Metrics { + fn new(num_memtables: usize, num_ssts: usize) -> Self { + Self { + num_memtables, + num_ssts, + total_batch_fetched: 0, + total_rows_fetched: 0, + create_at: Instant::now(), + inited_at: None, + } + } + + fn set_inited_time(&mut self) { + self.inited_at = Some(Instant::now()); + } +} + +impl fmt::Debug for Metrics { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Metrics") + .field("num_memtables", &self.num_memtables) + .field("num_ssts", &self.num_ssts) + .field("total_batch_fetched", &self.total_batch_fetched) + .field("total_rows_fetched", &self.total_rows_fetched) + .field("duration_since_create", &self.create_at.elapsed()) + .field("duration_since_init", &self.inited_at.map(|v| v.elapsed())) + .finish() + } +} + +/// ChainIter chains memtables and ssts and reads the [RecordBatch] from them +/// batch by batch. +/// +/// Note: The chain order is `memtable -> sst level 0 -> sst_level 1`. +pub struct ChainIterator { + space_id: SpaceId, + table_id: TableId, + request_id: RequestId, + schema: RecordSchemaWithKey, + streams: Vec, + /// The range of the index is [0, streams.len()] and the iterator is + /// exhausted if it reaches `streams.len()`. + next_stream_idx: usize, + inited: bool, + + // metrics for the iterator. + metrics: Metrics, +} + +impl ChainIterator { + fn init_if_necessary(&mut self) { + if self.inited { + return; + } + self.inited = true; + self.metrics.set_inited_time(); + + debug!("Init ChainIterator, space_id:{}, table_id:{:?}, request_id:{}, total_streams:{}, schema:{:?}", + self.space_id, self.table_id, self.request_id, self.streams.len(), self.schema + ); + } +} + +impl Drop for ChainIterator { + fn drop(&mut self) { + debug!( + "Chain iterator dropped, space_id:{}, table_id:{:?}, request_id:{}, metrics:{:?}", + self.space_id, self.table_id, self.request_id, self.metrics, + ); + } +} + +#[async_trait] +impl RecordBatchWithKeyIterator for ChainIterator { + type Error = Error; + + fn schema(&self) -> &RecordSchemaWithKey { + &self.schema + } + + async fn next_batch(&mut self) -> Result> { + self.init_if_necessary(); + + while self.next_stream_idx < self.streams.len() { + let read_stream = &mut self.streams[self.next_stream_idx]; + let sequenced_record_batch = read_stream + .next() + .await + .transpose() + .context(PollNextRecordBatch)?; + + match sequenced_record_batch { + Some(v) => { + self.metrics.total_rows_fetched += v.num_rows(); + self.metrics.total_batch_fetched += 1; + + if v.num_rows() > 0 { + return Ok(Some(v.record_batch)); + } + } + // Fetch next stream only if the current sequence_record_batch is None. + None => self.next_stream_idx += 1, + } + } + + Ok(None) + } +} + +#[cfg(test)] +mod tests { + use common_types::{ + self, + row::Row, + tests::{build_row, build_schema}, + SequenceNumber, + }; + + use super::*; + use crate::row_iter::tests::check_iterator; + + async fn run_and_check(testcases: Vec<(SequenceNumber, Vec)>) { + let schema = build_schema(); + + let expect_rows: Vec<_> = testcases + .iter() + .flat_map(|(_, rows)| rows.clone()) + .collect(); + + let streams = + record_batch_stream::tests::build_sequenced_record_batch_stream(&schema, testcases); + + let mut chain_iter = ChainIterator { + space_id: 0, + table_id: TableId::MIN, + request_id: RequestId::next_id(), + schema: schema.to_record_schema_with_key(), + streams, + next_stream_idx: 0, + inited: false, + metrics: Metrics::new(0, 0), + }; + + check_iterator(&mut chain_iter, expect_rows).await; + } + + #[tokio::test] + async fn test_chain_multiple_streams() { + let testcases = vec![ + // (sequence, rows) + (10, vec![build_row(b"key4", 1000000, 10.0, "v4")]), + (20, vec![build_row(b"key2", 1000000, 10.0, "v2")]), + (100, vec![build_row(b"key3", 1000000, 10.0, "v3")]), + (1, vec![build_row(b"key1", 1000000, 10.0, "v1")]), + ]; + run_and_check(testcases).await; + } + + #[tokio::test] + async fn test_chain_empty_streams() { + let testcases = vec![ + // (sequence, rows) + (10, vec![]), + (20, vec![]), + (100, vec![]), + (1, vec![]), + ]; + run_and_check(testcases).await; + } + + #[tokio::test] + async fn test_chain_no_streams() { + let testcases = vec![]; + run_and_check(testcases).await; + } + + #[tokio::test] + async fn test_chain_half_empty_streams() { + let testcases = vec![ + // (sequence, rows) + (10, vec![build_row(b"key4", 1000000, 10.0, "v4")]), + (20, vec![]), + (100, vec![]), + (1, vec![build_row(b"key1", 1000000, 10.0, "v1")]), + ]; + run_and_check(testcases).await; + } +} diff --git a/analytic_engine/src/row_iter/dedup.rs b/analytic_engine/src/row_iter/dedup.rs new file mode 100644 index 0000000000..cd58b0157f --- /dev/null +++ b/analytic_engine/src/row_iter/dedup.rs @@ -0,0 +1,243 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::cmp::Ordering; + +use async_trait::async_trait; +use common_types::{ + record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder}, + request_id::RequestId, + row::{Row, RowViewOnBatch, RowWithMeta}, + schema::RecordSchemaWithKey, +}; +use common_util::define_result; +use log::{info, trace}; +use snafu::{ResultExt, Snafu}; + +use crate::row_iter::{IterOptions, RecordBatchWithKeyIterator}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to iterate column, error:{:?}", source))] + IterateColumn { source: common_types::row::Error }, + + #[snafu(display("Failed to build record batch, error:{:?}", source))] + BuildRecordBatch { + source: common_types::record_batch::Error, + }, + + #[snafu(display("Failed to append row, err:{:?}", source))] + AppendRow { + source: common_types::record_batch::Error, + }, + + #[snafu(display("Failed to read data from the sub iterator, err:{:?}", source))] + ReadFromSubIter { + source: Box, + }, +} + +define_result!(Error); + +/// Dedup the elements from the `iter` by choosing the first one in the +/// duplicate rows. +pub struct DedupIterator { + request_id: RequestId, + schema: RecordSchemaWithKey, + record_batch_builder: RecordBatchWithKeyBuilder, + iter: I, + /// Previous row returned. + prev_row: Option, + /// Store which row in record batch is keep, use Vec is a bit faster + /// than a bitmap. + selected_rows: Vec, + + // Metrics: + total_duplications: usize, + total_selected_rows: usize, +} + +impl DedupIterator { + pub fn new(request_id: RequestId, iter: I, iter_options: IterOptions) -> Self { + let schema = iter.schema(); + + let record_batch_builder = + RecordBatchWithKeyBuilder::with_capacity(schema.clone(), iter_options.batch_size); + Self { + request_id, + schema: schema.clone(), + record_batch_builder, + iter, + prev_row: None, + selected_rows: Vec::new(), + total_duplications: 0, + total_selected_rows: 0, + } + } + + fn dedup_batch(&mut self, record_batch: RecordBatchWithKey) -> Result { + self.selected_rows.clear(); + // Ignore all rows by default. + self.selected_rows.resize(record_batch.num_rows(), false); + + if record_batch.is_empty() { + return Ok(record_batch); + } + + // Dedup batch. + for col_idx in 0..self.schema.num_key_columns() { + let column = record_batch.column(col_idx); + + column.dedup(&mut self.selected_rows); + } + + // Dedup first row in record batch with previous row. + if let Some(prev_row) = &self.prev_row { + let prev_row_view = RowWithMeta { + row: prev_row, + schema: &self.schema, + }; + let curr_row_view = RowViewOnBatch { + record_batch: &record_batch, + // First row. + row_idx: 0, + }; + + let is_equal = matches!( + // TODO(yingwen): Compare row needs clone data of row. + self.schema.compare_row(&prev_row_view, &curr_row_view), + Ordering::Equal + ); + + if is_equal { + // Depulicate with previous row. + self.selected_rows[0] = false; + } + } + + let selected_num = self + .selected_rows + .iter() + .map(|v| if *v { 1 } else { 0 }) + .sum(); + + // Eventhough all rows are duplicate, we can still use row pointed by + // prev_row_idx because they have same row key. + self.prev_row = Some(record_batch.clone_row_at(record_batch.num_rows() - 1)); + + self.filter_batch(record_batch, selected_num) + } + + /// Filter batch by `selected_rows`. + fn filter_batch( + &mut self, + record_batch: RecordBatchWithKey, + selected_num: usize, + ) -> Result { + self.total_selected_rows += selected_num; + self.total_duplications += record_batch.num_rows() - selected_num; + + if selected_num == record_batch.num_rows() { + // No duplicate rows in batch. + return Ok(record_batch); + } + + self.record_batch_builder.clear(); + for (row_idx, selected) in self.selected_rows.iter().enumerate() { + if *selected { + self.record_batch_builder + .append_row_view(&RowViewOnBatch { + record_batch: &record_batch, + row_idx, + }) + .context(AppendRow)?; + } + } + + self.record_batch_builder.build().context(BuildRecordBatch) + } +} + +#[async_trait] +impl RecordBatchWithKeyIterator for DedupIterator { + type Error = Error; + + fn schema(&self) -> &RecordSchemaWithKey { + &self.schema + } + + async fn next_batch(&mut self) -> Result> { + match self + .iter + .next_batch() + .await + .map_err(|e| Box::new(e) as _) + .context(ReadFromSubIter)? + { + Some(record_batch) => { + trace!( + "DedupIterator received next record batch, request_id:{}, batch:{:?}", + self.request_id, + record_batch + ); + + self.dedup_batch(record_batch).map(Some) + } + None => { + info!( + "DedupIterator received none record batch, request_id:{}, total_duplications:{}, total_selected_rows:{}", + self.request_id, self.total_duplications, self.total_selected_rows, + ); + + Ok(None) + } + } + } +} + +#[cfg(test)] +mod tests { + use common_types::tests::{build_row, build_schema}; + + use super::*; + use crate::row_iter::tests::{build_record_batch_with_key, check_iterator, VectorIterator}; + + #[tokio::test] + async fn test_dedup_iterator() { + // first two columns are key columns + let schema = build_schema(); + let iter = VectorIterator::new( + schema.to_record_schema_with_key(), + vec![ + build_record_batch_with_key( + schema.clone(), + vec![ + build_row(b"a", 1, 10.0, "v1"), + build_row(b"a", 1, 10.0, "v"), + build_row(b"a", 2, 10.0, "v2"), + ], + ), + build_record_batch_with_key( + schema, + vec![ + build_row(b"a", 2, 10.0, "v"), + build_row(b"a", 3, 10.0, "v3"), + build_row(b"a", 3, 10.0, "v"), + build_row(b"a", 4, 10.0, "v4"), + ], + ), + ], + ); + + let mut iter = DedupIterator::new(RequestId::next_id(), iter, IterOptions::default()); + check_iterator( + &mut iter, + vec![ + build_row(b"a", 1, 10.0, "v1"), + build_row(b"a", 2, 10.0, "v2"), + build_row(b"a", 3, 10.0, "v3"), + build_row(b"a", 4, 10.0, "v4"), + ], + ) + .await; + } +} diff --git a/analytic_engine/src/row_iter/merge.rs b/analytic_engine/src/row_iter/merge.rs new file mode 100644 index 0000000000..49403c90ae --- /dev/null +++ b/analytic_engine/src/row_iter/merge.rs @@ -0,0 +1,957 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + cmp, + cmp::Ordering, + collections::BinaryHeap, + fmt, mem, + ops::{Deref, DerefMut}, + time::{Duration, Instant}, +}; + +use async_trait::async_trait; +use common_types::{ + projected_schema::ProjectedSchema, + record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder}, + request_id::RequestId, + row::RowViewOnBatch, + schema::RecordSchemaWithKey, + SequenceNumber, +}; +use common_util::define_result; +use futures::StreamExt; +use log::{debug, info, trace}; +use object_store::ObjectStore; +use snafu::{ensure, Backtrace, ResultExt, Snafu}; +use table_engine::{predicate::PredicateRef, table::TableId}; + +use crate::{ + row_iter::{ + record_batch_stream, + record_batch_stream::{SequencedRecordBatch, SequencedRecordBatchStream}, + IterOptions, RecordBatchWithKeyIterator, + }, + space::SpaceId, + sst::{ + factory::{Factory, SstReaderOptions}, + file::FileHandle, + manager::{FileId, MAX_LEVEL}, + }, + table::version::{MemTableVec, SamplingMemTable}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Expect the same schema, expect:{:?}, given:{:?}.\nBacktrace:\n{}", + expect, + given, + backtrace + ))] + MismatchedSchema { + expect: RecordSchemaWithKey, + given: RecordSchemaWithKey, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to pull record batch, error:{}", source))] + PullRecordBatch { + source: Box, + }, + + #[snafu(display("Failed to build record batch, error:{}", source))] + BuildRecordBatch { + source: common_types::record_batch::Error, + }, + + #[snafu(display("Failed to append row, err:{:?}", source))] + AppendRow { + source: common_types::record_batch::Error, + }, + + #[snafu(display("Failed to build stream from memtable, err:{}", source))] + BuildStreamFromMemtable { + source: crate::row_iter::record_batch_stream::Error, + }, + + #[snafu(display("Failed to build record batch from sst, err:{}", source))] + BuildStreamFromSst { + source: crate::row_iter::record_batch_stream::Error, + }, +} + +define_result!(Error); + +/// Required parameters to construct the [MergeBuilder] +#[derive(Debug)] +pub struct MergeConfig<'a, S, Fa> { + pub request_id: RequestId, + pub space_id: SpaceId, + pub table_id: TableId, + /// Max visible sequence (inclusive) + pub sequence: SequenceNumber, + /// The projected schema to read. + pub projected_schema: ProjectedSchema, + /// The predicate of the query. + pub predicate: PredicateRef, + + pub sst_reader_options: SstReaderOptions, + pub sst_factory: Fa, + /// Sst storage + pub store: &'a S, + + pub merge_iter_options: IterOptions, + + pub need_dedup: bool, + pub reverse: bool, +} + +/// Builder for building merge stream from memtables and sst files. +#[must_use] +pub struct MergeBuilder<'a, S, Fa> { + config: MergeConfig<'a, S, Fa>, + + /// Sampling memtable to read. + sampling_mem: Option, + /// MemTables to read. + memtables: MemTableVec, + /// Ssts to read of each level. + ssts: Vec>, +} + +impl<'a, S: ObjectStore, Fa: Factory> MergeBuilder<'a, S, Fa> { + pub fn new(config: MergeConfig<'a, S, Fa>) -> Self { + Self { + config, + sampling_mem: None, + memtables: Vec::new(), + ssts: vec![Vec::new(); MAX_LEVEL], + } + } + + pub fn sampling_mem(mut self, sampling_mem: Option) -> Self { + self.sampling_mem = sampling_mem; + self + } + + pub fn memtables(mut self, memtables: MemTableVec) -> Self { + self.memtables = memtables; + self + } + + pub fn ssts_of_level(mut self, ssts: Vec>) -> Self { + self.ssts = ssts; + self + } + + pub fn mut_memtables(&mut self) -> &mut MemTableVec { + &mut self.memtables + } + + /// Returns file handles in `level`, panic if level >= MAX_LEVEL + pub fn mut_ssts_of_level(&mut self, level: u16) -> &mut Vec { + &mut self.ssts[usize::from(level)] + } + + pub async fn build(self) -> Result { + let sst_streams_num: usize = self + .ssts + .iter() + .map(|leveled_ssts| leveled_ssts.len()) + .sum(); + let mut streams_num = sst_streams_num + self.memtables.len(); + if self.sampling_mem.is_some() { + streams_num += 1; + } + let mut streams = Vec::with_capacity(streams_num); + + debug!( + "Build merge iterator, table_id:{:?}, request_id:{}, sampling_mem:{:?}, memtables:{:?}, ssts:{:?}", + self.config.table_id, + self.config.request_id, + self.sampling_mem, + self.memtables, + self.ssts + ); + + if let Some(v) = &self.sampling_mem { + let stream = record_batch_stream::filtered_stream_from_memtable( + self.config.projected_schema.clone(), + self.config.need_dedup, + &v.mem, + self.config.reverse, + self.config.predicate.as_ref(), + ) + .context(BuildStreamFromMemtable)?; + streams.push(stream); + } + + for memtable in &self.memtables { + let stream = record_batch_stream::filtered_stream_from_memtable( + self.config.projected_schema.clone(), + self.config.need_dedup, + &memtable.mem, + self.config.reverse, + self.config.predicate.as_ref(), + ) + .context(BuildStreamFromMemtable)?; + streams.push(stream); + } + + let mut sst_ids = Vec::with_capacity(self.ssts.len()); + for leveled_ssts in &self.ssts { + for f in leveled_ssts { + let stream = record_batch_stream::filtered_stream_from_sst_file( + self.config.space_id, + self.config.table_id, + f, + &self.config.sst_factory, + &self.config.sst_reader_options, + self.config.store, + ) + .await + .context(BuildStreamFromSst)?; + streams.push(stream); + sst_ids.push(f.id()); + } + } + + Ok(MergeIterator::new( + self.config.table_id, + self.config.request_id, + // Use the schema after projection as the schema of the merge iterator. + self.config.projected_schema.to_record_schema_with_key(), + streams, + self.config.merge_iter_options, + self.config.reverse, + Metrics::new(self.memtables.len(), sst_streams_num, sst_ids), + )) + } +} + +struct BufferedStreamState { + /// Buffered record batch. + /// + /// invariant: `buffered_record_batch` is not empty. + buffered_record_batch: SequencedRecordBatch, + /// Cursor for reading buffered record batch. + /// + /// `cursor` increases monotonically from 0 to + /// `buffered_record_batch.num_rows()` and `cursor == + /// buffered_record_batch.num_rows()` means no more buffered rows to read. + cursor: usize, +} + +impl BufferedStreamState { + #[inline] + fn is_valid(&self) -> bool { + self.cursor < self.buffered_record_batch.num_rows() + } + + #[inline] + fn is_empty(&self) -> bool { + self.cursor >= self.buffered_record_batch.num_rows() + } + + #[inline] + fn sequence(&self) -> SequenceNumber { + self.buffered_record_batch.sequence + } + + #[inline] + fn first_row(&self) -> RowViewOnBatch<'_> { + assert!(self.is_valid()); + + RowViewOnBatch { + record_batch: &self.buffered_record_batch.record_batch, + row_idx: self.cursor, + } + } + + #[inline] + fn last_row(&self) -> RowViewOnBatch<'_> { + assert!(self.is_valid()); + + RowViewOnBatch { + record_batch: &self.buffered_record_batch.record_batch, + row_idx: self.buffered_record_batch.num_rows() - 1, + } + } + + /// Returns the next available row in the buffer and advance the cursor by + /// one step. + fn next_row(&mut self) -> Option> { + if self.cursor < self.buffered_record_batch.num_rows() { + let row_view = RowViewOnBatch { + record_batch: &self.buffered_record_batch.record_batch, + row_idx: self.cursor, + }; + self.cursor += 1; + Some(row_view) + } else { + None + } + } + + /// Append `len` rows from cursor to the `builder` and advance the cursor. + /// + /// Returns number of rows added. + fn append_rows_to( + &mut self, + builder: &mut RecordBatchWithKeyBuilder, + len: usize, + ) -> Result { + let added = builder + .append_batch_range(&self.buffered_record_batch.record_batch, self.cursor, len) + .context(AppendRow)?; + self.cursor += added; + Ok(added) + } + + /// Take record batch slice with at most `len` rows from cursor and advance + /// the cursor. + fn take_record_batch_slice(&mut self, len: usize) -> RecordBatchWithKey { + let len_to_fetch = cmp::min( + self.buffered_record_batch.record_batch.num_rows() - self.cursor, + len, + ); + let record_batch = self + .buffered_record_batch + .record_batch + .slice(self.cursor, len_to_fetch); + self.cursor += record_batch.num_rows(); + record_batch + } + + #[inline] + fn reset(&mut self, record_batch: SequencedRecordBatch) { + self.buffered_record_batch = record_batch; + self.cursor = 0; + } +} + +struct BufferedStream { + schema: RecordSchemaWithKey, + stream: SequencedRecordBatchStream, + /// `None` state means the stream is exhausted. + state: Option, +} + +impl BufferedStream { + async fn build( + schema: RecordSchemaWithKey, + mut stream: SequencedRecordBatchStream, + metrics: &mut Metrics, + ) -> Result { + // TODO(xikai): do the metrics collection in the `pull_next_non_empty_batch`. + let pull_start = Instant::now(); + let buffered_record_batch = Self::pull_next_non_empty_batch(&mut stream).await?; + metrics.scan_duration += pull_start.elapsed(); + metrics.scan_count += 1; + + let state = buffered_record_batch.map(|v| BufferedStreamState { + buffered_record_batch: v, + cursor: 0, + }); + + Ok(Self { + schema, + stream, + state, + }) + } + + fn sequence_in_buffer(&self) -> SequenceNumber { + self.state.as_ref().unwrap().sequence() + } + + /// REQUIRE: the buffer is not exhausted. + fn first_row_in_buffer(&self) -> RowViewOnBatch<'_> { + self.state.as_ref().unwrap().first_row() + } + + /// REQUIRE: the buffer is not exhausted. + fn last_row_in_buffer(&self) -> RowViewOnBatch<'_> { + self.state.as_ref().unwrap().last_row() + } + + /// REQUIRE: the buffer is not exhausted. + fn next_row_in_buffer(&mut self) -> Option> { + self.state.as_mut().unwrap().next_row() + } + + /// REQUIRE: the buffer is not exhausted. + fn append_rows_to( + &mut self, + builder: &mut RecordBatchWithKeyBuilder, + len: usize, + ) -> Result { + self.state.as_mut().unwrap().append_rows_to(builder, len) + } + + /// REQUIRE: the buffer is not exhausted. + fn take_record_batch_slice(&mut self, len: usize) -> RecordBatchWithKey { + self.state.as_mut().unwrap().take_record_batch_slice(len) + } + + /// Pull the next non empty record batch. + /// + /// The returned record batch is ensured `num_rows() > 0`. + async fn pull_next_non_empty_batch( + stream: &mut SequencedRecordBatchStream, + ) -> Result> { + loop { + match stream.next().await.transpose().context(PullRecordBatch)? { + Some(record_batch) => { + trace!( + "MergeIterator one record batch is fetched:{:?}", + record_batch + ); + + if record_batch.num_rows() > 0 { + return Ok(Some(record_batch)); + } + } + None => return Ok(None), + } + } + } + + /// Pull the next batch if the stream is not exhausted and the inner state + /// is empty. + async fn pull_next_batch_if_necessary(&mut self, metrics: &mut Metrics) -> Result { + let need_pull_new_batch = !self.is_exhausted() && self.state.as_ref().unwrap().is_empty(); + if !need_pull_new_batch { + return Ok(false); + } + + // TODO(xikai): do the metrics collection in the `pull_next_non_empty_batch`. + let pull_start = Instant::now(); + let pulled = match Self::pull_next_non_empty_batch(&mut self.stream).await? { + None => { + self.state = None; + Ok(false) + } + Some(record_batch) => { + self.state.as_mut().unwrap().reset(record_batch); + Ok(true) + } + }; + + metrics.scan_duration += pull_start.elapsed(); + metrics.scan_count += 1; + + pulled + } + + #[inline] + fn is_exhausted(&self) -> bool { + self.state.is_none() + } + + fn into_heaped(self, reverse: bool) -> HeapBufferedStream { + HeapBufferedStream { + stream: self, + reverse, + } + } + + #[inline] + fn schema(&self) -> &RecordSchemaWithKey { + &self.schema + } +} + +/// The wrapper struct determines the compare result for the min binary heap. +struct HeapBufferedStream { + stream: BufferedStream, + reverse: bool, +} + +impl HeapBufferedStream { + /// Check whether all the buffered rows in the `stream` is after the + /// `boundary_row`. + /// + /// NOTE: + /// - The first row in the stream is actually the max row if in reverse + /// order and should check whether it is smaller than `boundary_row`. + /// - The first row in the stream is actually the min row if in normal + /// order and should check whether it is greater than `boundary_row`. + fn is_after_boundary( + &self, + schema: &RecordSchemaWithKey, + boundary_row: &RowViewOnBatch, + ) -> bool { + if self.reverse { + // Compare the max row(the first row) in of the stream with the boundary row. + // The stream is after the boundary if the max row is smaller than boundary. + // is_after: (boundary_row) > [first_row in buffer] + matches!( + schema.compare_row(boundary_row, &self.first_row_in_buffer()), + Ordering::Greater + ) + } else { + // compare the min row(the first row) in of the stream with the boundary row. + // The stream is after the boundary if the min row is greater than boundary. + // is_after: (boundary_row) < [first_row in buffer] + matches!( + schema.compare_row(&self.first_row_in_buffer(), boundary_row), + Ordering::Greater + ) + } + } +} + +impl Deref for HeapBufferedStream { + type Target = BufferedStream; + + fn deref(&self) -> &BufferedStream { + &self.stream + } +} + +impl DerefMut for HeapBufferedStream { + fn deref_mut(&mut self) -> &mut BufferedStream { + &mut self.stream + } +} + +impl PartialEq for HeapBufferedStream { + fn eq(&self, other: &Self) -> bool { + let ordering = self + .schema + .compare_row(&self.first_row_in_buffer(), &other.first_row_in_buffer()); + if let Ordering::Equal = ordering { + self.sequence_in_buffer() == other.sequence_in_buffer() + } else { + false + } + } +} + +impl Eq for HeapBufferedStream {} + +impl PartialOrd for HeapBufferedStream { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for HeapBufferedStream { + fn cmp(&self, other: &Self) -> Ordering { + let ordering = if self.reverse { + // keep the original ordering so the greater row comes before the smaller one. + self.schema + .compare_row(&self.first_row_in_buffer(), &other.first_row_in_buffer()) + } else { + // reverse the original ordering so the smaller row comes before the greater + // one. + self.schema + .compare_row(&other.first_row_in_buffer(), &self.first_row_in_buffer()) + }; + + if let Ordering::Equal = ordering { + // The larger sequence number should always comes before the smaller one. + self.sequence_in_buffer().cmp(&other.sequence_in_buffer()) + } else { + ordering + } + } +} + +pub struct Metrics { + num_memtables: usize, + num_ssts: usize, + sst_ids: Vec, + /// Times to fetch rows from one stream. + times_fetch_rows_from_one: usize, + /// Total rows collected using fetch_rows_from_one_stream(). + total_rows_fetch_from_one: usize, + /// Times to fetch one row from multiple stream. + times_fetch_row_from_multiple: usize, + /// Create time of the metrics. + create_at: Instant, + /// Init time cost of the metrics. + init_duration: Duration, + /// Scan time cost of the metrics. + scan_duration: Duration, + /// Scan count + scan_count: usize, +} + +impl Metrics { + fn new(num_memtables: usize, num_ssts: usize, sst_ids: Vec) -> Self { + Self { + num_memtables, + num_ssts, + sst_ids, + times_fetch_rows_from_one: 0, + total_rows_fetch_from_one: 0, + times_fetch_row_from_multiple: 0, + create_at: Instant::now(), + init_duration: Duration::default(), + scan_duration: Duration::default(), + scan_count: 0, + } + } +} + +impl fmt::Debug for Metrics { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Metrics") + .field("num_memtables", &self.num_memtables) + .field("num_ssts", &self.num_ssts) + .field("sst_ids", &self.sst_ids) + .field("times_fetch_rows_from_one", &self.times_fetch_rows_from_one) + .field("total_rows_fetch_from_one", &self.total_rows_fetch_from_one) + .field( + "times_fetch_row_from_multiple", + &self.times_fetch_row_from_multiple, + ) + .field("duration_since_create", &self.create_at.elapsed()) + .field("init_duration", &self.init_duration) + .field("scan_duration", &self.scan_duration) + .field("scan_count", &self.scan_count) + .finish() + } +} + +pub struct MergeIterator { + table_id: TableId, + request_id: RequestId, + inited: bool, + schema: RecordSchemaWithKey, + record_batch_builder: RecordBatchWithKeyBuilder, + origin_streams: Vec, + /// Any [BufferedStream] in the hot heap is not empty. + hot: BinaryHeap, + /// Any [BufferedStream] in the cold heap is not empty. + cold: BinaryHeap, + iter_options: IterOptions, + reverse: bool, + metrics: Metrics, +} + +impl MergeIterator { + pub fn new( + table_id: TableId, + request_id: RequestId, + schema: RecordSchemaWithKey, + streams: Vec, + iter_options: IterOptions, + reverse: bool, + metrics: Metrics, + ) -> Self { + let heap_cap = streams.len(); + let record_batch_builder = + RecordBatchWithKeyBuilder::with_capacity(schema.clone(), iter_options.batch_size); + Self { + table_id, + request_id, + inited: false, + schema, + record_batch_builder, + origin_streams: streams, + hot: BinaryHeap::with_capacity(heap_cap), + cold: BinaryHeap::with_capacity(heap_cap), + iter_options, + reverse, + metrics, + } + } + + fn merge_window_end(&self) -> Option { + self.hot.peek().as_ref().map(|v| v.last_row_in_buffer()) + } + + async fn init_if_necessary(&mut self) -> Result<()> { + if self.inited { + return Ok(()); + } + + info!( + "Merge iterator init, table_id:{:?}, request_id:{}, schema:{:?}", + self.table_id, self.request_id, self.schema + ); + let init_start = Instant::now(); + + let current_schema = &self.schema; + for stream in mem::take(&mut self.origin_streams) { + let buffered_stream = + BufferedStream::build(self.schema.clone(), stream, &mut self.metrics).await?; + let stream_schema = buffered_stream.schema(); + ensure!( + current_schema == stream_schema, + MismatchedSchema { + expect: current_schema.clone(), + given: stream_schema.clone(), + } + ); + + if !buffered_stream.is_exhausted() { + self.cold.push(buffered_stream.into_heaped(self.reverse)); + } + } + + self.refill_hot(); + + self.inited = true; + self.metrics.init_duration = init_start.elapsed(); + Ok(()) + } + + fn refill_hot(&mut self) { + while !self.cold.is_empty() { + if !self.hot.is_empty() { + let merge_window_end = self.merge_window_end().unwrap(); + let warmest = self.cold.peek().unwrap(); + if warmest.is_after_boundary(&self.schema, &merge_window_end) { + // if the warmest stream in the cold stream sets is totally after the + // merge_window_end then no need to add more streams into + // the hot stream sets for merge sorting. + break; + } + } + + let warmest = self.cold.pop().unwrap(); + self.hot.push(warmest); + } + } + + /// Pull the next batch Rearrange the heap + async fn reheap(&mut self, mut buffered_stream: HeapBufferedStream) -> Result<()> { + let pulled_new_batch = buffered_stream + .pull_next_batch_if_necessary(&mut self.metrics) + .await?; + + if buffered_stream.is_exhausted() { + self.refill_hot(); + } else if pulled_new_batch { + // TODO(xikai): it seems no need to decide to which heap push the + // `buffered_stream`. Just put the new batch into the cold heap if + // the max bound of the hottest batch is smaller than the min bound + // of new one. + let cold_new_batch = if let Some(hottest) = self.hot.peek() { + buffered_stream.is_after_boundary(&self.schema, &hottest.last_row_in_buffer()) + } else { + false + }; + + if cold_new_batch { + self.cold.push(buffered_stream); + } else { + self.hot.push(buffered_stream); + } + self.refill_hot(); + } else { + // No new batch is pulled and the `buffered_stream` is not exhausted so just put + // it back to the hot heap. + self.hot.push(buffered_stream); + } + + Ok(()) + } + + /// Fetch at most `num_rows_to_fetch` rows from the hottest + /// `BufferedStream`. + /// + /// If the inner builder is empty, returns a slice of the record batch in + /// stream. + async fn fetch_rows_from_one_stream( + &mut self, + num_rows_to_fetch: usize, + ) -> Result> { + assert_eq!(self.hot.len(), 1); + self.metrics.times_fetch_rows_from_one += 1; + + let mut buffered_stream = self.hot.pop().unwrap(); + + let record_batch = if self.record_batch_builder.is_empty() { + let record_batch = buffered_stream.take_record_batch_slice(num_rows_to_fetch); + + self.metrics.total_rows_fetch_from_one += record_batch.num_rows(); + + Some(record_batch) + } else { + let fetched_row_num = buffered_stream + .append_rows_to(&mut self.record_batch_builder, num_rows_to_fetch)?; + + self.metrics.total_rows_fetch_from_one += fetched_row_num; + + None + }; + + self.reheap(buffered_stream).await?; + + Ok(record_batch) + } + + /// Fetch one row from the hottest `BufferedStream`. + /// + /// REQUIRES: `self.hot` is not empty. + async fn fetch_one_row_from_multiple_streams(&mut self) -> Result<()> { + assert!(!self.hot.is_empty()); + self.metrics.times_fetch_row_from_multiple += 1; + + let mut hottest = self.hot.pop().unwrap(); + let row = hottest.next_row_in_buffer().unwrap(); + self.record_batch_builder + .append_row_view(&row) + .context(AppendRow)?; + self.reheap(hottest).await + } + + /// Fetch the next batch from the streams. + /// + /// `init_if_necessary` should be finished before this method. + async fn fetch_next_batch(&mut self) -> Result> { + self.init_if_necessary().await?; + + self.record_batch_builder.clear(); + + while !self.hot.is_empty() && self.record_batch_builder.len() < self.iter_options.batch_size + { + // no need to do merge sort if only one batch in the hot heap. + if self.hot.len() == 1 { + let fetch_row_num = self.iter_options.batch_size - self.record_batch_builder.len(); + + if let Some(record_batch) = self.fetch_rows_from_one_stream(fetch_row_num).await? { + // The builder is empty and we have fetch a record batch from this stream, just + // return that batch. + return Ok(Some(record_batch)); + } + // Else, some rows may have been pushed into the builder. + } else { + self.fetch_one_row_from_multiple_streams().await?; + } + } + + if self.record_batch_builder.is_empty() { + Ok(None) + } else { + let record_batch = self + .record_batch_builder + .build() + .context(BuildRecordBatch)?; + Ok(Some(record_batch)) + } + } +} + +impl Drop for MergeIterator { + fn drop(&mut self) { + info!( + "Merge iterator dropped, table_id:{:?}, request_id:{}, metrics:{:?}, iter_options:{:?},", + self.table_id, self.request_id, self.metrics, self.iter_options, + ); + } +} + +#[async_trait] +impl RecordBatchWithKeyIterator for MergeIterator { + type Error = Error; + + fn schema(&self) -> &RecordSchemaWithKey { + &self.schema + } + + async fn next_batch(&mut self) -> Result> { + let record_batch = self.fetch_next_batch().await?; + + trace!("MergeIterator send next record batch:{:?}", record_batch); + + Ok(record_batch) + } +} + +#[cfg(test)] +mod tests { + use common_types::{ + self, + tests::{build_row, build_schema}, + }; + + use super::*; + use crate::row_iter::tests::check_iterator; + + #[tokio::test] + async fn test_row_merge_iterator() { + // first two columns are key columns + let schema = build_schema(); + + let testcases = vec![ + // (sequence, rows) + (10, vec![build_row(b"y", 1000000, 10.0, "v4")]), + (20, vec![build_row(b"y", 1000000, 10.0, "v3")]), + (100, vec![build_row(b"b", 1000000, 10.0, "v2")]), + (1, vec![build_row(b"a", 1000000, 10.0, "v1")]), + ]; + + let streams = + record_batch_stream::tests::build_sequenced_record_batch_stream(&schema, testcases); + let mut iter = MergeIterator::new( + TableId::MIN, + RequestId::next_id(), + schema.to_record_schema_with_key(), + streams, + IterOptions::default(), + false, + Metrics::new(1, 1, vec![]), + ); + + check_iterator( + &mut iter, + vec![ + build_row(b"a", 1000000, 10.0, "v1"), + build_row(b"b", 1000000, 10.0, "v2"), + build_row(b"y", 1000000, 10.0, "v3"), + build_row(b"y", 1000000, 10.0, "v4"), + ], + ) + .await; + } + + #[tokio::test] + async fn test_row_merge_iterator_reverse() { + // first two columns are key columns + let schema = build_schema(); + + let testcases = vec![ + // (sequence, rows) + ( + 10, + vec![ + build_row(b"y", 1000001, 10.0, "v5"), + build_row(b"y", 1000000, 10.0, "v4"), + ], + ), + (20, vec![build_row(b"y", 1000000, 10.0, "v3")]), + (100, vec![build_row(b"b", 1000000, 10.0, "v2")]), + (1, vec![build_row(b"a", 1000000, 10.0, "v1")]), + ]; + + let streams = + record_batch_stream::tests::build_sequenced_record_batch_stream(&schema, testcases); + let mut iter = MergeIterator::new( + TableId::MIN, + RequestId::next_id(), + schema.to_record_schema_with_key(), + streams, + IterOptions::default(), + true, + Metrics::new(1, 1, vec![]), + ); + + check_iterator( + &mut iter, + vec![ + build_row(b"y", 1000001, 10.0, "v5"), + build_row(b"y", 1000000, 10.0, "v3"), + build_row(b"y", 1000000, 10.0, "v4"), + build_row(b"b", 1000000, 10.0, "v2"), + build_row(b"a", 1000000, 10.0, "v1"), + ], + ) + .await; + } +} diff --git a/analytic_engine/src/row_iter/mod.rs b/analytic_engine/src/row_iter/mod.rs new file mode 100644 index 0000000000..8c30523396 --- /dev/null +++ b/analytic_engine/src/row_iter/mod.rs @@ -0,0 +1,87 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Iterators for row. + +use std::{ + pin::Pin, + task::{Context, Poll}, +}; + +use async_trait::async_trait; +use common_types::{record_batch::RecordBatchWithKey, schema::RecordSchemaWithKey}; +use common_util::runtime::Runtime; +use futures::stream::Stream; +use log::{debug, error}; +use tokio::sync::mpsc::{self, Receiver}; + +use crate::sst::builder::{RecordBatchStream, RecordBatchStreamItem}; + +pub mod chain; +pub mod dedup; +pub mod merge; +pub mod record_batch_stream; +#[cfg(test)] +pub mod tests; + +const RECORD_BATCH_READ_BUF_SIZE: usize = 10; + +#[derive(Debug, Clone)] +pub struct IterOptions { + pub batch_size: usize, +} + +impl Default for IterOptions { + fn default() -> Self { + Self { batch_size: 500 } + } +} + +/// The iterator for reading RecordBatch from a table. +/// +/// The `schema()` should be the same as the RecordBatch from `read()`. +/// The reader is exhausted if the `read()` returns the `Ok(None)`. +#[async_trait] +pub trait RecordBatchWithKeyIterator: Send { + type Error: std::error::Error + Send + Sync + 'static; + + fn schema(&self) -> &RecordSchemaWithKey; + + async fn next_batch(&mut self) -> std::result::Result, Self::Error>; +} + +struct ReceiverStream { + rx: Receiver, +} + +impl Stream for ReceiverStream { + type Item = RecordBatchStreamItem; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + Pin::new(&mut this.rx).poll_recv(cx) + } +} + +// TODO(yingwen): This is a hack way to convert an async trait to stream. +pub fn record_batch_with_key_iter_to_stream( + mut iter: I, + runtime: &Runtime, +) -> RecordBatchStream { + let (tx, rx) = mpsc::channel(RECORD_BATCH_READ_BUF_SIZE); + runtime.spawn(async move { + while let Some(record_batch) = iter.next_batch().await.transpose() { + let record_batch = record_batch.map_err(|e| Box::new(e) as _); + + debug!( + "compact table send next record batch, batch:{:?}", + record_batch + ); + if tx.send(record_batch).await.is_err() { + error!("Failed to send record batch from the merge iterator"); + break; + } + } + }); + + Box::new(ReceiverStream { rx }) +} diff --git a/analytic_engine/src/row_iter/record_batch_stream.rs b/analytic_engine/src/row_iter/record_batch_stream.rs new file mode 100644 index 0000000000..13cf049b13 --- /dev/null +++ b/analytic_engine/src/row_iter/record_batch_stream.rs @@ -0,0 +1,287 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::ops::Bound; + +use common_types::{ + projected_schema::ProjectedSchema, record_batch::RecordBatchWithKey, SequenceNumber, +}; +use common_util::define_result; +use futures::stream::{self, Stream, StreamExt}; +use log::{error, trace}; +use snafu::{Backtrace, OptionExt, ResultExt, Snafu}; +use table_engine::{ + predicate::{filter_record_batch::RecordBatchFilter, Predicate}, + table::TableId, +}; + +use crate::{ + memtable::{MemTableRef, ScanContext, ScanRequest}, + space::SpaceId, + sst, + sst::{factory::SstReaderOptions, file::FileHandle}, + table::sst_util, +}; + +#[derive(Debug, Snafu)] +#[snafu(visibility(pub))] +pub enum Error { + #[snafu(display( + "No sst reader found, sst_reader_options:{:?}.\nBacktrace:\n{}", + options, + backtrace + ))] + SstReaderNotFound { + options: SstReaderOptions, + backtrace: Backtrace, + }, + + #[snafu(display("Fail to read sst meta, err:{}", source))] + ReadSstMeta { source: crate::sst::reader::Error }, + + #[snafu(display("Fail to read sst data, err:{}", source))] + ReadSstData { source: crate::sst::reader::Error }, + + #[snafu(display("Fail to scan memtable, err:{}", source))] + ScanMemtable { source: crate::memtable::Error }, +} + +define_result!(Error); + +const REBUILD_FILTERED_RECORD_BATCH_MAGNIFICATION: usize = 2; + +// TODO(yingwen): Can we move sequence to RecordBatchWithKey and remove this +// struct? But what is the sequence after merge? +#[derive(Debug)] +pub struct SequencedRecordBatch { + pub record_batch: RecordBatchWithKey, + pub sequence: SequenceNumber, +} + +impl SequencedRecordBatch { + #[inline] + pub fn num_rows(&self) -> usize { + self.record_batch.num_rows() + } +} + +pub type SequencedRecordBatchStream = Box< + dyn Stream< + Item = std::result::Result< + SequencedRecordBatch, + Box, + >, + > + Send + + Unpin, +>; + +/// Filter the `sequenced_record_batch` according to the `filter` if necessary. +/// Returns the original batch if only a small proportion of the batch is +/// filtered out. +/// The `selected_rows_buf` is for reuse. +fn maybe_filter_record_batch( + mut sequenced_record_batch: SequencedRecordBatch, + filter: &RecordBatchFilter, + selected_rows_buf: &mut Vec, +) -> Option { + if filter.is_empty() { + return Some(sequenced_record_batch); + } + + // The filter requires the `selected_rows_buf.len() >= + // sequenced_record_batch.num_rows()`. + selected_rows_buf.resize(sequenced_record_batch.num_rows(), true); + let num_selected_rows = filter.filter( + &sequenced_record_batch.record_batch, + selected_rows_buf.as_mut_slice(), + ); + + trace!( + "filter record batch, selected_rows:{}, origin_rows:{}", + num_selected_rows, + sequenced_record_batch.num_rows() + ); + + // No row is selected. + if num_selected_rows == 0 { + return None; + } + + if num_selected_rows + > sequenced_record_batch.num_rows() / REBUILD_FILTERED_RECORD_BATCH_MAGNIFICATION + { + // just use the original record batch because only a small proportion is + // filtered out. + return Some(sequenced_record_batch); + } + + // select the rows according to the filter result. + if let Err(e) = sequenced_record_batch + .record_batch + .select_data(selected_rows_buf.as_slice()) + { + error!( + "Fail to select record batch, data:{:?}, selected_rows:{:?}, err:{}", + sequenced_record_batch, selected_rows_buf, e, + ); + } + + Some(sequenced_record_batch) +} + +/// Filter the sequenced record batch stream by applying the `predicate`. +/// However, the output record batches is not ensured to meet the requirements +/// of the `predicate`. +pub fn filter_stream( + origin_stream: SequencedRecordBatchStream, + predicate: &Predicate, +) -> SequencedRecordBatchStream { + if predicate.exprs.is_empty() { + return origin_stream; + } + + let mut select_row_buf = Vec::new(); + let filter = RecordBatchFilter::from(predicate.exprs.as_slice()); + let stream = origin_stream.filter_map(move |sequence_record_batch| { + let v = match sequence_record_batch { + Ok(v) => maybe_filter_record_batch(v, &filter, &mut select_row_buf).map(Ok), + Err(e) => Some(Err(e)), + }; + + futures::future::ready(v) + }); + + Box::new(stream) +} + +/// Build filtered (by `predicate`) [SequencedRecordBatchStream] from a +/// memtable. +pub fn filtered_stream_from_memtable( + projected_schema: ProjectedSchema, + need_dedup: bool, + memtable: &MemTableRef, + reverse: bool, + predicate: &Predicate, +) -> Result { + stream_from_memtable(projected_schema, need_dedup, memtable, reverse) + .map(|origin_stream| filter_stream(origin_stream, predicate)) +} + +/// Build [SequencedRecordBatchStream] from a memtable. +pub fn stream_from_memtable( + projected_schema: ProjectedSchema, + need_dedup: bool, + memtable: &MemTableRef, + reverse: bool, +) -> Result { + let scan_ctx = ScanContext::default(); + let max_seq = memtable.last_sequence(); + let scan_req = ScanRequest { + start_user_key: Bound::Unbounded, + end_user_key: Bound::Unbounded, + sequence: max_seq, + projected_schema, + need_dedup, + reverse, + }; + + let iter = memtable.scan(scan_ctx, scan_req).context(ScanMemtable)?; + let stream = stream::iter(iter).map(move |v| { + v.map(|record_batch| SequencedRecordBatch { + record_batch, + sequence: max_seq, + }) + .map_err(|e| Box::new(e) as _) + }); + + Ok(Box::new(stream)) +} + +/// Build the filtered by `sst_read_options.predicate` +/// [SequencedRecordBatchStream] from a sst. +pub async fn filtered_stream_from_sst_file( + space_id: SpaceId, + table_id: TableId, + sst_file: &FileHandle, + sst_factory: &Fa, + sst_reader_options: &SstReaderOptions, + store: &S, +) -> Result +where + Fa: sst::factory::Factory, + S: object_store::ObjectStore, +{ + stream_from_sst_file( + space_id, + table_id, + sst_file, + sst_factory, + sst_reader_options, + store, + ) + .await + .map(|origin_stream| filter_stream(origin_stream, sst_reader_options.predicate.as_ref())) +} + +/// Build the [SequencedRecordBatchStream] from a sst. +pub async fn stream_from_sst_file( + space_id: SpaceId, + table_id: TableId, + sst_file: &FileHandle, + sst_factory: &Fa, + sst_reader_options: &SstReaderOptions, + store: &S, +) -> Result +where + Fa: sst::factory::Factory, + S: object_store::ObjectStore, +{ + sst_file.read_meter().mark(); + let mut path = store.new_path(); + sst_util::set_sst_file_path(space_id, table_id, sst_file.id(), &mut path); + let mut sst_reader = sst_factory + .new_sst_reader(sst_reader_options, &path, store) + .with_context(|| SstReaderNotFound { + options: sst_reader_options.clone(), + })?; + let meta = sst_reader.meta_data().await.context(ReadSstMeta)?; + let max_seq = meta.max_sequence; + let sst_stream = sst_reader.read().await.context(ReadSstData)?; + + let stream = Box::new(sst_stream.map(move |v| { + v.map(|record_batch| SequencedRecordBatch { + record_batch, + sequence: max_seq, + }) + .map_err(|e| Box::new(e) as _) + })); + + Ok(stream) +} + +#[cfg(test)] +pub mod tests { + use common_types::{row::Row, schema::Schema}; + + use super::*; + use crate::row_iter; + + /// Build [SequencedRecordBatchStream] from the sequenced rows. + pub fn build_sequenced_record_batch_stream( + schema: &Schema, + batches: Vec<(SequenceNumber, Vec)>, + ) -> Vec { + batches + .into_iter() + .map(|(seq, rows)| { + let batch = SequencedRecordBatch { + record_batch: row_iter::tests::build_record_batch_with_key( + schema.clone(), + rows, + ), + sequence: seq, + }; + Box::new(stream::iter(vec![Ok(batch)])) as SequencedRecordBatchStream + }) + .collect() + } +} diff --git a/analytic_engine/src/row_iter/tests.rs b/analytic_engine/src/row_iter/tests.rs new file mode 100644 index 0000000000..ce929b852a --- /dev/null +++ b/analytic_engine/src/row_iter/tests.rs @@ -0,0 +1,93 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use async_trait::async_trait; +use common_types::{ + projected_schema::ProjectedSchema, + record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder}, + row::{ + contiguous::{ContiguousRowReader, ContiguousRowWriter, ProjectedContiguousRow}, + Row, + }, + schema::{IndexInWriterSchema, RecordSchemaWithKey, Schema}, +}; +use common_util::define_result; +use snafu::Snafu; + +use crate::row_iter::RecordBatchWithKeyIterator; + +#[derive(Debug, Snafu)] +pub enum Error {} + +define_result!(Error); + +pub struct VectorIterator { + schema: RecordSchemaWithKey, + items: Vec>, + idx: usize, +} + +impl VectorIterator { + pub fn new(schema: RecordSchemaWithKey, items: Vec) -> Self { + Self { + schema, + items: items.into_iter().map(Some).collect(), + idx: 0, + } + } +} + +#[async_trait] +impl RecordBatchWithKeyIterator for VectorIterator { + type Error = Error; + + fn schema(&self) -> &RecordSchemaWithKey { + &self.schema + } + + async fn next_batch(&mut self) -> Result> { + if self.idx == self.items.len() { + return Ok(None); + } + + let ret = Ok(self.items[self.idx].take()); + self.idx += 1; + + ret + } +} + +pub fn build_record_batch_with_key(schema: Schema, rows: Vec) -> RecordBatchWithKey { + assert!(schema.num_columns() > 1); + let projection: Vec = (0..schema.num_columns()).collect(); + let projected_schema = ProjectedSchema::new(schema.clone(), Some(projection)).unwrap(); + let row_projected_schema = projected_schema.try_project_with_key(&schema).unwrap(); + let mut builder = + RecordBatchWithKeyBuilder::with_capacity(projected_schema.to_record_schema_with_key(), 2); + let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns()); + + let mut buf = Vec::new(); + for row in rows { + let mut writer = ContiguousRowWriter::new(&mut buf, &schema, &index_in_writer); + + writer.write_row(&row).unwrap(); + + let source_row = ContiguousRowReader::with_schema(&buf, &schema); + let projected_row = ProjectedContiguousRow::new(source_row, &row_projected_schema); + builder + .append_projected_contiguous_row(&projected_row) + .unwrap(); + } + builder.build().unwrap() +} + +pub async fn check_iterator(iter: &mut T, expected_rows: Vec) { + let mut visited_rows = 0; + while let Some(batch) = iter.next_batch().await.unwrap() { + for row_idx in 0..batch.num_rows() { + assert_eq!(batch.clone_row_at(row_idx), expected_rows[visited_rows]); + visited_rows += 1; + } + } + + assert_eq!(visited_rows, expected_rows.len()); +} diff --git a/analytic_engine/src/sampler.rs b/analytic_engine/src/sampler.rs new file mode 100644 index 0000000000..304d052327 --- /dev/null +++ b/analytic_engine/src/sampler.rs @@ -0,0 +1,448 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Segment duration sampler. + +use std::{ + collections::HashSet, + sync::{Arc, Mutex}, + time::Duration, +}; + +use common_types::time::{TimeRange, Timestamp}; +use common_util::define_result; +use snafu::{ensure, Backtrace, Snafu}; + +use crate::table_options; + +/// Initial size of timestamps set. +const INIT_CAPACITY: usize = 1000; +const HOUR_MS: u64 = 3600 * 1000; +const DAY_MS: u64 = 24 * HOUR_MS; +const AVAILABLE_DURATIONS: [u64; 8] = [ + 2 * HOUR_MS, + DAY_MS, + 7 * DAY_MS, + 30 * DAY_MS, + 180 * DAY_MS, + 360 * DAY_MS, + 5 * 360 * DAY_MS, + 10 * 360 * DAY_MS, +]; +const INTERVAL_RATIO: f64 = 0.9; +/// Expected points per timeseries in a segment, used to pick a proper segment +/// duration. +const POINTS_PER_SERIES: u64 = 100; +/// Max timestamp that wont overflow even using max duration. +const MAX_TIMESTAMP_MS_FOR_DURATION: i64 = + i64::MAX - 2 * AVAILABLE_DURATIONS[AVAILABLE_DURATIONS.len() - 1] as i64; +/// Minimun sample timestamps to compute duration. +const MIN_SAMPLES: usize = 2; + +#[derive(Debug, Snafu)] +#[snafu(display( + "Invalid timestamp to collect, timestamp:{:?}.\nBacktrace:\n{}", + timestamp, + backtrace +))] +pub struct Error { + timestamp: Timestamp, + backtrace: Backtrace, +} + +define_result!(Error); + +/// Segment duration sampler. +/// +/// Collects all timestamps and then yield a suggested segment duration to hold +/// all data with similar timestamp interval. +pub trait DurationSampler { + /// Collect a timestamp. + fn collect(&self, timestamp: Timestamp) -> Result<()>; + + /// Returns a suggested duration to partition the timestamps or default + /// duration if no enough timestamp has been sampled. + /// + /// Note that this method may be invoked more than once. + fn suggest_duration(&self) -> Duration; + + /// Returns a vector of time range with suggested duration that can hold all + /// timestamps collected by this sampler. + fn ranges(&self) -> Vec; + + // TODO(yingwen): Memory usage. +} + +pub type SamplerRef = Arc; + +struct State { + /// Deduplicated timestamps. + deduped_timestamps: HashSet, + /// Cached suggested duration. + duration: Option, + /// Sorted timestamps cache, empty if `duration` is None. + sorted_timestamps: Vec, +} + +impl State { + fn clear_cache(&mut self) { + self.duration = None; + self.sorted_timestamps.clear(); + } +} + +pub struct DefaultSampler { + state: Mutex, +} + +impl Default for DefaultSampler { + fn default() -> Self { + Self { + state: Mutex::new(State { + deduped_timestamps: HashSet::with_capacity(INIT_CAPACITY), + duration: None, + sorted_timestamps: Vec::new(), + }), + } + } +} + +impl DurationSampler for DefaultSampler { + fn collect(&self, timestamp: Timestamp) -> Result<()> { + ensure!( + timestamp.as_i64() < MAX_TIMESTAMP_MS_FOR_DURATION, + Context { timestamp } + ); + + let mut state = self.state.lock().unwrap(); + state.deduped_timestamps.insert(timestamp); + state.clear_cache(); + + Ok(()) + } + + fn suggest_duration(&self) -> Duration { + if let Some(v) = self.duration() { + return v; + } + + let timestamps = self.compute_sorted_timestamps(); + let picked = match evaluate_interval(×tamps) { + Some(interval) => pick_duration(interval), + None => table_options::DEFAULT_SEGMENT_DURATION, + }; + + { + // Cache the picked duration. + let mut state = self.state.lock().unwrap(); + state.duration = Some(picked); + state.sorted_timestamps = timestamps; + } + + picked + } + + fn ranges(&self) -> Vec { + let duration = self.suggest_duration(); + let sorted_timestamps = self.cached_sorted_timestamps(); + // This type hint is needed to make `ranges.last()` work. + let mut ranges: Vec = Vec::new(); + + for ts in sorted_timestamps { + if let Some(range) = ranges.last() { + if range.contains(ts) { + continue; + } + } + + // collect() ensures timestamp won't overflow. + let range = TimeRange::bucket_of(ts, duration).unwrap(); + ranges.push(range); + } + + ranges + } +} + +impl DefaultSampler { + fn cached_sorted_timestamps(&self) -> Vec { + self.state.lock().unwrap().sorted_timestamps.clone() + } + + fn compute_sorted_timestamps(&self) -> Vec { + let mut timestamps: Vec<_> = { + let state = self.state.lock().unwrap(); + state.deduped_timestamps.iter().copied().collect() + }; + + timestamps.sort_unstable(); + + timestamps + } + + fn duration(&self) -> Option { + self.state.lock().unwrap().duration + } +} + +fn evaluate_interval(sorted_timestamps: &[Timestamp]) -> Option { + if sorted_timestamps.len() < MIN_SAMPLES { + return None; + } + + let mut intervals = Vec::with_capacity(sorted_timestamps.len()); + for i in 0..sorted_timestamps.len() - 1 { + let current = sorted_timestamps[i]; + let next = sorted_timestamps[i + 1]; + let interval = next.as_i64() - current.as_i64(); + intervals.push(interval); + } + + intervals.sort_unstable(); + + let mut index = (intervals.len() as f64 * INTERVAL_RATIO) as usize; + if index > 1 { + index -= 1; + }; + let selected = intervals[index]; + // Interval should larger than 0. + assert!(selected > 0); + + Some(selected as u64) +} + +fn pick_duration(interval: u64) -> Duration { + let scaled_interval = interval.checked_mul(POINTS_PER_SERIES).unwrap_or(u64::MAX); + for du_ms in AVAILABLE_DURATIONS { + if du_ms > scaled_interval { + return Duration::from_millis(du_ms); + } + } + + // No duration larger than scaled interval, returns the largest duration. + let du_ms = AVAILABLE_DURATIONS[AVAILABLE_DURATIONS.len() - 1]; + + Duration::from_millis(du_ms) +} + +#[cfg(test)] +mod tests { + use super::*; + + const SEC_MS: u64 = 1000; + const MIN_MS: u64 = 60 * SEC_MS; + + #[test] + fn test_pick_duration() { + let cases = [ + (1, 2 * HOUR_MS), + (5 * SEC_MS, 2 * HOUR_MS), + (15 * SEC_MS, 2 * HOUR_MS), + (MIN_MS, 2 * HOUR_MS), + (5 * MIN_MS, DAY_MS), + (10 * MIN_MS, DAY_MS), + (30 * MIN_MS, 7 * DAY_MS), + (HOUR_MS, 7 * DAY_MS), + (4 * HOUR_MS, 30 * DAY_MS), + (8 * HOUR_MS, 180 * DAY_MS), + (DAY_MS, 180 * DAY_MS), + (3 * DAY_MS, 360 * DAY_MS), + (7 * DAY_MS, 5 * 360 * DAY_MS), + (30 * DAY_MS, 10 * 360 * DAY_MS), + (360 * DAY_MS, 10 * 360 * DAY_MS), + (10 * 360 * DAY_MS, 10 * 360 * DAY_MS), + (20 * 360 * DAY_MS, 10 * 360 * DAY_MS), + ]; + + for (i, (interval, expect)) in cases.iter().enumerate() { + assert_eq!( + *expect, + pick_duration(*interval).as_millis() as u64, + "Case {}", + i + ); + } + } + + #[test] + fn test_empty_sampler() { + let sampler = DefaultSampler::default(); + + assert_eq!( + table_options::DEFAULT_SEGMENT_DURATION, + sampler.suggest_duration() + ); + assert!(sampler.ranges().is_empty()); + } + + #[test] + fn test_one_sample() { + let sampler = DefaultSampler::default(); + + sampler.collect(Timestamp::new(0)).unwrap(); + + assert_eq!( + table_options::DEFAULT_SEGMENT_DURATION, + sampler.suggest_duration() + ); + let time_range = + TimeRange::bucket_of(Timestamp::new(0), table_options::DEFAULT_SEGMENT_DURATION) + .unwrap(); + assert_eq!(&[time_range], &sampler.ranges()[..]); + } + + #[test] + fn test_all_sample_same() { + let sampler = DefaultSampler::default(); + + let ts = Timestamp::now(); + for _ in 0..5 { + sampler.collect(ts).unwrap(); + } + + assert_eq!( + table_options::DEFAULT_SEGMENT_DURATION, + sampler.suggest_duration() + ); + let time_range = TimeRange::bucket_of(ts, table_options::DEFAULT_SEGMENT_DURATION).unwrap(); + assert_eq!(&[time_range], &sampler.ranges()[..]); + } + + #[test] + fn test_collect_invalid() { + let sampler = DefaultSampler::default(); + + assert!(sampler + .collect(Timestamp::new(MAX_TIMESTAMP_MS_FOR_DURATION - 1)) + .is_ok()); + assert!(sampler + .collect(Timestamp::new(MAX_TIMESTAMP_MS_FOR_DURATION)) + .is_err()); + } + + #[test] + fn test_sampler_cache() { + let sampler = DefaultSampler::default(); + + let ts1 = Timestamp::now(); + for i in 0..3 { + sampler + .collect(Timestamp::new(ts1.as_i64() + i * SEC_MS as i64)) + .unwrap(); + } + + assert_eq!( + table_options::DEFAULT_SEGMENT_DURATION, + sampler.suggest_duration() + ); + let time_range1 = + TimeRange::bucket_of(ts1, table_options::DEFAULT_SEGMENT_DURATION).unwrap(); + assert_eq!(&[time_range1], &sampler.ranges()[..]); + + // A new timestamp is sampled. + let ts2 = Timestamp::new(ts1.as_i64() + DAY_MS as i64); + sampler.collect(ts2).unwrap(); + + assert!(sampler.state.lock().unwrap().duration.is_none()); + assert!(sampler.state.lock().unwrap().sorted_timestamps.is_empty()); + + assert_eq!( + table_options::DEFAULT_SEGMENT_DURATION, + sampler.suggest_duration() + ); + let time_range2 = + TimeRange::bucket_of(ts2, table_options::DEFAULT_SEGMENT_DURATION).unwrap(); + assert_eq!(&[time_range1, time_range2], &sampler.ranges()[..]); + } + + fn test_suggest_duration_and_ranges_case( + timestamps: &[i64], + duration: u64, + ranges: &[(i64, i64)], + ) { + let sampler = DefaultSampler::default(); + + for ts in timestamps { + sampler.collect(Timestamp::new(*ts)).unwrap(); + } + + assert_eq!(Duration::from_millis(duration), sampler.suggest_duration()); + + let suggested_ranges = sampler.ranges(); + for (range, suggested_range) in ranges.iter().zip(suggested_ranges) { + assert_eq!(range.0, suggested_range.inclusive_start().as_i64()); + assert_eq!(range.1, suggested_range.exclusive_end().as_i64()); + } + } + + #[test] + fn test_suggest_duration_and_ranges() { + test_suggest_duration_and_ranges_case( + // Intervals: 3, 5 + &[100, 103, 108], + 2 * HOUR_MS, + &[(0, 2 * HOUR_MS as i64)], + ); + + let now_ts = Timestamp::now(); + let now = now_ts.as_i64(); + let sec_ms_i64 = SEC_MS as i64; + + let bucket = TimeRange::bucket_of(now_ts, Duration::from_millis(2 * HOUR_MS)).unwrap(); + let expect_range = ( + bucket.inclusive_start().as_i64(), + bucket.exclusive_end().as_i64(), + ); + test_suggest_duration_and_ranges_case( + // Intervals: 5s, 5s, 5s, 5s, 100s, + &[ + now, + now + 5 * sec_ms_i64, + now + 2 * 5 * sec_ms_i64, + now + 3 * 5 * sec_ms_i64, + now + 4 * 5 * sec_ms_i64, + now + 4 * 5 * sec_ms_i64 + 100 * sec_ms_i64, + ], + 2 * HOUR_MS, + &[expect_range], + ); + + // Same with previous case, but shuffle the input timestamps. + test_suggest_duration_and_ranges_case( + &[ + now + 3 * 5 * sec_ms_i64, + now, + now + 5 * sec_ms_i64, + now + 4 * 5 * sec_ms_i64, + now + 2 * 5 * sec_ms_i64, + now + 4 * 5 * sec_ms_i64 + 100 * sec_ms_i64, + ], + 2 * HOUR_MS, + &[expect_range], + ); + + test_suggest_duration_and_ranges_case( + // Intervals: nine 5s and one 8h + &[ + now + 5 * 5 * sec_ms_i64 + 8 * HOUR_MS as i64, + now, + now + 5 * sec_ms_i64, + now + 2 * 5 * sec_ms_i64, + now + 7 * 5 * sec_ms_i64 + 8 * HOUR_MS as i64, + now + 3 * 5 * sec_ms_i64, + now + 4 * 5 * sec_ms_i64, + now + 4 * 5 * sec_ms_i64 + 8 * HOUR_MS as i64, + now + 6 * 5 * sec_ms_i64 + 8 * HOUR_MS as i64, + now + 8 * 5 * sec_ms_i64 + 8 * HOUR_MS as i64, + now + 9 * 5 * sec_ms_i64 + 8 * HOUR_MS as i64, + ], + 2 * HOUR_MS, + &[ + expect_range, + ( + expect_range.0 + 8 * HOUR_MS as i64, + expect_range.1 + 8 * HOUR_MS as i64, + ), + ], + ); + } +} diff --git a/analytic_engine/src/setup.rs b/analytic_engine/src/setup.rs new file mode 100644 index 0000000000..80e673778a --- /dev/null +++ b/analytic_engine/src/setup.rs @@ -0,0 +1,103 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Setup the analytic engine + +use std::{path::Path, sync::Arc}; + +use common_util::define_result; +use object_store::disk::File; +use parquet::{ + cache::{LruDataCache, LruMetaCache}, + DataCacheRef, MetaCacheRef, +}; +use snafu::{ResultExt, Snafu}; +use table_engine::engine::EngineRuntimes; +use wal::{manager, rocks_impl::manager::Builder as WalBuilder}; + +use crate::{ + context::OpenContext, engine::TableEngineImpl, instance::Instance, meta::details::ManifestImpl, + sst::factory::FactoryImpl, AnalyticTableEngine, Config, EngineInstance, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to open engine instance, err:{}", source))] + OpenInstance { + source: crate::instance::open::Error, + }, + + #[snafu(display("Failed to open wal, err:{}", source))] + OpenWal { source: manager::error::Error }, + + #[snafu(display("Failed to open wal for manifest, err:{}", source))] + OpenManifestWal { source: manager::error::Error }, + + #[snafu(display("Failed to open manifest, err:{}", source))] + OpenManifest { source: crate::meta::details::Error }, +} + +define_result!(Error); + +const WAL_DIR_NAME: &str = "wal"; +const MANIFEST_DIR_NAME: &str = "manifest"; +const STORE_DIR_NAME: &str = "store"; + +/// Open an [AnalyticTableEngine] instance +pub async fn open_analytic_table_engine( + config: Config, + engine_runtimes: Arc, +) -> Result { + let instance = open_instance(config.clone(), engine_runtimes).await?; + + Ok(TableEngineImpl::new(instance)) +} + +async fn open_instance( + config: Config, + engine_runtimes: Arc, +) -> Result { + let write_runtime = engine_runtimes.write_runtime.clone(); + let data_path = Path::new(&config.data_path); + let wal_path = data_path.join(WAL_DIR_NAME); + let wal_manager = WalBuilder::with_default_rocksdb_config(wal_path, write_runtime.clone()) + .build() + .context(OpenWal)?; + + let manifest_path = data_path.join(MANIFEST_DIR_NAME); + let manifest_wal = WalBuilder::with_default_rocksdb_config(manifest_path, write_runtime) + .build() + .context(OpenManifestWal)?; + + let manifest = ManifestImpl::open(manifest_wal, config.manifest.clone()) + .await + .context(OpenManifest)?; + + let meta_cache: Option = + if let Some(sst_meta_cache_cap) = &config.sst_meta_cache_cap { + Some(Arc::new(LruMetaCache::new(*sst_meta_cache_cap))) + } else { + None + }; + + let data_cache: Option = + if let Some(sst_data_cache_cap) = &config.sst_data_cache_cap { + Some(Arc::new(LruDataCache::new(*sst_data_cache_cap))) + } else { + None + }; + + let sst_path = data_path.join(STORE_DIR_NAME); + let store = File::new(sst_path); + let open_ctx = OpenContext { + config, + runtimes: engine_runtimes, + meta_cache, + data_cache, + }; + + let instance = Instance::open(open_ctx, manifest, wal_manager, store, FactoryImpl) + .await + .context(OpenInstance)?; + + Ok(instance) +} diff --git a/analytic_engine/src/space.rs b/analytic_engine/src/space.rs new file mode 100644 index 0000000000..d7ab539571 --- /dev/null +++ b/analytic_engine/src/space.rs @@ -0,0 +1,305 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table space +//! +//! A table space acts like a namespace of a bunch of tables, tables under +//! different space can use same table name + +use std::{ + fmt, + sync::{Arc, RwLock}, +}; + +use arena::CollectorRef; +use common_util::define_result; +use log::info; +use snafu::{Backtrace, ResultExt, Snafu}; +use table_engine::{engine::CreateTableRequest, table::TableId}; +use tokio::sync::Mutex; + +use crate::{ + instance::{mem_collector::MemUsageCollector, write_worker::WriteGroup}, + meta::{ + meta_update::{AddTableMeta, MetaUpdate}, + Manifest, + }, + sst::file::FilePurger, + table::data::{TableData, TableDataRef, TableDataSet}, + TableOptions, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Table already exists, table:{}.\nBacktrace:\n{}", table, backtrace))] + TableExists { table: String, backtrace: Backtrace }, + + #[snafu(display("Failed to create table data, table:{}, err:{}", table, source))] + CreateTableData { + table: String, + source: crate::table::data::Error, + }, + + #[snafu(display("Failed to store meta data, err:{}", source))] + WriteMeta { + source: Box, + }, +} + +define_result!(Error); + +impl From for table_engine::engine::Error { + fn from(err: Error) -> Self { + match err { + Error::TableExists { table, backtrace } => Self::TableExists { table, backtrace }, + Error::CreateTableData { ref table, .. } => Self::InvalidArguments { + table: table.clone(), + source: Box::new(err), + }, + Error::WriteMeta { .. } => Self::WriteMeta { + source: Box::new(err), + }, + } + } +} + +/// Holds references to the table data and its space +/// +/// REQUIRE: The table must belongs to the space +#[derive(Clone)] +pub struct SpaceAndTable { + /// The space of the table + space: SpaceRef, + /// Data of the table + table_data: TableDataRef, +} + +impl SpaceAndTable { + /// Create SpaceAndTable + /// + /// REQUIRE: The table must belongs to the space + pub fn new(space: SpaceRef, table_data: TableDataRef) -> Self { + // Checks table is in space + debug_assert!(space + .table_datas + .read() + .unwrap() + .find_table(&table_data.name) + .is_some()); + + Self { space, table_data } + } + + /// Get space info + #[inline] + pub fn space(&self) -> &SpaceRef { + &self.space + } + + /// Get table data + #[inline] + pub fn table_data(&self) -> &TableDataRef { + &self.table_data + } +} + +impl fmt::Debug for SpaceAndTable { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SpaceAndTable") + .field("space_id", &self.space.id) + .field("space_name", &self.space.name) + .field("table_id", &self.table_data.id) + .field("table_name", &self.table_data.name) + .finish() + } +} + +/// Name type of space +// TODO(yingwen): Or use binary string? +pub type SpaceName = String; +/// Reference of space name +pub type SpaceNameRef<'a> = &'a str; +/// Space id +// TODO(yingwen): Or just use something like uuid as space id? +pub type SpaceId = u32; + +/// A space can hold mulitple tables +pub struct Space { + /// Space id + pub id: SpaceId, + /// Space name + pub name: SpaceName, + /// Data of tables in this space + /// + /// Adding table into it should acquire the space lock first, then the write + /// lock + table_datas: RwLock, + /// Space lock + /// + /// Persisting meta update of this space is protected by this lock + mutex: Mutex<()>, + + /// Write workers + pub write_group: WriteGroup, + /// Space memtable memory usage collector + pub mem_usage_collector: Arc, + /// The maximum write buffer size used for single space. + pub write_buffer_size: usize, +} + +impl Space { + pub fn new( + id: SpaceId, + name: SpaceName, + write_buffer_size: usize, + write_group: WriteGroup, + engine_mem_collector: CollectorRef, + ) -> Self { + Self { + id, + name, + table_datas: RwLock::new(TableDataSet::new()), + mutex: Mutex::new(()), + write_group, + mem_usage_collector: Arc::new(MemUsageCollector::with_parent(engine_mem_collector)), + write_buffer_size, + } + } + + /// Returns true when space total memtable memory usage reaches + /// space_write_buffer_size limit. + #[inline] + pub fn should_flush_space(&self) -> bool { + self.write_buffer_size > 0 && self.memtable_memory_usage() >= self.write_buffer_size + } + + /// Find the table in space which it's memtable consumes maximum memory. + #[inline] + pub fn find_maximum_memory_usage_table(&self) -> Option { + self.table_datas + .read() + .unwrap() + .find_maximum_memory_usage_table() + } + + #[inline] + pub fn memtable_memory_usage(&self) -> usize { + self.mem_usage_collector.total_memory_allocated() + } + + pub async fn close(&self) -> Result<()> { + // Stop the write group. + self.write_group.stop().await; + + Ok(()) + } + + /// Create a table under this space + /// + /// Returns error if the table already exists + pub async fn create_table( + &self, + request: CreateTableRequest, + manifest: &Meta, + table_opts: &TableOptions, + purger: &FilePurger, + ) -> Result { + info!( + "Space create table, space_id:{}, space_name:{}, request:{:?}", + self.id, self.name, request + ); + + // Checks whether the table is exists + if self.find_table(&request.table_name).is_some() { + return TableExists { + table: request.table_name, + } + .fail(); + } + + // Choose a write worker for this table + let write_handle = self.write_group.choose_worker(request.table_id); + + let _lock = self.mutex.lock().await; + + // Double check for table existence under space lock + if self.find_table(&request.table_name).is_some() { + return TableExists { + table: request.table_name, + } + .fail(); + } + + // Store table info into meta + let update = MetaUpdate::AddTable(AddTableMeta { + space_id: self.id, + table_id: request.table_id, + table_name: request.table_name.clone(), + schema: request.table_schema.clone(), + opts: table_opts.clone(), + }); + manifest + .store_update(update) + .await + .map_err(|e| Box::new(e) as _) + .context(WriteMeta)?; + + // Update memory state + let table_name = request.table_name.clone(); + let table_data = Arc::new( + TableData::new( + self.id, + request, + write_handle, + table_opts.clone(), + purger, + self.mem_usage_collector.clone(), + ) + .context(CreateTableData { table: &table_name })?, + ); + + self.insert_table(table_data.clone()); + + Ok(table_data) + } + + /// Insert table data into space memory state if the table is + /// absent. For internal use only + /// + /// Panic if the table is already exists + pub(crate) fn insert_table(&self, table_data: TableDataRef) { + let success = self + .table_datas + .write() + .unwrap() + .insert_if_absent(table_data); + assert!(success); + } + + /// Find table under this space by table name + pub fn find_table(&self, table_name: &str) -> Option { + self.table_datas.read().unwrap().find_table(table_name) + } + + /// Find table under this space by its id + pub fn find_table_by_id(&self, table_id: TableId) -> Option { + self.table_datas.read().unwrap().find_table_by_id(table_id) + } + + /// Remove table under this space by table name + pub fn remove_table(&self, table_name: &str) -> Option { + self.table_datas.write().unwrap().remove_table(table_name) + } + + /// Returns the total table num in this space + pub fn table_num(&self) -> usize { + self.table_datas.read().unwrap().table_num() + } + + /// List all tables of this space to `tables` + pub fn list_all_tables(&self, tables: &mut Vec) { + self.table_datas.read().unwrap().list_all_tables(tables) + } +} + +/// A reference to space +pub type SpaceRef = Arc; diff --git a/analytic_engine/src/sst/builder.rs b/analytic_engine/src/sst/builder.rs new file mode 100644 index 0000000000..3eecbcdf2a --- /dev/null +++ b/analytic_engine/src/sst/builder.rs @@ -0,0 +1,76 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Sst builder trait definition + +use async_trait::async_trait; +use common_types::{record_batch::RecordBatchWithKey, request_id::RequestId}; +use futures::Stream; + +use crate::sst::file::SstMetaData; + +pub mod error { + use common_util::define_result; + use snafu::{Backtrace, Snafu}; + + #[derive(Debug, Snafu)] + #[snafu(visibility(pub))] + pub enum Error { + #[snafu(display("Failed to persist sst content, path:{}, err:{}", path, source))] + Persist { + path: String, + source: Box, + }, + + #[snafu(display("Failed to encode meta data, err:{}", source))] + EncodeMetaData { + source: Box, + }, + + #[snafu(display("Failed to get sst file size, path:{}", path))] + GetFileSize { path: String }, + + #[snafu(display( + "Failed to encode record batch into sst, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + EncodeRecordBatch { + source: Box, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to poll record batch, err:{}", source))] + PollRecordBatch { + source: Box, + }, + } + + define_result!(Error); +} + +pub use error::*; + +pub type RecordBatchStreamItem = + std::result::Result>; +// TODO(yingwen): SstReader also has a RecordBatchStream, can we use same type? +pub type RecordBatchStream = Box + Send + Unpin>; + +#[derive(Debug, Copy, Clone)] +pub struct SstInfo { + pub file_size: usize, + pub row_num: usize, +} + +/// The builder for sst. +/// +/// The caller provides a stream of [RecordBatch] and the builder takes +/// responsibilities for persisting the records. +#[async_trait] +pub trait SstBuilder { + async fn build( + &mut self, + request_id: RequestId, + meta: &SstMetaData, + record_stream: RecordBatchStream, + ) -> Result; +} diff --git a/analytic_engine/src/sst/factory.rs b/analytic_engine/src/sst/factory.rs new file mode 100644 index 0000000000..f910468515 --- /dev/null +++ b/analytic_engine/src/sst/factory.rs @@ -0,0 +1,87 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Factory for different kinds sst builder and reader. + +use std::{fmt::Debug, sync::Arc}; + +use common_types::projected_schema::ProjectedSchema; +use common_util::runtime::Runtime; +use object_store::ObjectStore; +use parquet::{DataCacheRef, MetaCacheRef}; +use table_engine::predicate::PredicateRef; + +use crate::{ + sst::{ + builder::SstBuilder, + parquet::{builder::ParquetSstBuilder, reader::ParquetSstReader}, + reader::SstReader, + }, + table_options::Compression, +}; + +pub trait Factory: Clone { + fn new_sst_reader<'a, S: ObjectStore>( + &self, + options: &SstReaderOptions, + path: &'a S::Path, + storage: &'a S, + ) -> Option>; + + fn new_sst_builder<'a, S: ObjectStore>( + &self, + options: &SstBuilderOptions, + path: &'a S::Path, + storage: &'a S, + ) -> Option>; +} + +#[derive(Debug, Copy, Clone)] +pub enum SstType { + Parquet, +} + +#[derive(Debug, Clone)] +pub struct SstReaderOptions { + pub sst_type: SstType, + pub read_batch_row_num: usize, + pub reverse: bool, + pub projected_schema: ProjectedSchema, + pub predicate: PredicateRef, + pub meta_cache: Option, + pub data_cache: Option, + pub runtime: Arc, +} + +#[derive(Debug, Clone)] +pub struct SstBuilderOptions { + pub sst_type: SstType, + pub num_rows_per_row_group: usize, + pub compression: Compression, +} + +#[derive(Debug, Clone)] +pub struct FactoryImpl; + +impl Factory for FactoryImpl { + fn new_sst_reader<'a, S: ObjectStore>( + &self, + options: &SstReaderOptions, + path: &'a S::Path, + storage: &'a S, + ) -> Option> { + match options.sst_type { + SstType::Parquet => Some(Box::new(ParquetSstReader::new(path, storage, options))), + } + } + + fn new_sst_builder<'a, S: ObjectStore>( + &self, + options: &SstBuilderOptions, + path: &'a S::Path, + storage: &'a S, + ) -> Option> { + match options.sst_type { + SstType::Parquet => Some(Box::new(ParquetSstBuilder::new(path, storage, options))), + } + } +} diff --git a/analytic_engine/src/sst/file.rs b/analytic_engine/src/sst/file.rs new file mode 100644 index 0000000000..00bf345e66 --- /dev/null +++ b/analytic_engine/src/sst/file.rs @@ -0,0 +1,699 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Sst file and storage info + +use std::{ + borrow::Borrow, + cmp, + collections::{BTreeMap, HashSet}, + convert::TryFrom, + fmt, + fmt::Debug, + hash::{Hash, Hasher}, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, +}; + +use common_types::{ + bytes::Bytes, + schema::Schema, + time::{TimeRange, Timestamp}, + SequenceNumber, +}; +use common_util::{ + define_result, + metric::Meter, + runtime::{JoinHandle, Runtime}, +}; +use log::{debug, error, info}; +use object_store::{path::ObjectStorePath, ObjectStore}; +use proto::{common::TimeRange as TimeRangePb, sst::SstMetaData as SstMetaDataPb}; +use snafu::{ResultExt, Snafu}; +use table_engine::table::TableId; +use tokio::sync::{ + mpsc::{self, UnboundedReceiver, UnboundedSender}, + Mutex, +}; + +use crate::{space::SpaceId, sst::manager::FileId, table::sst_util}; + +/// Error of sst file. +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to convert time range, err:{}", source))] + ConvertTimeRange { source: common_types::time::Error }, + + #[snafu(display("Failed to convert table schema, err:{}", source))] + ConvertTableSchema { source: common_types::schema::Error }, + + #[snafu(display("Failed to join purger, err:{}", source))] + StopPurger { source: common_util::runtime::Error }, +} + +define_result!(Error); + +pub type Level = u16; + +// TODO(yingwen): Order or split file by time range to speed up filter (even in +// level 0). +/// Manage files of single level +pub struct LevelHandler { + pub level: Level, + /// All files in current level. + files: FileHandleSet, +} + +impl LevelHandler { + pub fn new(level: u16) -> Self { + Self { + level, + files: FileHandleSet::default(), + } + } + + #[inline] + pub fn insert(&mut self, file: FileHandle) { + self.files.insert(file); + } + + pub fn latest_sst(&self) -> Option { + self.files.latest() + } + + pub fn pick_ssts(&self, time_range: TimeRange) -> Vec { + if self.level == 0 { + self.files.files_by_time_range(time_range) + } else { + Vec::new() + } + } + + #[inline] + pub fn remove_ssts(&mut self, file_ids: &[FileId]) { + self.files.remove_by_ids(file_ids); + } + + pub fn iter_ssts(&self) -> Iter { + let iter = self.files.file_map.values(); + Iter(iter) + } + + #[inline] + pub fn collect_expired( + &self, + expire_time: Option, + expired_files: &mut Vec, + ) { + self.files.collect_expired(expire_time, expired_files); + } + + #[inline] + pub fn has_expired_sst(&self, expire_time: Option) -> bool { + self.files.has_expired_sst(expire_time) + } +} + +pub struct Iter<'a>(std::collections::btree_map::Values<'a, FileOrdKey, FileHandle>); + +impl<'a> Iterator for Iter<'a> { + type Item = &'a FileHandle; + + fn next(&mut self) -> Option { + self.0.next() + } +} + +#[derive(Clone)] +pub struct FileHandle { + inner: Arc, +} + +impl PartialEq for FileHandle { + fn eq(&self, other: &Self) -> bool { + self.id() == other.id() + } +} + +impl Eq for FileHandle {} + +impl Hash for FileHandle { + fn hash(&self, state: &mut H) { + self.id().hash(state); + } +} + +impl FileHandle { + pub fn new(meta: FileMeta, purge_queue: FilePurgeQueue) -> Self { + Self { + inner: Arc::new(FileHandleInner { + meta, + purge_queue, + being_compacted: AtomicBool::new(false), + metrics: SstMetrics::default(), + }), + } + } + + #[inline] + pub fn read_meter(&self) -> Arc { + self.inner.metrics.read_meter.clone() + } + + #[inline] + pub fn row_num(&self) -> u64 { + self.inner.meta.meta.row_num + } + + #[inline] + pub fn id(&self) -> FileId { + self.inner.meta.id + } + + #[inline] + pub fn id_ref(&self) -> &FileId { + &self.inner.meta.id + } + + #[inline] + pub fn intersect_with_time_range(&self, time_range: TimeRange) -> bool { + self.inner.meta.intersect_with_time_range(time_range) + } + + #[inline] + pub fn min_key(&self) -> Bytes { + self.inner.meta.meta.min_key.clone() + } + + #[inline] + pub fn max_key(&self) -> Bytes { + self.inner.meta.meta.max_key.clone() + } + + #[inline] + pub fn time_range(&self) -> TimeRange { + self.inner.meta.meta.time_range + } + + #[inline] + pub fn time_range_ref(&self) -> &TimeRange { + &self.inner.meta.meta.time_range + } + + #[inline] + pub fn max_sequence(&self) -> SequenceNumber { + self.inner.meta.meta.max_sequence + } + + #[inline] + pub fn being_compacted(&self) -> bool { + self.inner.being_compacted.load(Ordering::Relaxed) + } + + #[inline] + pub fn size(&self) -> u64 { + self.inner.meta.meta.size + } + + #[inline] + pub fn set_being_compacted(&self, value: bool) { + self.inner.being_compacted.store(value, Ordering::Relaxed); + } +} + +impl fmt::Debug for FileHandle { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("FileHandle") + .field("meta", &self.inner.meta) + .field("being_compacted", &self.being_compacted()) + .field("metrics", &self.inner.metrics) + .finish() + } +} + +struct SstMetrics { + pub read_meter: Arc, + pub key_num: usize, +} + +impl Default for SstMetrics { + fn default() -> Self { + SstMetrics { + read_meter: Arc::new(Meter::new()), + key_num: 0, + } + } +} + +impl fmt::Debug for SstMetrics { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SstMetrics") + .field("read_meter", &self.read_meter.h2_rate()) + .field("key_num", &self.key_num) + .finish() + } +} + +struct FileHandleInner { + meta: FileMeta, + purge_queue: FilePurgeQueue, + /// The file is being compacting. + being_compacted: AtomicBool, + metrics: SstMetrics, +} + +impl Drop for FileHandleInner { + fn drop(&mut self) { + debug!("FileHandle is dropped, meta:{:?}", self.meta); + + // Push file cannot block or be async because we are in drop(). + self.purge_queue.push_file(self.meta.id); + } +} + +/// Used to order [FileHandle] by (end_time, start_time, file_id) +#[derive(PartialEq, Eq, PartialOrd, Ord)] +struct FileOrdKey { + exclusive_end: Timestamp, + inclusive_start: Timestamp, + file_id: FileId, +} + +impl FileOrdKey { + fn for_seek(exclusive_end: Timestamp) -> Self { + Self { + exclusive_end, + inclusive_start: Timestamp::MIN, + file_id: 0, + } + } + + fn key_of(file: &FileHandle) -> Self { + Self { + exclusive_end: file.time_range().exclusive_end(), + inclusive_start: file.time_range().inclusive_start(), + file_id: file.id(), + } + } +} + +/// Used to index [FileHandle] by file_id +struct FileHandleHash(FileHandle); + +impl PartialEq for FileHandleHash { + fn eq(&self, other: &Self) -> bool { + self.0.id() == other.0.id() + } +} + +impl Eq for FileHandleHash {} + +impl Hash for FileHandleHash { + fn hash(&self, state: &mut H) { + self.0.id().hash(state); + } +} + +impl Borrow for FileHandleHash { + #[inline] + fn borrow(&self) -> &FileId { + self.0.id_ref() + } +} + +#[derive(Default)] +struct FileHandleSet { + /// Files ordered by time range and id. + file_map: BTreeMap, + /// Files indexed by file id, used to speed up removal. + id_to_files: HashSet, +} + +impl FileHandleSet { + fn latest(&self) -> Option { + if let Some(file) = self.file_map.values().rev().next() { + return Some(file.clone()); + } + None + } + + fn files_by_time_range(&self, time_range: TimeRange) -> Vec { + // Seek to first sst whose end time >= time_range.inclusive_start(). + let seek_key = FileOrdKey::for_seek(time_range.inclusive_start()); + self.file_map + .range(seek_key..) + .into_iter() + .filter_map(|(_key, file)| { + if file.intersect_with_time_range(time_range) { + Some(file.clone()) + } else { + None + } + }) + .collect() + } + + fn insert(&mut self, file: FileHandle) { + self.file_map + .insert(FileOrdKey::key_of(&file), file.clone()); + self.id_to_files.insert(FileHandleHash(file)); + } + + fn remove_by_ids(&mut self, file_ids: &[FileId]) { + for file_id in file_ids { + if let Some(file) = self.id_to_files.take(file_id) { + let key = FileOrdKey::key_of(&file.0); + self.file_map.remove(&key); + } + } + } + + /// Collect ssts with time range is expired. + fn collect_expired(&self, expire_time: Option, expired_files: &mut Vec) { + for file in self.file_map.values() { + if file.time_range().is_expired(expire_time) { + expired_files.push(file.clone()); + } else { + // Files are sorted by end time first, so there is no more file whose end time + // is less than `expire_time`. + break; + } + } + } + + fn has_expired_sst(&self, expire_time: Option) -> bool { + // Files are sorted by end time first, so check first file is enough. + if let Some(file) = self.file_map.values().next() { + return file.time_range().is_expired(expire_time); + } + + false + } +} + +/// Meta of a sst file, immutable once created +#[derive(Debug, Clone)] +pub struct FileMeta { + /// Id of the sst file + pub id: FileId, + pub meta: SstMetaData, +} + +impl FileMeta { + pub fn intersect_with_time_range(&self, time_range: TimeRange) -> bool { + self.meta.time_range.intersect_with(time_range) + } +} + +/// Meta data of a sst file, immutable once created +#[derive(Debug, Clone, PartialEq)] +pub struct SstMetaData { + pub min_key: Bytes, + pub max_key: Bytes, + /// Time Range of the sst + pub time_range: TimeRange, + /// Max sequence number in the sst + pub max_sequence: SequenceNumber, + pub schema: Schema, + /// file size in bytes + pub size: u64, + // total row number + pub row_num: u64, +} + +impl From for SstMetaDataPb { + fn from(src: SstMetaData) -> Self { + let mut target = SstMetaDataPb::default(); + target.set_min_key(src.min_key.to_vec()); + target.set_max_key(src.max_key.to_vec()); + target.set_max_sequence(src.max_sequence); + let time_range = TimeRangePb::from(src.time_range); + target.set_time_range(time_range); + target.set_schema(src.schema.into()); + target.set_size(src.size); + target.set_row_num(src.row_num); + + target + } +} + +impl TryFrom for SstMetaData { + type Error = Error; + + fn try_from(mut src: SstMetaDataPb) -> Result { + let time_range = TimeRange::try_from(src.take_time_range()).context(ConvertTimeRange)?; + let schema = Schema::try_from(src.take_schema()).context(ConvertTableSchema)?; + Ok(Self { + min_key: src.min_key.into(), + max_key: src.max_key.into(), + time_range, + max_sequence: src.max_sequence, + schema, + size: src.size, + row_num: src.row_num, + }) + } +} + +// Queue to store files to be deleted for a table. +#[derive(Clone)] +pub struct FilePurgeQueue { + // Wrap a inner struct to avoid storing space/table ids for each file. + inner: Arc, +} + +impl FilePurgeQueue { + pub fn new(space_id: SpaceId, table_id: TableId, sender: UnboundedSender) -> Self { + Self { + inner: Arc::new(FilePurgeQueueInner { + space_id, + table_id, + sender, + closed: AtomicBool::new(false), + }), + } + } + + /// Close the purge queue, then all request pushed to this queue will be + /// ignored. This is mainly used to avoid files being deleted after the + /// db is closed. + pub fn close(&self) { + self.inner.closed.store(true, Ordering::SeqCst); + } + + fn push_file(&self, file_id: FileId) { + if self.inner.closed.load(Ordering::SeqCst) { + return; + } + + // Send the file id via a channel to file purger and delete the file from sst + // store in background. + let request = FilePurgeRequest { + space_id: self.inner.space_id, + table_id: self.inner.table_id, + file_id, + }; + + if let Err(send_res) = self.inner.sender.send(Request::Purge(request)) { + error!( + "Failed to send delete file request, request:{:?}", + send_res.0 + ); + } + } +} + +struct FilePurgeQueueInner { + space_id: SpaceId, + table_id: TableId, + closed: AtomicBool, + sender: UnboundedSender, +} + +#[derive(Debug)] +pub struct FilePurgeRequest { + space_id: SpaceId, + table_id: TableId, + file_id: FileId, +} + +#[derive(Debug)] +pub enum Request { + Purge(FilePurgeRequest), + Exit, +} + +/// Background file purger. +pub struct FilePurger { + sender: UnboundedSender, + handle: Mutex>>, +} + +impl FilePurger { + pub fn start( + runtime: &Runtime, + store: Arc, + ) -> Self { + // We must use unbound channel, so the sender wont block when the handle is + // dropped. + let (tx, rx) = mpsc::unbounded_channel(); + + // Spawn a background job to purge files. + let handle = runtime.spawn(async { + Self::purge_file_loop(store, rx).await; + }); + + Self { + sender: tx, + handle: Mutex::new(Some(handle)), + } + } + + pub async fn stop(&self) -> Result<()> { + info!("Try to stop file purger"); + + if self.sender.send(Request::Exit).is_err() { + error!("File purge task already exited"); + } + + let mut handle = self.handle.lock().await; + // Also clear the handle to avoid await a ready future. + if let Some(h) = handle.take() { + h.await.context(StopPurger)?; + } + + Ok(()) + } + + pub fn create_purge_queue(&self, space_id: SpaceId, table_id: TableId) -> FilePurgeQueue { + FilePurgeQueue::new(space_id, table_id, self.sender.clone()) + } + + async fn purge_file_loop( + store: Arc, + mut receiver: UnboundedReceiver, + ) { + info!("File purger start"); + + while let Some(request) = receiver.recv().await { + match request { + Request::Purge(purge_request) => { + let mut sst_file_path = store.new_path(); + sst_util::set_sst_file_path( + purge_request.space_id, + purge_request.table_id, + purge_request.file_id, + &mut sst_file_path, + ); + + info!( + "File purger delete file, purge_request:{:?}, sst_file_path:{}", + purge_request, + sst_file_path.display() + ); + + if let Err(e) = store.delete(&sst_file_path).await { + error!( + "File purger failed to delete file, sst_file_path:{}, err:{}", + sst_file_path.display(), + e + ); + } + } + Request::Exit => break, + } + } + + info!("File purger exit"); + } +} + +/// Merge sst meta of given `files`, panic if `files` is empty. +/// +/// The size and row_num of the merged meta is initialized to 0. +pub fn merge_sst_meta(files: &[FileHandle], schema: Schema) -> SstMetaData { + let mut min_key = files[0].min_key(); + let mut max_key = files[0].max_key(); + let mut time_range_start = files[0].time_range().inclusive_start(); + let mut time_range_end = files[0].time_range().exclusive_end(); + let mut max_sequence = files[0].max_sequence(); + + if files.len() > 1 { + for file in &files[1..] { + min_key = cmp::min(file.min_key(), min_key); + max_key = cmp::max(file.max_key(), max_key); + time_range_start = cmp::min(file.time_range().inclusive_start(), time_range_start); + time_range_end = cmp::max(file.time_range().exclusive_end(), time_range_end); + max_sequence = cmp::max(file.max_sequence(), max_sequence); + } + } + + SstMetaData { + min_key, + max_key, + time_range: TimeRange::new(time_range_start, time_range_end).unwrap(), + max_sequence, + schema, + // we don't know file size and total row number yet + size: 0, + row_num: 0, + } +} + +#[cfg(test)] +pub mod tests { + use super::*; + + pub struct FilePurgerMocker; + + impl FilePurgerMocker { + pub fn mock() -> FilePurger { + let (sender, _receiver) = mpsc::unbounded_channel(); + + FilePurger { + sender, + handle: Mutex::new(None), + } + } + } + + #[must_use] + pub struct SstMetaDataMocker { + schema: Schema, + time_range: TimeRange, + max_sequence: SequenceNumber, + } + + impl SstMetaDataMocker { + pub fn new(schema: Schema) -> Self { + Self { + schema, + time_range: TimeRange::min_to_max(), + max_sequence: 1, + } + } + + pub fn time_range(mut self, range: TimeRange) -> Self { + self.time_range = range; + self + } + + pub fn max_sequence(mut self, max_sequence: SequenceNumber) -> Self { + self.max_sequence = max_sequence; + self + } + + pub fn build(&self) -> SstMetaData { + SstMetaData { + min_key: Bytes::new(), + max_key: Bytes::new(), + time_range: self.time_range, + max_sequence: self.max_sequence, + schema: self.schema.clone(), + size: 0, + row_num: 0, + } + } + } +} diff --git a/analytic_engine/src/sst/manager.rs b/analytic_engine/src/sst/manager.rs new file mode 100644 index 0000000000..2d64a8fafb --- /dev/null +++ b/analytic_engine/src/sst/manager.rs @@ -0,0 +1,159 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Multi-level SST management + +use common_types::time::{TimeRange, Timestamp}; + +use crate::{ + compaction::ExpiredFiles, + sst::file::{FileHandle, FileMeta, FilePurgeQueue, Iter, Level, LevelHandler}, +}; + +/// Id for a sst file +pub type FileId = u64; +/// We use two level merge tree, the max level should less than u16::MAX +pub const MAX_LEVEL: usize = 2; + +/// A table level manager that manages all the sst files of the table +pub struct LevelsController { + levels: Vec, + purge_queue: FilePurgeQueue, +} + +impl Drop for LevelsController { + fn drop(&mut self) { + // Close the purge queue to avoid files being deleted. + self.purge_queue.close(); + } +} + +impl LevelsController { + /// Create an empty LevelsController + pub fn new(purge_queue: FilePurgeQueue) -> Self { + let mut levels = Vec::with_capacity(MAX_LEVEL); + for level in 0..MAX_LEVEL { + levels.push(LevelHandler::new(level as Level)); + } + + Self { + levels, + purge_queue, + } + } + + /// Add sst file to level + /// + /// Panic: If the level is greater than the max level + pub fn add_sst_to_level(&mut self, level: Level, file_meta: FileMeta) { + let level_handler = &mut self.levels[usize::from(level)]; + let file = FileHandle::new(file_meta, self.purge_queue.clone()); + + level_handler.insert(file); + } + + pub fn latest_sst(&self, level: Level) -> Option { + self.levels[usize::from(level)].latest_sst() + } + + /// Pick the ssts and collect it by `append_sst`. + pub fn pick_ssts( + &self, + time_range: TimeRange, + mut append_sst: impl FnMut(Level, &[FileHandle]), + ) { + for level_handler in self.levels.iter() { + let ssts = level_handler.pick_ssts(time_range); + append_sst(level_handler.level, &ssts); + } + } + + /// Remove sst files from level. + /// + /// Panic: If the level is greater than the max level + pub fn remove_ssts_from_level(&mut self, level: Level, file_ids: &[FileId]) { + let level_handler = &mut self.levels[usize::from(level)]; + level_handler.remove_ssts(file_ids); + } + + /// Total number of levels. + pub fn num_levels(&self) -> Level { + self.levels.len() as Level + } + + /// Iter ssts at given `level`. + /// + /// Panic if level is out of bound. + pub fn iter_ssts_at_level(&self, level: Level) -> Iter { + let level_handler = &self.levels[usize::from(level)]; + level_handler.iter_ssts() + } + + pub fn collect_expired_at_level( + &self, + level: Level, + expire_time: Option, + ) -> Vec { + let level_handler = &self.levels[usize::from(level)]; + let mut expired = Vec::new(); + level_handler.collect_expired(expire_time, &mut expired); + + expired + } + + pub fn has_expired_sst(&self, expire_time: Option) -> bool { + self.levels + .iter() + .any(|level_handler| level_handler.has_expired_sst(expire_time)) + } + + pub fn expired_ssts(&self, expire_time: Option) -> Vec { + let mut expired = Vec::new(); + let num_levels = self.num_levels(); + for level in 0..num_levels { + let files = self.collect_expired_at_level(level, expire_time); + expired.push(ExpiredFiles { level, files }); + } + + expired + } +} + +#[cfg(test)] +pub mod tests { + use table_engine::table::TableId; + use tokio::sync::mpsc; + + use crate::sst::{ + file::{FileMeta, FilePurgeQueue, SstMetaData}, + manager::{FileId, LevelsController}, + }; + + #[must_use] + #[derive(Default)] + pub struct LevelsControllerMockBuilder { + sst_meta_vec: Vec, + } + + impl LevelsControllerMockBuilder { + pub fn add_sst(mut self, mut sst_meta: Vec) -> Self { + self.sst_meta_vec.append(&mut sst_meta); + self + } + + pub fn build(self) -> LevelsController { + let (tx, _rx) = mpsc::unbounded_channel(); + let file_purge_queue = FilePurgeQueue::new(100, TableId::from(101), tx); + let mut levels_controller = LevelsController::new(file_purge_queue); + for (id, sst_meta) in self.sst_meta_vec.into_iter().enumerate() { + levels_controller.add_sst_to_level( + 0, + FileMeta { + id: id as FileId, + meta: sst_meta, + }, + ); + } + levels_controller + } + } +} diff --git a/analytic_engine/src/sst/mod.rs b/analytic_engine/src/sst/mod.rs new file mode 100644 index 0000000000..a6fec9162b --- /dev/null +++ b/analytic_engine/src/sst/mod.rs @@ -0,0 +1,10 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! SST (Sorted String Table) file + +pub mod builder; +pub mod factory; +pub mod file; +pub mod manager; +pub mod parquet; +pub mod reader; diff --git a/analytic_engine/src/sst/parquet/builder.rs b/analytic_engine/src/sst/parquet/builder.rs new file mode 100644 index 0000000000..8bba10cc79 --- /dev/null +++ b/analytic_engine/src/sst/parquet/builder.rs @@ -0,0 +1,560 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Sst builder implementation based on parquet. + +use std::{ + io::SeekFrom, + pin::Pin, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, Mutex, + }, + task::{Context, Poll}, +}; + +use arrow_deps::{ + arrow::record_batch::RecordBatch as ArrowRecordBatch, + datafusion::parquet::basic::Compression, + parquet::{ + arrow::ArrowWriter, + file::{properties::WriterProperties, writer::TryClone}, + }, +}; +use async_trait::async_trait; +use common_types::{bytes::BufMut, request_id::RequestId}; +use futures::AsyncRead; +use log::debug; +use object_store::{path::ObjectStorePath, ObjectStore}; +use snafu::{ensure, ResultExt}; + +use crate::sst::{ + builder::{RecordBatchStream, SstBuilder, *}, + factory::SstBuilderOptions, + file::SstMetaData, + parquet::encoding, +}; + +/// The implementation of sst based on parquet and object storage. +#[derive(Debug)] +pub struct ParquetSstBuilder<'a, S: ObjectStore> { + /// The path where the data is persisted. + path: &'a S::Path, + /// The storage where the data is persist. + storage: &'a S, + /// Max row group size. + num_rows_per_row_group: usize, + compression: Compression, +} + +impl<'a, S: ObjectStore> ParquetSstBuilder<'a, S> { + pub fn new(path: &'a S::Path, storage: &'a S, options: &SstBuilderOptions) -> Self { + Self { + path, + storage, + num_rows_per_row_group: options.num_rows_per_row_group, + compression: options.compression.into(), + } + } +} + +/// A memory writer implementing the [ParquetWriter]. +/// +/// The writer accepts the encoded bytes by parquet format and provides the byte +/// stream to the reader. +#[derive(Clone, Debug)] +struct EncodingBuffer { + // In order to reuse the buffer, the buffer must be wrapped in the Arc and the Mutex because + // the writer is consumed when building a ArrowWriter. + inner: Arc>, +} + +impl Default for EncodingBuffer { + fn default() -> Self { + Self { + inner: Arc::new(Mutex::new(EncodingBufferInner { + bytes_written: 0, + read_offset: 0, + buf: Vec::new(), + })), + } + } +} + +impl std::io::Write for EncodingBuffer { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + let mut inner = self.inner.lock().unwrap(); + inner.write(buf) + } + + fn flush(&mut self) -> std::io::Result<()> { + let mut inner = self.inner.lock().unwrap(); + inner.flush() + } +} + +impl std::io::Seek for EncodingBuffer { + fn seek(&mut self, pos: SeekFrom) -> std::io::Result { + let mut inner = self.inner.lock().unwrap(); + inner.seek(pos) + } +} + +impl TryClone for EncodingBuffer { + fn try_clone(&self) -> std::io::Result { + Ok(self.clone()) + } +} + +impl EncodingBuffer { + fn read(&self, read_buf: &mut [u8]) -> usize { + let mut inner = self.inner.lock().unwrap(); + inner.read(read_buf) + } +} + +/// The underlying buffer implementing [ParquetWriter]. +/// +/// Provides the write function for [ArrowWriter] and read function for +/// [AsyncRead]. +#[derive(Clone, Debug)] +struct EncodingBufferInner { + bytes_written: usize, + read_offset: usize, + buf: Vec, +} + +impl std::io::Write for EncodingBufferInner { + /// Write the `buf` to the `self.buf`. + /// + /// The readable bytes should be exhausted before writing new bytes. + /// `self.bytes_written` and `self.read_offset` is updated after writing. + fn write(&mut self, buf: &[u8]) -> std::io::Result { + if self.read_offset != 0 { + assert_eq!(self.buf.len(), self.read_offset); + self.buf.clear(); + self.buf.reserve(buf.len()); + // reset the read offset + self.read_offset = 0; + } + + let bytes_written = self.buf.write(buf)?; + // accumulate the written bytes + self.bytes_written += bytes_written; + + Ok(bytes_written) + } + + /// Actually nothing to flush. + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} + +impl std::io::Seek for EncodingBufferInner { + /// Given the assumption that the seek usage of the [ParquetWriter] in the + /// parquet project is just `seek(SeekFrom::Current(0))`, the + /// implementation panics if seek to a different target. + fn seek(&mut self, pos: SeekFrom) -> std::io::Result { + if let SeekFrom::Current(offset) = pos { + assert_eq!(offset, 0); + return Ok(self.bytes_written as u64); + } + + unreachable!("Only can handle the case where seek to current(0)") + } +} + +impl EncodingBufferInner { + /// Read the content in `self.buf[self.offset..]` into `read_buf`. + /// + /// When finishing reading, advance the `self.offset`. + fn read(&mut self, mut read_buf: &mut [u8]) -> usize { + if self.read_offset >= self.buf.len() { + return 0; + } + let remaining_size = self.buf.len() - self.read_offset; + + let read_len = remaining_size.min(read_buf.len()); + read_buf.put(&self.buf[self.read_offset..self.read_offset + read_len]); + + self.advance(read_len); + read_len + } + + /// Advance the `self.offset` by `len`. + /// + /// Caller should ensures the advanced offset wont exceed `self.buf.len()`. + fn advance(&mut self, len: usize) { + self.read_offset += len; + + assert!(self.read_offset <= self.buf.len()); + } +} + +/// RecordBytesReader provides AsyncRead implementation for the encoded records +/// by parquet. +struct RecordBytesReader { + request_id: RequestId, + record_stream: RecordBatchStream, + encoding_buffer: EncodingBuffer, + arrow_writer: Mutex>>, + num_rows_per_row_group: usize, + compression: Compression, + meta_data: SstMetaData, + total_row_num: Arc, + arrow_record_batch_vec: Vec, + // Whether the underlying `record_stream` is finished + stream_finished: bool, + + fetched_row_num: usize, +} + +/// Build the write properties containing the sst meta data. +fn build_write_properties( + num_rows_per_row_group: usize, + compression: Compression, + meta_data: &SstMetaData, +) -> Result { + let meta_data_kv = encoding::encode_sst_meta_data(meta_data.clone()) + .map_err(|e| Box::new(e) as _) + .context(EncodeMetaData)?; + + Ok(WriterProperties::builder() + .set_key_value_metadata(Some(vec![meta_data_kv])) + .set_max_row_group_size(num_rows_per_row_group) + .set_compression(compression) + .build()) +} + +/// Encode the record batch with [ArrowWriter] and the encoded contents is +/// written to the [EncodingBuffer]. +// TODO(xikai): too many parameters +fn encode_record_batch( + arrow_writer: &mut Option>, + num_rows_per_row_group: usize, + compression: Compression, + meta_data: &SstMetaData, + mem_buf_writer: EncodingBuffer, + arrow_record_batch_vec: Vec, +) -> Result { + if arrow_record_batch_vec.is_empty() { + return Ok(0); + } + + let arrow_schema = arrow_record_batch_vec[0].schema(); + + // create arrow writer if not exist + if arrow_writer.is_none() { + let write_props = build_write_properties(num_rows_per_row_group, compression, meta_data)?; + let writer = ArrowWriter::try_new(mem_buf_writer, arrow_schema.clone(), Some(write_props)) + .map_err(|e| Box::new(e) as _) + .context(EncodeRecordBatch)?; + *arrow_writer = Some(writer); + } + + let record_batch = ArrowRecordBatch::concat(&arrow_schema, &arrow_record_batch_vec) + .map_err(|e| Box::new(e) as _) + .context(EncodeRecordBatch)?; + + arrow_writer + .as_mut() + .unwrap() + .write(&record_batch) + .map_err(|e| Box::new(e) as _) + .context(EncodeRecordBatch)?; + + Ok(record_batch.num_rows()) +} + +fn close_writer(arrow_writer: &mut Option>) -> Result<()> { + if let Some(arrow_writer) = arrow_writer { + arrow_writer + .close() + .map_err(|e| Box::new(e) as _) + .context(EncodeRecordBatch)?; + } + + Ok(()) +} + +impl AsyncRead for RecordBytesReader { + fn poll_read( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut [u8], + ) -> Poll> { + let mut reader = self.get_mut(); + let size = reader.encoding_buffer.read(buf); + if size > 0 { + return Poll::Ready(Ok(size)); + } + + // The stream is also finished + if reader.stream_finished { + return Poll::Ready(Ok(0)); + } + + // FIXME(xikai): no data may cause empty sst file. + // fetch more rows from the stream. + while reader.fetched_row_num < reader.num_rows_per_row_group { + match Pin::new(reader.record_stream.as_mut()).poll_next(cx) { + Poll::Pending => return Poll::Pending, + Poll::Ready(v) => match v { + Some(record_batch) => match record_batch.context(PollRecordBatch) { + Ok(record_batch) => { + assert!( + !record_batch.is_empty(), + "found empty record batch, request id:{}", + reader.request_id + ); + + reader.fetched_row_num += record_batch.num_rows(); + reader + .arrow_record_batch_vec + .push(record_batch.into_record_batch().into_arrow_record_batch()); + } + Err(e) => { + return Poll::Ready(Err(std::io::Error::new( + std::io::ErrorKind::Other, + e, + ))) + } + }, + None => { + reader.stream_finished = true; + debug!( + "Record stream finished, request_id:{}, batch_len:{}, fetched_row_num:{}, num_rows_per_row_group:{}", + reader.request_id, + reader.arrow_record_batch_vec.len(), + reader.fetched_row_num, + reader.num_rows_per_row_group, + ); + break; + } + }, + } + } + + assert!(reader.stream_finished || reader.fetched_row_num >= reader.num_rows_per_row_group); + + // Reset fetched row num. + reader.fetched_row_num = 0; + match encode_record_batch( + reader.arrow_writer.get_mut().unwrap(), + reader.num_rows_per_row_group, + reader.compression, + &reader.meta_data, + reader.encoding_buffer.clone(), + std::mem::take(&mut reader.arrow_record_batch_vec), + ) { + Err(e) => return Poll::Ready(Err(std::io::Error::new(std::io::ErrorKind::Other, e))), + Ok(row_num) => { + reader.total_row_num.fetch_add(row_num, Ordering::Relaxed); + } + } + + if reader.stream_finished { + if let Err(e) = close_writer(reader.arrow_writer.get_mut().unwrap()) { + return Poll::Ready(Err(std::io::Error::new(std::io::ErrorKind::Other, e))); + } + } + + Poll::Ready(Ok(reader.encoding_buffer.read(buf))) + } +} + +#[async_trait] +impl<'a, S: ObjectStore> SstBuilder for ParquetSstBuilder<'a, S> { + async fn build( + &mut self, + request_id: RequestId, + meta: &SstMetaData, + record_stream: RecordBatchStream, + ) -> Result { + debug!( + "Build parquet file, request_id:{}, meta:{:?}, num_rows_per_row_group:{}", + request_id, meta, self.num_rows_per_row_group + ); + + let total_row_num = Arc::new(AtomicUsize::new(0)); + let reader = RecordBytesReader { + request_id, + record_stream, + encoding_buffer: EncodingBuffer::default(), + arrow_writer: Mutex::new(None), + num_rows_per_row_group: self.num_rows_per_row_group, + compression: self.compression, + total_row_num: total_row_num.clone(), + arrow_record_batch_vec: Vec::new(), + // TODO(xikai): should we avoid this clone? + meta_data: meta.to_owned(), + stream_finished: false, + fetched_row_num: 0, + }; + + self.storage + .put(self.path, reader, None) + .await + .map_err(|e| Box::new(e) as _) + .context(Persist { + path: self.path.display(), + })?; + + let result = self + .storage + .list_with_delimiter(self.path) + .await + .map_err(|e| Box::new(e) as _) + .context(Persist { + path: self.path.display(), + })?; + + ensure!( + result.objects.len() == 1, + GetFileSize { + path: self.path.display(), + } + ); + + Ok(SstInfo { + file_size: result.objects[0].size, + row_num: total_row_num.load(Ordering::Relaxed), + }) + } +} + +#[cfg(test)] +mod tests { + + use common_types::{ + bytes::Bytes, + projected_schema::ProjectedSchema, + tests::{build_row, build_schema}, + time::{TimeRange, Timestamp}, + }; + use common_util::runtime::{self, Runtime}; + use futures::stream; + use object_store::disk::File; + use table_engine::predicate::Predicate; + use tempfile::tempdir; + + use super::*; + use crate::{ + row_iter::tests::build_record_batch_with_key, + sst::{ + factory::{Factory, FactoryImpl, SstBuilderOptions, SstReaderOptions, SstType}, + parquet::reader::ParquetSstReader, + reader::{tests::check_stream, SstReader}, + }, + table_options, + }; + + // TODO(xikai): add test for reverse reader + + #[test] + fn test_parquet_build_and_read() { + let runtime = Arc::new(runtime::Builder::default().build().unwrap()); + parquet_write_and_then_read_back(runtime.clone(), 3, vec![3, 3, 3, 3, 3]); + // TODO: num_rows should be [4, 4, 4, 3]? + parquet_write_and_then_read_back(runtime.clone(), 4, vec![4, 2, 4, 2, 3]); + // TODO: num_rows should be [5, 5, 5]? + parquet_write_and_then_read_back(runtime, 5, vec![5, 1, 5, 1, 3]); + } + + fn parquet_write_and_then_read_back( + runtime: Arc, + num_rows_per_row_group: usize, + expected_num_rows: Vec, + ) { + runtime.block_on(async { + let sst_factory = FactoryImpl; + let sst_builder_options = SstBuilderOptions { + sst_type: SstType::Parquet, + num_rows_per_row_group, + compression: table_options::Compression::Uncompressed, + }; + + let dir = tempdir().unwrap(); + let root = dir.path(); + let store = File::new(root); + let mut sst_file_path = store.new_path(); + sst_file_path.set_file_name("data.par"); + + let schema = build_schema(); + let projected_schema = ProjectedSchema::no_projection(schema.clone()); + let sst_meta = SstMetaData { + min_key: Bytes::from_static(b"100"), + max_key: Bytes::from_static(b"200"), + time_range: TimeRange::new_unchecked(Timestamp::new(1), Timestamp::new(2)), + max_sequence: 200, + schema: schema.clone(), + size: 10, + row_num: 2, + }; + + let mut counter = 10; + let record_batch_stream = Box::new(stream::poll_fn(move |ctx| -> Poll> { + counter -= 1; + if counter == 0 { + return Poll::Ready(None); + } else if counter % 2 == 0 { + ctx.waker().wake_by_ref(); + return Poll::Pending; + } + + // reach here when counter is 9 7 5 3 1 + let ts = 100 + counter; + let rows = vec![ + build_row(b"a", ts, 10.0, "v4"), + build_row(b"b", ts, 10.0, "v4"), + build_row(b"c", ts, 10.0, "v4"), + ]; + let batch = build_record_batch_with_key(schema.clone(), rows); + Poll::Ready(Some(Ok(batch))) + })); + + let mut builder = sst_factory + .new_sst_builder(&sst_builder_options, &sst_file_path, &store) + .unwrap(); + let sst_info = builder + .build(RequestId::next_id(), &sst_meta, record_batch_stream) + .await + .unwrap(); + + assert_eq!(15, sst_info.row_num); + + // read sst back to test + let sst_reader_options = SstReaderOptions { + sst_type: SstType::Parquet, + read_batch_row_num: 5, + reverse: false, + projected_schema, + predicate: Arc::new(Predicate::new(TimeRange::min_to_max())), + meta_cache: None, + data_cache: None, + runtime: runtime.clone(), + }; + + let mut reader = ParquetSstReader::new(&sst_file_path, &store, &sst_reader_options); + assert_eq!(reader.meta_data().await.unwrap(), &sst_meta); + assert_eq!( + expected_num_rows, + reader + .row_groups() + .await + .iter() + .map(|g| g.num_rows()) + .collect::>() + ); + + let mut stream = reader.read().await.unwrap(); + let mut expect_rows = vec![]; + for counter in &[9, 7, 5, 3, 1] { + expect_rows.push(build_row(b"a", 100 + counter, 10.0, "v4")); + expect_rows.push(build_row(b"b", 100 + counter, 10.0, "v4")); + expect_rows.push(build_row(b"c", 100 + counter, 10.0, "v4")); + } + check_stream(&mut stream, expect_rows).await; + }); + } +} diff --git a/analytic_engine/src/sst/parquet/encoding.rs b/analytic_engine/src/sst/parquet/encoding.rs new file mode 100644 index 0000000000..ddb916b14d --- /dev/null +++ b/analytic_engine/src/sst/parquet/encoding.rs @@ -0,0 +1,152 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::convert::TryFrom; + +use arrow_deps::parquet::file::metadata::KeyValue; +use common_types::bytes::{BytesMut, MemBufMut, Writer}; +use common_util::define_result; +use proto::sst::SstMetaData as SstMetaDataPb; +use protobuf::Message; +use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu}; + +use crate::sst::file::SstMetaData; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Failed to encode sst meta data, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + EncodeIntoPb { + source: protobuf::ProtobufError, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to decode sst meta data, base64 of meta value:{}, err:{}.\nBacktrace:\n{}", + meta_value, + source, + backtrace, + ))] + DecodeFromPb { + meta_value: String, + source: protobuf::ProtobufError, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid meta key, expect:{}, given:{}.\nBacktrace:\n{}", + expect, + given, + backtrace + ))] + InvalidMetaKey { + expect: String, + given: String, + backtrace: Backtrace, + }, + + #[snafu(display("Base64 meta value not found.\nBacktrace:\n{}", backtrace))] + Base64MetaValueNotFound { backtrace: Backtrace }, + + #[snafu(display( + "Invalid base64 meta value length, base64 of meta value:{}.\nBacktrace:\n{}", + meta_value, + backtrace, + ))] + InvalidBase64MetaValueLen { + meta_value: String, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to decode base64 meta value, base64 of meta value:{}, err:{}", + meta_value, + source + ))] + DecodeBase64MetaValue { + meta_value: String, + source: base64::DecodeError, + }, + + #[snafu(display( + "Invalid meta value length, base64 of meta value:{}.\nBacktrace:\n{}", + meta_value, + backtrace + ))] + InvalidMetaValueLen { + meta_value: String, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid meta value header, base64 of meta value:{}.\nBacktrace:\n{}", + meta_value, + backtrace + ))] + InvalidMetaValueHeader { + meta_value: String, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to convert sst meta data from protobuf, err:{}", source))] + ConvertSstMetaData { source: crate::sst::file::Error }, +} + +define_result!(Error); + +pub const META_KEY: &str = "meta"; +pub const META_VALUE_HEADER: u8 = 0; + +/// Encode the sst meta data into binary key value pair. +pub fn encode_sst_meta_data(meta_data: SstMetaData) -> Result { + let meta_data_pb = SstMetaDataPb::from(meta_data); + + let mut buf = BytesMut::with_capacity(meta_data_pb.compute_size() as usize + 1); + buf.write_u8(META_VALUE_HEADER) + .expect("Should write header into the buffer successfully"); + + // encode the sst meta data into protobuf binary + { + let mut writer = Writer::new(&mut buf); + meta_data_pb + .write_to_writer(&mut writer) + .context(EncodeIntoPb)?; + } + Ok(KeyValue { + key: META_KEY.to_string(), + value: Some(base64::encode(buf.as_ref())), + }) +} + +/// Decode the sst meta data from the binary key value pair. +pub fn decode_sst_meta_data(kv: &KeyValue) -> Result { + ensure!( + kv.key == META_KEY, + InvalidMetaKey { + expect: META_KEY, + given: &kv.key, + } + ); + + let meta_value = kv.value.as_ref().context(Base64MetaValueNotFound)?; + ensure!( + !meta_value.is_empty(), + InvalidBase64MetaValueLen { meta_value } + ); + + let raw_bytes = base64::decode(meta_value).context(DecodeBase64MetaValue { meta_value })?; + + ensure!(!raw_bytes.is_empty(), InvalidMetaValueLen { meta_value }); + + ensure!( + raw_bytes[0] == META_VALUE_HEADER, + InvalidMetaValueHeader { meta_value } + ); + + let meta_data_pb: SstMetaDataPb = + Message::parse_from_bytes(&raw_bytes[1..]).context(DecodeFromPb { meta_value })?; + + SstMetaData::try_from(meta_data_pb).context(ConvertSstMetaData) +} diff --git a/analytic_engine/src/sst/parquet/mod.rs b/analytic_engine/src/sst/parquet/mod.rs new file mode 100644 index 0000000000..aaf82e4671 --- /dev/null +++ b/analytic_engine/src/sst/parquet/mod.rs @@ -0,0 +1,7 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Sst implementation based on parquet. + +pub mod builder; +pub mod encoding; +pub mod reader; diff --git a/analytic_engine/src/sst/parquet/reader.rs b/analytic_engine/src/sst/parquet/reader.rs new file mode 100644 index 0000000000..f515855ff7 --- /dev/null +++ b/analytic_engine/src/sst/parquet/reader.rs @@ -0,0 +1,371 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Sst reader implementation based on parquet. + +use std::{ + fs::File, + pin::Pin, + sync::Arc, + task::{Context, Poll}, + time::Instant, +}; + +use arrow_deps::{ + arrow::{error::Result as ArrowResult, record_batch::RecordBatch}, + parquet::{ + arrow::{ArrowReader, ParquetFileArrowReader}, + file::{metadata::RowGroupMetaData, reader::FileReader}, + }, +}; +use async_trait::async_trait; +use common_types::{ + projected_schema::{ProjectedSchema, RowProjector}, + record_batch::{ArrowRecordBatchProjector, RecordBatchWithKey}, + schema::Schema, +}; +use common_util::runtime::Runtime; +use futures::Stream; +use log::{debug, error, trace}; +use object_store::{path::ObjectStorePath, ObjectStore}; +use parquet::{ + reverse_reader::Builder as ReverseRecordBatchReaderBuilder, CachableSerializedFileReader, + DataCacheRef, MetaCacheRef, +}; +use snafu::{ensure, OptionExt, ResultExt}; +use table_engine::predicate::PredicateRef; +use tokio::sync::mpsc::{self, Receiver, Sender}; + +use crate::sst::{ + factory::SstReaderOptions, + file::SstMetaData, + parquet::encoding, + reader::{error::*, SstReader}, +}; + +const DEFAULT_CHANNEL_CAP: usize = 1000; + +pub async fn read_sst_meta( + storage: &S, + path: &S::Path, + meta_cache: &Option, + data_cache: &Option, +) -> Result<(CachableSerializedFileReader, SstMetaData)> { + let file = storage + .get(path) + .await + .map_err(|e| Box::new(e) as _) + .with_context(|| ReadPersist { + path: path.display(), + })?; + + // generate the file reader + let file_reader = CachableSerializedFileReader::new( + path.display(), + file, + meta_cache.clone(), + data_cache.clone(), + ) + .map_err(|e| Box::new(e) as _) + .with_context(|| ReadPersist { + path: path.display(), + })?; + + // parse sst meta data + let sst_meta = { + let kv_metas = file_reader + .metadata() + .file_metadata() + .key_value_metadata() + .as_ref() + .context(SstMetaNotFound)?; + + ensure!(!kv_metas.is_empty(), EmptySstMeta); + + encoding::decode_sst_meta_data(&kv_metas[0]) + .map_err(|e| Box::new(e) as _) + .context(DecodeSstMeta)? + }; + + Ok((file_reader, sst_meta)) +} + +/// The implementation of sst based on parquet and object storage. +pub struct ParquetSstReader<'a, S: ObjectStore> { + /// The path where the data is persisted. + path: &'a S::Path, + /// The storage where the data is persist. + storage: &'a S, + projected_schema: ProjectedSchema, + predicate: PredicateRef, + meta_data: Option, + file_reader: Option>, + /// The batch of rows in one `record_batch`. + batch_size: usize, + /// Read the rows in reverse order. + reverse: bool, + channel_cap: usize, + + meta_cache: Option, + data_cache: Option, + + runtime: Arc, +} + +impl<'a, S: ObjectStore> ParquetSstReader<'a, S> { + pub fn new(path: &'a S::Path, storage: &'a S, options: &SstReaderOptions) -> Self { + Self { + path, + storage, + projected_schema: options.projected_schema.clone(), + predicate: options.predicate.clone(), + meta_data: None, + file_reader: None, + batch_size: options.read_batch_row_num, + reverse: options.reverse, + channel_cap: DEFAULT_CHANNEL_CAP, + meta_cache: options.meta_cache.clone(), + data_cache: options.data_cache.clone(), + runtime: options.runtime.clone(), + } + } +} + +impl<'a, S: ObjectStore> ParquetSstReader<'a, S> { + async fn init_if_necessary(&mut self) -> Result<()> { + if self.meta_data.is_some() { + return Ok(()); + } + + let (file_reader, sst_meta) = + read_sst_meta(self.storage, self.path, &self.meta_cache, &self.data_cache).await?; + + self.file_reader = Some(file_reader); + self.meta_data = Some(sst_meta); + + Ok(()) + } + + fn read_record_batches(&mut self, tx: Sender>) -> Result<()> { + let path = self.path.display(); + ensure!(self.file_reader.is_some(), ReadAgain { path }); + + let file_reader = self.file_reader.take().unwrap(); + let batch_size = self.batch_size; + let schema = { + let meta_data = self.meta_data.as_ref().unwrap(); + meta_data.schema.clone() + }; + let projected_schema = self.projected_schema.clone(); + let row_projector = projected_schema + .try_project_with_key(&schema) + .map_err(|e| Box::new(e) as _) + .context(Projection)?; + let predicate = self.predicate.clone(); + let reverse = self.reverse; + + let _ = self.runtime.spawn_blocking(move || { + debug!( + "begin reading record batch from the sst:{}, predicate:{:?}, projection:{:?}", + path, predicate, projected_schema, + ); + + let mut send_failed = false; + let send = |v| -> Result<()> { + tx.blocking_send(v) + .map_err(|e| { + send_failed = true; + Box::new(e) as _ + }) + .context(Other)?; + Ok(()) + }; + + let reader = ProjectAndFilterReader { + file_path: path.clone(), + file_reader: Some(file_reader), + schema, + projected_schema, + row_projector, + predicate, + batch_size, + reverse, + }; + + let start_fetch = Instant::now(); + match reader.fetch_and_send_record_batch(send) { + Ok(row_num) => { + debug!( + "finish reading record batch({} rows) from the sst:{}, time cost:{:?}", + row_num, + path, + start_fetch.elapsed(), + ); + } + Err(e) => { + if send_failed { + error!("fail to send the fetched record batch result, err:{}", e); + } else { + error!( + "failed to read record batch from the sst:{}, err:{}", + path, e + ); + let _ = tx.blocking_send(Err(e)); + } + } + } + }); + + Ok(()) + } + + #[cfg(test)] + pub(crate) async fn row_groups(&mut self) -> &[RowGroupMetaData] { + self.init_if_necessary().await.unwrap(); + self.file_reader.as_ref().unwrap().metadata().row_groups() + } +} + +/// A reader for projection and filter on the parquet file. +struct ProjectAndFilterReader { + file_path: String, + file_reader: Option>, + schema: Schema, + projected_schema: ProjectedSchema, + row_projector: RowProjector, + predicate: PredicateRef, + batch_size: usize, + reverse: bool, +} + +impl ProjectAndFilterReader { + fn build_row_group_predicate(&self) -> Box bool + 'static> { + assert!(self.file_reader.is_some()); + + let row_groups = self.file_reader.as_ref().unwrap().metadata().row_groups(); + let filter_results = self.predicate.filter_row_groups(&self.schema, row_groups); + + trace!("Finish build row group predicate, predicate:{:?}, schema:{:?}, filter_results:{:?}, row_groups meta data:{:?}", self.predicate, self.schema, filter_results, row_groups); + + Box::new(move |_, idx: usize| filter_results[idx]) + } + + /// Generate the reader which has processed projection and filter. + /// This `file_reader` is consumed after calling this method. + fn project_and_filter_reader( + &mut self, + ) -> Result>>> { + assert!(self.file_reader.is_some()); + + let row_group_predicate = self.build_row_group_predicate(); + let mut file_reader = self.file_reader.take().unwrap(); + file_reader.filter_row_groups(&row_group_predicate); + + if self.reverse { + let mut builder = + ReverseRecordBatchReaderBuilder::new(Arc::new(file_reader), self.batch_size); + if !self.projected_schema.is_all_projection() { + builder = builder.projection(Some(self.row_projector.existed_source_projection())); + } + + let reverse_reader = builder + .build() + .map_err(|e| Box::new(e) as _) + .context(DecodeRecordBatch)?; + + Ok(Box::new(reverse_reader)) + } else { + let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(file_reader)); + + let reader = if self.projected_schema.is_all_projection() { + arrow_reader.get_record_reader(self.batch_size) + } else { + let projection = self.row_projector.existed_source_projection(); + arrow_reader.get_record_reader_by_columns(projection, self.batch_size) + }; + let reader = reader + .map_err(|e| Box::new(e) as _) + .context(DecodeRecordBatch)?; + + Ok(Box::new(reader)) + } + } + + /// Fetch the record batch from the `reader` and send them. + /// Returns the fetched row number. + fn fetch_and_send_record_batch( + mut self, + mut send: impl FnMut(Result) -> Result<()>, + ) -> Result { + let reader = self.project_and_filter_reader()?; + + let arrow_record_batch_projector = ArrowRecordBatchProjector::from(self.row_projector); + let mut row_num = 0; + for record_batch in reader { + trace!( + "Fetch one record batch from sst:{}, num_rows:{:?}", + self.file_path, + record_batch.as_ref().map(|v| v.num_rows()) + ); + + match record_batch + .map_err(|e| Box::new(e) as _) + .context(DecodeRecordBatch) + { + Ok(record_batch) => { + row_num += record_batch.num_rows(); + + let record_batch_with_key = arrow_record_batch_projector + .project_to_record_batch_with_key(record_batch) + .map_err(|e| Box::new(e) as _) + .context(DecodeRecordBatch); + + send(record_batch_with_key)?; + } + Err(e) => { + send(Err(e))?; + break; + } + }; + } + + Ok(row_num) + } +} + +struct RecordBatchReceiver { + rx: Receiver>, +} + +impl Stream for RecordBatchReceiver { + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + self.as_mut().rx.poll_recv(cx) + } +} + +#[async_trait] +impl<'a, S: ObjectStore> SstReader for ParquetSstReader<'a, S> { + async fn meta_data(&mut self) -> Result<&SstMetaData> { + self.init_if_necessary().await?; + Ok(self.meta_data.as_ref().unwrap()) + } + + // TODO(yingwen): Project the schema in parquet + async fn read( + &mut self, + ) -> Result> + Send + Unpin>> { + debug!( + "read sst:{}, projected_schema:{:?}, predicate:{:?}", + self.path.display(), + self.projected_schema, + self.predicate + ); + + self.init_if_necessary().await?; + let (tx, rx) = mpsc::channel::>(self.channel_cap); + self.read_record_batches(tx)?; + + Ok(Box::new(RecordBatchReceiver { rx })) + } +} diff --git a/analytic_engine/src/sst/reader.rs b/analytic_engine/src/sst/reader.rs new file mode 100644 index 0000000000..ab76c9a044 --- /dev/null +++ b/analytic_engine/src/sst/reader.rs @@ -0,0 +1,90 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Sst reader trait definition. + +use async_trait::async_trait; +use common_types::record_batch::RecordBatchWithKey; +use futures::Stream; + +use crate::sst::file::SstMetaData; + +pub mod error { + use common_util::define_result; + use snafu::{Backtrace, Snafu}; + + #[derive(Debug, Snafu)] + #[snafu(visibility(pub))] + pub enum Error { + #[snafu(display("Try to read again, path:{}.\nBacktrace:\n{}", path, backtrace))] + ReadAgain { backtrace: Backtrace, path: String }, + + #[snafu(display("Fail to read persisted file, path:{}, err:{}", path, source))] + ReadPersist { + path: String, + source: Box, + }, + + #[snafu(display("Failed to decode record batch, err:{}", source))] + DecodeRecordBatch { + source: Box, + }, + + #[snafu(display("Failed to decode sst meta data, err:{}", source))] + DecodeSstMeta { + source: Box, + }, + + #[snafu(display("Sst meta data is not found.\nBacktrace:\n{}", backtrace))] + SstMetaNotFound { backtrace: Backtrace }, + + #[snafu(display("Fail to projection, err:{}", source))] + Projection { + source: Box, + }, + + #[snafu(display("Sst meta data is empty.\nBacktrace:\n{}", backtrace))] + EmptySstMeta { backtrace: Backtrace }, + + #[snafu(display("Other kind of error:{}", source))] + Other { + source: Box, + }, + } + + define_result!(Error); +} + +pub use error::*; + +#[async_trait] +pub trait SstReader { + async fn meta_data(&mut self) -> Result<&SstMetaData>; + + async fn read( + &mut self, + ) -> Result> + Send + Unpin>>; +} + +#[cfg(test)] +pub mod tests { + use common_types::row::Row; + use futures::StreamExt; + + use super::*; + + pub async fn check_stream(stream: &mut S, expected_rows: Vec) + where + S: Stream> + Unpin, + { + let mut visited_rows = 0; + while let Some(batch) = stream.next().await { + let batch = batch.unwrap(); + for row_idx in 0..batch.num_rows() { + assert_eq!(batch.clone_row_at(row_idx), expected_rows[visited_rows]); + visited_rows += 1; + } + } + + assert_eq!(visited_rows, expected_rows.len()); + } +} diff --git a/analytic_engine/src/table/data.rs b/analytic_engine/src/table/data.rs new file mode 100644 index 0000000000..88dde35166 --- /dev/null +++ b/analytic_engine/src/table/data.rs @@ -0,0 +1,713 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table data + +use std::{ + collections::HashMap, + convert::TryInto, + sync::{ + atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering}, + Arc, Mutex, + }, + time::Duration, +}; + +use arc_swap::ArcSwap; +use arena::CollectorRef; +use common_types::{ + schema::{Schema, Version}, + time::{TimeRange, Timestamp}, + SequenceNumber, +}; +use common_util::define_result; +use log::{debug, info}; +use object_store::path::ObjectStorePath; +use snafu::{Backtrace, OptionExt, ResultExt, Snafu}; +use table_engine::{engine::CreateTableRequest, table::TableId}; +use wal::manager::RegionId; + +use crate::{ + instance::write_worker::{WorkerLocal, WriteHandle}, + memtable::{ + factory::{FactoryRef as MemTableFactoryRef, Options as MemTableOptions}, + skiplist::factory::SkiplistMemTableFactory, + }, + meta::meta_update::AddTableMeta, + space::SpaceId, + sst::{factory::SstType, file::FilePurger, manager::FileId}, + table::{ + metrics::Metrics, + sst_util, + version::{MemTableForWrite, MemTableState, SamplingMemTable, TableVersion}, + }, + TableOptions, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to create memtable, err:{}", source))] + CreateMemTable { + source: crate::memtable::factory::Error, + }, + + #[snafu(display( + "Failed to find or create memtable, timestamp overflow, timestamp:{:?}, duration:{:?}.\nBacktrace:\n{}", + timestamp, + duration, + backtrace, + ))] + TimestampOverflow { + timestamp: Timestamp, + duration: Duration, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to find memtable for write, err:{}", source))] + FindMemTable { + source: crate::table::version::Error, + }, +} + +define_result!(Error); + +pub type MemTableId = u64; + +/// Data of a table +pub struct TableData { + /// Id of this table + pub id: TableId, + /// Name of this table + pub name: String, + /// Schema of this table + schema: Mutex, + /// Space id of this table + pub space_id: SpaceId, + /// The sst type of this table + pub sst_type: SstType, + + /// Mutable memtable memory size limitation + mutable_limit: AtomicU32, + /// Options of this table. + /// + /// Most modification to `opts` can be done by replacing the old options + /// with a new one. However, altering the segment duration should be done + /// carefully to avoid the reader seeing inconsistent segment duration + /// and memtables/ssts during query/compaction/flush . + opts: ArcSwap, + /// MemTable factory of this table + memtable_factory: MemTableFactoryRef, + /// Space memtable memory usage collector + mem_usage_collector: CollectorRef, + + /// Current table version + current_version: TableVersion, + /// Last sequence visible to the reads + /// + /// Write to last_sequence should be guarded by a mutex and only done by + /// single writer, but reads are allowed to be done concurrently without + /// mutex protected + last_sequence: AtomicU64, + /// Handle to the write worker + pub write_handle: WriteHandle, + /// Auto incremented id to track memtable, reset on engine open + /// + /// Allocating memtable id should be guarded by write lock + last_memtable_id: AtomicU64, + + /// Last id of the sst file + /// + /// Write to last_file_id require external synchronization + last_file_id: AtomicU64, + + /// Flag denoting whether the table is dropped + /// + /// No write/alter is allowed if the table is dropped. + dropped: AtomicBool, + + /// Metrics of this table. + pub metrics: Metrics, +} + +impl Drop for TableData { + fn drop(&mut self) { + debug!("TableData is dropped, id:{}, name:{}", self.id, self.name); + } +} + +#[inline] +fn get_mutable_limit(opts: &TableOptions) -> u32 { + opts.write_buffer_size * 7 / 8 +} + +impl TableData { + /// Create a new TableData + /// + /// This function should only be called when a new table is creating and + /// there is no existing data of the table + pub fn new( + space_id: SpaceId, + request: CreateTableRequest, + write_handle: WriteHandle, + table_opts: TableOptions, + purger: &FilePurger, + mem_usage_collector: CollectorRef, + ) -> Result { + // FIXME(yingwen): Validate TableOptions, such as bucket_duration >= + // segment_duration and bucket_duration is aligned to segment_duration + + let memtable_factory = Arc::new(SkiplistMemTableFactory); + let purge_queue = purger.create_purge_queue(space_id, request.table_id); + let current_version = TableVersion::new(purge_queue); + let metrics = Metrics::new(&request.table_name); + + Ok(Self { + id: request.table_id, + name: request.table_name, + schema: Mutex::new(request.table_schema), + space_id, + // TODO(xikai): sst type should be decided by the `request`. + sst_type: SstType::Parquet, + mutable_limit: AtomicU32::new(get_mutable_limit(&table_opts)), + opts: ArcSwap::new(Arc::new(table_opts)), + memtable_factory, + mem_usage_collector, + current_version, + last_sequence: AtomicU64::new(0), + write_handle, + last_memtable_id: AtomicU64::new(0), + last_file_id: AtomicU64::new(0), + dropped: AtomicBool::new(false), + metrics, + }) + } + + /// Recover table from add table meta + /// + /// This wont recover sequence number, which will be set after wal replayed + pub fn recover_from_add( + add_meta: AddTableMeta, + write_handle: WriteHandle, + purger: &FilePurger, + mem_usage_collector: CollectorRef, + ) -> Result { + let memtable_factory = Arc::new(SkiplistMemTableFactory); + let purge_queue = purger.create_purge_queue(add_meta.space_id, add_meta.table_id); + let current_version = TableVersion::new(purge_queue); + let metrics = Metrics::new(&add_meta.table_name); + + Ok(Self { + id: add_meta.table_id, + name: add_meta.table_name, + schema: Mutex::new(add_meta.schema), + space_id: add_meta.space_id, + // TODO(xikai): it should be recovered from `add_meta` struct. + sst_type: SstType::Parquet, + mutable_limit: AtomicU32::new(get_mutable_limit(&add_meta.opts)), + opts: ArcSwap::new(Arc::new(add_meta.opts)), + memtable_factory, + mem_usage_collector, + current_version, + last_sequence: AtomicU64::new(0), + write_handle, + last_memtable_id: AtomicU64::new(0), + last_file_id: AtomicU64::new(0), + dropped: AtomicBool::new(false), + metrics, + }) + } + + /// Get current schema of the table. + pub fn schema(&self) -> Schema { + self.schema.lock().unwrap().clone() + } + + /// Set current schema of the table. + pub fn set_schema(&self, schema: Schema) { + *self.schema.lock().unwrap() = schema; + } + + /// Get current version of schema. + pub fn schema_version(&self) -> Version { + self.schema.lock().unwrap().version() + } + + /// Get current table version + #[inline] + pub fn current_version(&self) -> &TableVersion { + &self.current_version + } + + /// Get the wal region id of this table + /// + /// Now we just use table id as region id + #[inline] + pub fn wal_region_id(&self) -> RegionId { + self.id.as_u64() + } + + /// Get last sequence number + #[inline] + pub fn last_sequence(&self) -> SequenceNumber { + self.last_sequence.load(Ordering::Acquire) + } + + /// Set last sequence number + #[inline] + pub fn set_last_sequence(&self, seq: SequenceNumber) { + self.last_sequence.store(seq, Ordering::Release); + } + + #[inline] + pub fn table_options(&self) -> Arc { + self.opts.load().clone() + } + + /// Update table options. + /// + /// REQUIRE: The write lock is held. + #[inline] + pub fn set_table_options(&self, _write_lock: &WorkerLocal, opts: TableOptions) { + self.mutable_limit + .store(get_mutable_limit(&opts), Ordering::Relaxed); + self.opts.store(Arc::new(opts)) + } + + #[inline] + pub fn is_dropped(&self) -> bool { + self.dropped.load(Ordering::SeqCst) + } + + /// Set the table is dropped and forbid any writes/alter on this table. + #[inline] + pub fn set_dropped(&self) { + self.dropped.store(true, Ordering::SeqCst); + } + + /// Returns total memtable memory usage in bytes. + #[inline] + pub fn memtable_memory_usage(&self) -> usize { + self.current_version.total_memory_usage() + } + + /// Find memtable for given timestamp to insert, create if not exists + /// + /// If the memtable schema is outdated, switch all memtables and create the + /// needed mutable memtable by current schema. The returned memtable is + /// guaranteed to have same schema of current table + /// + /// REQUIRE: The write lock is held + pub fn find_or_create_mutable( + &self, + write_lock: &WorkerLocal, + timestamp: Timestamp, + table_schema: &Schema, + ) -> Result { + let schema_version = table_schema.version(); + let last_sequence = self.last_sequence(); + + if let Some(mem) = self + .current_version + .memtable_for_write(write_lock, timestamp, schema_version) + .context(FindMemTable)? + { + return Ok(mem); + } + + // Mutable memtable for this timestamp not found, need to create a new one. + let table_options = self.table_options(); + let memtable_opts = MemTableOptions { + schema: table_schema.clone(), + arena_block_size: table_options.arena_block_size, + creation_sequence: last_sequence, + collector: self.mem_usage_collector.clone(), + }; + let mem = self + .memtable_factory + .create_memtable(memtable_opts) + .context(CreateMemTable)?; + + match table_options.segment_duration() { + Some(segment_duration) => { + let time_range = TimeRange::bucket_of(timestamp, segment_duration).context( + TimestampOverflow { + timestamp, + duration: segment_duration, + }, + )?; + let mem_state = MemTableState { + mem, + time_range, + id: self.alloc_memtable_id(), + }; + + // Insert memtable into mutable memtables of current version. + self.current_version.insert_mutable(mem_state.clone()); + + Ok(MemTableForWrite::Normal(mem_state)) + } + None => { + let sampling_mem = SamplingMemTable::new(mem, self.alloc_memtable_id()); + + // Set sampling memtables of current version. + self.current_version.set_sampling(sampling_mem.clone()); + + Ok(MemTableForWrite::Sampling(sampling_mem)) + } + } + } + + /// Returns true if the memory usage of this table reaches flush threshold + /// + /// REQUIRE: Do in write worker + pub fn should_flush_table(&self, _worker_local: &WorkerLocal) -> bool { + // Fallback to usize::MAX if Failed to convert arena_block_size into + // usize (overflow) + let max_write_buffer_size = self + .table_options() + .write_buffer_size + .try_into() + .unwrap_or(usize::MAX); + let mutable_limit = self + .mutable_limit + .load(Ordering::Relaxed) + .try_into() + .unwrap_or(usize::MAX); + + let mutable_usage = self.current_version.mutable_memory_usage(); + let total_usage = self.current_version.total_memory_usage(); + + // Inspired by https://github.com/facebook/rocksdb/blob/main/include/rocksdb/write_buffer_manager.h#L94 + if mutable_usage > mutable_limit { + info!( + "TableData should flush, table:{}, table_id:{}, mutable_usage:{}, mutable_limit: {}, total_usage:{}, max_write_buffer_size:{}", + self.name, self.id, mutable_usage, mutable_limit, total_usage, max_write_buffer_size + ); + return true; + } + + // If the memory exceeds the buffer size, we trigger more aggressive + // flush. But if already more than half memory is being flushed, + // triggering more flush may not help. We will hold it instead. + let should_flush = + total_usage >= max_write_buffer_size && mutable_usage >= max_write_buffer_size / 2; + + debug!( + "Check should flush, table:{}, table_id:{}, mutable_usage:{}, mutable_limit: {}, total_usage:{}, max_write_buffer_size:{}", + self.name, self.id, mutable_usage, mutable_limit, total_usage, max_write_buffer_size + ); + + if should_flush { + info!( + "TableData should flush, table:{}, table_id:{}, mutable_usage:{}, mutable_limit: {}, total_usage:{}, max_write_buffer_size:{}", + self.name, self.id, mutable_usage, mutable_limit, total_usage, max_write_buffer_size + ); + } + + should_flush + } + + /// Set `last_file_id`, mainly used in recover + /// + /// This operation require external synchronization + pub fn set_last_file_id(&self, last_file_id: FileId) { + self.last_file_id.store(last_file_id, Ordering::Relaxed); + } + + /// Returns the last file id + pub fn last_file_id(&self) -> FileId { + self.last_file_id.load(Ordering::Relaxed) + } + + /// Alloc a file id for a new file + pub fn alloc_file_id(&self) -> FileId { + let last = self.last_file_id.fetch_add(1, Ordering::Relaxed); + last + 1 + } + + /// Set the sst file path into the object storage path. + pub fn set_sst_file_path(&self, file_id: FileId, path: &mut impl ObjectStorePath) { + sst_util::set_sst_file_path(self.space_id, self.id, file_id, path) + } + + /// Allocate next memtable id + fn alloc_memtable_id(&self) -> MemTableId { + let last = self.last_memtable_id.fetch_add(1, Ordering::Relaxed); + last + 1 + } + + /// Returns last memtable id + pub fn last_memtable_id(&self) -> MemTableId { + self.last_memtable_id.load(Ordering::Relaxed) + } + + pub fn dedup(&self) -> bool { + self.table_options().need_dedup() + } + + pub fn is_expired(&self, timestamp: Timestamp) -> bool { + self.table_options().is_expired(timestamp) + } +} + +/// Table data reference +pub type TableDataRef = Arc; + +/// Manages TableDataRef +pub struct TableDataSet { + /// Name to table data + table_datas: HashMap, + /// Id to table data + id_to_tables: HashMap, +} + +impl TableDataSet { + /// Create an empty TableDataSet + pub fn new() -> Self { + Self { + table_datas: HashMap::new(), + id_to_tables: HashMap::new(), + } + } + + /// Insert if absent, if successfully inserted, return true and return + /// false if the data already exists + pub fn insert_if_absent(&mut self, table_data_ref: TableDataRef) -> bool { + let table_name = &table_data_ref.name; + if self.table_datas.contains_key(table_name) { + return false; + } + self.table_datas + .insert(table_name.to_string(), table_data_ref.clone()); + self.id_to_tables.insert(table_data_ref.id, table_data_ref); + true + } + + /// Find table by table name + pub fn find_table(&self, table_name: &str) -> Option { + self.table_datas.get(table_name).cloned() + } + + /// Find table by table id + pub fn find_table_by_id(&self, table_id: TableId) -> Option { + self.id_to_tables.get(&table_id).cloned() + } + + /// Remove table by table name + pub fn remove_table(&mut self, table_name: &str) -> Option { + let table = self.table_datas.remove(table_name)?; + self.id_to_tables.remove(&table.id); + Some(table) + } + + /// Returns the total table num in this set + pub fn table_num(&self) -> usize { + self.table_datas.len() + } + + /// Find the table which consumes maximum memtable memory usag. + pub fn find_maximum_memory_usage_table(&self) -> Option { + self.table_datas + .values() + .max_by_key(|t| t.memtable_memory_usage()) + .cloned() + } + + /// List all tables to `tables` + pub fn list_all_tables(&self, tables: &mut Vec) { + for table_data in self.table_datas.values().cloned() { + tables.push(table_data); + } + } +} + +impl Default for TableDataSet { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +pub mod tests { + use std::sync::Arc; + + use arena::NoopCollector; + use common_types::datum::DatumKind; + use common_util::config::ReadableDuration; + use table_engine::engine::TableState; + + use super::*; + use crate::{ + instance::write_worker::tests::WriteHandleMocker, + memtable::{factory::Factory, MemTableRef}, + sst::file::tests::FilePurgerMocker, + table_options, + tests::table, + }; + + const DEFAULT_SPACE_ID: SpaceId = 1; + + fn default_schema() -> Schema { + table::create_schema_builder( + &[("key", DatumKind::Timestamp)], + &[("value", DatumKind::Double)], + ) + .build() + .unwrap() + } + + #[derive(Default)] + pub struct MemTableMocker; + + impl MemTableMocker { + pub fn build(&self) -> MemTableRef { + let memtable_opts = MemTableOptions { + schema: default_schema(), + arena_block_size: 1024 * 1024, + creation_sequence: 1000, + collector: Arc::new(NoopCollector), + }; + + let factory = SkiplistMemTableFactory; + factory.create_memtable(memtable_opts).unwrap() + } + } + + #[must_use] + pub struct TableDataMocker { + table_id: TableId, + table_name: String, + write_handle: Option, + } + + impl TableDataMocker { + pub fn table_id(mut self, table_id: TableId) -> Self { + self.table_id = table_id; + self + } + + pub fn table_name(mut self, table_name: String) -> Self { + self.table_name = table_name; + self + } + + pub fn write_handle(mut self, write_handle: WriteHandle) -> Self { + self.write_handle = Some(write_handle); + self + } + + pub fn build(self) -> TableData { + let space_id = DEFAULT_SPACE_ID; + let table_schema = default_schema(); + let create_request = CreateTableRequest { + catalog_name: "test_catalog".to_string(), + schema_name: "public".to_string(), + table_id: self.table_id, + table_name: self.table_name, + table_schema, + partition_info: None, + engine: table_engine::ANALYTIC_ENGINE_TYPE.to_string(), + options: HashMap::new(), + state: TableState::Stable, + }; + + let write_handle = self.write_handle.unwrap_or_else(|| { + let mocked_write_handle = WriteHandleMocker::default().space_id(space_id).build(); + mocked_write_handle.write_handle + }); + let table_opts = TableOptions::default(); + let purger = FilePurgerMocker::mock(); + let collector = Arc::new(NoopCollector); + + TableData::new( + space_id, + create_request, + write_handle, + table_opts, + &purger, + collector, + ) + .unwrap() + } + } + + impl Default for TableDataMocker { + fn default() -> Self { + Self { + table_id: table::new_table_id(2, 1), + table_name: "mocked_table".to_string(), + write_handle: None, + } + } + } + + #[test] + fn test_new_table_data() { + let table_id = table::new_table_id(100, 30); + let table_name = "new_table".to_string(); + let table_data = TableDataMocker::default() + .table_id(table_id) + .table_name(table_name.clone()) + .build(); + + assert_eq!(table_id, table_data.id); + assert_eq!(table_name, table_data.name); + assert_eq!(table_data.id.as_u64(), table_data.wal_region_id()); + assert_eq!(0, table_data.last_sequence()); + assert!(!table_data.is_dropped()); + assert_eq!(0, table_data.last_file_id()); + assert_eq!(0, table_data.last_memtable_id()); + assert!(table_data.dedup()); + } + + #[test] + fn test_find_or_create_mutable() { + let mocked_write_handle = WriteHandleMocker::default() + .space_id(DEFAULT_SPACE_ID) + .build(); + let table_data = TableDataMocker::default() + .write_handle(mocked_write_handle.write_handle) + .build(); + let worker_local = mocked_write_handle.worker_local; + let schema = table_data.schema(); + + // Create sampling memtable. + let zero_ts = Timestamp::new(0); + let mutable = table_data + .find_or_create_mutable(&worker_local, zero_ts, &schema) + .unwrap(); + assert!(mutable.accept_timestamp(zero_ts)); + let sampling_mem = mutable.as_sampling(); + let sampling_id = sampling_mem.id; + assert_eq!(1, sampling_id); + + // Test memtable is reused. + let now_ts = Timestamp::now(); + let mutable = table_data + .find_or_create_mutable(&worker_local, now_ts, &schema) + .unwrap(); + assert!(mutable.accept_timestamp(now_ts)); + let sampling_mem = mutable.as_sampling(); + // Use same sampling memtable. + assert_eq!(sampling_id, sampling_mem.id); + + let current_version = table_data.current_version(); + // Set segment duration manually. + let mut table_opts = (*table_data.table_options()).clone(); + table_opts.segment_duration = + Some(ReadableDuration(table_options::DEFAULT_SEGMENT_DURATION)); + table_data.set_table_options(&worker_local, table_opts); + // Freeze sampling memtable. + current_version.freeze_sampling(&worker_local); + + // A new mutable memtable should be created. + let mutable = table_data + .find_or_create_mutable(&worker_local, now_ts, &schema) + .unwrap(); + assert!(mutable.accept_timestamp(now_ts)); + let mem_state = mutable.as_normal(); + assert_eq!(2, mem_state.id); + let time_range = + TimeRange::bucket_of(now_ts, table_options::DEFAULT_SEGMENT_DURATION).unwrap(); + assert_eq!(time_range, mem_state.time_range); + } +} diff --git a/analytic_engine/src/table/metrics.rs b/analytic_engine/src/table/metrics.rs new file mode 100644 index 0000000000..0a5d801796 --- /dev/null +++ b/analytic_engine/src/table/metrics.rs @@ -0,0 +1,229 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Metrics of table. + +use std::time::Duration; + +use lazy_static::lazy_static; +use prometheus::{ + exponential_buckets, local::LocalHistogram, register_histogram_vec, register_int_counter_vec, + Histogram, HistogramVec, IntCounter, IntCounterVec, +}; + +const KB: f64 = 1024.0; + +lazy_static! { + // Counters: + static ref TABLE_WRITE_REQUEST_COUNTER: IntCounterVec = register_int_counter_vec!( + "table_write_request_counter", + "Write request counter of table", + &["table"] + ) + .unwrap(); + static ref TABLE_WRITE_ROWS_COUNTER: IntCounterVec = register_int_counter_vec!( + "table_write_rows_counter", + "Number of rows wrote to table", + &["table"] + ) + .unwrap(); + static ref TABLE_READ_REQUEST_COUNTER: IntCounterVec = register_int_counter_vec!( + "table_read_request_counter", + "Read request counter of table", + &["table"] + ) + .unwrap(); + // End of counters. + + // Histograms: + // Buckets: 0, 0.002, .., 0.002 * 4^9 + static ref TABLE_FLUSH_DURATION_HISTOGRAM: HistogramVec = register_histogram_vec!( + "table_flush_duration", + "Histogram for flush duration of the table in seconds", + &["table"], + exponential_buckets(0.002, 4.0, 10).unwrap() + ).unwrap(); + // Buckets: 0, 1, .., 2^7 + static ref TABLE_FLUSH_SST_NUM_HISTOGRAM: HistogramVec = register_histogram_vec!( + "table_flush_sst_num", + "Histogram for number of ssts flushed by the table", + &["table"], + exponential_buckets(1.0, 2.0, 8).unwrap() + ).unwrap(); + // Buckets: 0, 1, ..., 4^11 (4GB) + static ref TABLE_FLUSH_SST_SIZE_HISTOGRAM: HistogramVec = register_histogram_vec!( + "table_flush_sst_size", + "Histogram for size of ssts flushed by the table in KB", + &["table"], + exponential_buckets(1.0, 4.0, 12).unwrap() + ).unwrap(); + + // Buckets: 0, 0.02, .., 0.02 * 4^9 + static ref TABLE_COMPACT_DURATION_HISTOGRAM: HistogramVec = register_histogram_vec!( + "table_compaction_duration", + "Histogram for compaction duration of the table in seconds", + &["table"], + exponential_buckets(0.02, 4.0, 10).unwrap() + ).unwrap(); + // Buckets: 0, 1, .., 2^7 + static ref TABLE_COMPACTION_SST_NUM_HISTOGRAM: HistogramVec = register_histogram_vec!( + "table_compaction_sst_num", + "Histogram for number of ssts compacted by the table", + &["table"], + exponential_buckets(1.0, 2.0, 8).unwrap() + ).unwrap(); + // Buckets: 0, 1, ..., 4^11 (4GB) + static ref TABLE_COMPACTION_SST_SIZE_HISTOGRAM: HistogramVec = register_histogram_vec!( + "table_compaction_sst_size", + "Histogram for size of ssts compacted by the table in KB", + &["table", "type"], + exponential_buckets(1.0, 4.0, 12).unwrap() + ).unwrap(); + // Buckets: 0, 1, ..., 10^12(1 billion) + static ref TABLE_COMPACTION_SST_ROW_NUM_HISTOGRAM: HistogramVec = register_histogram_vec!( + "table_compaction_sst_row_num", + "Histogram for row num of ssts compacted by the table", + &["table", "type"], + exponential_buckets(1.0, 10.0, 13).unwrap() + ).unwrap(); + + // Buckets: 0, 0.01, .., 0.01 * 2^12 + static ref TABLE_WRITE_STALL_DURATION_HISTOGRAM: HistogramVec = register_histogram_vec!( + "table_write_stall_duration", + "Histogram for write stall duration of the table in seconds", + &["table"], + exponential_buckets(0.01, 2.0, 13).unwrap() + ).unwrap(); + // End of histograms. +} + +/// Table metrics. +/// +/// Now the registered labels won't remove from the metrics vec to avoid panic +/// on concurrent removal. +pub struct Metrics { + // Counters: + pub write_request_counter: IntCounter, + write_rows_counter: IntCounter, + pub read_request_counter: IntCounter, + // End of counters. + + // Histograms: + pub flush_duration_histogram: Histogram, + flush_sst_num_histogram: Histogram, + flush_sst_size_histogram: Histogram, + + pub compaction_duration_histogram: Histogram, + compaction_sst_num_histogram: Histogram, + compaction_input_sst_size_histogram: Histogram, + compaction_output_sst_size_histogram: Histogram, + compaction_input_sst_row_num_histogram: Histogram, + compaction_output_sst_row_num_histogram: Histogram, + + // Write stall metrics. + write_stall_duration_histogram: Histogram, + // End of histograms. +} + +impl Metrics { + pub fn new(table_name: &str) -> Self { + Self { + write_request_counter: TABLE_WRITE_REQUEST_COUNTER.with_label_values(&[table_name]), + write_rows_counter: TABLE_WRITE_ROWS_COUNTER.with_label_values(&[table_name]), + read_request_counter: TABLE_READ_REQUEST_COUNTER.with_label_values(&[table_name]), + + flush_duration_histogram: TABLE_FLUSH_DURATION_HISTOGRAM + .with_label_values(&[table_name]), + flush_sst_num_histogram: TABLE_FLUSH_SST_NUM_HISTOGRAM.with_label_values(&[table_name]), + flush_sst_size_histogram: TABLE_FLUSH_SST_SIZE_HISTOGRAM + .with_label_values(&[table_name]), + + compaction_duration_histogram: TABLE_COMPACT_DURATION_HISTOGRAM + .with_label_values(&[table_name]), + compaction_sst_num_histogram: TABLE_COMPACTION_SST_NUM_HISTOGRAM + .with_label_values(&[table_name]), + compaction_input_sst_size_histogram: TABLE_COMPACTION_SST_SIZE_HISTOGRAM + .with_label_values(&[table_name, "input"]), + compaction_output_sst_size_histogram: TABLE_COMPACTION_SST_SIZE_HISTOGRAM + .with_label_values(&[table_name, "output"]), + compaction_input_sst_row_num_histogram: TABLE_COMPACTION_SST_ROW_NUM_HISTOGRAM + .with_label_values(&[table_name, "input"]), + compaction_output_sst_row_num_histogram: TABLE_COMPACTION_SST_ROW_NUM_HISTOGRAM + .with_label_values(&[table_name, "output"]), + + write_stall_duration_histogram: TABLE_WRITE_STALL_DURATION_HISTOGRAM + .with_label_values(&[table_name]), + } + } + + #[inline] + pub fn on_write_request_begin(&self) { + self.write_request_counter.inc(); + } + + #[inline] + pub fn on_write_request_done(&self, num_rows: usize) { + self.write_rows_counter.inc_by(num_rows as u64); + } + + #[inline] + pub fn on_read_request_begin(&self) { + self.read_request_counter.inc(); + } + + #[inline] + pub fn on_write_stall(&self, duration: Duration) { + self.write_stall_duration_histogram + .observe(duration.as_secs_f64()); + } + + pub fn local_flush_metrics(&self) -> LocalFlushMetrics { + LocalFlushMetrics { + flush_duration_histogram: self.flush_duration_histogram.local(), + flush_sst_num_histogram: self.flush_sst_num_histogram.local(), + flush_sst_size_histogram: self.flush_sst_size_histogram.local(), + } + } + + pub fn compaction_observe_sst_num(&self, sst_num: usize) { + self.compaction_sst_num_histogram.observe(sst_num as f64); + } + + pub fn compaction_observe_input_sst_size(&self, sst_size: u64) { + // Convert bytes to KB. + self.compaction_input_sst_size_histogram + .observe(sst_size as f64 / KB); + } + + pub fn compaction_observe_output_sst_size(&self, sst_size: u64) { + // Convert bytes to KB. + self.compaction_output_sst_size_histogram + .observe(sst_size as f64 / KB); + } + + pub fn compaction_observe_input_sst_row_num(&self, sst_row_num: u64) { + self.compaction_input_sst_row_num_histogram + .observe(sst_row_num as f64); + } + + pub fn compaction_observe_output_sst_row_num(&self, sst_row_num: u64) { + self.compaction_output_sst_row_num_histogram + .observe(sst_row_num as f64); + } +} + +pub struct LocalFlushMetrics { + pub flush_duration_histogram: LocalHistogram, + flush_sst_num_histogram: LocalHistogram, + flush_sst_size_histogram: LocalHistogram, +} + +impl LocalFlushMetrics { + pub fn observe_sst_num(&self, sst_num: usize) { + self.flush_sst_num_histogram.observe(sst_num as f64); + } + + pub fn observe_sst_size(&self, sst_size: u64) { + // Convert bytes to KB. + self.flush_sst_size_histogram.observe(sst_size as f64 / KB); + } +} diff --git a/analytic_engine/src/table/mod.rs b/analytic_engine/src/table/mod.rs new file mode 100644 index 0000000000..0f5598f0c1 --- /dev/null +++ b/analytic_engine/src/table/mod.rs @@ -0,0 +1,270 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table implementation + +use std::{collections::HashMap, fmt, sync::Arc}; + +use arrow_deps::datafusion::logical_plan::{Column, Expr}; +use async_trait::async_trait; +use common_types::{row::Row, schema::Schema, time::TimeRange}; +use futures::TryStreamExt; +use object_store::ObjectStore; +use snafu::{ensure, OptionExt, ResultExt}; +use table_engine::{ + predicate::Predicate, + stream::{PartitionedStreams, SendableRecordBatchStream}, + table::{ + AlterOptions, AlterSchema, AlterSchemaRequest, Compact, Flush, FlushRequest, Get, + GetInvalidPrimaryKey, GetNullPrimaryKey, GetRequest, ReadOptions, ReadOrder, ReadRequest, + Result, Scan, Table, TableId, TableStats, Write, WriteRequest, + }, +}; +use tokio::sync::oneshot; +use wal::manager::WalManager; + +use crate::{ + instance::{flush_compaction::TableFlushOptions, InstanceRef}, + meta::Manifest, + space::SpaceAndTable, + sst::factory::Factory, +}; + +pub mod data; +pub mod metrics; +pub mod sst_util; +pub mod version; +pub mod version_edit; + +// TODO(yingwen): How to handle drop table? + +/// Table trait implementation +pub struct TableImpl { + /// Space and table info + space_table: SpaceAndTable, + /// Instance + instance: InstanceRef, + /// Engine type + engine_type: String, +} + +impl TableImpl { + pub fn new( + space_table: SpaceAndTable, + instance: InstanceRef, + engine_type: String, + ) -> Self { + Self { + space_table, + instance, + engine_type, + } + } +} + +impl fmt::Debug for TableImpl { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("TableImpl") + .field("space_table", &self.space_table) + .finish() + } +} + +#[async_trait] +impl< + Wal: WalManager + Send + Sync + 'static, + Meta: Manifest + Send + Sync + 'static, + Store: ObjectStore, + Fa: Factory + Send + Sync + 'static, + > Table for TableImpl +{ + fn name(&self) -> &str { + &self.space_table.table_data().name + } + + fn id(&self) -> TableId { + self.space_table.table_data().id + } + + fn schema(&self) -> Schema { + self.space_table.table_data().schema() + } + + fn options(&self) -> HashMap { + self.space_table.table_data().table_options().to_raw_map() + } + + fn engine_type(&self) -> &str { + &self.engine_type + } + + fn stats(&self) -> TableStats { + let metrics = &self.space_table.table_data().metrics; + + TableStats { + num_write: metrics.write_request_counter.get(), + num_read: metrics.read_request_counter.get(), + num_flush: metrics.flush_duration_histogram.get_sample_count(), + } + } + + async fn write(&self, request: WriteRequest) -> Result { + let num_rows = self + .instance + .write_to_table(&self.space_table, request) + .await + .map_err(|e| Box::new(e) as _) + .context(Write { table: self.name() })?; + Ok(num_rows) + } + + async fn read(&self, mut request: ReadRequest) -> Result { + request.opts.read_parallelism = 1; + let mut streams = self + .instance + .partitioned_read_from_table(&self.space_table, request) + .await + .map_err(|e| Box::new(e) as _) + .context(Scan { table: self.name() })?; + + assert_eq!(streams.streams.len(), 1); + let stream = streams.streams.pop().unwrap(); + + Ok(stream) + } + + async fn get(&self, request: GetRequest) -> Result> { + let schema = request.projected_schema.to_record_schema_with_key(); + let primary_key_columns = schema.key_columns(); + ensure!( + primary_key_columns.len() == request.primary_key.len(), + GetInvalidPrimaryKey { + schema: schema.clone(), + primary_key_columns, + } + ); + + let mut primary_key_exprs: Vec = Vec::with_capacity(request.primary_key.len()); + for (primary_key_value, column_schema) in + request.primary_key.iter().zip(primary_key_columns.iter()) + { + let v = primary_key_value + .as_scalar_value() + .with_context(|| GetNullPrimaryKey { + schema: schema.clone(), + primary_key_columns, + })?; + primary_key_exprs.push( + Expr::Column(Column::from_qualified_name(&column_schema.name)).eq(Expr::Literal(v)), + ); + } + + let read_request = ReadRequest { + request_id: request.request_id, + opts: ReadOptions::default(), + projected_schema: request.projected_schema, + predicate: Arc::new(Predicate { + exprs: primary_key_exprs, + time_range: TimeRange::min_to_max(), + }), + order: ReadOrder::None, + }; + let mut batch_stream = self + .read(read_request) + .await + .map_err(|e| Box::new(e) as _) + .context(Scan { table: self.name() })?; + + let mut result_columns = Vec::with_capacity(schema.num_columns()); + + while let Some(batch) = batch_stream + .try_next() + .await + .map_err(|e| Box::new(e) as _) + .context(Get { table: self.name() })? + { + let row_num = batch.num_rows(); + if row_num == 0 { + return Ok(None); + } + for row_idx in 0..row_num { + for col_idx in 0..batch.num_columns() { + let col = batch.column(col_idx); + result_columns.push(col.datum(row_idx)); + } + + if request.primary_key == result_columns[..schema.num_key_columns()] { + return Ok(Some(Row::from_datums(result_columns))); + } + result_columns.clear(); + } + } + + Ok(None) + } + + async fn partitioned_read(&self, request: ReadRequest) -> Result { + let streams = self + .instance + .partitioned_read_from_table(&self.space_table, request) + .await + .map_err(|e| Box::new(e) as _) + .context(Scan { table: self.name() })?; + + Ok(streams) + } + + async fn alter_schema(&self, request: AlterSchemaRequest) -> Result { + self.instance + .alter_schema_of_table(&self.space_table, request) + .await + .map_err(|e| Box::new(e) as _) + .context(AlterSchema { table: self.name() })?; + Ok(1) + } + + async fn alter_options(&self, options: HashMap) -> Result { + self.instance + .alter_options_of_table(&self.space_table, options) + .await + .map_err(|e| Box::new(e) as _) + .context(AlterOptions { table: self.name() })?; + Ok(1) + } + + async fn flush(&self, request: FlushRequest) -> Result<()> { + let mut rx_opt = None; + let flush_opts = TableFlushOptions { + compact_after_flush: request.compact_after_flush, + // Never block write thread + block_on_write_thread: false, + res_sender: if request.sync { + let (tx, rx) = oneshot::channel(); + rx_opt = Some(rx); + Some(tx) + } else { + None + }, + }; + + self.instance + .flush_table(&self.space_table, flush_opts) + .await + .map_err(|e| Box::new(e) as _) + .context(Flush { table: self.name() })?; + if let Some(rx) = rx_opt { + rx.await + .map_err(|e| Box::new(e) as _) + .context(Flush { table: self.name() })??; + } + Ok(()) + } + + async fn compact(&self) -> Result<()> { + self.instance + .manual_compact_table(&self.space_table) + .await + .map_err(|e| Box::new(e) as _) + .context(Compact { table: self.name() })?; + Ok(()) + } +} diff --git a/analytic_engine/src/table/sst_util.rs b/analytic_engine/src/table/sst_util.rs new file mode 100644 index 0000000000..b5d760a079 --- /dev/null +++ b/analytic_engine/src/table/sst_util.rs @@ -0,0 +1,27 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! utilities for sst. + +use object_store::path::ObjectStorePath; +use table_engine::table::TableId; + +use crate::{space::SpaceId, sst::manager::FileId}; + +const SST_FILE_SUFFIX: &str = "sst"; + +#[inline] +/// Generate the sst file name. +pub fn sst_file_name(id: FileId) -> String { + format!("{}.{}", id, SST_FILE_SUFFIX) +} + +/// Set the sst file path. +pub fn set_sst_file_path( + space_id: SpaceId, + table_id: TableId, + file_id: FileId, + path: &mut P, +) { + path.push_all_dirs([space_id.to_string().as_str(), table_id.to_string().as_str()]); + path.set_file_name(sst_file_name(file_id)); +} diff --git a/analytic_engine/src/table/version.rs b/analytic_engine/src/table/version.rs new file mode 100644 index 0000000000..b0e4e2b977 --- /dev/null +++ b/analytic_engine/src/table/version.rs @@ -0,0 +1,1096 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table version + +use std::{ + cmp, + collections::{BTreeMap, HashMap}, + fmt, + ops::Bound, + sync::{Arc, RwLock}, + time::Duration, +}; + +use common_types::{ + row::Row, + schema::{self, Schema}, + time::{TimeRange, Timestamp}, + SequenceNumber, +}; +use common_util::define_result; +use snafu::{ensure, Backtrace, ResultExt, Snafu}; + +use crate::{ + compaction::{ + picker::{self, CompactionPickerRef, PickerContext}, + CompactionTask, ExpiredFiles, + }, + instance::write_worker::WorkerLocal, + memtable::{self, key::KeySequence, MemTableRef, PutContext}, + sampler::{DefaultSampler, SamplerRef}, + sst::{ + file::{FileHandle, FilePurgeQueue}, + manager::{FileId, LevelsController, MAX_LEVEL}, + }, + table::{ + data::MemTableId, + version_edit::{AddFile, VersionEdit}, + }, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Schema mismatch, memtable_version:{}, given:{}.\nBacktrace:\n{}", + memtable_version, + given, + backtrace + ))] + SchemaMismatch { + memtable_version: schema::Version, + given: schema::Version, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to put memtable, err:{}", source))] + PutMemTable { source: crate::memtable::Error }, + + #[snafu(display("Failed to collect timestamp, err:{}", source))] + CollectTimestamp { source: crate::sampler::Error }, +} + +define_result!(Error); + +/// Memtable for sampling timestamp. +#[derive(Clone)] +pub struct SamplingMemTable { + pub mem: MemTableRef, + pub id: MemTableId, + /// If freezed is true, the sampling is finished and no more data should be + /// inserted into this memtable. Otherwise, the memtable is active and all + /// data should ONLY write to this memtable instead of mutable memtable. + pub freezed: bool, + pub sampler: SamplerRef, +} + +impl SamplingMemTable { + pub fn new(memtable: MemTableRef, id: MemTableId) -> Self { + SamplingMemTable { + mem: memtable, + id, + freezed: false, + sampler: Arc::new(DefaultSampler::default()), + } + } + + fn memory_usage(&self) -> usize { + self.mem.approximate_memory_usage() + } + + /// Suggest segment duration, if there is no sampled timestamp, returns + /// default segment duration. + fn suggest_segment_duration(&self) -> Duration { + self.sampler.suggest_duration() + } +} + +impl fmt::Debug for SamplingMemTable { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SamplingMemTable") + .field("id", &self.id) + .field("freezed", &self.freezed) + .finish() + } +} + +/// Memtable with additional meta data +#[derive(Clone)] +pub struct MemTableState { + /// The mutable memtable + pub mem: MemTableRef, + /// The `time_range` is estimated via the time range of the first row group + /// write to this memtable and is aligned to segment size + pub time_range: TimeRange, + /// Id of the memtable, newer memtable has greater id + pub id: MemTableId, +} + +impl MemTableState { + #[inline] + pub fn last_sequence(&self) -> SequenceNumber { + self.mem.last_sequence() + } +} + +impl fmt::Debug for MemTableState { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("MemTableState") + .field("time_range", &self.time_range) + .field("id", &self.id) + .field("last_sequence", &self.mem.last_sequence()) + .finish() + } +} + +// TODO(yingwen): Replace by Either. +#[derive(Clone)] +pub enum MemTableForWrite { + Sampling(SamplingMemTable), + Normal(MemTableState), +} + +impl MemTableForWrite { + #[inline] + pub fn set_last_sequence(&self, seq: SequenceNumber) -> memtable::Result<()> { + self.memtable().set_last_sequence(seq) + } + + #[inline] + pub fn accept_timestamp(&self, timestamp: Timestamp) -> bool { + match self { + MemTableForWrite::Sampling(_) => true, + MemTableForWrite::Normal(v) => v.time_range.contains(timestamp), + } + } + + #[inline] + pub fn put( + &self, + ctx: &mut PutContext, + sequence: KeySequence, + row: &Row, + schema: &Schema, + timestamp: Timestamp, + ) -> Result<()> { + match self { + MemTableForWrite::Sampling(v) => { + v.mem.put(ctx, sequence, row, schema).context(PutMemTable)?; + + // Collect the timstamp of this row. + v.sampler.collect(timestamp).context(CollectTimestamp)?; + + Ok(()) + } + MemTableForWrite::Normal(v) => { + v.mem.put(ctx, sequence, row, schema).context(PutMemTable) + } + } + } + + #[inline] + fn memtable(&self) -> &MemTableRef { + match self { + MemTableForWrite::Sampling(v) => &v.mem, + MemTableForWrite::Normal(v) => &v.mem, + } + } + + #[cfg(test)] + pub fn as_sampling(&self) -> &SamplingMemTable { + match self { + MemTableForWrite::Sampling(v) => v, + MemTableForWrite::Normal(_) => panic!(), + } + } + + #[cfg(test)] + pub fn as_normal(&self) -> &MemTableState { + match self { + MemTableForWrite::Sampling(_) => panic!(), + MemTableForWrite::Normal(v) => v, + } + } +} + +#[derive(Debug, Default)] +pub struct FlushableMemTables { + pub sampling_mem: Option, + pub memtables: MemTableVec, +} + +impl FlushableMemTables { + #[inline] + pub fn is_empty(&self) -> bool { + self.sampling_mem.is_none() && self.memtables.is_empty() + } + + pub fn ids(&self) -> Vec { + let mut memtable_ids = Vec::with_capacity(self.memtables.len() + 1); + if let Some(v) = &self.sampling_mem { + memtable_ids.push(v.id); + } + for mem in &self.memtables { + memtable_ids.push(mem.id); + } + + memtable_ids + } +} + +/// Vec to store memtables +pub type MemTableVec = Vec; + +/// MemTableView holds all memtables of the table +#[derive(Debug)] +struct MemTableView { + /// The memtable for sampling timestamp to suggest segment duration. + /// + /// This memtable is special and may contains data in differnt segment, so + /// can not be moved into immutable memtable set. + sampling_mem: Option, + /// Mutable memtables arranged by its time range. + mutables: MutableMemTableSet, + /// Immutable memtables set, lookup by memtable id is fast. + immutables: ImmutableMemTableSet, +} + +impl MemTableView { + fn new() -> Self { + Self { + sampling_mem: None, + mutables: MutableMemTableSet::new(), + immutables: ImmutableMemTableSet(BTreeMap::new()), + } + } + + /// Get the memory usage of mutable memtables. + fn mutable_memory_usage(&self) -> usize { + self.mutables.memory_usage() + + self + .sampling_mem + .as_ref() + .map(|v| v.memory_usage()) + .unwrap_or(0) + } + + /// Get the total memory usage of mutable and immutable memtables. + fn total_memory_usage(&self) -> usize { + let mutable_usage = self.mutable_memory_usage(); + let immutable_usage = self.immutables.memory_usage(); + + mutable_usage + immutable_usage + } + + /// Switch all memtables or just sample the segment duration. + /// + /// If the sampling memtable is still active, return the suggested segment + /// duration or move all mutable memtables into immutable memtables if + /// the sampling memtable is freezed and returns None. + /// + /// Instead of replace the old memtable by a new memtable, we just move the + /// old memtable to immutable memtables and left mutable memtables + /// empty. New mutable memtable will be constructed via put request. + fn switch_memtables_or_suggest_duration(&mut self) -> Option { + if let Some(v) = &mut self.sampling_mem { + if !v.freezed { + // Other memtable should be empty during sampling phase. + assert!(self.mutables.is_empty()); + assert!(self.immutables.is_empty()); + + // The sampling memtable is still active, we need to compute the + // segment duration and then freeze the memtable. + let segment_duration = v.suggest_segment_duration(); + + // But we cannot freeze the sampling memtable now, because the + // segment duration may not yet been persisted. + return Some(segment_duration); + } + } + + self.mutables.move_to_inmem(&mut self.immutables); + + None + } + + fn freeze_sampling_memtable(&mut self) { + if let Some(v) = &mut self.sampling_mem { + v.freezed = true; + } + } + + /// Returns the memtables that needs to be flushed. + /// - Id of returned memtables are no greater than `max_memtable_id`. + /// - The last sequences of the returned memtables are continuous and can + /// used as flushed sequence. + /// - All memTables with same last sequence must be picked to the same + /// MemTableVec, so we can update flushed sequence safely (The + /// `max_memtable_id` should also guarantee this). + /// - If freezed memtable exists, that memtable will be return if memtable + /// id is no greater than `max_memtable_id` (The memtable id should always + /// less than `max_memtable_id`). + /// + /// Now the returned memtables are also ordered by memtable id, but this may + /// change in the future. + fn pick_memtables_to_flush(&self, max_memtable_id: MemTableId, mems: &mut FlushableMemTables) { + if let Some(v) = &self.sampling_mem { + if v.id <= max_memtable_id { + mems.sampling_mem = Some(v.clone()); + } + } + + for mem in self.immutables.0.values() { + if mem.id <= max_memtable_id { + mems.memtables.push(mem.clone()); + } + } + } + + /// Remove memtable from immutables or sampling memtable. + #[inline] + fn remove_immutable_or_sampling(&mut self, id: MemTableId) { + if let Some(v) = &self.sampling_mem { + if v.id == id { + self.sampling_mem = None; + return; + } + } + + self.immutables.0.remove(&id); + } + + /// Collect memtables itersect with `time_range` + fn memtables_for_read( + &self, + time_range: TimeRange, + mems: &mut MemTableVec, + sampling_mem: &mut Option, + ) { + self.mutables.memtables_for_read(time_range, mems); + + self.immutables.memtables_for_read(time_range, mems); + + *sampling_mem = self.sampling_mem.clone(); + } +} + +/// Mutable memtables +/// +/// All mutable memtables ordered by their end time (exclusive), their time +/// range may overlaps if `alter segment duration` is supported +/// +/// We choose end time so we can use BTreeMap::range to find the first range +/// that may contains a given timestamp (end >= timestamp) +#[derive(Debug)] +struct MutableMemTableSet(BTreeMap); + +impl MutableMemTableSet { + fn new() -> Self { + Self(BTreeMap::new()) + } + + /// Get memtale by timestamp for write + fn memtable_for_write(&self, timestamp: Timestamp) -> Option<&MemTableState> { + // Find the first memtable whose end time (exclusive) > timestamp + if let Some((_, memtable)) = self + .0 + .range((Bound::Excluded(timestamp), Bound::Unbounded)) + .next() + { + if memtable.time_range.contains(timestamp) { + return Some(memtable); + } + } + + None + } + + /// Insert memtable, the caller should guarantee the key of memtable is not + /// present. + fn insert(&mut self, memtable: MemTableState) -> Option { + // Use end time of time range as key + let end = memtable.time_range.exclusive_end(); + self.0.insert(end, memtable) + } + + fn memory_usage(&self) -> usize { + self.0 + .values() + .map(|m| m.mem.approximate_memory_usage()) + .sum() + } + + /// Move all mutable memtables to immutable memtables. + fn move_to_inmem(&mut self, immem: &mut ImmutableMemTableSet) { + for m in self.0.values() { + let state = m.clone(); + + immem.0.insert(m.id, state); + } + self.0.clear(); + } + + fn memtables_for_read(&self, time_range: TimeRange, mems: &mut MemTableVec) { + // Seek to first memtable whose end time (exclusive) > time_range.start + let inclusive_start = time_range.inclusive_start(); + let iter = self + .0 + .range((Bound::Excluded(inclusive_start), Bound::Unbounded)); + for (_end_ts, mem) in iter { + // We need to iterate all candidate memtables as their start time is unspecific + if mem.time_range.intersect_with(time_range) { + mems.push(mem.clone()); + } + } + } + + fn is_empty(&self) -> bool { + self.0.is_empty() + } +} + +/// Immutable memtables set +/// +/// MemTables are ordered by memtable id, so lookup by memtable id is fast +#[derive(Debug)] +struct ImmutableMemTableSet(BTreeMap); + +impl ImmutableMemTableSet { + /// Memory used by all immutable memtables + fn memory_usage(&self) -> usize { + self.0 + .values() + .map(|m| m.mem.approximate_memory_usage()) + .sum() + } + + fn memtables_for_read(&self, time_range: TimeRange, mems: &mut MemTableVec) { + for mem in self.0.values() { + if mem.time_range.intersect_with(time_range) { + mems.push(mem.clone()); + } + } + } + + fn is_empty(&self) -> bool { + self.0.is_empty() + } +} + +pub type LeveledFiles = Vec>; + +/// Memtable/sst to read for given time range. +pub struct ReadView { + pub sampling_mem: Option, + pub memtables: MemTableVec, + /// Ssts to read in each level. + /// + /// The `ReadView` MUST ensure the length of `leveled_ssts` >= MAX_LEVEL. + pub leveled_ssts: LeveledFiles, +} + +impl Default for ReadView { + fn default() -> Self { + Self { + sampling_mem: None, + memtables: Vec::new(), + leveled_ssts: vec![Vec::new(); MAX_LEVEL], + } + } +} + +impl ReadView { + pub fn contains_sampling(&self) -> bool { + self.sampling_mem.is_some() + } +} + +/// Data of TableVersion +struct TableVersionInner { + /// All memtables + memtable_view: MemTableView, + /// All ssts + levels: LevelsController, + + /// The earliest sequence number of the entries already flushed (inclusive). + /// All log entry with sequence <= `flushed_sequence` can be deleted + flushed_sequence: SequenceNumber, +} + +impl TableVersionInner { + fn memtable_for_write( + &self, + _write_lock: &WorkerLocal, + timestamp: Timestamp, + ) -> Option { + if let Some(mem) = self.memtable_view.sampling_mem.clone() { + if !mem.freezed { + // If sampling memtable is not freezed. + return Some(MemTableForWrite::Sampling(mem)); + } + } + + self.memtable_view + .mutables + .memtable_for_write(timestamp) + .cloned() + .map(MemTableForWrite::Normal) + } +} + +// TODO(yingwen): How to support snapshot? +/// Table version +/// +/// Holds memtables and sst meta data of a table +/// +/// Switching memtable, memtable to level 0 file, addition/deletion to files +/// should be done atomically. +pub struct TableVersion { + inner: RwLock, +} + +impl TableVersion { + /// Create an empty table version + pub fn new(purge_queue: FilePurgeQueue) -> Self { + Self { + inner: RwLock::new(TableVersionInner { + memtable_view: MemTableView::new(), + levels: LevelsController::new(purge_queue), + flushed_sequence: 0, + }), + } + } + + /// See [MemTableView::mutable_memory_usage] + pub fn mutable_memory_usage(&self) -> usize { + self.inner + .read() + .unwrap() + .memtable_view + .mutable_memory_usage() + } + + /// See [MemTableView::total_memory_usage] + pub fn total_memory_usage(&self) -> usize { + self.inner + .read() + .unwrap() + .memtable_view + .total_memory_usage() + } + + /// Switch all mutable memtables or just return the suggested segment + /// duration if sampling memtable is still active. + /// + /// Returns a duration if a sampled segment duration needs to be persisted. + /// + /// REQUIRE: Do in write worker + pub fn switch_memtables_or_suggest_duration( + &self, + _worker_local: &WorkerLocal, + ) -> Option { + self.inner + .write() + .unwrap() + .memtable_view + .switch_memtables_or_suggest_duration() + } + + /// Stop timestamp sampling and freezed the sampling memtable. + /// + /// REQUIRE: Do in write worker + pub fn freeze_sampling(&self, _worker_local: &WorkerLocal) { + self.inner + .write() + .unwrap() + .memtable_view + .freeze_sampling_memtable(); + } + + /// See [MemTableView::pick_memtables_to_flush] + pub fn pick_memtables_to_flush( + &self, + max_memtable_id: MemTableId, + mems: &mut FlushableMemTables, + ) { + self.inner + .read() + .unwrap() + .memtable_view + .pick_memtables_to_flush(max_memtable_id, mems); + } + + /// Get memtable by timestamp for write. + /// + /// The returned schema is guaranteed to have schema with same version as + /// `schema_version`. Return None if the schema of existing memtable has + /// different schema. + pub fn memtable_for_write( + &self, + write_lock: &WorkerLocal, + timestamp: Timestamp, + schema_version: schema::Version, + ) -> Result> { + // Find memtable by timestamp + let mutable = { + let inner = self.inner.read().unwrap(); + match inner.memtable_for_write(write_lock, timestamp) { + Some(v) => v, + None => return Ok(None), + } + }; + + // We consider the schemas are same if they have the same version. + ensure!( + mutable.memtable().schema().version() == schema_version, + SchemaMismatch { + memtable_version: mutable.memtable().schema().version(), + given: schema_version, + } + ); + + Ok(Some(mutable)) + } + + /// Insert memtable into mutable memtable set. + pub fn insert_mutable(&self, mem_state: MemTableState) { + let mut inner = self.inner.write().unwrap(); + let old_memtable = inner.memtable_view.mutables.insert(mem_state.clone()); + assert!( + old_memtable.is_none(), + "Find a duplicate memtable, new_memtable:{:?}, old_memtable:{:?}, memtable_view:{:#?}", + mem_state, + old_memtable, + inner.memtable_view + ); + } + + /// Set sampling memtable. + /// + /// Panic if the sampling memtable of this version is not None. + pub fn set_sampling(&self, sampling_mem: SamplingMemTable) { + let mut inner = self.inner.write().unwrap(); + assert!(inner.memtable_view.sampling_mem.is_none()); + inner.memtable_view.sampling_mem = Some(sampling_mem); + } + + /// Atomically apply the edit to the version. + pub fn apply_edit(&self, edit: VersionEdit) { + let mut inner = self.inner.write().unwrap(); + + // TODO(yingwen): else, log warning + inner.flushed_sequence = cmp::max(inner.flushed_sequence, edit.flushed_sequence); + + // Add sst files to level first. + for add_file in edit.files_to_add { + inner.levels.add_sst_to_level(add_file.level, add_file.file); + } + + // Remove ssts from level. + for delete_file in edit.files_to_delete { + inner + .levels + .remove_ssts_from_level(delete_file.level, &[delete_file.file_id]); + } + + // Remove immutable memtables. + for mem_id in edit.mems_to_remove { + inner.memtable_view.remove_immutable_or_sampling(mem_id); + } + } + + /// Atomically apply the meta to the version, useful in recover. + pub fn apply_meta(&self, meta: TableVersionMeta) { + let mut inner = self.inner.write().unwrap(); + + inner.flushed_sequence = cmp::max(inner.flushed_sequence, meta.flushed_sequence); + + for add_file in meta.files.into_values() { + inner.levels.add_sst_to_level(add_file.level, add_file.file); + } + } + + pub fn pick_read_view(&self, time_range: TimeRange) -> ReadView { + let mut sampling_mem = None; + let mut memtables = MemTableVec::new(); + let mut leveled_ssts = vec![Vec::new(); MAX_LEVEL]; + + { + // Pick memtables for read. + let inner = self.inner.read().unwrap(); + + inner + .memtable_view + .memtables_for_read(time_range, &mut memtables, &mut sampling_mem); + + // Pick ssts for read. + inner.levels.pick_ssts(time_range, |level, ssts| { + leveled_ssts[level as usize].extend_from_slice(ssts) + }); + } + + ReadView { + sampling_mem, + memtables, + leveled_ssts, + } + } + + /// Pick ssts for compaction using given `picker`. + pub fn pick_for_compaction( + &self, + picker_ctx: PickerContext, + picker: &CompactionPickerRef, + ) -> picker::Result { + let inner = self.inner.read().unwrap(); + + picker.pick_compaction(picker_ctx, &inner.levels) + } + + pub fn has_expired_sst(&self, expire_time: Option) -> bool { + let inner = self.inner.read().unwrap(); + + inner.levels.has_expired_sst(expire_time) + } + + pub fn expired_ssts(&self, expire_time: Option) -> Vec { + let inner = self.inner.read().unwrap(); + + inner.levels.expired_ssts(expire_time) + } +} + +/// During recovery, we apply all version edit to [TableVersionMeta] first, then +/// apply the version meta to the table, so we can avoid adding removed ssts to +/// the version. +#[derive(Debug, Default)] +pub struct TableVersionMeta { + pub flushed_sequence: SequenceNumber, + files: HashMap, + max_file_id: FileId, +} + +impl TableVersionMeta { + pub fn apply_edit(&mut self, edit: VersionEdit) { + self.flushed_sequence = cmp::max(self.flushed_sequence, edit.flushed_sequence); + + for add_file in edit.files_to_add { + self.max_file_id = cmp::max(self.max_file_id, add_file.file.id); + + self.files.insert(add_file.file.id, add_file); + } + + for delete_file in edit.files_to_delete { + self.files.remove(&delete_file.file_id); + } + } + + /// Returns the max file id in the files to add. + pub fn max_file_id_to_add(&self) -> FileId { + self.max_file_id + } + + pub fn ordered_files(&self) -> Vec { + let mut files_vec: Vec<_> = self.files.values().cloned().collect(); + files_vec.sort_unstable_by_key(|file| file.file.id); + + files_vec + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + instance::write_worker::tests::WriteHandleMocker, + sst::file::tests::{FilePurgerMocker, SstMetaDataMocker}, + table::{data::tests::MemTableMocker, version_edit::tests::AddFileMocker}, + table_options, + tests::table, + }; + + fn new_table_version() -> TableVersion { + let purger = FilePurgerMocker::mock(); + let queue = purger.create_purge_queue(1, table::new_table_id(2, 2)); + TableVersion::new(queue) + } + + #[test] + fn test_empty_table_version() { + let mocked_write_handle = WriteHandleMocker::default().build(); + let worker_local = mocked_write_handle.worker_local; + let version = new_table_version(); + + let ts = Timestamp::now(); + assert!(!version.has_expired_sst(None)); + assert!(!version.has_expired_sst(Some(ts))); + + assert_eq!(0, version.mutable_memory_usage()); + assert_eq!(0, version.total_memory_usage()); + + { + let inner = version.inner.read().unwrap(); + let memtable_view = &inner.memtable_view; + assert!(memtable_view.sampling_mem.is_none()); + assert!(memtable_view.mutables.is_empty()); + assert!(memtable_view.immutables.is_empty()); + } + + let mut flushable_mems = FlushableMemTables::default(); + let max_memtable_id = 1000; + version.pick_memtables_to_flush(max_memtable_id, &mut flushable_mems); + assert!(flushable_mems.is_empty()); + + let read_view = version.pick_read_view(TimeRange::min_to_max()); + assert!(!read_view.contains_sampling()); + + assert!(read_view.sampling_mem.is_none()); + assert!(read_view.memtables.is_empty()); + for ssts in read_view.leveled_ssts { + assert!(ssts.is_empty()); + } + + let now = Timestamp::now(); + let mutable = version.memtable_for_write(&worker_local, now, 1).unwrap(); + assert!(mutable.is_none()); + + // Nothing to switch. + assert!(version + .switch_memtables_or_suggest_duration(&worker_local) + .is_none()); + } + + fn check_flushable_mem_with_sampling( + flushable_mems: &FlushableMemTables, + memtable_id: MemTableId, + ) { + assert!(!flushable_mems.is_empty()); + assert_eq!( + memtable_id, + flushable_mems.sampling_mem.as_ref().unwrap().id + ); + assert!(flushable_mems.memtables.is_empty()); + } + + #[test] + fn test_table_version_sampling() { + let mocked_write_handle = WriteHandleMocker::default().build(); + let worker_local = mocked_write_handle.worker_local; + let version = new_table_version(); + + let memtable = MemTableMocker::default().build(); + let schema = memtable.schema().clone(); + + let memtable_id = 1; + let sampling_mem = SamplingMemTable::new(memtable, memtable_id); + + version.set_sampling(sampling_mem); + + // Should write to sampling memtable. + let now = Timestamp::now(); + let mutable = version + .memtable_for_write(&worker_local, now, schema.version()) + .unwrap() + .unwrap(); + let actual_memtable = mutable.as_sampling(); + assert_eq!(memtable_id, actual_memtable.id); + + let mutable = version + .memtable_for_write(&worker_local, Timestamp::new(1234), schema.version()) + .unwrap() + .unwrap(); + let actual_memtable = mutable.as_sampling(); + assert_eq!(memtable_id, actual_memtable.id); + + // Sampling memtable should always be read. + let read_view = version.pick_read_view(TimeRange::new(0.into(), 123.into()).unwrap()); + assert!(read_view.contains_sampling()); + assert_eq!(memtable_id, read_view.sampling_mem.unwrap().id); + + let mut flushable_mems = FlushableMemTables::default(); + version.pick_memtables_to_flush(memtable_id, &mut flushable_mems); + check_flushable_mem_with_sampling(&flushable_mems, memtable_id); + } + + #[test] + fn test_table_version_sampling_switch() { + let worker_local = WriteHandleMocker::default().build().worker_local; + let version = new_table_version(); + + let memtable = MemTableMocker::default().build(); + let schema = memtable.schema().clone(); + + let memtable_id = 1; + let sampling_mem = SamplingMemTable::new(memtable, memtable_id); + + version.set_sampling(sampling_mem); + + let duration = version + .switch_memtables_or_suggest_duration(&worker_local) + .unwrap(); + assert_eq!(table_options::DEFAULT_SEGMENT_DURATION, duration); + + // Flushable memtables only contains sampling memtable. + let mut flushable_mems = FlushableMemTables::default(); + version.pick_memtables_to_flush(memtable_id, &mut flushable_mems); + check_flushable_mem_with_sampling(&flushable_mems, memtable_id); + + // Write to memtable after switch and before freezed. + let now = Timestamp::now(); + let mutable = version + .memtable_for_write(&worker_local, now, schema.version()) + .unwrap() + .unwrap(); + // Still write to sampling memtable. + let actual_memtable = mutable.as_sampling(); + assert_eq!(memtable_id, actual_memtable.id); + + // Switch still return duration before freezed. + let duration = version + .switch_memtables_or_suggest_duration(&worker_local) + .unwrap(); + assert_eq!(table_options::DEFAULT_SEGMENT_DURATION, duration); + + // Flushable memtables only contains sampling memtable before sampling + // memtable is freezed. + let mut flushable_mems = FlushableMemTables::default(); + version.pick_memtables_to_flush(memtable_id, &mut flushable_mems); + check_flushable_mem_with_sampling(&flushable_mems, memtable_id); + } + + #[test] + fn test_table_version_sampling_freeze() { + let worker_local = WriteHandleMocker::default().build().worker_local; + let version = new_table_version(); + + let memtable = MemTableMocker::default().build(); + let schema = memtable.schema().clone(); + + let memtable_id1 = 1; + let sampling_mem = SamplingMemTable::new(memtable, memtable_id1); + + version.set_sampling(sampling_mem); + assert_eq!( + table_options::DEFAULT_SEGMENT_DURATION, + version + .switch_memtables_or_suggest_duration(&worker_local) + .unwrap() + ); + + // Freeze the sampling memtable. + version.freeze_sampling(&worker_local); + + // No memtable after switch and freezed. + let now = Timestamp::now(); + assert!(version + .memtable_for_write(&worker_local, now, schema.version()) + .unwrap() + .is_none()); + + // Still flushable after freezed. + let mut flushable_mems = FlushableMemTables::default(); + version.pick_memtables_to_flush(memtable_id1, &mut flushable_mems); + assert!(flushable_mems.sampling_mem.unwrap().freezed); + + let time_range = + TimeRange::bucket_of(now, table_options::DEFAULT_SEGMENT_DURATION).unwrap(); + + // Sampling memtable still readable after freezed. + let read_view = version.pick_read_view(time_range); + assert!(read_view.contains_sampling()); + assert_eq!(memtable_id1, read_view.sampling_mem.as_ref().unwrap().id); + assert!(read_view.sampling_mem.unwrap().freezed); + + let memtable = MemTableMocker::default().build(); + let memtable_id2 = 2; + let mem_state = MemTableState { + mem: memtable, + time_range, + id: memtable_id2, + }; + // Insert a mutable memtable. + version.insert_mutable(mem_state); + + // Write to mutable memtable. + let mutable = version + .memtable_for_write(&worker_local, now, schema.version()) + .unwrap() + .unwrap(); + let mutable = mutable.as_normal(); + assert_eq!(time_range, mutable.time_range); + assert_eq!(memtable_id2, mutable.id); + + // Need to read sampling memtable and mutable memtable. + let read_view = version.pick_read_view(time_range); + assert_eq!(memtable_id1, read_view.sampling_mem.as_ref().unwrap().id); + assert_eq!(1, read_view.memtables.len()); + assert_eq!(memtable_id2, read_view.memtables[0].id); + + // Switch mutable memtable. + assert!(version + .switch_memtables_or_suggest_duration(&worker_local) + .is_none()); + // No memtable after switch. + let now = Timestamp::now(); + assert!(version + .memtable_for_write(&worker_local, now, schema.version()) + .unwrap() + .is_none()); + + // Two memtables to flush. + let mut flushable_mems = FlushableMemTables::default(); + version.pick_memtables_to_flush(memtable_id2, &mut flushable_mems); + assert!(flushable_mems.sampling_mem.unwrap().freezed); + assert_eq!(1, flushable_mems.memtables.len()); + assert_eq!(memtable_id2, flushable_mems.memtables[0].id); + } + + #[test] + fn test_table_version_sampling_apply_edit() { + let worker_local = WriteHandleMocker::default().build().worker_local; + let version = new_table_version(); + + let memtable = MemTableMocker::default().build(); + let schema = memtable.schema().clone(); + + let memtable_id1 = 1; + let sampling_mem = SamplingMemTable::new(memtable, memtable_id1); + + // Prepare sampling memtable. + version.set_sampling(sampling_mem); + version.freeze_sampling(&worker_local); + + let now = Timestamp::now(); + let time_range = + TimeRange::bucket_of(now, table_options::DEFAULT_SEGMENT_DURATION).unwrap(); + + // Prepare mutable memtable. + let memtable = MemTableMocker::default().build(); + let memtable_id2 = 2; + let mem_state = MemTableState { + mem: memtable, + time_range, + id: memtable_id2, + }; + // Insert a mutable memtable. + version.insert_mutable(mem_state); + + // Switch memtable. + assert!(version + .switch_memtables_or_suggest_duration(&worker_local) + .is_none()); + + let max_sequence = 120; + let file_id = 13; + // TO simplify test, we only create one sst. + let sst_meta = SstMetaDataMocker::new(schema) + .time_range(time_range) + .max_sequence(max_sequence) + .build(); + let add_file = AddFileMocker::new(sst_meta).file_id(file_id).build(); + let edit = VersionEdit { + flushed_sequence: max_sequence, + mems_to_remove: vec![memtable_id1, memtable_id2], + files_to_add: vec![add_file], + files_to_delete: vec![], + }; + version.apply_edit(edit); + + // Only pick ssts after flushed. + let read_view = version.pick_read_view(time_range); + assert!(!read_view.contains_sampling()); + assert!(read_view.sampling_mem.is_none()); + assert!(read_view.memtables.is_empty()); + assert_eq!(1, read_view.leveled_ssts[0].len()); + assert_eq!(file_id, read_view.leveled_ssts[0][0].id()); + } +} diff --git a/analytic_engine/src/table/version_edit.rs b/analytic_engine/src/table/version_edit.rs new file mode 100644 index 0000000000..97f09e5454 --- /dev/null +++ b/analytic_engine/src/table/version_edit.rs @@ -0,0 +1,176 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Version edits + +use std::convert::{TryFrom, TryInto}; + +use common_types::{bytes::Bytes, schema::Schema, time::TimeRange, SequenceNumber}; +use common_util::define_result; +use proto::meta_update as meta_pb; +use snafu::{Backtrace, ResultExt, Snafu}; + +use crate::{ + sst::{ + file::{FileMeta, SstMetaData}, + manager::FileId, + }, + table::data::MemTableId, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Invalid level:{}, err:{}.\nBacktrace:\n{}", level, source, backtrace))] + InvalidLevel { + level: u32, + source: std::num::TryFromIntError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to convert time range, err:{}", source))] + ConvertTimeRange { source: common_types::time::Error }, + + #[snafu(display("Fail to convert table schema, err:{}", source))] + ConvertTableSchema { source: common_types::schema::Error }, +} + +define_result!(Error); + +/// Meta data of a new file. +#[derive(Debug, Clone)] +pub struct AddFile { + /// The level of the file intended to add. + pub level: u16, + /// Meta data of the file to add. + pub file: FileMeta, +} + +impl AddFile { + /// Convert into protobuf struct + pub fn into_pb(self) -> meta_pb::AddFileMeta { + let mut target = meta_pb::AddFileMeta::new(); + target.set_level(self.level.into()); + target.set_file_id(self.file.id); + target.set_min_key(self.file.meta.min_key.to_vec()); + target.set_max_key(self.file.meta.max_key.to_vec()); + target.set_time_range(self.file.meta.time_range.into()); + target.set_max_seq(self.file.meta.max_sequence); + target.set_schema(self.file.meta.schema.into()); + target.set_size(self.file.meta.size); + target.set_row_num(self.file.meta.row_num); + + target + } +} + +impl TryFrom for AddFile { + type Error = Error; + + fn try_from(mut src: meta_pb::AddFileMeta) -> Result { + let time_range = TimeRange::try_from(src.take_time_range()).context(ConvertTimeRange)?; + let schema = Schema::try_from(src.take_schema()).context(ConvertTableSchema)?; + Ok(Self { + level: src + .level + .try_into() + .context(InvalidLevel { level: src.level })?, + file: FileMeta { + id: src.file_id, + meta: SstMetaData { + min_key: Bytes::from(src.min_key), + max_key: Bytes::from(src.max_key), + time_range, + max_sequence: src.max_seq, + schema, + size: src.size, + row_num: src.row_num, + }, + }, + }) + } +} + +/// Meta data of the file to delete. +#[derive(Debug, Clone)] +pub struct DeleteFile { + /// The level of the file intended to delete. + pub level: u16, + /// Id of the file to delete. + pub file_id: FileId, +} + +impl DeleteFile { + /// Convert into protobuf struct + pub fn into_pb(self) -> meta_pb::DeleteFileMeta { + let mut target = meta_pb::DeleteFileMeta::new(); + target.set_level(self.level.into()); + target.set_file_id(self.file_id); + + target + } +} + +impl TryFrom for DeleteFile { + type Error = Error; + + fn try_from(src: meta_pb::DeleteFileMeta) -> Result { + let level = src + .level + .try_into() + .context(InvalidLevel { level: src.level })?; + + Ok(Self { + level, + file_id: src.file_id, + }) + } +} + +/// Edit to the [TableVersion], which should be done atomically +#[derive(Debug)] +pub struct VersionEdit { + /// The last sequence already flushed. This field is not guaranteed to be + /// set if the version edit is created by a non-flush operation (such as + /// compaction). + pub flushed_sequence: SequenceNumber, + /// Id of memtables to remove from immutable memtable lists. + pub mems_to_remove: Vec, + /// Sst files to add. + pub files_to_add: Vec, + /// Sst files to delete. + pub files_to_delete: Vec, +} + +#[cfg(test)] +pub mod tests { + use super::*; + + #[must_use] + pub struct AddFileMocker { + file_id: FileId, + sst_meta: SstMetaData, + } + + impl AddFileMocker { + pub fn new(sst_meta: SstMetaData) -> Self { + Self { + file_id: 1, + sst_meta, + } + } + + pub fn file_id(mut self, file_id: FileId) -> Self { + self.file_id = file_id; + self + } + + pub fn build(&self) -> AddFile { + AddFile { + level: 0, + file: FileMeta { + id: self.file_id, + meta: self.sst_meta.clone(), + }, + } + } + } +} diff --git a/analytic_engine/src/table_options.rs b/analytic_engine/src/table_options.rs new file mode 100644 index 0000000000..badac47830 --- /dev/null +++ b/analytic_engine/src/table_options.rs @@ -0,0 +1,553 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Constants for table options. + +use std::{collections::HashMap, string::ToString, time::Duration}; + +use arrow_deps::datafusion::parquet::basic::Compression as ParquetCompression; +use common_types::time::Timestamp; +use common_util::{ + config::{ReadableDuration, ReadableSize}, + define_result, + time::DurationExt, +}; +use proto::analytic_common::{ + CompactionOptions as CompactionOptionsPb, CompactionStrategy as CompactionStrategyPb, + Compression as CompressionPb, TableOptions as TableOptionsPb, UpdateMode as UpdateModePb, +}; +use serde_derive::Deserialize; +use snafu::{Backtrace, GenerateBacktrace, ResultExt, Snafu}; +use table_engine::OPTION_KEY_ENABLE_TTL; + +use crate::compaction::{ + CompactionStrategy, SizeTieredCompactionOptions, TimeWindowCompactionOptions, +}; + +pub const SEGMENT_DURATION: &str = "segment_duration"; +pub const ENABLE_TTL: &str = OPTION_KEY_ENABLE_TTL; +pub const TTL: &str = "ttl"; +pub const ARENA_BLOCK_SIZE: &str = "arena_block_size"; +pub const WRITE_BUFFER_SIZE: &str = "write_buffer_size"; +pub const COMPACTION_STRATEGY: &str = "compaction_strategy"; +pub const NUM_ROWS_PER_ROW_GROUP: &str = "num_rows_per_row_group"; +pub const UPDATE_MODE: &str = "update_mode"; +pub const COMPRESSION: &str = "compression"; + +const UPDATE_MODE_OVERWRITE: &str = "OVERWRITE"; +const UPDATE_MODE_APPEND: &str = "APPEND"; +const COMPRESSION_UNCOMPRESSED: &str = "UNCOMPRESSED"; +const COMPRESSION_LZ4: &str = "LZ4"; +const COMPRESSION_SNAPPY: &str = "SNAPPY"; +const COMPRESSION_ZSTD: &str = "ZSTD"; +const AT_LEAST_OPTIONS_NUM: usize = 9; + +/// Default bucket duration (1d) +const BUCKET_DURATION_1D: Duration = Duration::from_secs(24 * 60 * 60); +/// Default duration of a segment (2h). +pub const DEFAULT_SEGMENT_DURATION: Duration = Duration::from_secs(60 * 60 * 2); +/// Default arena block size (2M). +const DEFAULT_ARENA_BLOCK_SIZE: u32 = 2 * 1024 * 1024; +/// Default write buffer size (32M). +const DEFAULT_WRITE_BUFFER_SIZE: u32 = 32 * 1024 * 1024; +/// Default ttl of table (7d). +const DEFAULT_TTL: Duration = Duration::from_secs(7 * 24 * 60 * 60); +/// Default row number of a row group. +const DEFAULT_NUM_ROW_PER_ROW_GROUP: usize = 8192; + +/// Max arena block size (2G) +const MAX_ARENA_BLOCK_SIZE: u32 = 2 * 1024 * 1024 * 1024; +/// Min arena block size (1K) +const MIN_ARENA_BLOCK_SIZE: u32 = 1024; +const MIN_NUM_ROWS_PER_ROW_GROUP: usize = 100; +const MAX_NUM_ROWS_PER_ROW_GROUP: usize = 10_000_000; + +#[derive(Debug, Snafu)] +#[allow(clippy::enum_variant_names)] +pub enum Error { + #[snafu(display("Failed to parse duration, err:{}.\nBacktrace:\n{}", err, backtrace))] + ParseDuration { err: String, backtrace: Backtrace }, + + #[snafu(display("Failed to parse size, err:{}.\nBacktrace:\n{}", err, backtrace))] + ParseSize { err: String, backtrace: Backtrace }, + + #[snafu(display("Failed to parse compaction strategy: {}, err: {}", value, source))] + ParseStrategy { + value: String, + source: crate::compaction::Error, + }, + #[snafu(display("Failed to parse int, err:{}.\nBacktrace:\n{}", source, backtrace))] + ParseInt { + source: std::num::ParseIntError, + backtrace: Backtrace, + }, + #[snafu(display("Failed to parse bool, err:{}.\nBacktrace:\n{}", source, backtrace))] + ParseBool { + source: std::str::ParseBoolError, + backtrace: Backtrace, + }, + #[snafu(display( + "Failed to parse update mode, raw str:{}.\nBacktrace:\n{}", + s, + backtrace + ))] + ParseUpdateMode { s: String, backtrace: Backtrace }, + #[snafu(display( + "Failed to parse compression, name:{}.\nBacktrace:\n{}", + name, + backtrace + ))] + ParseCompressionName { name: String, backtrace: Backtrace }, +} + +define_result!(Error); + +#[derive(Debug, Clone, Deserialize)] +pub enum UpdateMode { + Overwrite, + Append, +} + +impl UpdateMode { + pub fn parse_from(s: &str) -> Result { + if s.eq_ignore_ascii_case(UPDATE_MODE_OVERWRITE) { + Ok(UpdateMode::Overwrite) + } else if s.eq_ignore_ascii_case(UPDATE_MODE_APPEND) { + Ok(UpdateMode::Append) + } else { + ParseUpdateMode { s }.fail() + } + } +} + +impl ToString for UpdateMode { + fn to_string(&self) -> String { + match self { + UpdateMode::Append => UPDATE_MODE_APPEND.to_string(), + UpdateMode::Overwrite => UPDATE_MODE_OVERWRITE.to_string(), + } + } +} + +#[derive(Debug, Clone, Copy, Deserialize)] +pub enum Compression { + Uncompressed, + Lz4, + Snappy, + Zstd, +} + +impl Compression { + pub fn parse_from(name: &str) -> Result { + if name.eq_ignore_ascii_case(COMPRESSION_UNCOMPRESSED) { + Ok(Compression::Uncompressed) + } else if name.eq_ignore_ascii_case(COMPRESSION_LZ4) { + Ok(Compression::Lz4) + } else if name.eq_ignore_ascii_case(COMPRESSION_SNAPPY) { + Ok(Compression::Snappy) + } else if name.eq_ignore_ascii_case(COMPRESSION_ZSTD) { + Ok(Compression::Zstd) + } else { + ParseCompressionName { name }.fail() + } + } +} + +impl ToString for Compression { + fn to_string(&self) -> String { + match self { + Compression::Uncompressed => COMPRESSION_UNCOMPRESSED.to_string(), + Compression::Lz4 => COMPRESSION_LZ4.to_string(), + Compression::Snappy => COMPRESSION_SNAPPY.to_string(), + Compression::Zstd => COMPRESSION_ZSTD.to_string(), + } + } +} + +impl From for CompressionPb { + fn from(compression: Compression) -> Self { + match compression { + Compression::Uncompressed => CompressionPb::UNCOMPRESSED, + Compression::Lz4 => CompressionPb::LZ4, + Compression::Snappy => CompressionPb::SNAPPY, + Compression::Zstd => CompressionPb::ZSTD, + } + } +} + +impl From for Compression { + fn from(compression: CompressionPb) -> Self { + match compression { + CompressionPb::UNCOMPRESSED => Compression::Uncompressed, + CompressionPb::LZ4 => Compression::Lz4, + CompressionPb::SNAPPY => Compression::Snappy, + CompressionPb::ZSTD => Compression::Zstd, + } + } +} + +impl From for ParquetCompression { + fn from(compression: Compression) -> Self { + match compression { + Compression::Uncompressed => ParquetCompression::UNCOMPRESSED, + Compression::Lz4 => ParquetCompression::LZ4, + Compression::Snappy => ParquetCompression::SNAPPY, + Compression::Zstd => ParquetCompression::ZSTD, + } + } +} + +/// Options for a table. +#[derive(Debug, Clone, Deserialize)] +#[serde(default)] +pub struct TableOptions { + // The following options are immutable once table was created. + /// Segment duration of the table. + /// + /// `None` means the table is doing the segment duration sampling and + /// the actual duration is still unknown. + pub segment_duration: Option, + /// Table update mode, now support Overwrite(Default) and Append + pub update_mode: UpdateMode, + + // The following options can be altered. + /// Enable ttl + pub enable_ttl: bool, + /// Time-to-live of the data. + pub ttl: ReadableDuration, + /// Arena block size of memtable. + pub arena_block_size: u32, + /// Write buffer size of memtable. + pub write_buffer_size: u32, + /// Compaction strategy of the table. + pub compaction_strategy: CompactionStrategy, + /// Row number in a row group. + pub num_rows_per_row_group: usize, + /// Table Compression + pub compression: Compression, +} + +impl TableOptions { + #[inline] + pub fn segment_duration(&self) -> Option { + self.segment_duration.map(|v| v.0) + } + + #[inline] + pub fn ttl(&self) -> Option { + if self.enable_ttl { + Some(self.ttl) + } else { + None + } + } + + // for show create table + pub fn to_raw_map(&self) -> HashMap { + let mut m = HashMap::with_capacity(AT_LEAST_OPTIONS_NUM); + m.insert( + SEGMENT_DURATION.to_string(), + self.segment_duration + .map(|v| v.to_string()) + .unwrap_or_else(String::new), + ); + m.insert(UPDATE_MODE.to_string(), self.update_mode.to_string()); + m.insert(ENABLE_TTL.to_string(), self.enable_ttl.to_string()); + m.insert(TTL.to_string(), format!("{}", self.ttl)); + m.insert( + ARENA_BLOCK_SIZE.to_string(), + format!("{}", self.arena_block_size), + ); + m.insert( + WRITE_BUFFER_SIZE.to_string(), + format!("{}", self.write_buffer_size), + ); + self.compaction_strategy.fill_raw_map(&mut m); + m.insert( + NUM_ROWS_PER_ROW_GROUP.to_string(), + format!("{}", self.num_rows_per_row_group), + ); + m.insert(COMPRESSION.to_string(), self.compression.to_string()); + + assert!(m.len() >= AT_LEAST_OPTIONS_NUM); + + m + } + + /// Sanitize options silently. + pub fn sanitize(&mut self) { + let one_day_secs = BUCKET_DURATION_1D.as_secs(); + + if let Some(segment_duration) = self.segment_duration { + let mut segment_duration_secs = segment_duration.as_secs(); + if segment_duration_secs == 0 { + segment_duration_secs = DEFAULT_SEGMENT_DURATION.as_secs() + }; + self.segment_duration = Some(ReadableDuration::secs(segment_duration_secs)); + } + + let ttl_secs = self.ttl.as_secs(); + // Ttl must align to day. + let ttl_secs = ttl_secs / one_day_secs * one_day_secs; + self.ttl = ReadableDuration::secs(ttl_secs); + + if self.arena_block_size < MIN_ARENA_BLOCK_SIZE { + self.arena_block_size = MIN_ARENA_BLOCK_SIZE; + } + + if self.arena_block_size > MAX_ARENA_BLOCK_SIZE { + self.arena_block_size = MAX_ARENA_BLOCK_SIZE; + } + + if self.num_rows_per_row_group < MIN_NUM_ROWS_PER_ROW_GROUP { + self.num_rows_per_row_group = MIN_NUM_ROWS_PER_ROW_GROUP; + } + + if self.num_rows_per_row_group > MAX_NUM_ROWS_PER_ROW_GROUP { + self.num_rows_per_row_group = MAX_NUM_ROWS_PER_ROW_GROUP; + } + } + + pub fn need_dedup(&self) -> bool { + match self.update_mode { + UpdateMode::Overwrite => true, + UpdateMode::Append => false, + } + } + + pub fn is_expired(&self, timestamp: Timestamp) -> bool { + self.enable_ttl && timestamp.is_expired(Timestamp::expire_time(self.ttl.0)) + } +} + +impl From for CompactionOptionsPb { + fn from(opts: SizeTieredCompactionOptions) -> Self { + let mut target = CompactionOptionsPb::new(); + target.set_bucket_low(opts.bucket_low); + target.set_bucket_high(opts.bucket_high); + target.set_min_sstable_size(opts.min_sstable_size.0 as u32); + target.set_max_threshold(opts.max_threshold as u32); + target.set_min_threshold(opts.min_threshold as u32); + + target + } +} + +impl From for SizeTieredCompactionOptions { + fn from(opts: CompactionOptionsPb) -> Self { + Self { + bucket_low: opts.bucket_low, + bucket_high: opts.bucket_high, + min_sstable_size: ReadableSize(opts.min_sstable_size.into()), + min_threshold: opts.min_threshold as usize, + max_threshold: opts.max_threshold as usize, + } + } +} + +impl From for CompactionOptionsPb { + fn from(opts: TimeWindowCompactionOptions) -> Self { + let mut target = CompactionOptionsPb::new(); + target.set_bucket_low(opts.size_tiered.bucket_low); + target.set_bucket_high(opts.size_tiered.bucket_high); + target.set_min_sstable_size(opts.size_tiered.min_sstable_size.0 as u32); + target.set_min_threshold(opts.size_tiered.min_threshold as u32); + target.set_max_threshold(opts.size_tiered.max_threshold as u32); + target.set_timestamp_resolution(opts.timestamp_resolution.into()); + + target + } +} + +impl From for TimeWindowCompactionOptions { + fn from(opts: CompactionOptionsPb) -> Self { + let size_tiered: SizeTieredCompactionOptions = opts.clone().into(); + + Self { + size_tiered, + timestamp_resolution: opts.timestamp_resolution.into(), + } + } +} + +impl From for TableOptionsPb { + fn from(opts: TableOptions) -> Self { + let mut target = TableOptionsPb::new(); + if let Some(segment_duration) = opts.segment_duration { + target.set_segment_duration(segment_duration.0.as_millis_u64()); + target.set_sampling_segment_duration(false); + } else { + // The segment duration is unknown. + target.set_sampling_segment_duration(true); + } + target.set_enable_ttl(opts.enable_ttl); + target.set_ttl(opts.ttl.0.as_millis_u64()); + target.set_arena_block_size(opts.arena_block_size); + target.set_num_rows_per_row_group(opts.num_rows_per_row_group as u64); + + match opts.compaction_strategy { + CompactionStrategy::Default => { + target.set_compaction_strategy(CompactionStrategyPb::DEFAULT); + } + CompactionStrategy::SizeTiered(opts) => { + target.set_compaction_strategy(CompactionStrategyPb::SIZE_TIERED); + target.set_compaction_options(opts.into()); + } + CompactionStrategy::TimeWindow(opts) => { + target.set_compaction_strategy(CompactionStrategyPb::TIME_WINDOW); + target.set_compaction_options(opts.into()); + } + } + + match opts.update_mode { + UpdateMode::Overwrite => { + target.set_update_mode(UpdateModePb::Overwrite); + } + UpdateMode::Append => { + target.set_update_mode(UpdateModePb::Append); + } + } + + target.set_write_buffer_size(opts.write_buffer_size); + target.set_compression(opts.compression.into()); + + target + } +} + +impl From for TableOptions { + fn from(opts: TableOptionsPb) -> Self { + let compaction_strategy = match opts.compaction_strategy { + CompactionStrategyPb::DEFAULT => CompactionStrategy::default(), + CompactionStrategyPb::SIZE_TIERED => { + let opts = opts + .compaction_options + .map(SizeTieredCompactionOptions::from) + .unwrap_or_default(); + CompactionStrategy::SizeTiered(opts) + } + CompactionStrategyPb::TIME_WINDOW => { + let opts = opts + .compaction_options + .map(TimeWindowCompactionOptions::from) + .unwrap_or_default(); + CompactionStrategy::TimeWindow(opts) + } + }; + + let update_mode = match opts.update_mode { + UpdateModePb::Overwrite => UpdateMode::Overwrite, + UpdateModePb::Append => UpdateMode::Append, + }; + let segment_duration = if opts.sampling_segment_duration { + None + } else if opts.segment_duration == 0 { + // If segment duration is still zero. If the data had been used by an elder + // version release that not yet support sampling, the + // `sampling_segment_duration` flag would be truncated after + // manifest snapshot, but left segment duration zero. + Some(DEFAULT_SEGMENT_DURATION.into()) + } else { + Some(Duration::from_millis(opts.segment_duration).into()) + }; + + Self { + segment_duration, + enable_ttl: opts.enable_ttl, + ttl: Duration::from_millis(opts.ttl).into(), + arena_block_size: opts.arena_block_size, + compaction_strategy, + num_rows_per_row_group: opts.num_rows_per_row_group as usize, + update_mode, + write_buffer_size: opts.write_buffer_size, + compression: opts.compression.into(), + } + } +} + +impl Default for TableOptions { + fn default() -> Self { + Self { + segment_duration: None, + enable_ttl: true, + ttl: DEFAULT_TTL.into(), + arena_block_size: DEFAULT_ARENA_BLOCK_SIZE, + compaction_strategy: CompactionStrategy::default(), + num_rows_per_row_group: DEFAULT_NUM_ROW_PER_ROW_GROUP, + update_mode: UpdateMode::Overwrite, + write_buffer_size: DEFAULT_WRITE_BUFFER_SIZE, + compression: Compression::Zstd, + } + } +} + +pub fn merge_table_options_for_create( + options: &HashMap, + table_opts: &TableOptions, +) -> Result { + merge_table_options(options, table_opts, true) +} + +pub fn merge_table_options_for_alter( + options: &HashMap, + table_opts: &TableOptions, +) -> Result { + merge_table_options(options, table_opts, false) +} + +/// The options will override the old options. +fn merge_table_options( + options: &HashMap, + table_old_opts: &TableOptions, + is_create: bool, +) -> Result { + let mut table_opts = table_old_opts.clone(); + if is_create { + if let Some(v) = options.get(SEGMENT_DURATION) { + table_opts.segment_duration = Some(parse_duration(v)?); + } + if let Some(v) = options.get(UPDATE_MODE) { + table_opts.update_mode = UpdateMode::parse_from(v)?; + } + } + + if let Some(v) = options.get(TTL) { + table_opts.ttl = parse_duration(v)?; + } + if let Some(v) = options.get(OPTION_KEY_ENABLE_TTL) { + table_opts.enable_ttl = v.parse::().context(ParseBool)?; + } + if let Some(v) = options.get(ARENA_BLOCK_SIZE) { + let size = parse_size(v)?; + table_opts.arena_block_size = size.0 as u32; + } + if let Some(v) = options.get(WRITE_BUFFER_SIZE) { + let size = parse_size(v)?; + table_opts.write_buffer_size = size.0 as u32; + } + if let Some(v) = options.get(COMPACTION_STRATEGY) { + table_opts.compaction_strategy = + CompactionStrategy::parse_from(v, options).context(ParseStrategy { value: v })?; + } + if let Some(v) = options.get(NUM_ROWS_PER_ROW_GROUP) { + table_opts.num_rows_per_row_group = v.parse().context(ParseInt)?; + } + if let Some(v) = options.get(COMPRESSION) { + table_opts.compression = Compression::parse_from(v)?; + } + Ok(table_opts) +} + +fn parse_duration(v: &str) -> Result { + v.parse::() + .map_err(|err| Error::ParseDuration { + err, + backtrace: Backtrace::generate(), + }) +} + +fn parse_size(v: &str) -> Result { + v.parse::().map_err(|err| Error::ParseSize { + err, + backtrace: Backtrace::generate(), + }) +} diff --git a/analytic_engine/src/tests/alter_test.rs b/analytic_engine/src/tests/alter_test.rs new file mode 100644 index 0000000000..2bdc74f50b --- /dev/null +++ b/analytic_engine/src/tests/alter_test.rs @@ -0,0 +1,449 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Alter test + +use std::collections::{BTreeMap, HashMap}; + +use common_types::{ + column_schema, + datum::DatumKind, + row::{RowGroup, RowGroupBuilder}, + schema::{self, Schema}, + time::Timestamp, +}; +use log::info; +use table_engine::table::AlterSchemaRequest; + +use crate::{ + table_options::TableOptions, + tests::{ + row_util, + table::{self, FixedSchemaTable}, + util::{Null, TestContext, TestEnv}, + }, +}; + +#[test] +fn test_alter_table_add_column() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await; + + let start_ms = test_ctx.start_ms(); + let rows = [ + ( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key2", + Timestamp::new(start_ms), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ]; + + // Write data to table. + let row_group = fixed_schema_table.rows_to_row_group(&rows); + test_ctx.write_to_table(test_table1, row_group).await; + + alter_schema_same_schema_version_case(&test_ctx, test_table1).await; + + alter_schema_old_pre_version_case(&test_ctx, test_table1).await; + + alter_schema_add_column_case(&mut test_ctx, test_table1, start_ms, false).await; + + // Prepare another table for alter. + let test_table2 = "test_table2"; + test_ctx.create_fixed_schema_table(test_table2).await; + let row_group = fixed_schema_table.rows_to_row_group(&rows); + test_ctx.write_to_table(test_table2, row_group).await; + + alter_schema_add_column_case(&mut test_ctx, test_table2, start_ms, true).await; + }); +} + +// Add two columns: +// - add_string +// - add_double +fn add_columns(schema_builder: schema::Builder) -> schema::Builder { + schema_builder + .add_normal_column( + column_schema::Builder::new("add_string".to_string(), DatumKind::String) + .is_nullable(true) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("add_double".to_string(), DatumKind::Double) + .is_nullable(true) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() +} + +async fn alter_schema_same_schema_version_case(test_ctx: &TestContext, table_name: &str) { + info!("test alter_schema_same_schema_version_case"); + + let mut schema_builder = FixedSchemaTable::default_schema_builder(); + schema_builder = add_columns(schema_builder); + let new_schema = schema_builder.build().unwrap(); + + let table = test_ctx.table(table_name); + let old_schema = table.schema(); + + let request = AlterSchemaRequest { + schema: new_schema, + pre_schema_version: old_schema.version(), + }; + + let res = test_ctx.try_alter_schema(table_name, request).await; + assert!(res.is_err()); +} + +async fn alter_schema_old_pre_version_case(test_ctx: &TestContext, table_name: &str) { + info!("test alter_schema_old_pre_version_case"); + + let mut schema_builder = FixedSchemaTable::default_schema_builder(); + schema_builder = add_columns(schema_builder); + + let table = test_ctx.table(table_name); + let old_schema = table.schema(); + + let new_schema = schema_builder + .version(old_schema.version() + 1) + .build() + .unwrap(); + + let request = AlterSchemaRequest { + schema: new_schema, + pre_schema_version: old_schema.version() - 1, + }; + + let res = test_ctx.try_alter_schema(table_name, request).await; + assert!(res.is_err()); +} + +async fn alter_schema_add_column_case( + test_ctx: &mut TestContext, + table_name: &str, + start_ms: i64, + flush: bool, +) { + info!( + "test alter_schema_add_column_case, table_name:{}", + table_name + ); + + let mut schema_builder = FixedSchemaTable::default_schema_builder(); + schema_builder = add_columns(schema_builder); + + let old_schema = test_ctx.table(table_name).schema(); + + let new_schema = schema_builder + .version(old_schema.version() + 1) + .build() + .unwrap(); + + let request = AlterSchemaRequest { + schema: new_schema.clone(), + pre_schema_version: old_schema.version(), + }; + + let affected = test_ctx + .try_alter_schema(table_name, request) + .await + .unwrap(); + assert_eq!(1, affected); + + let rows = [ + ( + "key1", + Timestamp::new(start_ms + 10), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + "add1-1", + 210.0, + ), + ( + "key2", + Timestamp::new(start_ms + 10), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + "add1-2", + 220.0, + ), + ]; + let rows_vec = row_util::new_rows_8(&rows); + let row_group = RowGroupBuilder::with_rows(new_schema.clone(), rows_vec) + .unwrap() + .build(); + + // Write data with new schema. + test_ctx.write_to_table(table_name, row_group).await; + + if flush { + test_ctx.flush_table(table_name).await; + } + + let new_schema_rows = [ + // We need to check null datum, so tuples have different types and we need to + // convert it into row first. + row_util::new_row_8(( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + Null, + Null, + )), + row_util::new_row_8(( + "key1", + Timestamp::new(start_ms + 10), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + "add1-1", + 210.0, + )), + row_util::new_row_8(( + "key2", + Timestamp::new(start_ms), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + Null, + Null, + )), + row_util::new_row_8(( + "key2", + Timestamp::new(start_ms + 10), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + "add1-2", + 220.0, + )), + ]; + let new_schema_row_group = + RowGroupBuilder::with_rows(new_schema.clone(), new_schema_rows.to_vec()) + .unwrap() + .build(); + + // Read data using new schema. + check_read_row_group( + test_ctx, + "Test read new schema after add columns", + table_name, + &new_schema, + &new_schema_row_group, + ) + .await; + + let old_schema_rows = [ + ( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key1", + Timestamp::new(start_ms + 10), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key2", + Timestamp::new(start_ms), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ( + "key2", + Timestamp::new(start_ms + 10), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ]; + let old_schema_rows_vec = row_util::new_rows_6(&old_schema_rows); + let old_schema_row_group = RowGroupBuilder::with_rows(old_schema.clone(), old_schema_rows_vec) + .unwrap() + .build(); + + // Read data using old schema. + check_read_row_group( + test_ctx, + "Test read old schema after add columns", + table_name, + &old_schema, + &old_schema_row_group, + ) + .await; + + // Reopen db. + test_ctx.reopen_with_tables(&[table_name]).await; + + // Read again after reopen. + check_read_row_group( + test_ctx, + "Test read after reopen", + table_name, + &new_schema, + &new_schema_row_group, + ) + .await; +} + +async fn check_read_row_group( + test_ctx: &TestContext, + msg: &str, + table_name: &str, + schema: &Schema, + row_group: &RowGroup, +) { + for read_opts in table::read_opts_list() { + info!("{}, opts:{:?}", msg, read_opts); + + let record_batches = test_ctx + .read_table( + table_name, + table::new_read_all_request(schema.clone(), read_opts), + ) + .await; + + table::assert_batch_eq_to_row_group(&record_batches, row_group); + } +} + +#[test] +fn test_alter_table_options() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + test_ctx.create_fixed_schema_table(test_table1).await; + + let opts = test_ctx.table(test_table1).options(); + + let default_opts_map = default_options(); + + assert_options_eq(&default_opts_map, &opts); + + alter_immutable_option_case(&test_ctx, test_table1, "segment_duration", "20d").await; + + alter_immutable_option_case(&test_ctx, test_table1, "bucket_duration", "20d").await; + + alter_immutable_option_case(&test_ctx, test_table1, "update_mode", "Append").await; + + alter_mutable_option_case(&mut test_ctx, test_table1, "enable_ttl", "false").await; + alter_mutable_option_case(&mut test_ctx, test_table1, "enable_ttl", "true").await; + + alter_mutable_option_case(&mut test_ctx, test_table1, "arena_block_size", "10240").await; + + alter_mutable_option_case(&mut test_ctx, test_table1, "write_buffer_size", "1024000").await; + + alter_mutable_option_case( + &mut test_ctx, + test_table1, + "num_rows_per_row_group", + "10000", + ) + .await; + }); +} + +async fn alter_immutable_option_case( + test_ctx: &TestContext, + table_name: &str, + opt_key: &str, + opt_value: &str, +) { + let old_opts = test_ctx.table(table_name).options(); + + let mut new_opts = HashMap::new(); + new_opts.insert(opt_key.to_string(), opt_value.to_string()); + + let affected = test_ctx + .try_alter_options(table_name, new_opts) + .await + .unwrap(); + assert_eq!(1, affected); + + let opts_after_alter = test_ctx.table(table_name).options(); + assert_options_eq(&old_opts, &opts_after_alter); +} + +async fn alter_mutable_option_case( + test_ctx: &mut TestContext, + table_name: &str, + opt_key: &str, + opt_value: &str, +) { + let mut expect_opts = test_ctx.table(table_name).options(); + expect_opts.insert(opt_key.to_string(), opt_value.to_string()); + + let mut new_opts = HashMap::new(); + new_opts.insert(opt_key.to_string(), opt_value.to_string()); + + let affected = test_ctx + .try_alter_options(table_name, new_opts) + .await + .unwrap(); + assert_eq!(1, affected); + + let opts_after_alter = test_ctx.table(table_name).options(); + assert_options_eq(&expect_opts, &opts_after_alter); + + // Reopen table. + test_ctx.reopen_with_tables(&[table_name]).await; + + let opts_after_alter = test_ctx.table(table_name).options(); + assert_options_eq(&expect_opts, &opts_after_alter); +} + +fn assert_options_eq(left: &HashMap, right: &HashMap) { + let sorted_left: BTreeMap<_, _> = left.iter().collect(); + let sorted_right: BTreeMap<_, _> = right.iter().collect(); + + assert_eq!(sorted_left, sorted_right); +} + +fn default_options() -> HashMap { + let table_opts = TableOptions::default(); + + table_opts.to_raw_map() +} diff --git a/analytic_engine/src/tests/compaction_test.rs b/analytic_engine/src/tests/compaction_test.rs new file mode 100644 index 0000000000..6a5b300eb3 --- /dev/null +++ b/analytic_engine/src/tests/compaction_test.rs @@ -0,0 +1,90 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Compaction integration tests. + +use common_types::time::Timestamp; +use table_engine::table::FlushRequest; + +use crate::{ + compaction::SizeTieredCompactionOptions, + tests::util::{self, TestEnv}, +}; + +#[test] +fn test_table_compact_current_segment() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await; + let default_opts = SizeTieredCompactionOptions::default(); + + let mut expect_rows = Vec::new(); + + let start_ms = test_ctx.start_ms(); + // Write more than ensure compaction will be triggered. + for offset in 0..default_opts.max_threshold as i64 * 2 { + let rows = [ + ( + "key1", + Timestamp::new(start_ms + offset), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key2", + Timestamp::new(start_ms + offset), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ]; + expect_rows.extend_from_slice(&rows); + let row_group = fixed_schema_table.rows_to_row_group(&rows); + + test_ctx.write_to_table(test_table1, row_group).await; + + // Flush table and generate sst. + test_ctx + .flush_table_with_request( + test_table1, + FlushRequest { + // Don't trigger a compaction. + compact_after_flush: false, + sync: true, + }, + ) + .await; + } + + expect_rows.sort_unstable_by_key(|row_tuple| (row_tuple.0, row_tuple.1)); + + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read after flush", + test_table1, + &expect_rows, + ) + .await; + + // Trigger a compaction. + test_ctx.compact_table(test_table1).await; + + // Check read after compaction. + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read after compaction", + test_table1, + &expect_rows, + ) + .await; + }); +} diff --git a/analytic_engine/src/tests/drop_test.rs b/analytic_engine/src/tests/drop_test.rs new file mode 100644 index 0000000000..7d12baa536 --- /dev/null +++ b/analytic_engine/src/tests/drop_test.rs @@ -0,0 +1,231 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Drop table tests + +use std::collections::HashMap; + +use common_types::{column_schema, datum::DatumKind, time::Timestamp}; +use table_engine::table::AlterSchemaRequest; + +use crate::tests::{ + table::FixedSchemaTable, + util::{self, TestEnv}, +}; + +#[test] +fn test_drop_table_once() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + test_ctx.create_fixed_schema_table(test_table1).await; + + assert!(test_ctx.drop_table(test_table1).await); + + let table_opt = test_ctx.try_open_table(test_table1).await.unwrap(); + assert!(table_opt.is_none()); + + test_ctx.reopen().await; + + let table_opt = test_ctx.try_open_table(test_table1).await.unwrap(); + assert!(table_opt.is_none()); + }); +} + +#[test] +fn test_drop_table_again() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + test_ctx.create_fixed_schema_table(test_table1).await; + + assert!(test_ctx.drop_table(test_table1).await); + + assert!(!test_ctx.drop_table(test_table1).await); + + let table_opt = test_ctx.try_open_table(test_table1).await.unwrap(); + assert!(table_opt.is_none()); + }); +} + +#[test] +fn test_drop_create_table_mixed() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + test_ctx.create_fixed_schema_table(test_table1).await; + + assert!(test_ctx.drop_table(test_table1).await); + + // Create another table after dropped. + let test_table2 = "test_table2"; + test_ctx.create_fixed_schema_table(test_table2).await; + + let table_opt = test_ctx.try_open_table(test_table1).await.unwrap(); + assert!(table_opt.is_none()); + + test_ctx.reopen().await; + + let table_opt = test_ctx.try_open_table(test_table1).await.unwrap(); + assert!(table_opt.is_none()); + // Table 2 is still exists. + assert!(test_ctx + .try_open_table(test_table2) + .await + .unwrap() + .is_some()); + }); +} + +fn test_drop_create_same_table_case(flush: bool) { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await; + + // Write data to table1. + let start_ms = test_ctx.start_ms(); + let rows = [( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + )]; + let row_group = fixed_schema_table.rows_to_row_group(&rows); + test_ctx.write_to_table(test_table1, row_group).await; + + if flush { + test_ctx.flush_table(test_table1).await; + } + + assert!(test_ctx.drop_table(test_table1).await); + + // Create same table again. + let test_table1 = "test_table1"; + test_ctx.create_fixed_schema_table(test_table1).await; + + // No data exists. + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read table", + test_table1, + &[], + ) + .await; + + test_ctx.reopen_with_tables(&[test_table1]).await; + + // No data exists. + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read table after reopen", + test_table1, + &[], + ) + .await; + }); +} + +#[test] +fn test_drop_create_same_table() { + test_drop_create_same_table_case(false); + + test_drop_create_same_table_case(true); +} + +#[test] +fn test_alter_schema_drop_create() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + test_ctx.create_fixed_schema_table(test_table1).await; + + // Alter schema. + let old_schema = test_ctx.table(test_table1).schema(); + let schema_builder = FixedSchemaTable::default_schema_builder() + .add_normal_column( + column_schema::Builder::new("add_double".to_string(), DatumKind::Double) + .is_nullable(true) + .build() + .unwrap(), + ) + .unwrap(); + let new_schema = schema_builder + .version(old_schema.version() + 1) + .build() + .unwrap(); + let request = AlterSchemaRequest { + schema: new_schema.clone(), + pre_schema_version: old_schema.version(), + }; + let affected = test_ctx + .try_alter_schema(test_table1, request) + .await + .unwrap(); + assert_eq!(1, affected); + + // Drop table. + assert!(test_ctx.drop_table(test_table1).await); + + // Create same table again. + let test_table1 = "test_table1"; + test_ctx.create_fixed_schema_table(test_table1).await; + + test_ctx.reopen_with_tables(&[test_table1]).await; + }); +} + +#[test] +fn test_alter_options_drop_create() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + test_ctx.create_fixed_schema_table(test_table1).await; + + // Alter options. + let mut new_opts = HashMap::new(); + new_opts.insert("arena_block_size".to_string(), "10240".to_string()); + + let affected = test_ctx + .try_alter_options(test_table1, new_opts) + .await + .unwrap(); + assert_eq!(1, affected); + + // Drop table. + assert!(test_ctx.drop_table(test_table1).await); + + // Create same table again. + let test_table1 = "test_table1"; + test_ctx.create_fixed_schema_table(test_table1).await; + + test_ctx.reopen_with_tables(&[test_table1]).await; + }); +} diff --git a/analytic_engine/src/tests/mod.rs b/analytic_engine/src/tests/mod.rs new file mode 100644 index 0000000000..3ed5f527e0 --- /dev/null +++ b/analytic_engine/src/tests/mod.rs @@ -0,0 +1,17 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Test suits and intergration tests. + +#[cfg(test)] +mod alter_test; +#[cfg(test)] +mod compaction_test; +#[cfg(test)] +mod drop_test; +#[cfg(test)] +mod open_test; +#[cfg(test)] +mod read_write_test; +pub mod row_util; +pub mod table; +pub mod util; diff --git a/analytic_engine/src/tests/open_test.rs b/analytic_engine/src/tests/open_test.rs new file mode 100644 index 0000000000..6c3afc0578 --- /dev/null +++ b/analytic_engine/src/tests/open_test.rs @@ -0,0 +1,18 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Engine open test. + +use crate::tests::util::TestEnv; + +#[test] +fn test_open_engine() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + // Reopen engine. + test_ctx.reopen().await; + }); +} diff --git a/analytic_engine/src/tests/read_write_test.rs b/analytic_engine/src/tests/read_write_test.rs new file mode 100644 index 0000000000..c190817470 --- /dev/null +++ b/analytic_engine/src/tests/read_write_test.rs @@ -0,0 +1,735 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Read write test. + +use std::{thread, time}; + +use common_types::time::Timestamp; +use log::info; +use table_engine::table::ReadOrder; + +use crate::{ + table_options, + tests::util::{self, TestEnv}, +}; + +#[test] +fn test_multi_table_read_write() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_multi_table_read_write1"; + let test_table2 = "test_multi_table_read_write2"; + let test_table3 = "test_multi_table_read_write3"; + + let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await; + let _ = test_ctx.create_fixed_schema_table(test_table2).await; + let _ = test_ctx.create_fixed_schema_table(test_table3).await; + + let start_ms = test_ctx.start_ms(); + let rows = [ + // One bucket. + ( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key2", + Timestamp::new(start_ms), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ( + "key2", + Timestamp::new(start_ms + 1), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ( + "key3", + Timestamp::new(start_ms + 2), + "tag1-4", + 13.0, + 110.0, + "tag2-4", + ), + ( + "key4", + Timestamp::new(start_ms + 3), + "tag1-5", + 13.0, + 110.0, + "tag2-5", + ), + // Next bucket. + ( + "key5", + Timestamp::new( + start_ms + 1 + 2 * table_options::DEFAULT_SEGMENT_DURATION.as_millis() as i64, + ), + "tag-5-3", + 33.0, + 310.0, + "tag-5-3", + ), + ]; + + // Write data to table. + let row_group1 = fixed_schema_table.rows_to_row_group(&rows); + let row_group2 = fixed_schema_table.rows_to_row_group(&rows); + let row_group3 = fixed_schema_table.rows_to_row_group(&rows); + test_ctx.write_to_table(test_table1, row_group1).await; + test_ctx.write_to_table(test_table2, row_group2).await; + test_ctx.write_to_table(test_table3, row_group3).await; + + // Read with different opts. + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read write table1", + test_table1, + &rows, + ) + .await; + + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read write table2", + test_table2, + &rows, + ) + .await; + + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read write table3", + test_table3, + &rows, + ) + .await; + + // Reopen db. + test_ctx + .reopen_with_tables(&[test_table1, test_table2, test_table3]) + .await; + + // Read with different opts again. + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read write table1 after reopen", + test_table1, + &rows, + ) + .await; + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read write table2 after reopen", + test_table2, + &rows, + ) + .await; + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read write table3 after reopen", + test_table3, + &rows, + ) + .await; + }); +} + +#[test] +fn test_table_write_read() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await; + + let start_ms = test_ctx.start_ms(); + let rows = [ + ( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key2", + Timestamp::new(start_ms), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ( + "key2", + Timestamp::new(start_ms + 1), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ]; + let row_group = fixed_schema_table.rows_to_row_group(&rows); + + // Write data to table. + test_ctx.write_to_table(test_table1, row_group).await; + + // Read with different opts. + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read write table", + test_table1, + &rows, + ) + .await; + + // Reopen db. + test_ctx.reopen_with_tables(&[test_table1]).await; + + // Read with different opts again. + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read write table after reopen", + test_table1, + &rows, + ) + .await; + }); +} + +#[test] +fn test_table_write_get() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table1 = "test_table1"; + let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await; + + let start_ms = test_ctx.start_ms(); + let rows = [ + ( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key2", + Timestamp::new(start_ms), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ( + "key2", + Timestamp::new(start_ms + 1), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ]; + let row_group = fixed_schema_table.rows_to_row_group(&rows); + + // Write data to table. + test_ctx.write_to_table(test_table1, row_group).await; + + util::check_get( + &test_ctx, + &fixed_schema_table, + "Try to get row", + test_table1, + &rows, + ) + .await; + + // Reopen db. + test_ctx.reopen_with_tables(&[test_table1]).await; + + util::check_get( + &test_ctx, + &fixed_schema_table, + "Try to get row after reopen", + test_table1, + &rows, + ) + .await; + }); +} + +#[test] +fn test_table_write_get_override() { + test_table_write_get_override_case(FlushPoint::NoFlush); + + test_table_write_get_override_case(FlushPoint::AfterFirstWrite); + + test_table_write_get_override_case(FlushPoint::AfterOverwrite); + + test_table_write_get_override_case(FlushPoint::FirstAndOverwrite); +} + +#[derive(Debug)] +enum FlushPoint { + NoFlush, + AfterFirstWrite, + AfterOverwrite, + FirstAndOverwrite, +} + +fn test_table_write_get_override_case(flush_point: FlushPoint) { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + info!( + "test_table_write_get_override_case, flush_point:{:?}", + flush_point + ); + + test_ctx.open().await; + + let test_table1 = "test_table1"; + let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await; + + let start_ms = test_ctx.start_ms(); + { + let rows = [ + ( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key2", + Timestamp::new(start_ms), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ( + "key3", + Timestamp::new(start_ms + 10), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ( + "key2", + Timestamp::new(start_ms + 1), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ]; + let row_group = fixed_schema_table.rows_to_row_group(&rows); + + // Write data to table. + test_ctx.write_to_table(test_table1, row_group).await; + } + + if let FlushPoint::AfterFirstWrite | FlushPoint::FirstAndOverwrite = flush_point { + test_ctx.flush_table(test_table1).await; + } + + // Override some rows + { + let rows = [ + ( + "key2", + Timestamp::new(start_ms), + "tag1-2-copy", + 112.0, + 210.0, + "tag2-2-copy", + ), + ( + "key2", + Timestamp::new(start_ms + 1), + "tag1-3-copy", + 113.0, + 210.0, + "tag2-3-copy", + ), + ]; + let row_group = fixed_schema_table.rows_to_row_group(&rows); + + test_ctx.write_to_table(test_table1, row_group).await; + } + + if let FlushPoint::AfterOverwrite | FlushPoint::FirstAndOverwrite = flush_point { + test_ctx.flush_table(test_table1).await; + } + + let expect_rows = [ + ( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key2", + Timestamp::new(start_ms), + "tag1-2-copy", + 112.0, + 210.0, + "tag2-2-copy", + ), + ( + "key2", + Timestamp::new(start_ms + 1), + "tag1-3-copy", + 113.0, + 210.0, + "tag2-3-copy", + ), + ( + "key3", + Timestamp::new(start_ms + 10), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ]; + + util::check_get( + &test_ctx, + &fixed_schema_table, + "Try to get row", + test_table1, + &expect_rows, + ) + .await; + + // Reopen db. + test_ctx.reopen_with_tables(&[test_table1]).await; + + util::check_get( + &test_ctx, + &fixed_schema_table, + "Try to get row after reopen", + test_table1, + &expect_rows, + ) + .await; + }); +} + +#[test] +fn test_db_write_buffer_size() { + let mut env = TestEnv::builder().build(); + env.config.db_write_buffer_size = 1; + test_write_buffer_size_overflow("db_write_buffer_size_test", env); +} + +#[test] +fn test_space_write_buffer_size() { + let mut env = TestEnv::builder().build(); + env.config.space_write_buffer_size = 1; + test_write_buffer_size_overflow("space_write_buffer_size_test", env); +} + +fn test_write_buffer_size_overflow(test_table_name: &str, env: TestEnv) { + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table_name).await; + + let table = test_ctx.table(test_table_name); + let old_stats = table.stats(); + + let start_ms = test_ctx.start_ms(); + let rows1 = [ + ( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key2", + Timestamp::new(start_ms), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ( + "key2", + Timestamp::new(start_ms + 1), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ]; + let row_group = fixed_schema_table.rows_to_row_group(&rows1); + // Write rows1 to table. + test_ctx.write_to_table(test_table_name, row_group).await; + + let stats = table.stats(); + assert_eq!(old_stats.num_read, stats.num_read); + assert_eq!(old_stats.num_write + 1, stats.num_write); + assert_eq!(old_stats.num_flush, stats.num_flush); + + let rows2 = [ + ( + "key4", + Timestamp::new(start_ms + 2), + "tag1-4", + 11.0, + 110.0, + "tag2-4", + ), + ( + "key5", + Timestamp::new(start_ms + 3), + "tag1-5", + 12.0, + 110.0, + "tag2-5", + ), + ]; + + let row_group = fixed_schema_table.rows_to_row_group(&rows2); + // Write rowss2 to table. + test_ctx.write_to_table(test_table_name, row_group).await; + + let mut rows = Vec::new(); + rows.extend_from_slice(&rows1); + rows.extend_from_slice(&rows2); + + // TODO(boyan) a better way to wait table flushing finishes. + thread::sleep(time::Duration::from_millis(500)); + + // Read with different opts. + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read write table", + test_table_name, + &rows, + ) + .await; + + let stats = table.stats(); + assert_eq!(old_stats.num_read + 5, stats.num_read); + assert_eq!(old_stats.num_write + 2, stats.num_write); + // Flush when reaches (db/space) write_buffer size limitation. + assert_eq!(old_stats.num_flush + 1, stats.num_flush); + + drop(table); + // Reopen db. + test_ctx.reopen_with_tables(&[test_table_name]).await; + + // Read with different opts again. + util::check_read( + &test_ctx, + &fixed_schema_table, + "Test read write table after reopen", + test_table_name, + &rows, + ) + .await; + }); +} + +#[test] +fn test_table_write_read_reverse() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table = "test_table"; + let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table).await; + + let start_ms = test_ctx.start_ms(); + let rows = [ + ( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + // update the first row + ( + "key1", + Timestamp::new(start_ms), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ( + "key1", + Timestamp::new(start_ms + 1), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ( + "key2", + Timestamp::new(start_ms), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ( + "key2", + Timestamp::new(start_ms + 1), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ]; + let expect_reversed_rows = vec![rows[4], rows[3], rows[2], rows[1]]; + let row_group = fixed_schema_table.rows_to_row_group(&rows); + + // Write data to table. + test_ctx.write_to_table(test_table, row_group).await; + + // Read reverse + util::check_read_with_order( + &test_ctx, + &fixed_schema_table, + "Test read write table", + test_table, + &expect_reversed_rows, + ReadOrder::Desc, + ) + .await; + }); +} + +#[test] +fn test_table_write_read_reverse_after_flush() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + + env.block_on(async { + test_ctx.open().await; + + let test_table = "test_table"; + let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table).await; + + let start_ms = test_ctx.start_ms(); + let rows1 = [ + ( + "key1", + Timestamp::new(start_ms), + "tag1-1", + 11.0, + 110.0, + "tag2-1", + ), + ( + "key2", + Timestamp::new(start_ms), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ( + "key2", + Timestamp::new(start_ms + 1), + "tag1-3", + 13.0, + 110.0, + "tag2-3", + ), + ]; + + let rows2 = vec![ + // update the first row + ( + "key1", + Timestamp::new(start_ms), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ( + "key1", + Timestamp::new(start_ms + 1), + "tag1-2", + 12.0, + 110.0, + "tag2-2", + ), + ]; + + let expect_reversed_rows = vec![rows1[2], rows1[1], rows2[1], rows2[0]]; + let row_group1 = fixed_schema_table.rows_to_row_group(&rows1); + // Write data to table and flush + test_ctx.write_to_table(test_table, row_group1).await; + test_ctx.flush_table(test_table).await; + + let row_group2 = fixed_schema_table.rows_to_row_group(&rows2); + // Write data to table and not flush + test_ctx.write_to_table(test_table, row_group2).await; + + // Read reverse + util::check_read_with_order( + &test_ctx, + &fixed_schema_table, + "Test read write table", + test_table, + &expect_reversed_rows, + ReadOrder::Desc, + ) + .await; + }); +} diff --git a/analytic_engine/src/tests/row_util.rs b/analytic_engine/src/tests/row_util.rs new file mode 100644 index 0000000000..eaf7b592ed --- /dev/null +++ b/analytic_engine/src/tests/row_util.rs @@ -0,0 +1,93 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Row utils + +use common_types::{datum::Datum, row::Row}; + +pub fn new_row_6(data: (C0, C1, C2, C3, C4, C5)) -> Row +where + C0: Into, + C1: Into, + C2: Into, + C3: Into, + C4: Into, + C5: Into, +{ + let cols = vec![ + data.0.into(), + data.1.into(), + data.2.into(), + data.3.into(), + data.4.into(), + data.5.into(), + ]; + + Row::from_datums(cols) +} + +pub fn assert_row_eq_6(data: (C0, C1, C2, C3, C4, C5), row: Row) +where + C0: Into, + C1: Into, + C2: Into, + C3: Into, + C4: Into, + C5: Into, +{ + let expect_row = new_row_6(data); + assert_eq!(expect_row, row); +} + +pub fn new_row_8(data: (C0, C1, C2, C3, C4, C5, C6, C7)) -> Row +where + C0: Into, + C1: Into, + C2: Into, + C3: Into, + C4: Into, + C5: Into, + C6: Into, + C7: Into, +{ + let cols = vec![ + data.0.into(), + data.1.into(), + data.2.into(), + data.3.into(), + data.4.into(), + data.5.into(), + data.6.into(), + data.7.into(), + ]; + + Row::from_datums(cols) +} + +pub fn new_rows_6(data: &[(C0, C1, C2, C3, C4, C5)]) -> Vec +where + C0: Into + Clone, + C1: Into + Clone, + C2: Into + Clone, + C3: Into + Clone, + C4: Into + Clone, + C5: Into + Clone, +{ + data.iter().cloned().map(new_row_6).collect() +} + +#[allow(clippy::type_complexity)] +pub fn new_rows_8( + data: &[(C0, C1, C2, C3, C4, C5, C6, C7)], +) -> Vec +where + C0: Into + Clone, + C1: Into + Clone, + C2: Into + Clone, + C3: Into + Clone, + C4: Into + Clone, + C5: Into + Clone, + C6: Into + Clone, + C7: Into + Clone, +{ + data.iter().cloned().map(new_row_8).collect() +} diff --git a/analytic_engine/src/tests/table.rs b/analytic_engine/src/tests/table.rs new file mode 100644 index 0000000000..8d3d7a83e1 --- /dev/null +++ b/analytic_engine/src/tests/table.rs @@ -0,0 +1,331 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Utils to create table. + +use std::{collections::HashMap, sync::Arc}; + +use common_types::{ + column_schema, + datum::{Datum, DatumKind}, + projected_schema::ProjectedSchema, + record_batch::RecordBatch, + request_id::RequestId, + row::{Row, RowGroup, RowGroupBuilder}, + schema::{self, Schema}, + time::{TimeRange, Timestamp}, +}; +use common_util::config::ReadableDuration; +use table_engine::{ + self, + engine::{CreateTableRequest, TableState}, + predicate::Predicate, + table::{GetRequest, ReadOptions, ReadOrder, ReadRequest, SchemaId, TableId, TableSeq}, +}; + +use crate::{table_options, tests::row_util}; + +pub fn new_table_id(schema_id: u16, table_seq: u32) -> TableId { + TableId::new(SchemaId::from(schema_id), TableSeq::from(table_seq)) +} + +pub type RowTuple<'a> = (&'a str, Timestamp, &'a str, f64, f64, &'a str); +pub type RowTupleOpt<'a> = ( + &'a str, + Timestamp, + Option<&'a str>, + Option, + Option, + Option<&'a str>, +); +pub type KeyTuple<'a> = (&'a str, Timestamp); + +pub struct FixedSchemaTable { + create_request: CreateTableRequest, +} + +impl FixedSchemaTable { + pub fn builder() -> Builder { + Builder::default() + } + + fn default_schema() -> Schema { + Self::default_schema_builder().build().unwrap() + } + + pub fn default_schema_builder() -> schema::Builder { + create_schema_builder( + // Key columns + &[("key", DatumKind::String), ("ts", DatumKind::Timestamp)], + // Normal columns + &[ + ("string_tag", DatumKind::String), + ("double_field1", DatumKind::Double), + ("double_field2", DatumKind::Double), + ("string_field2", DatumKind::String), + ], + ) + } + + #[inline] + pub fn create_request(&self) -> &CreateTableRequest { + &self.create_request + } + + #[inline] + pub fn segment_duration_ms(&self) -> i64 { + table_options::DEFAULT_SEGMENT_DURATION.as_millis() as i64 + } + + // Format of data: (key string, timestamp, string_tag, double_field1, + // double_field2, string_field2) + fn new_row(data: RowTuple) -> Row { + row_util::new_row_6(data) + } + + pub fn rows_to_row_group(&self, data: &[RowTuple]) -> RowGroup { + let rows = data + .iter() + .copied() + .map(FixedSchemaTable::new_row) + .collect(); + + self.new_row_group(rows) + } + + pub fn rows_opt_to_row_group(&self, data: &[RowTupleOpt]) -> RowGroup { + let rows = data + .iter() + .copied() + .map(FixedSchemaTable::new_row_opt) + .collect(); + + self.new_row_group(rows) + } + + fn new_row_group(&self, rows: Vec) -> RowGroup { + RowGroupBuilder::with_rows(self.create_request.table_schema.clone(), rows) + .unwrap() + .build() + } + + fn new_row_opt(data: RowTupleOpt) -> Row { + row_util::new_row_6(data) + } + + pub fn new_read_all_request(&self, opts: ReadOptions, read_order: ReadOrder) -> ReadRequest { + new_read_all_request_with_order(self.create_request.table_schema.clone(), opts, read_order) + } + + pub fn new_get_request(&self, key: KeyTuple) -> GetRequest { + let primary_key = vec![key.0.into(), key.1.into()]; + + GetRequest { + request_id: RequestId::next_id(), + projected_schema: ProjectedSchema::no_projection( + self.create_request.table_schema.clone(), + ), + primary_key, + } + } + + pub fn new_get_request_from_row(&self, data: RowTuple) -> GetRequest { + self.new_get_request((data.0, data.1)) + } + + pub fn assert_batch_eq_to_rows(&self, record_batches: &[RecordBatch], rows: &[RowTuple]) { + let row_group = self.rows_to_row_group(rows); + assert_batch_eq_to_row_group(record_batches, &row_group); + } + + pub fn assert_row_eq(&self, data: RowTuple, row: Row) { + row_util::assert_row_eq_6(data, row); + } +} + +pub fn read_opts_list() -> Vec { + vec![ + ReadOptions::default(), + ReadOptions { + batch_size: 1, + read_parallelism: 1, + }, + ReadOptions { + batch_size: 1, + read_parallelism: 4, + }, + ReadOptions { + batch_size: 100, + read_parallelism: 1, + }, + ReadOptions { + batch_size: 100, + read_parallelism: 4, + }, + ] +} + +pub fn new_read_all_request_with_order( + schema: Schema, + opts: ReadOptions, + order: ReadOrder, +) -> ReadRequest { + ReadRequest { + request_id: RequestId::next_id(), + opts, + projected_schema: ProjectedSchema::no_projection(schema), + predicate: Arc::new(Predicate::new(TimeRange::min_to_max())), + order, + } +} + +pub fn new_read_all_request(schema: Schema, opts: ReadOptions) -> ReadRequest { + new_read_all_request_with_order(schema, opts, ReadOrder::None) +} + +pub fn assert_batch_eq_to_row_group(record_batches: &[RecordBatch], row_group: &RowGroup) { + if record_batches.is_empty() { + assert!(row_group.is_empty()); + } + + for record_batch in record_batches { + assert_eq!( + record_batch.schema().columns(), + row_group.schema().columns() + ); + } + + let mut cursor = RecordBatchesCursor::new(record_batches); + + for row in row_group.iter() { + for (column_idx, datum) in row.iter().enumerate() { + assert_eq!( + &cursor.datum(column_idx), + datum, + "record_batches:{:?}, row_group:{:?}", + record_batches, + row_group + ); + } + cursor.step(); + } +} + +struct RecordBatchesCursor<'a> { + record_batches: &'a [RecordBatch], + batch_idx: usize, + row_idx_in_batch: usize, +} + +impl<'a> RecordBatchesCursor<'a> { + fn new(record_batches: &[RecordBatch]) -> RecordBatchesCursor { + RecordBatchesCursor { + record_batches, + batch_idx: 0, + row_idx_in_batch: 0, + } + } + + fn step(&mut self) { + if self.batch_idx >= self.record_batches.len() { + return; + } + + self.row_idx_in_batch += 1; + if self.row_idx_in_batch >= self.record_batches[self.batch_idx].num_rows() { + self.batch_idx += 1; + self.row_idx_in_batch = 0; + } + } + + fn datum(&self, column_idx: usize) -> Datum { + let record_batch = &self.record_batches[self.batch_idx]; + let column_in_batch = record_batch.column(column_idx); + column_in_batch.datum(self.row_idx_in_batch) + } +} + +#[must_use] +pub struct Builder { + create_request: CreateTableRequest, +} + +impl Builder { + pub fn table_name(mut self, table_name: String) -> Self { + self.create_request.table_name = table_name; + self + } + + pub fn table_id(mut self, table_id: TableId) -> Self { + self.create_request.table_id = table_id; + self + } + + pub fn enable_ttl(mut self, enable_ttl: bool) -> Self { + self.create_request.options.insert( + table_engine::OPTION_KEY_ENABLE_TTL.to_string(), + enable_ttl.to_string(), + ); + self + } + + pub fn ttl(mut self, duration: ReadableDuration) -> Self { + self.create_request + .options + .insert(table_options::TTL.to_string(), duration.to_string()); + self + } + + pub fn build_fixed(self) -> FixedSchemaTable { + FixedSchemaTable { + create_request: self.create_request, + } + } +} + +impl Default for Builder { + fn default() -> Self { + Self { + create_request: CreateTableRequest { + catalog_name: "ceresdb".to_string(), + schema_name: "public".to_string(), + table_id: new_table_id(2, 1), + table_name: "test_table".to_string(), + table_schema: FixedSchemaTable::default_schema(), + partition_info: None, + engine: table_engine::ANALYTIC_ENGINE_TYPE.to_string(), + options: HashMap::new(), + state: TableState::Stable, + }, + } + } +} + +// Format of input slice: &[ ( column name, column type ) ] +pub fn create_schema_builder( + key_tuples: &[(&str, DatumKind)], + normal_tuples: &[(&str, DatumKind)], +) -> schema::Builder { + assert!(!key_tuples.is_empty()); + + let mut schema_builder = schema::Builder::with_capacity(key_tuples.len() + normal_tuples.len()) + .auto_increment_column_id(true); + + for tuple in key_tuples { + // Key column is not nullable. + let column_schema = column_schema::Builder::new(tuple.0.to_string(), tuple.1) + .is_nullable(false) + .build() + .expect("Should succeed to build key column schema"); + schema_builder = schema_builder.add_key_column(column_schema).unwrap(); + } + + for tuple in normal_tuples { + let column_schema = column_schema::Builder::new(tuple.0.to_string(), tuple.1) + .is_nullable(true) + .build() + .expect("Should succeed to build normal column schema"); + schema_builder = schema_builder.add_normal_column(column_schema).unwrap(); + } + + schema_builder +} diff --git a/analytic_engine/src/tests/util.rs b/analytic_engine/src/tests/util.rs new file mode 100644 index 0000000000..31afc1b582 --- /dev/null +++ b/analytic_engine/src/tests/util.rs @@ -0,0 +1,404 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Test utils. + +use std::{collections::HashMap, future::Future, sync::Arc}; + +use common_types::{ + datum::Datum, + record_batch::RecordBatch, + row::{Row, RowGroup}, + time::Timestamp, +}; +use common_util::{config::ReadableDuration, runtime}; +use futures::stream::StreamExt; +use log::info; +use table_engine::{ + engine::{ + CreateTableRequest, DropTableRequest, EngineRuntimes, OpenTableRequest, + Result as EngineResult, TableEngine, + }, + table::{ + AlterSchemaRequest, FlushRequest, GetRequest, ReadOrder, ReadRequest, Result, TableId, + TableRef, WriteRequest, + }, +}; +use tempfile::TempDir; + +use crate::{ + setup, + tests::table::{self, FixedSchemaTable, RowTuple}, + AnalyticTableEngine, Config, EngineInstance, +}; + +const DAY_MS: i64 = 24 * 60 * 60 * 1000; + +/// Helper struct to create a null datum. +pub struct Null; + +impl From for Datum { + fn from(_data: Null) -> Datum { + Datum::Null + } +} + +pub async fn check_read_with_order( + test_ctx: &TestContext, + fixed_schema_table: &FixedSchemaTable, + msg: &str, + table_name: &str, + rows: &[RowTuple<'_>], + read_order: ReadOrder, +) { + for read_opts in table::read_opts_list() { + info!("{}, opts:{:?}", msg, read_opts); + + let record_batches = test_ctx + .read_table( + table_name, + fixed_schema_table.new_read_all_request(read_opts, read_order), + ) + .await; + + fixed_schema_table.assert_batch_eq_to_rows(&record_batches, rows); + } +} + +pub async fn check_read( + test_ctx: &TestContext, + fixed_schema_table: &FixedSchemaTable, + msg: &str, + table_name: &str, + rows: &[RowTuple<'_>], +) { + check_read_with_order( + test_ctx, + fixed_schema_table, + msg, + table_name, + rows, + ReadOrder::None, + ) + .await +} + +pub async fn check_get( + test_ctx: &TestContext, + fixed_schema_table: &FixedSchemaTable, + msg: &str, + table_name: &str, + rows: &[RowTuple<'_>], +) { + for row_data in rows { + let request = fixed_schema_table.new_get_request_from_row(*row_data); + + info!("{}, request:{:?}, row_data:{:?}", msg, request, row_data); + + let row = test_ctx.get_from_table(table_name, request).await.unwrap(); + + fixed_schema_table.assert_row_eq(*row_data, row); + } +} + +pub struct TestContext { + pub config: Config, + runtimes: Arc, + pub engine: Option, + last_table_seq: u32, + + name_to_tables: HashMap, +} + +impl TestContext { + pub async fn open(&mut self) { + let engine = setup::open_analytic_table_engine(self.config.clone(), self.runtimes.clone()) + .await + .unwrap(); + + self.engine = Some(engine); + } + + pub async fn reopen(&mut self) { + { + // Close all tables. + self.name_to_tables.clear(); + + // Close engine. + let engine = self.engine.take().unwrap(); + engine.close().await.unwrap(); + } + + self.open().await; + } + + pub async fn reopen_with_tables(&mut self, tables: &[&str]) { + { + // Close all tables. + self.name_to_tables.clear(); + + // Close engine. + let engine = self.engine.take().unwrap(); + engine.close().await.unwrap(); + } + + self.open().await; + + for name in tables { + self.open_table(name).await; + } + } + + async fn open_table(&mut self, table_name: &str) { + let table = self + .engine() + .open_table(OpenTableRequest { + catalog_name: "ceresdb".to_string(), + schema_name: "public".to_string(), + table_name: table_name.to_string(), + engine: table_engine::ANALYTIC_ENGINE_TYPE.to_string(), + }) + .await + .unwrap() + .unwrap(); + + self.name_to_tables.insert(table_name.to_string(), table); + } + + pub async fn try_open_table(&mut self, table_name: &str) -> EngineResult> { + let table_opt = self + .engine() + .open_table(OpenTableRequest { + catalog_name: "ceresdb".to_string(), + schema_name: "public".to_string(), + table_name: table_name.to_string(), + engine: table_engine::ANALYTIC_ENGINE_TYPE.to_string(), + }) + .await?; + + let table = match table_opt { + Some(v) => v, + None => return Ok(None), + }; + + self.name_to_tables + .insert(table_name.to_string(), table.clone()); + + Ok(Some(table)) + } + + pub async fn drop_table(&mut self, table_name: &str) -> bool { + let request = DropTableRequest { + catalog_name: "ceresdb".to_string(), + schema_name: "public".to_string(), + table_name: table_name.to_string(), + engine: table_engine::ANALYTIC_ENGINE_TYPE.to_string(), + }; + + let ret = self.engine().drop_table(request).await.unwrap(); + + self.name_to_tables.remove(table_name); + + ret + } + + /// 3 days ago. + pub fn start_ms(&self) -> i64 { + Timestamp::now().as_i64() - 3 * DAY_MS + } + + pub async fn create_fixed_schema_table(&mut self, table_name: &str) -> FixedSchemaTable { + let fixed_schema_table = FixedSchemaTable::builder() + .table_name(table_name.to_string()) + .table_id(self.next_table_id()) + .ttl("7d".parse::().unwrap()) + .build_fixed(); + + self.create_table(fixed_schema_table.create_request().clone()) + .await; + + fixed_schema_table + } + + async fn create_table(&mut self, create_request: CreateTableRequest) { + let table_name = create_request.table_name.clone(); + let table = self.engine().create_table(create_request).await.unwrap(); + + self.name_to_tables.insert(table_name.to_string(), table); + } + + pub async fn write_to_table(&self, table_name: &str, row_group: RowGroup) { + let table = self.table(table_name); + + table.write(WriteRequest { row_group }).await.unwrap(); + } + + pub async fn read_table( + &self, + table_name: &str, + read_request: ReadRequest, + ) -> Vec { + let table = self.table(table_name); + + let mut stream = table.read(read_request).await.unwrap(); + let mut record_batches = Vec::new(); + while let Some(batch) = stream.next().await { + let batch = batch.unwrap(); + + record_batches.push(batch); + } + + record_batches + } + + pub async fn partitioned_read_table( + &self, + table_name: &str, + read_request: ReadRequest, + ) -> Vec { + let table = self.table(table_name); + + let streams = table.partitioned_read(read_request).await.unwrap(); + let mut record_batches = Vec::new(); + + for mut stream in streams.streams { + while let Some(batch) = stream.next().await { + let batch = batch.unwrap(); + + record_batches.push(batch); + } + } + + record_batches + } + + pub async fn get_from_table(&self, table_name: &str, request: GetRequest) -> Option { + let table = self.table(table_name); + + table.get(request).await.unwrap() + } + + pub async fn flush_table(&self, table_name: &str) { + let table = self.table(table_name); + + table.flush(FlushRequest::default()).await.unwrap(); + } + + pub async fn flush_table_with_request(&self, table_name: &str, request: FlushRequest) { + let table = self.table(table_name); + + table.flush(request).await.unwrap(); + } + + pub async fn compact_table(&self, table_name: &str) { + let table = self.table(table_name); + + table.compact().await.unwrap(); + } + + pub async fn try_alter_schema( + &self, + table_name: &str, + request: AlterSchemaRequest, + ) -> Result { + let table = self.table(table_name); + + table.alter_schema(request).await + } + + pub async fn try_alter_options( + &self, + table_name: &str, + opts: HashMap, + ) -> Result { + let table = self.table(table_name); + + table.alter_options(opts).await + } + + pub fn table(&self, table_name: &str) -> TableRef { + self.name_to_tables.get(table_name).cloned().unwrap() + } + + #[inline] + pub fn engine(&self) -> AnalyticTableEngine { + self.engine.clone().unwrap() + } + + #[inline] + pub fn instance(&self) -> EngineInstance { + self.engine().instance() + } + + fn next_table_id(&mut self) -> TableId { + self.last_table_seq += 1; + table::new_table_id(2, self.last_table_seq) + } +} + +pub struct TestEnv { + _dir: TempDir, + pub config: Config, + pub runtimes: Arc, +} + +impl TestEnv { + pub fn builder() -> Builder { + Builder::default() + } + + pub fn new_context(&self) -> TestContext { + TestContext { + config: self.config.clone(), + runtimes: self.runtimes.clone(), + engine: None, + last_table_seq: 1, + name_to_tables: HashMap::new(), + } + } + + pub fn block_on(&self, future: F) -> F::Output { + self.runtimes.bg_runtime.block_on(future) + } +} + +pub struct Builder { + num_workers: usize, +} + +impl Builder { + pub fn build(self) -> TestEnv { + // Init log for test. + common_util::tests::init_log_for_test(); + + let dir = tempfile::tempdir().unwrap(); + + let config = Config { + data_path: dir.path().to_str().unwrap().to_string(), + ..Default::default() + }; + + let runtime = Arc::new( + runtime::Builder::default() + .worker_threads(self.num_workers) + .enable_all() + .build() + .unwrap(), + ); + + TestEnv { + _dir: dir, + config, + runtimes: Arc::new(EngineRuntimes { + read_runtime: runtime.clone(), + write_runtime: runtime.clone(), + bg_runtime: runtime, + }), + } + } +} + +impl Default for Builder { + fn default() -> Self { + Self { num_workers: 2 } + } +} diff --git a/arrow_deps/Cargo.toml b/arrow_deps/Cargo.toml new file mode 100644 index 0000000000..e7cac70aa2 --- /dev/null +++ b/arrow_deps/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "arrow_deps" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +arrow = "7.0.0" +parquet = "7.0.0" + +[dependencies.uncover] +git = "https://github.com/matklad/uncover.git" +rev = "1d0770d997e29731b287e9e11e4ffbbea5f456da" + +[dependencies.datafusion] +git = "https://github.com/apache/arrow-datafusion.git" +rev = "444c153863520072ea22d4f8c498dee39437516d" diff --git a/arrow_deps/src/display.rs b/arrow_deps/src/display.rs new file mode 100644 index 0000000000..be037d882e --- /dev/null +++ b/arrow_deps/src/display.rs @@ -0,0 +1,428 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Functions for printing array values, as strings, for debugging +//! purposes. See the `pretty` crate for additional functions for +//! record batch pretty printing. + +// Copy from arrow + +use std::sync::Arc; + +use arrow::{ + array::{self, Array, DictionaryArray}, + datatypes::{ + ArrowNativeType, ArrowPrimitiveType, DataType, Int16Type, Int32Type, Int64Type, Int8Type, + IntervalUnit, TimeUnit, UInt16Type, UInt32Type, UInt64Type, UInt8Type, + }, + error::{ArrowError, Result}, +}; + +macro_rules! make_string { + ($array_type:ty, $column: ident, $row: ident) => {{ + let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + + let s = if array.is_null($row) { + "".to_string() + } else { + array.value($row).to_string() + }; + + Ok(s) + }}; +} + +macro_rules! make_string_interval_year_month { + ($column: ident, $row: ident) => {{ + let array = $column + .as_any() + .downcast_ref::() + .unwrap(); + + let s = if array.is_null($row) { + "NULL".to_string() + } else { + let interval = array.value($row) as f64; + let years = (interval / 12_f64).floor(); + let month = interval - (years * 12_f64); + + format!( + "{} years {} mons 0 days 0 hours 0 mins 0.00 secs", + years, month, + ) + }; + + Ok(s) + }}; +} + +macro_rules! make_string_interval_day_time { + ($column: ident, $row: ident) => {{ + let array = $column + .as_any() + .downcast_ref::() + .unwrap(); + + let s = if array.is_null($row) { + "NULL".to_string() + } else { + let value: u64 = array.value($row) as u64; + + let days_parts: i32 = ((value & 0xFFFFFFFF00000000) >> 32) as i32; + let milliseconds_part: i32 = (value & 0xFFFFFFFF) as i32; + + let secs = milliseconds_part / 1000; + let mins = secs / 60; + let hours = mins / 60; + + let secs = secs - (mins * 60); + let mins = mins - (hours * 60); + + format!( + "0 years 0 mons {} days {} hours {} mins {}.{:02} secs", + days_parts, + hours, + mins, + secs, + (milliseconds_part % 1000), + ) + }; + + Ok(s) + }}; +} + +macro_rules! make_string_interval_month_day_nano { + ($column: ident, $row: ident) => {{ + let array = $column + .as_any() + .downcast_ref::() + .unwrap(); + + let s = if array.is_null($row) { + "NULL".to_string() + } else { + let value: u128 = array.value($row) as u128; + + let months_part: i32 = ((value & 0xFFFFFFFF000000000000000000000000) >> 96) as i32; + let days_part: i32 = ((value & 0xFFFFFFFF0000000000000000) >> 64) as i32; + let nanoseconds_part: i64 = (value & 0xFFFFFFFFFFFFFFFF) as i64; + + let secs = nanoseconds_part / 1000000000; + let mins = secs / 60; + let hours = mins / 60; + + let secs = secs - (mins * 60); + let mins = mins - (hours * 60); + + format!( + "0 years {} mons {} days {} hours {} mins {}.{:02} secs", + months_part, + days_part, + hours, + mins, + secs, + (nanoseconds_part % 1000000000), + ) + }; + + Ok(s) + }}; +} + +macro_rules! make_string_date { + ($array_type:ty, $column: ident, $row: ident) => {{ + let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + + let s = if array.is_null($row) { + "".to_string() + } else { + array + .value_as_date($row) + .map(|d| d.to_string()) + .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()) + }; + + Ok(s) + }}; +} + +macro_rules! make_string_time { + ($array_type:ty, $column: ident, $row: ident) => {{ + let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + + let s = if array.is_null($row) { + "".to_string() + } else { + array + .value_as_time($row) + .map(|d| d.to_string()) + .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()) + }; + + Ok(s) + }}; +} + +macro_rules! make_string_datetime { + ($array_type:ty, $column: ident, $row: ident) => {{ + let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + + let s = if array.is_null($row) { + "".to_string() + } else { + array + .value_as_datetime($row) + .map(|d| d.to_string()) + .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()) + }; + + Ok(s) + }}; +} + +// It's not possible to do array.value($row).to_string() for &[u8], let's format +// it as hex +macro_rules! make_string_hex { + ($array_type:ty, $column: ident, $row: ident) => {{ + let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + + let s = if array.is_null($row) { + "".to_string() + } else { + let mut tmp = "".to_string(); + + for character in array.value($row) { + tmp += &format!("{:02x}", character); + } + + tmp + }; + + Ok(s) + }}; +} + +macro_rules! make_string_from_list { + ($column: ident, $row: ident) => {{ + let list = $column + .as_any() + .downcast_ref::() + .ok_or(ArrowError::InvalidArgumentError(format!( + "Repl error: could not convert list column to list array." + )))? + .value($row); + let string_values = (0..list.len()) + .map(|i| array_value_to_string(&list.clone(), i)) + .collect::>>()?; + Ok(format!("[{}]", string_values.join(", "))) + }}; +} + +macro_rules! make_string_from_fixed_size_list { + ($column: ident, $row: ident) => {{ + let list = $column + .as_any() + .downcast_ref::() + .ok_or(ArrowError::InvalidArgumentError(format!( + "Repl error: could not convert list column to list array." + )))? + .value($row); + let string_values = (0..list.len()) + .map(|i| array_value_to_string(&list.clone(), i)) + .collect::>>()?; + Ok(format!("[{}]", string_values.join(", "))) + }}; +} + +#[inline(always)] +pub fn make_string_from_decimal(column: &Arc, row: usize) -> Result { + let array = column + .as_any() + .downcast_ref::() + .unwrap(); + + let formatted_decimal = array.value_as_string(row); + Ok(formatted_decimal) +} + +fn append_struct_field_string( + target: &mut String, + name: &str, + field_col: &Arc, + row: usize, +) -> Result<()> { + target.push('"'); + target.push_str(name); + target.push_str("\": "); + + if field_col.is_null(row) { + target.push_str("null"); + } else { + match field_col.data_type() { + DataType::Utf8 | DataType::LargeUtf8 => { + target.push('"'); + target.push_str(array_value_to_string(field_col, row)?.as_str()); + target.push('"'); + } + _ => { + target.push_str(array_value_to_string(field_col, row)?.as_str()); + } + } + } + + Ok(()) +} + +/// Get the value at the given row in an array as a String. +/// +/// Note this function is quite inefficient and is unlikely to be +/// suitable for converting large arrays or record batches. +pub fn array_value_to_string(column: &array::ArrayRef, row: usize) -> Result { + if column.is_null(row) { + return Ok("".to_string()); + } + match column.data_type() { + DataType::Utf8 => make_string!(array::StringArray, column, row), + DataType::LargeUtf8 => make_string!(array::LargeStringArray, column, row), + DataType::Binary => make_string_hex!(array::BinaryArray, column, row), + DataType::LargeBinary => make_string_hex!(array::LargeBinaryArray, column, row), + DataType::FixedSizeBinary(_) => { + make_string_hex!(array::FixedSizeBinaryArray, column, row) + } + DataType::Boolean => make_string!(array::BooleanArray, column, row), + DataType::Int8 => make_string!(array::Int8Array, column, row), + DataType::Int16 => make_string!(array::Int16Array, column, row), + DataType::Int32 => make_string!(array::Int32Array, column, row), + DataType::Int64 => make_string!(array::Int64Array, column, row), + DataType::UInt8 => make_string!(array::UInt8Array, column, row), + DataType::UInt16 => make_string!(array::UInt16Array, column, row), + DataType::UInt32 => make_string!(array::UInt32Array, column, row), + DataType::UInt64 => make_string!(array::UInt64Array, column, row), + DataType::Float16 => make_string!(array::Float16Array, column, row), + DataType::Float32 => make_string!(array::Float32Array, column, row), + DataType::Float64 => make_string!(array::Float64Array, column, row), + DataType::Decimal(..) => make_string_from_decimal(column, row), + DataType::Timestamp(unit, _) if *unit == TimeUnit::Second => { + make_string_datetime!(array::TimestampSecondArray, column, row) + } + DataType::Timestamp(unit, _) if *unit == TimeUnit::Millisecond => { + make_string_datetime!(array::TimestampMillisecondArray, column, row) + } + DataType::Timestamp(unit, _) if *unit == TimeUnit::Microsecond => { + make_string_datetime!(array::TimestampMicrosecondArray, column, row) + } + DataType::Timestamp(unit, _) if *unit == TimeUnit::Nanosecond => { + make_string_datetime!(array::TimestampNanosecondArray, column, row) + } + DataType::Date32 => make_string_date!(array::Date32Array, column, row), + DataType::Date64 => make_string_date!(array::Date64Array, column, row), + DataType::Time32(unit) if *unit == TimeUnit::Second => { + make_string_time!(array::Time32SecondArray, column, row) + } + DataType::Time32(unit) if *unit == TimeUnit::Millisecond => { + make_string_time!(array::Time32MillisecondArray, column, row) + } + DataType::Time64(unit) if *unit == TimeUnit::Microsecond => { + make_string_time!(array::Time64MicrosecondArray, column, row) + } + DataType::Time64(unit) if *unit == TimeUnit::Nanosecond => { + make_string_time!(array::Time64NanosecondArray, column, row) + } + DataType::Interval(unit) => match unit { + IntervalUnit::DayTime => { + make_string_interval_day_time!(column, row) + } + IntervalUnit::YearMonth => { + make_string_interval_year_month!(column, row) + } + IntervalUnit::MonthDayNano => { + make_string_interval_month_day_nano!(column, row) + } + }, + DataType::List(_) => make_string_from_list!(column, row), + DataType::Dictionary(index_type, _value_type) => match **index_type { + DataType::Int8 => dict_array_value_to_string::(column, row), + DataType::Int16 => dict_array_value_to_string::(column, row), + DataType::Int32 => dict_array_value_to_string::(column, row), + DataType::Int64 => dict_array_value_to_string::(column, row), + DataType::UInt8 => dict_array_value_to_string::(column, row), + DataType::UInt16 => dict_array_value_to_string::(column, row), + DataType::UInt32 => dict_array_value_to_string::(column, row), + DataType::UInt64 => dict_array_value_to_string::(column, row), + _ => Err(ArrowError::InvalidArgumentError(format!( + "Pretty printing not supported for {:?} due to index type", + column.data_type() + ))), + }, + DataType::FixedSizeList(_, _) => make_string_from_fixed_size_list!(column, row), + DataType::Struct(_) => { + let st = column + .as_any() + .downcast_ref::() + .ok_or_else(|| { + ArrowError::InvalidArgumentError( + "Repl error: could not convert struct column to struct array.".to_string(), + ) + })?; + + let mut s = String::new(); + s.push('{'); + let mut kv_iter = st.columns().into_iter().zip(st.column_names().into_iter()); + if let Some((col, name)) = kv_iter.next() { + append_struct_field_string(&mut s, name, col, row)?; + } + for (col, name) in kv_iter { + s.push_str(", "); + append_struct_field_string(&mut s, name, col, row)?; + } + s.push('}'); + + Ok(s) + } + _ => Err(ArrowError::InvalidArgumentError(format!( + "Pretty printing not implemented for {:?} type", + column.data_type() + ))), + } +} + +/// Converts the value of the dictionary array at `row` to a String +fn dict_array_value_to_string( + colum: &array::ArrayRef, + row: usize, +) -> Result { + let dict_array = colum.as_any().downcast_ref::>().unwrap(); + + let keys_array = dict_array.keys(); + + if keys_array.is_null(row) { + return Ok(String::from("")); + } + + let dict_index = keys_array.value(row).to_usize().ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Can not convert value {:?} at index {:?} to usize for string conversion.", + keys_array.value(row), + row + )) + })?; + + array_value_to_string(dict_array.values(), dict_index) +} diff --git a/arrow_deps/src/lib.rs b/arrow_deps/src/lib.rs new file mode 100644 index 0000000000..b1ead9249e --- /dev/null +++ b/arrow_deps/src/lib.rs @@ -0,0 +1,14 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! This crate exists to add a dependency on (likely as yet +//! unpublished) versions of arrow / datafusion so we can +//! manage the version used by ceresdbx in a single crate. + +pub mod display; +pub mod util; + +// export arrow and datafusion publically so we can have a single +// reference in cargo +pub use arrow; +pub use datafusion; +pub use parquet; diff --git a/arrow_deps/src/util.rs b/arrow_deps/src/util.rs new file mode 100644 index 0000000000..661fa919dd --- /dev/null +++ b/arrow_deps/src/util.rs @@ -0,0 +1,133 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! utilities for manipulating arrow/parquet/datafusion data structures. + +use std::convert::TryFrom; + +use arrow::{ + array::UInt32Array, + compute, + error::{ArrowError, Result}, + record_batch::RecordBatch, +}; + +/// Select the data in the [`RecordBatch`] by read and copy from the source +/// `batch`. +pub fn select_record_batch(batch: &RecordBatch, selected_rows: &[bool]) -> Result { + assert_eq!(batch.num_rows(), selected_rows.len()); + let selected_columns = { + // ensure the the selected_rows.len() is not greater than u32::MAX. + let _ = u32::try_from(selected_rows.len()).map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "too many rows in a batch, convert usize to u32 failed, num_rows:{}, err:{}", + batch.num_rows(), + e + )) + })?; + + let selected_index_iter = selected_rows + .iter() + .enumerate() + .filter_map(|(idx, selected)| if *selected { Some(idx as u32) } else { None }); + // TODO(xikai): avoid this memory allocation. + let indices = UInt32Array::from_iter_values(selected_index_iter); + + let mut cols = Vec::with_capacity(batch.num_columns()); + for orig_col_data in batch.columns() { + let new_col_data = compute::take(orig_col_data.as_ref(), &indices, None)?; + cols.push(new_col_data); + } + + cols + }; + + RecordBatch::try_new(batch.schema(), selected_columns) +} + +/// Reverse the data in the [`RecordBatch`] by read and copy from the source +/// `batch`. +pub fn reverse_record_batch(batch: &RecordBatch) -> Result { + let reversed_columns = { + let num_rows = u32::try_from(batch.num_rows()).map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "too many rows in a batch, convert usize to u32 failed, num_rows:{}, err:{}", + batch.num_rows(), + e + )) + })?; + // TODO(xikai): avoid this memory allocation. + let indices = UInt32Array::from_iter_values((0..num_rows).into_iter().rev()); + + let mut cols = Vec::with_capacity(batch.num_columns()); + for orig_col_data in batch.columns() { + let new_col_data = compute::take(orig_col_data.as_ref(), &indices, None)?; + cols.push(new_col_data); + } + + cols + }; + + RecordBatch::try_new(batch.schema(), reversed_columns) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow::{ + array::Int32Array, + datatypes::{DataType, Field, Schema}, + }; + + use super::*; + + #[test] + fn test_reverse_record_batch() { + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let (ids, reverse_ids) = { + let mut source = vec![1, 2, 3, 4, 5]; + let arr = Int32Array::from(source.clone()); + source.reverse(); + let reversed_arr = Int32Array::from(source); + (arr, reversed_arr) + }; + + let batch = + RecordBatch::try_new(schema.clone(), vec![Arc::new(ids)]).expect("build record batch"); + let expect_reversed_batch = + RecordBatch::try_new(schema, vec![Arc::new(reverse_ids)]).expect("build record batch"); + let reversed_batch = reverse_record_batch(&batch).expect("reverse record batch"); + + assert_eq!(expect_reversed_batch, reversed_batch); + } + + #[test] + fn test_reverse_empty_record_batch() { + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let arr = Int32Array::from(Vec::::new()); + + let batch = RecordBatch::try_new(schema, vec![Arc::new(arr)]).expect("build record batch"); + let reversed_batch = reverse_record_batch(&batch).expect("reverse record batch"); + + assert_eq!(batch, reversed_batch); + } + + #[test] + fn test_select_record_batch() { + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + let (ids, expect_selected_ids, selected_rows) = { + let arr = Int32Array::from(vec![1, 2, 3, 4, 5]); + let selected_arr = Int32Array::from(vec![2, 3, 5]); + (arr, selected_arr, vec![false, true, true, false, true]) + }; + let batch = + RecordBatch::try_new(schema.clone(), vec![Arc::new(ids)]).expect("build record batch"); + let selected_batch = + select_record_batch(&batch, &selected_rows).expect("select record batch"); + let expect_selected_batch = + RecordBatch::try_new(schema, vec![Arc::new(expect_selected_ids)]) + .expect("build record batch"); + + assert_eq!(selected_batch, expect_selected_batch); + } +} diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml new file mode 100644 index 0000000000..e453bd8eb6 --- /dev/null +++ b/benchmarks/Cargo.toml @@ -0,0 +1,34 @@ +[package] +name = "benchmarks" +version = "0.1.0" +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +arena = { path = "../components/arena" } +arrow2 = { version = "0.7.0", features = [ "io_parquet" ] } +arrow_deps = { path = "../arrow_deps" } +analytic_engine = { path = "../analytic_engine" } +clap = "2.0" +common_types = { path = "../common_types" } +common_util = { path = "../common_util" } +env_logger = "0.6" +futures = "0.3" +log = "0.4" +object_store = { path = "../components/object_store" } +parquet = { path = "../components/parquet"} +serde = "1.0" +serde_derive = "1.0" +table_engine = { path = "../table_engine" } +tokio = { version = "1.0", features = ["sync"] } + +[dev-dependencies] +criterion = "0.3" + +[[bench]] +name = "bench" +harness = false + +[[bin]] +name = "sst-tools" diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000000..65cc001e80 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,25 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +# Benchmarks + +## Test Data +todo + +## Config +A config template can be found in `config/bench.toml`. + +## Run benchmarks +In root directory of `ceresdbx` (not this directory `ceresdbx/benchmarks`), run the following command: +```bash +ANALYTIC_BENCH_CONFIG_PATH=/path/to/bench.toml cargo bench -p benchmarks +``` + +Print logs: +```bash +RUST_LOG=info ANALYTIC_BENCH_CONFIG_PATH=/path/to/bench.toml cargo bench -p benchmarks +``` + +Run specific bench: +```bash +ANALYTIC_BENCH_CONFIG_PATH=/path/to/bench.toml cargo bench -p benchmarks -- read_parquet +``` diff --git a/benchmarks/bench.toml b/benchmarks/bench.toml new file mode 100644 index 0000000000..e182151bdb --- /dev/null +++ b/benchmarks/bench.toml @@ -0,0 +1,45 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +[sst_bench] +store_path = "/Users/chunshao.rcs/Desktop/work/gitlab/ceresdbx/1/1" +# store_path = "/Users/yingwen.yyw/data/antmonitor_mid_table_4022" +sst_file_name = "37.sst" +runtime_thread_num = 1 +bench_measurement_time = "30s" +max_projections = 5 +read_batch_row_num = 500 +sst_meta_cache_cap = 1000 +sst_data_cache_cap = 10000 + +[sst_bench.predicate] +# start_time_ms = 0 +start_time_ms = 1632985200000 +# end_time_ms = 0 +end_time_ms = 1632985800000 + +[merge_sst_bench] +store_path = "/Users/chunshao.rcs/Desktop/work/gitlab/ceresdbx" +space_id = 1 +table_id = 1 +sst_file_ids = [ 34, 37 ] +runtime_thread_num = 1 +bench_measurement_time = "120s" +max_projections = 5 +read_batch_row_num = 500 + +[merge_sst_bench.predicate] +start_time_ms = 0 +# start_time_ms = 1632985200000 +end_time_ms = 0 +# end_time_ms = 1632985800000 + +[scan_memtable_bench] +store_path = "/Users/chunshao.rcs/Desktop/work/gitlab/ceresdbx/1/1" +sst_file_name = "37.sst" +runtime_thread_num = 1 +max_projections = 5 +arena_block_size = "64M" + +[wal_row_bench] +rows_num = 100_0000 +test_num = 3 \ No newline at end of file diff --git a/benchmarks/benches/bench.rs b/benchmarks/benches/bench.rs new file mode 100644 index 0000000000..26ee634424 --- /dev/null +++ b/benchmarks/benches/bench.rs @@ -0,0 +1,208 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Benchmarks + +use std::sync::Once; + +use benchmarks::{ + arrow2_bench::Arrow2Bench, + config::{self, BenchConfig}, + merge_memtable_bench::MergeMemTableBench, + merge_sst_bench::MergeSstBench, + parquet_bench::ParquetBench, + scan_memtable_bench::ScanMemTableBench, + sst_bench::SstBench, +}; +use criterion::*; + +static INIT_LOG: Once = Once::new(); + +pub fn init_bench() -> BenchConfig { + INIT_LOG.call_once(|| { + env_logger::init(); + }); + + config::bench_config_from_env() +} + +fn bench_read_sst_iter(b: &mut Bencher<'_>, bench: &SstBench) { + b.iter(|| { + bench.run_bench(); + }) +} + +fn bench_read_sst(c: &mut Criterion) { + let config = init_bench(); + + let mut group = c.benchmark_group("read_sst"); + group.measurement_time(config.sst_bench.bench_measurement_time.0); + group.sample_size(config.sst_bench.bench_sample_size); + + let mut bench = SstBench::new(config.sst_bench); + + for i in 0..bench.num_benches() { + bench.init_for_bench(i); + + group.bench_with_input( + BenchmarkId::new("read_sst", format!("{}/{}", bench.sst_file_name, i)), + &bench, + bench_read_sst_iter, + ); + } + + group.finish(); +} + +fn bench_merge_sst_iter(b: &mut Bencher<'_>, bench: &MergeSstBench) { + b.iter(|| bench.run_bench()) +} + +fn bench_merge_sst(c: &mut Criterion) { + let config = init_bench(); + + let mut group = c.benchmark_group("merge_sst"); + + group.measurement_time(config.merge_sst_bench.bench_measurement_time.0); + group.sample_size(config.sst_bench.bench_sample_size); + + let sst_file_ids = format!("{:?}", config.merge_sst_bench.sst_file_ids); + let mut bench = MergeSstBench::new(config.merge_sst_bench); + + for i in 0..bench.num_benches() { + bench.init_for_bench(i, true); + group.bench_with_input( + BenchmarkId::new("merge_sst", format!("{}/{}/dedup", sst_file_ids, i)), + &bench, + bench_merge_sst_iter, + ); + + bench.init_for_bench(i, false); + group.bench_with_input( + BenchmarkId::new("merge_sst", format!("{}/{}/no-dedup", sst_file_ids, i)), + &bench, + bench_merge_sst_iter, + ); + } + + group.finish(); +} + +fn bench_parquet_iter(b: &mut Bencher<'_>, bench: &ParquetBench) { + b.iter(|| bench.run_bench()) +} + +fn bench_parquet(c: &mut Criterion) { + let config = init_bench(); + + let mut group = c.benchmark_group("read_parquet"); + + group.measurement_time(config.sst_bench.bench_measurement_time.0); + group.sample_size(config.sst_bench.bench_sample_size); + + let mut bench = ParquetBench::new(config.sst_bench); + + for i in 0..bench.num_benches() { + bench.init_for_bench(i); + + group.bench_with_input( + BenchmarkId::new("read_parquet", format!("{}/{}", bench.sst_file_name, i)), + &bench, + bench_parquet_iter, + ); + } + + group.finish(); +} + +fn bench_scan_memtable_iter(b: &mut Bencher<'_>, bench: &ScanMemTableBench) { + b.iter(|| bench.run_bench()) +} + +fn bench_scan_memtable(c: &mut Criterion) { + let config = init_bench(); + + let mut group = c.benchmark_group("scan_memtable"); + + let mut bench = ScanMemTableBench::new(config.scan_memtable_bench); + + for i in 0..bench.num_benches() { + bench.init_for_bench(i); + + group.bench_with_input( + BenchmarkId::new("scan_memtable", i), + &bench, + bench_scan_memtable_iter, + ); + } + + group.finish(); +} + +fn bench_merge_memtable_iter(b: &mut Bencher<'_>, bench: &MergeMemTableBench) { + b.iter(|| bench.run_bench()) +} + +fn bench_merge_memtable(c: &mut Criterion) { + let config = init_bench(); + + let mut group = c.benchmark_group("merge_memtable"); + + let sst_file_ids = format!("{:?}", config.merge_memtable_bench.sst_file_ids); + let mut bench = MergeMemTableBench::new(config.merge_memtable_bench); + + for i in 0..bench.num_benches() { + bench.init_for_bench(i, true); + group.bench_with_input( + BenchmarkId::new("merge_memtable", format!("{}/{}/dedup", sst_file_ids, i)), + &bench, + bench_merge_memtable_iter, + ); + + bench.init_for_bench(i, false); + group.bench_with_input( + BenchmarkId::new("merge_memtable", format!("{}/{}/no-dedup", sst_file_ids, i)), + &bench, + bench_merge_memtable_iter, + ); + } + + group.finish(); +} + +fn bench_arrow2_iter(b: &mut Bencher<'_>, bench: &Arrow2Bench) { + b.iter(|| bench.run_bench()) +} + +fn bench_arrow2(c: &mut Criterion) { + let config = init_bench(); + + let mut group = c.benchmark_group("read_arrow2"); + + group.measurement_time(config.sst_bench.bench_measurement_time.0); + group.sample_size(config.sst_bench.bench_sample_size); + + let mut bench = Arrow2Bench::new(config.sst_bench); + + for i in 0..bench.num_benches() { + bench.init_for_bench(i); + + group.bench_with_input( + BenchmarkId::new("read_arrow2", format!("{}/{}", bench.sst_file_name, i)), + &bench, + bench_arrow2_iter, + ); + } + + group.finish(); +} + +criterion_group!( + benches, + bench_read_sst, + bench_merge_sst, + bench_parquet, + bench_scan_memtable, + bench_merge_memtable, + bench_arrow2, +); +criterion_main!(benches); diff --git a/benchmarks/config/bench.toml b/benchmarks/config/bench.toml new file mode 100644 index 0000000000..ba73090b77 --- /dev/null +++ b/benchmarks/config/bench.toml @@ -0,0 +1,50 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +[sst_bench] +store_path = "/path/to/data/1/1" +sst_file_name = "37.sst" +runtime_thread_num = 1 +bench_measurement_time = "30s" +bench_sample_size = 30 +max_projections = 5 +read_batch_row_num = 500 +reverse = false + +[sst_bench.predicate] +# start_time_ms = 0 +start_time_ms = 1632985200000 +# end_time_ms = 0 +end_time_ms = 1632985800000 + +[merge_sst_bench] +store_path = "/path/to/data" +space_id = 1 +table_id = 1 +sst_file_ids = [ 34, 37 ] +runtime_thread_num = 1 +bench_measurement_time = "30s" +bench_sample_size = 30 +max_projections = 5 +read_batch_row_num = 500 + +[merge_sst_bench.predicate] +start_time_ms = 0 +# start_time_ms = 1632985200000 +end_time_ms = 0 +# end_time_ms = 1632985800000 + +[scan_memtable_bench] +store_path = "/path/to/data/1/1" +sst_file_name = "37.sst" +runtime_thread_num = 1 +max_projections = 5 +arena_block_size = "64M" + +[merge_memtable_bench] +store_path = "/path/to/data" +space_id = 1 +table_id = 1 +sst_file_ids = [ 37 ] +runtime_thread_num = 1 +max_projections = 5 +arena_block_size = "64M" diff --git a/benchmarks/config/sst.toml b/benchmarks/config/sst.toml new file mode 100644 index 0000000000..5758df2459 --- /dev/null +++ b/benchmarks/config/sst.toml @@ -0,0 +1,33 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +runtime_thread_num = 4 + + [rebuild_sst] + store_path = "/Users/chunshao.rcs/Desktop/work/gitlab/ceresdb/neo/ceresdb/ceresdbx/benchmarks" + input_file_name = "898.sst" + # read_batch_row_num = 500 + # read_batch_row_num = 4096 + read_batch_row_num = 8192 +# read_batch_row_num = 16384 + output_file_name = "tt_t.sst" + num_rows_per_row_group = 8192 +compression = "SNAPPY" + + [rebuild_sst.predicate] + start_time_ms = 0 + end_time_ms = 0 + +#[merge_sst] +#store_path = "/Users/chunshao.rcs/Desktop/work/gitlab/ceresdb/neo/ceresdb/ceresdbx/benchmarks/2199023255564" +#space_id = 1 +#table_id = 1 +#sst_file_ids = [1, 17, 19, 24, 31, 37, 43, 45, 9, 14, 18, 21, 27, 34, 40, 44, 5] +#dedup = true +#read_batch_row_num = 16384 +#output_store_path = "/Users/yingwen.yyw/data/1/1" +#output_file_name = "16384-all.sst" +#num_rows_per_row_group = 16384 +# +#[merge_sst.predicate] +#start_time_ms = 0 +#end_time_ms = 0 diff --git a/benchmarks/src/arrow2_bench.rs b/benchmarks/src/arrow2_bench.rs new file mode 100644 index 0000000000..e51e96fe4d --- /dev/null +++ b/benchmarks/src/arrow2_bench.rs @@ -0,0 +1,81 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Arrow 2 bench. + +use std::{fs::File, io::BufReader, path::Path, sync::Arc, time::Instant}; + +use arrow2::io::parquet::read; +use common_util::runtime::Runtime; +use log::info; + +use crate::{config::SstBenchConfig, util}; + +pub struct Arrow2Bench { + store_path: String, + pub sst_file_name: String, + max_projections: usize, + projection: Vec, + runtime: Arc, +} + +impl Arrow2Bench { + pub fn new(config: SstBenchConfig) -> Self { + let runtime = util::new_runtime(config.runtime_thread_num); + + Arrow2Bench { + store_path: config.store_path, + sst_file_name: config.sst_file_name, + max_projections: config.max_projections, + projection: Vec::new(), + runtime: Arc::new(runtime), + } + } + + pub fn num_benches(&self) -> usize { + // One test reads all columns and `max_projections` tests read with projection. + 1 + self.max_projections + } + + pub fn init_for_bench(&mut self, i: usize) { + let projection = if i < self.max_projections { + (0..i + 1).into_iter().collect() + } else { + Vec::new() + }; + + self.projection = projection; + } + + pub fn run_bench(&self) { + let sst_path = Path::new(&self.store_path).join(&self.sst_file_name); + + self.runtime.block_on(async { + let open_instant = Instant::now(); + let file = BufReader::new(File::open(sst_path).unwrap()); + + let record_reader = if self.projection.is_empty() { + read::RecordReader::try_new(file, None, None, None, None).unwrap() + } else { + read::RecordReader::try_new(file, Some(self.projection.clone()), None, None, None).unwrap() + }; + let open_cost = open_instant.elapsed(); + + let iter_begin_instant = Instant::now(); + let mut total_rows = 0; + let mut batch_num = 0; + for record_batch in record_reader { + let num_rows = record_batch.unwrap().num_rows(); + total_rows += num_rows; + batch_num += 1; + } + + info!( + "\nParquetBench total rows of sst: {}, total batch num: {}, open cost: {:?}, iter cost: {:?}", + total_rows, + batch_num, + open_cost, + iter_begin_instant.elapsed(), + ); + }); + } +} diff --git a/benchmarks/src/bin/sst-tools.rs b/benchmarks/src/bin/sst-tools.rs new file mode 100644 index 0000000000..ab1a6e91be --- /dev/null +++ b/benchmarks/src/bin/sst-tools.rs @@ -0,0 +1,70 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use benchmarks::{ + sst_tools::{self, MergeSstConfig, RebuildSstConfig}, + util, +}; +use clap::{App, Arg}; +use common_util::toml; +use log::info; +use serde_derive::Deserialize; + +#[derive(Debug, Deserialize)] +#[serde(default)] +struct Config { + runtime_thread_num: usize, + rebuild_sst: Option, + merge_sst: Option, +} + +impl Default for Config { + fn default() -> Config { + Self { + runtime_thread_num: 1, + rebuild_sst: None, + merge_sst: None, + } + } +} + +fn config_from_path(path: &str) -> Config { + let mut toml_buf = String::new(); + toml::parse_toml_from_path(path, &mut toml_buf).expect("Failed to parse config.") +} + +fn main() { + env_logger::init(); + + let matches = App::new("SST Tools") + .arg( + Arg::with_name("config") + .short("c") + .long("config") + .required(true) + .takes_value(true) + .help("Set configuration file, eg: \"/path/server.toml\""), + ) + .get_matches(); + + let config_path = matches + .value_of("config") + .expect("Config file is required."); + let config = config_from_path(config_path); + + info!("sst tools start, config:{:?}", config); + + let runtime = Arc::new(util::new_runtime(config.runtime_thread_num)); + + let rt = runtime.clone(); + runtime.block_on(async { + if let Some(rebuild_sst) = config.rebuild_sst { + sst_tools::rebuild_sst(rebuild_sst, rt.clone()).await; + } + + if let Some(merge_sst) = config.merge_sst { + sst_tools::merge_sst(merge_sst, rt).await; + } + }); +} diff --git a/benchmarks/src/config.rs b/benchmarks/src/config.rs new file mode 100644 index 0000000000..a66cfa1163 --- /dev/null +++ b/benchmarks/src/config.rs @@ -0,0 +1,123 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Benchmark configs. + +use std::env; + +use analytic_engine::{space::SpaceId, sst::manager::FileId}; +use common_types::time::{TimeRange, Timestamp}; +use common_util::{ + config::{ReadableDuration, ReadableSize}, + toml, +}; +use serde_derive::Deserialize; +use table_engine::{predicate::Predicate, table::TableId}; + +const BENCH_CONFIG_PATH_KEY: &str = "ANALYTIC_BENCH_CONFIG_PATH"; + +#[derive(Deserialize)] +pub struct BenchConfig { + pub sst_bench: SstBenchConfig, + pub merge_sst_bench: MergeSstBenchConfig, + pub scan_memtable_bench: ScanMemTableBenchConfig, + pub merge_memtable_bench: MergeMemTableBenchConfig, +} + +// TODO(yingwen): Maybe we can use layze static to load config first. +pub fn bench_config_from_env() -> BenchConfig { + let path = match env::var(BENCH_CONFIG_PATH_KEY) { + Ok(v) => v, + Err(e) => panic!( + "Env {} is required to run benches, err:{}.", + BENCH_CONFIG_PATH_KEY, e + ), + }; + + let mut toml_buf = String::new(); + toml::parse_toml_from_path(&path, &mut toml_buf).expect("Failed to parse config.") +} + +#[derive(Deserialize)] +pub struct SstBenchConfig { + pub store_path: String, + pub sst_file_name: String, + pub runtime_thread_num: usize, + + pub bench_measurement_time: ReadableDuration, + pub bench_sample_size: usize, + + /// Max number of projection columns. + pub max_projections: usize, + pub read_batch_row_num: usize, + pub predicate: BenchPredicate, + pub sst_meta_cache_cap: Option, + pub sst_data_cache_cap: Option, + pub reverse: bool, +} + +#[derive(Deserialize)] +pub struct MergeSstBenchConfig { + pub store_path: String, + pub space_id: SpaceId, + pub table_id: TableId, + pub sst_file_ids: Vec, + pub runtime_thread_num: usize, + + pub bench_measurement_time: ReadableDuration, + pub bench_sample_size: usize, + + /// Max number of projection columns. + pub max_projections: usize, + pub read_batch_row_num: usize, + pub predicate: BenchPredicate, +} + +#[derive(Deserialize)] +pub struct ScanMemTableBenchConfig { + pub store_path: String, + pub sst_file_name: String, + pub runtime_thread_num: usize, + + /// Max number of projection columns. + pub max_projections: usize, + + pub arena_block_size: ReadableSize, +} + +#[derive(Debug, Deserialize)] +pub struct BenchPredicate { + /// Inclusive start time in millis. + start_time_ms: i64, + /// Exclusive end time in millis. + /// + /// Set to current time millis if start_time_ms == end_time_ms. + end_time_ms: i64, +} + +impl BenchPredicate { + pub fn into_predicate(self) -> Predicate { + let start = Timestamp::new(self.start_time_ms); + let end = if self.start_time_ms == self.end_time_ms { + Timestamp::now() + } else { + Timestamp::new(self.end_time_ms) + }; + let time_range = TimeRange::new(start, end).unwrap(); + + Predicate::new(time_range) + } +} + +#[derive(Deserialize)] +pub struct MergeMemTableBenchConfig { + pub store_path: String, + pub space_id: SpaceId, + pub table_id: TableId, + pub sst_file_ids: Vec, + pub runtime_thread_num: usize, + + /// Max number of projection columns. + pub max_projections: usize, + + pub arena_block_size: ReadableSize, +} diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs new file mode 100644 index 0000000000..526d028021 --- /dev/null +++ b/benchmarks/src/lib.rs @@ -0,0 +1,17 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Utilities for benchmarks. + +use common_types::SequenceNumber; + +pub mod arrow2_bench; +pub mod config; +pub mod merge_memtable_bench; +pub mod merge_sst_bench; +pub mod parquet_bench; +pub mod scan_memtable_bench; +pub mod sst_bench; +pub mod sst_tools; +pub mod util; + +pub(crate) const INIT_SEQUENCE: SequenceNumber = 1; diff --git a/benchmarks/src/merge_memtable_bench.rs b/benchmarks/src/merge_memtable_bench.rs new file mode 100644 index 0000000000..7596576aa6 --- /dev/null +++ b/benchmarks/src/merge_memtable_bench.rs @@ -0,0 +1,209 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Merge memtable bench. + +use std::{cmp, sync::Arc, time::Instant}; + +use analytic_engine::{ + memtable::{ + factory::{Factory as MemTableFactory, Options}, + skiplist::factory::SkiplistMemTableFactory, + }, + row_iter::{ + dedup::DedupIterator, + merge::{MergeBuilder, MergeConfig}, + IterOptions, RecordBatchWithKeyIterator, + }, + space::SpaceId, + sst::factory::{FactoryImpl, SstReaderOptions, SstType}, + table::{ + sst_util, + version::{MemTableState, MemTableVec}, + }, +}; +use arena::NoopCollector; +use common_types::{ + projected_schema::ProjectedSchema, request_id::RequestId, schema::Schema, time::TimeRange, +}; +use common_util::runtime::Runtime; +use log::info; +use object_store::{disk::File, ObjectStore}; +use parquet::{DataCacheRef, MetaCacheRef}; +use table_engine::{predicate::Predicate, table::TableId}; + +use crate::{config::MergeMemTableBenchConfig, util}; + +pub struct MergeMemTableBench { + store: File, + memtables: MemTableVec, + max_projections: usize, + schema: Schema, + projected_schema: ProjectedSchema, + runtime: Arc, + space_id: SpaceId, + table_id: TableId, + dedup: bool, + sst_reader_options: SstReaderOptions, +} + +impl MergeMemTableBench { + pub fn new(config: MergeMemTableBenchConfig) -> Self { + assert!(!config.sst_file_ids.is_empty()); + + let store = File::new(config.store_path); + let runtime = Arc::new(util::new_runtime(config.runtime_thread_num)); + let space_id = config.space_id; + let table_id = config.table_id; + + let meta_cache: Option = None; + let data_cache: Option = None; + + // Use first sst's schema. + let mut sst_path = store.new_path(); + sst_util::set_sst_file_path(space_id, table_id, config.sst_file_ids[0], &mut sst_path); + let schema = runtime.block_on(util::schema_from_sst( + &store, + &sst_path, + &meta_cache, + &data_cache, + )); + + let projected_schema = ProjectedSchema::no_projection(schema.clone()); + let max_projections = cmp::min(config.max_projections, schema.num_columns()); + + let mut memtables = Vec::with_capacity(config.sst_file_ids.len()); + for id in &config.sst_file_ids { + let mut sst_path = store.new_path(); + sst_util::set_sst_file_path(space_id, table_id, *id, &mut sst_path); + + let memtable_factory = SkiplistMemTableFactory; + let memtable_opts = Options { + collector: Arc::new(NoopCollector {}), + schema: schema.clone(), + arena_block_size: config.arena_block_size.0 as u32, + creation_sequence: crate::INIT_SEQUENCE, + }; + let memtable = memtable_factory.create_memtable(memtable_opts).unwrap(); + + runtime.block_on(util::load_sst_to_memtable( + &store, + &sst_path, + &schema, + &memtable, + runtime.clone(), + )); + + info!( + "\nMergeMemTableBench memtable loaded, memory used: {}", + memtable.approximate_memory_usage() + ); + + memtables.push(MemTableState { + mem: memtable, + time_range: TimeRange::min_to_max(), + id: *id, + }); + } + let sst_reader_options = mock_sst_reader_options(projected_schema.clone(), runtime.clone()); + + MergeMemTableBench { + store, + memtables, + max_projections, + schema, + projected_schema, + runtime, + space_id, + table_id, + dedup: true, + sst_reader_options, + } + } + + pub fn num_benches(&self) -> usize { + // One test reads all columns and `max_projections` tests read with projection. + 1 + self.max_projections + } + + pub fn init_for_bench(&mut self, i: usize, dedup: bool) { + let projected_schema = + util::projected_schema_by_number(&self.schema, i, self.max_projections); + + self.projected_schema = projected_schema; + self.dedup = dedup; + } + + // TODO(xikai): add benchmark for merge in reverse order. + pub fn run_bench(&self) { + let space_id = self.space_id; + let table_id = self.table_id; + let sequence = u64::MAX; + let iter_options = IterOptions::default(); + let projected_schema = self.projected_schema.clone(); + let sst_factory = FactoryImpl; + + let request_id = RequestId::next_id(); + let mut builder = MergeBuilder::new(MergeConfig { + request_id, + space_id, + table_id, + sequence, + projected_schema, + predicate: Arc::new(Predicate::empty()), + sst_factory, + sst_reader_options: self.sst_reader_options.clone(), + store: &self.store, + merge_iter_options: iter_options.clone(), + need_dedup: true, + reverse: false, + }); + + builder.mut_memtables().extend_from_slice(&self.memtables); + + self.runtime.block_on(async { + let begin_instant = Instant::now(); + + let mut merge_iter = builder.build().await.unwrap(); + let mut total_rows = 0; + let mut batch_num = 0; + + if self.dedup { + let mut dedup_iter = DedupIterator::new(request_id, merge_iter, iter_options); + while let Some(batch) = dedup_iter.next_batch().await.unwrap() { + let num_rows = batch.num_rows(); + total_rows += num_rows; + batch_num += 1; + } + } else { + while let Some(batch) = merge_iter.next_batch().await.unwrap() { + let num_rows = batch.num_rows(); + total_rows += num_rows; + batch_num += 1; + } + } + + info!( + "\nMergeMemTableBench total rows of sst: {}, total batch num: {}, cost: {:?}", + total_rows, + batch_num, + begin_instant.elapsed(), + ); + }); + } +} + +fn mock_sst_reader_options( + projected_schema: ProjectedSchema, + runtime: Arc, +) -> SstReaderOptions { + SstReaderOptions { + sst_type: SstType::Parquet, + read_batch_row_num: 500, + reverse: false, + projected_schema, + predicate: Arc::new(Predicate::new(TimeRange::min_to_max())), + meta_cache: None, + data_cache: None, + runtime, + } +} diff --git a/benchmarks/src/merge_sst_bench.rs b/benchmarks/src/merge_sst_bench.rs new file mode 100644 index 0000000000..a0ccab50d5 --- /dev/null +++ b/benchmarks/src/merge_sst_bench.rs @@ -0,0 +1,225 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Merge SST bench. + +use std::{cmp, sync::Arc, time::Instant}; + +use analytic_engine::{ + row_iter::{ + chain, + chain::ChainConfig, + dedup::DedupIterator, + merge::{MergeBuilder, MergeConfig}, + IterOptions, RecordBatchWithKeyIterator, + }, + space::SpaceId, + sst::{ + factory::{FactoryImpl, SstReaderOptions, SstType}, + file::{FileHandle, FilePurgeQueue, Request}, + }, + table::sst_util, +}; +use common_types::{projected_schema::ProjectedSchema, request_id::RequestId, schema::Schema}; +use common_util::runtime::Runtime; +use log::info; +use object_store::{disk::File, ObjectStore}; +use parquet::{DataCacheRef, MetaCacheRef}; +use table_engine::{predicate::Predicate, table::TableId}; +use tokio::sync::mpsc::{self, UnboundedReceiver}; + +use crate::{config::MergeSstBenchConfig, util}; + +pub struct MergeSstBench { + store: File, + max_projections: usize, + schema: Schema, + sst_reader_options: SstReaderOptions, + runtime: Arc, + space_id: SpaceId, + table_id: TableId, + file_handles: Vec, + _receiver: UnboundedReceiver, + dedup: bool, +} + +impl MergeSstBench { + pub fn new(config: MergeSstBenchConfig) -> Self { + assert!(!config.sst_file_ids.is_empty()); + + let store = File::new(config.store_path); + let runtime = Arc::new(util::new_runtime(config.runtime_thread_num)); + let space_id = config.space_id; + let table_id = config.table_id; + + let mut sst_path = store.new_path(); + sst_util::set_sst_file_path(space_id, table_id, config.sst_file_ids[0], &mut sst_path); + let meta_cache: Option = None; + let data_cache: Option = None; + + let schema = runtime.block_on(util::schema_from_sst( + &store, + &sst_path, + &meta_cache, + &data_cache, + )); + + let predicate = config.predicate.into_predicate(); + let projected_schema = ProjectedSchema::no_projection(schema.clone()); + let sst_reader_options = SstReaderOptions { + sst_type: SstType::Parquet, + read_batch_row_num: config.read_batch_row_num, + reverse: false, + projected_schema, + predicate: Arc::new(predicate), + meta_cache: meta_cache.clone(), + data_cache: data_cache.clone(), + runtime: runtime.clone(), + }; + let max_projections = cmp::min(config.max_projections, schema.num_columns()); + + let (tx, rx) = mpsc::unbounded_channel(); + let purge_queue = FilePurgeQueue::new(space_id, table_id, tx); + + let file_handles = runtime.block_on(util::file_handles_from_ssts( + &store, + space_id, + table_id, + &config.sst_file_ids, + purge_queue, + &meta_cache, + &data_cache, + )); + + MergeSstBench { + store, + max_projections, + schema, + sst_reader_options, + runtime, + space_id, + table_id, + file_handles, + _receiver: rx, + dedup: true, + } + } + + pub fn num_benches(&self) -> usize { + // One test reads all columns and `max_projections` tests read with projection. + 1 + self.max_projections + } + + pub fn init_for_bench(&mut self, i: usize, dedup: bool) { + let projected_schema = + util::projected_schema_by_number(&self.schema, i, self.max_projections); + + self.sst_reader_options.projected_schema = projected_schema; + self.dedup = dedup; + } + + fn run_dedup_bench(&self) { + let space_id = self.space_id; + let table_id = self.table_id; + let sequence = u64::MAX; + let iter_options = IterOptions::default(); + let projected_schema = self.sst_reader_options.projected_schema.clone(); + let sst_factory = FactoryImpl; + + let request_id = RequestId::next_id(); + let mut builder = MergeBuilder::new(MergeConfig { + request_id, + space_id, + table_id, + sequence, + projected_schema, + predicate: Arc::new(Predicate::empty()), + sst_factory, + sst_reader_options: self.sst_reader_options.clone(), + store: &self.store, + merge_iter_options: iter_options.clone(), + need_dedup: true, + reverse: false, + }); + + builder + .mut_ssts_of_level(0) + .extend_from_slice(&self.file_handles); + + self.runtime.block_on(async { + let begin_instant = Instant::now(); + + let merge_iter = builder.build().await.unwrap(); + let mut dedup_iter = DedupIterator::new(request_id, merge_iter, iter_options); + let mut total_rows = 0; + let mut batch_num = 0; + + while let Some(batch) = dedup_iter.next_batch().await.unwrap() { + let num_rows = batch.num_rows(); + total_rows += num_rows; + batch_num += 1; + } + + info!( + "\nMergeSstBench total rows of sst: {}, total batch num: {}, cost: {:?}", + total_rows, + batch_num, + begin_instant.elapsed(), + ); + }); + } + + fn run_no_dedup_bench(&self) { + let space_id = self.space_id; + let table_id = self.table_id; + let projected_schema = self.sst_reader_options.projected_schema.clone(); + let sst_factory = FactoryImpl; + + let request_id = RequestId::next_id(); + let builder = chain::Builder::new(ChainConfig { + request_id, + space_id, + table_id, + projected_schema, + predicate: Arc::new(Predicate::empty()), + sst_factory, + sst_reader_options: self.sst_reader_options.clone(), + store: &self.store, + }) + .ssts(vec![self.file_handles.clone()]); + + self.runtime.block_on(async { + let begin_instant = Instant::now(); + + let mut chain_iter = builder.build().await.unwrap(); + let mut total_rows = 0; + let mut batch_num = 0; + + while let Some(batch) = chain_iter.next_batch().await.unwrap() { + let num_rows = batch.num_rows(); + total_rows += num_rows; + batch_num += 1; + } + + info!( + "\nMergeSstBench total rows of sst: {}, total batch num: {}, cost: {:?}", + total_rows, + batch_num, + begin_instant.elapsed(), + ); + }); + } + + pub fn run_bench(&self) { + if self.dedup { + self.run_dedup_bench(); + } else { + self.run_no_dedup_bench(); + } + } +} + +impl Drop for MergeSstBench { + fn drop(&mut self) { + self.file_handles.clear(); + } +} diff --git a/benchmarks/src/parquet_bench.rs b/benchmarks/src/parquet_bench.rs new file mode 100644 index 0000000000..b52c84f7e1 --- /dev/null +++ b/benchmarks/src/parquet_bench.rs @@ -0,0 +1,137 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Parquet bench. + +use std::{sync::Arc, time::Instant}; + +use arrow_deps::parquet::{ + arrow::{ArrowReader, ParquetFileArrowReader}, + file::{ + metadata::RowGroupMetaData, reader::FileReader, serialized_reader::SerializedFileReader, + }, +}; +use common_types::schema::Schema; +use common_util::runtime::Runtime; +use log::info; +use object_store::{disk::File, path::ObjectStorePath, ObjectStore}; +use parquet::{DataCacheRef, MetaCacheRef}; +use table_engine::predicate::PredicateRef; + +use crate::{config::SstBenchConfig, util}; + +type RowGroupPredicate = Box bool + 'static>; + +pub struct ParquetBench { + store: File, + pub sst_file_name: String, + max_projections: usize, + projection: Vec, + schema: Schema, + predicate: PredicateRef, + batch_size: usize, + runtime: Arc, +} + +impl ParquetBench { + pub fn new(config: SstBenchConfig) -> Self { + let store = File::new(config.store_path); + + let runtime = util::new_runtime(config.runtime_thread_num); + + let mut sst_path = store.new_path(); + sst_path.set_file_name(&config.sst_file_name); + let meta_cache: Option = None; + let data_cache: Option = None; + + let schema = runtime.block_on(util::schema_from_sst( + &store, + &sst_path, + &meta_cache, + &data_cache, + )); + + let predicate = Arc::new(config.predicate.into_predicate()); + + ParquetBench { + store, + sst_file_name: config.sst_file_name, + max_projections: config.max_projections, + projection: Vec::new(), + schema, + predicate, + batch_size: config.read_batch_row_num, + runtime: Arc::new(runtime), + } + } + + pub fn num_benches(&self) -> usize { + // One test reads all columns and `max_projections` tests read with projection. + 1 + self.max_projections + } + + pub fn init_for_bench(&mut self, i: usize) { + let projection = if i < self.max_projections { + (0..i + 1).into_iter().collect() + } else { + Vec::new() + }; + + self.projection = projection; + } + + pub fn run_bench(&self) { + let mut sst_path = self.store.new_path(); + sst_path.set_file_name(&self.sst_file_name); + + self.runtime.block_on(async { + let open_instant = Instant::now(); + let file = self.store.get(&sst_path).await.unwrap(); + let mut file_reader = SerializedFileReader::new(file).unwrap(); + let open_cost = open_instant.elapsed(); + + let filter_begin_instant = Instant::now(); + let row_group_predicate = self.build_row_group_predicate(&file_reader); + let mut arrow_reader = { + file_reader.filter_row_groups(&row_group_predicate); + ParquetFileArrowReader::new(Arc::new(file_reader)) + }; + let filter_cost = filter_begin_instant.elapsed(); + + let record_reader = if self.projection.is_empty() { + arrow_reader.get_record_reader(self.batch_size).unwrap() + } else { + arrow_reader + .get_record_reader_by_columns(self.projection.clone(), self.batch_size) + .unwrap() + }; + + let iter_begin_instant = Instant::now(); + let mut total_rows = 0; + let mut batch_num = 0; + for record_batch in record_reader { + let num_rows = record_batch.unwrap().num_rows(); + total_rows += num_rows; + batch_num += 1; + } + + info!( + "\nParquetBench total rows of sst: {}, total batch num: {}, open cost: {:?}, filter cost: {:?}, iter cost: {:?}", + total_rows, + batch_num, + open_cost, + filter_cost, + iter_begin_instant.elapsed(), + ); + }); + } + + fn build_row_group_predicate( + &self, + file_reader: &SerializedFileReader, + ) -> RowGroupPredicate { + let row_groups = file_reader.metadata().row_groups(); + let filter_results = self.predicate.filter_row_groups(&self.schema, row_groups); + + Box::new(move |_, idx: usize| filter_results[idx]) + } +} diff --git a/benchmarks/src/scan_memtable_bench.rs b/benchmarks/src/scan_memtable_bench.rs new file mode 100644 index 0000000000..424e1886e8 --- /dev/null +++ b/benchmarks/src/scan_memtable_bench.rs @@ -0,0 +1,111 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Scan memtable bench. + +use std::{collections::Bound, sync::Arc}; + +use analytic_engine::memtable::{ + factory::{Factory as MemTableFactory, Options}, + skiplist::factory::SkiplistMemTableFactory, + MemTableRef, ScanContext, ScanRequest, +}; +use arena::NoopCollector; +use common_types::projected_schema::ProjectedSchema; +use log::info; +use object_store::{disk::File, path::ObjectStorePath, ObjectStore}; +use parquet::{DataCacheRef, MetaCacheRef}; + +use crate::{config::ScanMemTableBenchConfig, util}; + +pub struct ScanMemTableBench { + memtable: MemTableRef, + projected_schema: ProjectedSchema, + max_projections: usize, +} + +impl ScanMemTableBench { + pub fn new(config: ScanMemTableBenchConfig) -> Self { + let store = File::new(config.store_path); + + let runtime = Arc::new(util::new_runtime(config.runtime_thread_num)); + let meta_cache: Option = None; + let data_cache: Option = None; + let mut sst_path = store.new_path(); + sst_path.set_file_name(&config.sst_file_name); + let schema = runtime.block_on(util::schema_from_sst( + &store, + &sst_path, + &meta_cache, + &data_cache, + )); + + let projected_schema = ProjectedSchema::no_projection(schema.clone()); + + let memtable_factory = SkiplistMemTableFactory; + let memtable_opts = Options { + collector: Arc::new(NoopCollector {}), + schema: schema.clone(), + arena_block_size: config.arena_block_size.0 as u32, + creation_sequence: crate::INIT_SEQUENCE, + }; + let memtable = memtable_factory.create_memtable(memtable_opts).unwrap(); + + runtime.block_on(util::load_sst_to_memtable( + &store, + &sst_path, + &schema, + &memtable, + runtime.clone(), + )); + + info!( + "\nScanMemTableBench memtable loaded, memory used: {}", + memtable.approximate_memory_usage() + ); + + Self { + memtable, + projected_schema, + max_projections: config.max_projections, + } + } + + pub fn num_benches(&self) -> usize { + // One test reads all columns and `max_projections` tests read with projection. + 1 + self.max_projections + } + + pub fn init_for_bench(&mut self, i: usize) { + let projected_schema = + util::projected_schema_by_number(self.memtable.schema(), i, self.max_projections); + + self.projected_schema = projected_schema; + } + + pub fn run_bench(&self) { + let scan_ctx = ScanContext::default(); + let scan_req = ScanRequest { + start_user_key: Bound::Unbounded, + end_user_key: Bound::Unbounded, + sequence: common_types::MAX_SEQUENCE_NUMBER, + projected_schema: self.projected_schema.clone(), + need_dedup: true, + reverse: false, + }; + + let iter = self.memtable.scan(scan_ctx, scan_req).unwrap(); + + let mut total_rows = 0; + let mut batch_num = 0; + for batch in iter { + let num_rows = batch.unwrap().num_rows(); + total_rows += num_rows; + batch_num += 1; + } + + info!( + "\nScanMemTableBench total rows of memtable: {}, total batch num: {}", + total_rows, batch_num, + ); + } +} diff --git a/benchmarks/src/sst_bench.rs b/benchmarks/src/sst_bench.rs new file mode 100644 index 0000000000..882e40b1fa --- /dev/null +++ b/benchmarks/src/sst_bench.rs @@ -0,0 +1,123 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! SST bench. + +use std::{cmp, sync::Arc, time::Instant}; + +use analytic_engine::sst::factory::{Factory, FactoryImpl, SstReaderOptions, SstType}; +use common_types::{projected_schema::ProjectedSchema, schema::Schema}; +use common_util::runtime::Runtime; +use futures::stream::StreamExt; +use log::info; +use object_store::{disk::File, path::ObjectStorePath, ObjectStore}; +use parquet::{ + cache::{LruDataCache, LruMetaCache}, + DataCacheRef, MetaCacheRef, +}; + +use crate::{config::SstBenchConfig, util}; + +pub struct SstBench { + store: File, + pub sst_file_name: String, + max_projections: usize, + schema: Schema, + sst_reader_options: SstReaderOptions, + runtime: Arc, +} + +impl SstBench { + pub fn new(config: SstBenchConfig) -> Self { + let store = File::new(config.store_path); + + let runtime = Arc::new(util::new_runtime(config.runtime_thread_num)); + + let mut sst_path = store.new_path(); + sst_path.set_file_name(&config.sst_file_name); + let meta_cache: Option = + if let Some(sst_meta_cache_cap) = config.sst_meta_cache_cap { + Some(Arc::new(LruMetaCache::new(sst_meta_cache_cap))) + } else { + None + }; + + let data_cache: Option = + if let Some(sst_data_cache_cap) = config.sst_data_cache_cap { + Some(Arc::new(LruDataCache::new(sst_data_cache_cap))) + } else { + None + }; + + let schema = runtime.block_on(util::schema_from_sst( + &store, + &sst_path, + &meta_cache, + &data_cache, + )); + + let predicate = config.predicate.into_predicate(); + let projected_schema = ProjectedSchema::no_projection(schema.clone()); + let sst_reader_options = SstReaderOptions { + sst_type: SstType::Parquet, + read_batch_row_num: config.read_batch_row_num, + reverse: config.reverse, + projected_schema, + predicate: Arc::new(predicate), + meta_cache, + data_cache, + runtime: runtime.clone(), + }; + let max_projections = cmp::min(config.max_projections, schema.num_columns()); + + SstBench { + store, + sst_file_name: config.sst_file_name, + max_projections, + schema, + sst_reader_options, + runtime, + } + } + + pub fn num_benches(&self) -> usize { + // One test reads all columns and `max_projections` tests read with projection. + 1 + self.max_projections + } + + pub fn init_for_bench(&mut self, i: usize) { + let projected_schema = + util::projected_schema_by_number(&self.schema, i, self.max_projections); + + self.sst_reader_options.projected_schema = projected_schema; + } + + pub fn run_bench(&self) { + let mut sst_path = self.store.new_path(); + sst_path.set_file_name(&self.sst_file_name); + + let sst_factory = FactoryImpl; + let mut sst_reader = sst_factory + .new_sst_reader(&self.sst_reader_options, &sst_path, &self.store) + .unwrap(); + + self.runtime.block_on(async { + let begin_instant = Instant::now(); + let mut sst_stream = sst_reader.read().await.unwrap(); + + let mut total_rows = 0; + let mut batch_num = 0; + while let Some(batch) = sst_stream.next().await { + let num_rows = batch.unwrap().num_rows(); + total_rows += num_rows; + batch_num += 1; + } + + info!( + "\nSstBench total rows of sst: {}, total batch num: {}, cost: {:?}", + total_rows, + batch_num, + begin_instant.elapsed(), + ); + }); + } +} diff --git a/benchmarks/src/sst_tools.rs b/benchmarks/src/sst_tools.rs new file mode 100644 index 0000000000..666722d91b --- /dev/null +++ b/benchmarks/src/sst_tools.rs @@ -0,0 +1,257 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Tools to generate SST. + +use std::sync::Arc; + +use analytic_engine::{ + row_iter::{ + self, + dedup::DedupIterator, + merge::{MergeBuilder, MergeConfig}, + IterOptions, + }, + space::SpaceId, + sst::{ + builder::RecordBatchStream, + factory::{Factory, FactoryImpl, SstBuilderOptions, SstReaderOptions, SstType}, + file::{self, FilePurgeQueue, SstMetaData}, + manager::FileId, + }, + table::sst_util, + table_options::Compression, +}; +use common_types::{projected_schema::ProjectedSchema, request_id::RequestId}; +use common_util::runtime::Runtime; +use futures::TryStreamExt; +use log::info; +use object_store::{ + disk::File, + path::{file::FilePath, ObjectStorePath}, + ObjectStore, +}; +use serde_derive::Deserialize; +use table_engine::{predicate::Predicate, table::TableId}; +use tokio::sync::mpsc; + +use crate::{config::BenchPredicate, util}; + +#[derive(Debug)] +struct SstConfig { + sst_meta: SstMetaData, + store_path: String, + sst_file_name: String, + num_rows_per_row_group: usize, + compression: Compression, +} + +async fn create_sst_from_stream(config: SstConfig, record_batch_stream: RecordBatchStream) { + let sst_factory = FactoryImpl; + let sst_builder_options = SstBuilderOptions { + sst_type: SstType::Parquet, + num_rows_per_row_group: config.num_rows_per_row_group, + compression: config.compression, + }; + + info!( + "create sst from stream, config:{:?}, sst_builder_options:{:?}", + config, sst_builder_options + ); + + let store = File::new(config.store_path); + let mut sst_file_path = store.new_path(); + sst_file_path.set_file_name(&config.sst_file_name); + + let mut builder = sst_factory + .new_sst_builder(&sst_builder_options, &sst_file_path, &store) + .unwrap(); + builder + .build(RequestId::next_id(), &config.sst_meta, record_batch_stream) + .await + .unwrap(); +} + +#[derive(Debug, Deserialize)] +pub struct RebuildSstConfig { + store_path: String, + input_file_name: String, + read_batch_row_num: usize, + predicate: BenchPredicate, + + // Output sst config: + output_file_name: String, + num_rows_per_row_group: usize, + compression: Compression, +} + +pub async fn rebuild_sst(config: RebuildSstConfig, runtime: Arc) { + info!("Start rebuild sst, config:{:?}", config); + + let store = File::new(config.store_path.clone()); + + let mut input_path = store.new_path(); + input_path.set_file_name(&config.input_file_name); + + let sst_meta = util::meta_from_sst(&store, &input_path, &None, &None).await; + + let projected_schema = ProjectedSchema::no_projection(sst_meta.schema.clone()); + let sst_reader_options = SstReaderOptions { + sst_type: SstType::Parquet, + read_batch_row_num: config.read_batch_row_num, + reverse: false, + projected_schema, + predicate: Arc::new(config.predicate.into_predicate()), + meta_cache: None, + data_cache: None, + runtime, + }; + + let record_batch_stream = + sst_to_record_batch_stream(&sst_reader_options, &input_path, &store).await; + + let output_sst_config = SstConfig { + sst_meta, + store_path: config.store_path, + sst_file_name: config.output_file_name, + num_rows_per_row_group: config.num_rows_per_row_group, + compression: config.compression, + }; + + create_sst_from_stream(output_sst_config, record_batch_stream).await; + + info!("Start rebuild sst done"); +} + +async fn sst_to_record_batch_stream( + sst_reader_options: &SstReaderOptions, + input_path: &FilePath, + store: &File, +) -> RecordBatchStream { + let sst_factory = FactoryImpl; + let mut sst_reader = sst_factory + .new_sst_reader(sst_reader_options, input_path, store) + .unwrap(); + + let sst_stream = sst_reader.read().await.unwrap(); + + Box::new(sst_stream.map_err(|e| Box::new(e) as _)) +} + +#[derive(Debug, Deserialize)] +pub struct MergeSstConfig { + store_path: String, + space_id: SpaceId, + table_id: TableId, + sst_file_ids: Vec, + dedup: bool, + read_batch_row_num: usize, + predicate: BenchPredicate, + + // Output sst config: + output_store_path: String, + output_file_name: String, + num_rows_per_row_group: usize, + compression: Compression, +} + +pub async fn merge_sst(config: MergeSstConfig, runtime: Arc) { + if config.sst_file_ids.is_empty() { + info!("No input files to merge"); + return; + } + + info!("Merge sst begin, config:{:?}", config); + + let space_id = config.space_id; + let table_id = config.table_id; + let store = File::new(config.store_path.clone()); + let (tx, _rx) = mpsc::unbounded_channel(); + let purge_queue = FilePurgeQueue::new(space_id, table_id, tx); + + let file_handles = util::file_handles_from_ssts( + &store, + space_id, + table_id, + &config.sst_file_ids, + purge_queue, + &None, + &None, + ) + .await; + let max_sequence = file_handles + .iter() + .map(|file| file.max_sequence()) + .max() + .unwrap(); + + let mut first_sst_path = store.new_path(); + sst_util::set_sst_file_path( + space_id, + table_id, + config.sst_file_ids[0], + &mut first_sst_path, + ); + let schema = util::schema_from_sst(&store, &first_sst_path, &None, &None).await; + let iter_options = IterOptions { + batch_size: config.read_batch_row_num, + }; + + let request_id = RequestId::next_id(); + let iter = { + let space_id = config.space_id; + let table_id = config.table_id; + let sequence = max_sequence + 1; + let projected_schema = ProjectedSchema::no_projection(schema.clone()); + let sst_reader_options = SstReaderOptions { + sst_type: SstType::Parquet, + read_batch_row_num: config.read_batch_row_num, + reverse: false, + projected_schema: projected_schema.clone(), + predicate: Arc::new(config.predicate.into_predicate()), + meta_cache: None, + data_cache: None, + runtime: runtime.clone(), + }; + + let sst_factory = FactoryImpl; + let mut builder = MergeBuilder::new(MergeConfig { + request_id, + space_id, + table_id, + sequence, + projected_schema, + predicate: Arc::new(Predicate::empty()), + sst_factory, + sst_reader_options, + store: &store, + merge_iter_options: iter_options.clone(), + need_dedup: true, + reverse: false, + }); + builder + .mut_ssts_of_level(0) + .extend_from_slice(&file_handles); + + builder.build().await.unwrap() + }; + + let record_batch_stream = if config.dedup { + let iter = DedupIterator::new(request_id, iter, iter_options); + row_iter::record_batch_with_key_iter_to_stream(iter, &runtime) + } else { + row_iter::record_batch_with_key_iter_to_stream(iter, &runtime) + }; + + let sst_meta = file::merge_sst_meta(&file_handles, schema); + let output_sst_config = SstConfig { + sst_meta, + store_path: config.output_store_path, + sst_file_name: config.output_file_name, + num_rows_per_row_group: config.num_rows_per_row_group, + compression: config.compression, + }; + + create_sst_from_stream(output_sst_config, record_batch_stream).await; + + info!("Merge sst done"); +} diff --git a/benchmarks/src/util.rs b/benchmarks/src/util.rs new file mode 100644 index 0000000000..639c3da19b --- /dev/null +++ b/benchmarks/src/util.rs @@ -0,0 +1,146 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Utilities. + +use std::sync::Arc; + +use analytic_engine::{ + memtable::{key::KeySequence, MemTableRef, PutContext}, + space::SpaceId, + sst::{ + factory::{Factory, FactoryImpl, SstReaderOptions, SstType}, + file::{FileHandle, FileMeta, FilePurgeQueue, SstMetaData}, + manager::FileId, + parquet::reader, + }, + table::sst_util, +}; +use common_types::{ + projected_schema::ProjectedSchema, + schema::{IndexInWriterSchema, Schema}, + time::TimeRange, +}; +use common_util::runtime::{self, Runtime}; +use futures::stream::StreamExt; +use object_store::{disk::File, path::file::FilePath, ObjectStore}; +use parquet::{DataCacheRef, MetaCacheRef}; +use table_engine::{predicate::Predicate, table::TableId}; + +pub fn new_runtime(thread_num: usize) -> Runtime { + runtime::Builder::default() + .thread_name("engine_bench") + .worker_threads(thread_num) + .enable_all() + .build() + .unwrap() +} + +pub async fn meta_from_sst( + store: &File, + sst_path: &FilePath, + meta_cache: &Option, + data_cache: &Option, +) -> SstMetaData { + let (_, sst_meta) = reader::read_sst_meta(store, sst_path, meta_cache, data_cache) + .await + .unwrap(); + + sst_meta +} + +pub async fn schema_from_sst( + store: &File, + sst_path: &FilePath, + meta_cache: &Option, + data_cache: &Option, +) -> Schema { + let sst_meta = meta_from_sst(store, sst_path, meta_cache, data_cache).await; + + sst_meta.schema +} + +pub fn projected_schema_by_number( + schema: &Schema, + num_columns: usize, + max_projections: usize, +) -> ProjectedSchema { + if num_columns < max_projections { + let projection = (0..num_columns + 1).into_iter().collect(); + + ProjectedSchema::new(schema.clone(), Some(projection)).unwrap() + } else { + ProjectedSchema::no_projection(schema.clone()) + } +} + +pub async fn load_sst_to_memtable( + store: &File, + sst_path: &FilePath, + schema: &Schema, + memtable: &MemTableRef, + runtime: Arc, +) { + let sst_reader_options = SstReaderOptions { + sst_type: SstType::Parquet, + read_batch_row_num: 500, + reverse: false, + projected_schema: ProjectedSchema::no_projection(schema.clone()), + predicate: Arc::new(Predicate::new(TimeRange::min_to_max())), + meta_cache: None, + data_cache: None, + runtime, + }; + let sst_factory = FactoryImpl; + let mut sst_reader = sst_factory + .new_sst_reader(&sst_reader_options, sst_path, store) + .unwrap(); + + let mut sst_stream = sst_reader.read().await.unwrap(); + let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns()); + let mut ctx = PutContext::new(index_in_writer); + + let mut sequence = crate::INIT_SEQUENCE; + + while let Some(batch) = sst_stream.next().await { + let batch = batch.unwrap(); + + for i in 0..batch.num_rows() { + let row = batch.clone_row_at(i); + + let key_seq = KeySequence::new(sequence, i as u32); + + memtable.put(&mut ctx, key_seq, &row, schema).unwrap(); + + sequence += 1; + } + } +} + +pub async fn file_handles_from_ssts( + store: &File, + space_id: SpaceId, + table_id: TableId, + sst_file_ids: &[FileId], + purge_queue: FilePurgeQueue, + meta_cache: &Option, + data_cache: &Option, +) -> Vec { + let mut file_handles = Vec::with_capacity(sst_file_ids.len()); + + for file_id in sst_file_ids.iter() { + let mut path = store.new_path(); + sst_util::set_sst_file_path(space_id, table_id, *file_id, &mut path); + + let sst_meta = meta_from_sst(store, &path, meta_cache, data_cache).await; + let file_meta = FileMeta { + id: *file_id, + meta: sst_meta, + }; + + let handle = FileHandle::new(file_meta, purge_queue.clone()); + + file_handles.push(handle); + } + + file_handles +} diff --git a/build.rs b/build.rs new file mode 100644 index 0000000000..ce2a0fb668 --- /dev/null +++ b/build.rs @@ -0,0 +1,26 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Build script + +use std::env; + +use vergen::{vergen, Config, ShaKind}; + +fn main() { + // Generate the default 'cargo:' instruction output + let mut config = Config::default(); + // Change the SHA output to the short variant + *config.git_mut().sha_kind_mut() = ShaKind::Short; + // Override git branch by env if provided. + if let Some(branch) = env::var_os("GITBRANCH") { + let branch = branch + .into_string() + .expect("Convert git branch env to string"); + if !branch.is_empty() { + *config.git_mut().branch_mut() = false; + println!("cargo:rustc-env=VERGEN_GIT_BRANCH={}", branch); + } + } + + vergen(config).expect("Vergen failed to generate config"); +} diff --git a/catalog/Cargo.toml b/catalog/Cargo.toml new file mode 100644 index 0000000000..14e3eb5c67 --- /dev/null +++ b/catalog/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "catalog" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +# Workspace dependencies, in alphabetical order +async-trait = "0.1.41" +snafu = { version ="0.6.10", features = ["backtraces"]} +common_types = { path = "../common_types" } +common_util = { path = "../common_util" } +table_engine = { path = "../table_engine" } diff --git a/catalog/src/consts.rs b/catalog/src/consts.rs new file mode 100644 index 0000000000..ebac82873c --- /dev/null +++ b/catalog/src/consts.rs @@ -0,0 +1,12 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Catalog constants + +/// Default catalog name +pub const DEFAULT_CATALOG: &str = "ceresdb"; +/// Default schema name +pub const DEFAULT_SCHEMA: &str = "public"; +/// Catalog name of the sys catalog +pub const SYSTEM_CATALOG: &str = "system"; +/// Schema name of the sys catalog +pub const SYSTEM_CATALOG_SCHEMA: &str = "public"; diff --git a/catalog/src/lib.rs b/catalog/src/lib.rs new file mode 100644 index 0000000000..90799b9205 --- /dev/null +++ b/catalog/src/lib.rs @@ -0,0 +1,59 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Common traits and types about catalog (schema) + +#[macro_use] +extern crate common_util; + +pub mod consts; +pub mod manager; +pub mod schema; + +use std::sync::Arc; + +use async_trait::async_trait; +use snafu::{Backtrace, Snafu}; + +use crate::schema::{NameRef, SchemaRef}; + +#[derive(Debug, Snafu)] +#[snafu(visibility = "pub")] +pub enum Error { + #[snafu(display( + "Failed to create schema, catalog:{}, schema:{}, err:{}", + catalog, + schema, + source + ))] + CreateSchema { + catalog: String, + schema: String, + source: Box, + }, + + #[snafu(display("Unsupported method, msg:{}.\nBacktrace:\n{}", msg, backtrace))] + UnSupported { msg: String, backtrace: Backtrace }, +} + +define_result!(Error); + +/// Catalog manage schemas +// TODO(yingwen): Maybe use async trait? +// TODO(yingwen): Provide a context +// TODO(yingwen): Catalog id? +#[async_trait] +pub trait Catalog { + /// Get the catalog name + fn name(&self) -> NameRef; + + /// Find schema by name + fn schema_by_name(&self, name: NameRef) -> Result>; + + async fn create_schema<'a>(&'a self, name: NameRef<'a>) -> Result<()>; + + /// All schemas + fn all_schemas(&self) -> Result>; +} + +/// A reference counted catalog pointer +pub type CatalogRef = Arc; diff --git a/catalog/src/manager.rs b/catalog/src/manager.rs new file mode 100644 index 0000000000..fb10637750 --- /dev/null +++ b/catalog/src/manager.rs @@ -0,0 +1,32 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Catalog manager + +use snafu::Snafu; + +use crate::{schema::NameRef, CatalogRef}; + +#[derive(Debug, Snafu)] +pub struct Error; + +define_result!(Error); + +/// Catalog manager abstraction +/// +/// Tracks meta data of databases/tables +// TODO(yingwen): Maybe use async trait? +// TODO(yingwen): Provide a context + +pub trait Manager: Clone + Send + Sync { + /// Get the default catalog name + fn default_catalog_name(&self) -> NameRef; + + /// Get the default schema name + fn default_schema_name(&self) -> NameRef; + + /// Find the catalog by name + fn catalog_by_name(&self, name: NameRef) -> Result>; + + /// All catalogs + fn all_catalogs(&self) -> Result>; +} diff --git a/catalog/src/schema.rs b/catalog/src/schema.rs new file mode 100644 index 0000000000..49c2f6c462 --- /dev/null +++ b/catalog/src/schema.rs @@ -0,0 +1,169 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Schema contains one or more tables + +use std::sync::Arc; + +use async_trait::async_trait; +use common_types::column_schema::ColumnSchema; +use snafu::{Backtrace, Snafu}; +use table_engine::{ + engine::{CreateTableRequest, DropTableRequest, TableEngineRef}, + table::{TableId, TableRef}, +}; + +#[derive(Debug, Snafu)] +#[snafu(visibility(pub))] +pub enum Error { + #[snafu(display("Unsupported method, msg:{}.\nBacktrace:\n{}", msg, backtrace))] + UnSupported { msg: String, backtrace: Backtrace }, + + #[snafu(display("Failed to create table, err:{}", source))] + CreateTable { source: table_engine::engine::Error }, + + #[snafu(display( + "Failed to create table, table already exists, table:{}.\nBacktrace:\n{}", + table, + backtrace + ))] + CreateExistTable { table: String, backtrace: Backtrace }, + + #[snafu(display( + "Failed to create table, cannot persist meta, table:{}, err:{}", + table, + source + ))] + WriteTableMeta { + table: String, + source: Box, + }, + + #[snafu(display( + "Catalog mismatch, expect:{}, given:{}.\nBacktrace:\n{}", + expect, + given, + backtrace + ))] + CatalogMismatch { + expect: String, + given: String, + backtrace: Backtrace, + }, + + #[snafu(display( + "Schema mismatch, expect:{}, given:{}.\nBacktrace:\n{}", + expect, + given, + backtrace + ))] + SchemaMismatch { + expect: String, + given: String, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid table id, msg:{}, table_id:{}.\nBacktrace:\n{}", + msg, + table_id, + backtrace + ))] + InvalidTableId { + msg: &'static str, + table_id: TableId, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to find table, table:{}.\nBacktrace:\n{}", table, backtrace))] + TableNotFound { table: String, backtrace: Backtrace }, + + #[snafu(display("Failed to alter table, err:{}", source))] + AlterTable { + source: Box, + }, + + #[snafu(display("Failed to drop table, err:{}", source))] + DropTable { source: table_engine::engine::Error }, + + #[snafu(display( + "Too many table, cannot create table, schema:{}, table:{}.\nBacktrace:\n{}", + schema, + table, + backtrace + ))] + TooManyTable { + schema: String, + table: String, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +/// Create table options. +#[derive(Clone)] +pub struct CreateOptions { + /// Table engine + // FIXME(yingwen): We have engine type in create request, remove this + pub table_engine: TableEngineRef, + /// Create if not exists, if table already exists, wont return error + // TODO(yingwen): Maybe remove this? + pub create_if_not_exists: bool, +} + +/// Drop table options. +#[derive(Clone)] +pub struct DropOptions { + /// Table engine + pub table_engine: TableEngineRef, +} + +/// Alter table operations. +#[derive(Debug)] +pub enum AlterTableOperation { + /// Add column operation, the column id in [ColumnSchema] will be ignored. + /// Primary key column is not allowed to be added, so all columns will + /// be added as normal columns. + AddColumn(ColumnSchema), +} + +/// Alter table request. +#[derive(Debug)] +pub struct AlterTableRequest { + pub table_name: String, + pub operations: Vec, +} + +/// Schema manage tables. +#[async_trait] +pub trait Schema { + /// Get schema name. + fn name(&self) -> NameRef; + + /// Find table by name. + fn table_by_name(&self, name: NameRef) -> Result>; + + /// Allocate a table id for given table. + fn alloc_table_id(&self, name: NameRef) -> Result; + + /// Create table according to `request`. + async fn create_table( + &self, + request: CreateTableRequest, + opts: CreateOptions, + ) -> Result; + + /// Drop table according to `request`. + /// + /// Returns true if the table is really dropped. + async fn drop_table(&self, request: DropTableRequest, opts: DropOptions) -> Result; + + /// All tables + fn all_tables(&self) -> Result>; +} + +/// A name reference +pub type NameRef<'a> = &'a str; +/// A reference counted schema pointer +// TODO(yingwen): This name is conflict with [table_engine::schema::SchemaRef]. +pub type SchemaRef = Arc; diff --git a/catalog_impls/Cargo.toml b/catalog_impls/Cargo.toml new file mode 100644 index 0000000000..ddcbdcdeec --- /dev/null +++ b/catalog_impls/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "catalog_impls" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +# Workspace dependencies, in alphabetical order +async-trait = "0.1.41" +catalog = { path = "../catalog" } +common_types = { path = "../common_types" } +common_util = { path = "../common_util" } +log = "0.4" +snafu = { version ="0.6.10", features = ["backtraces"]} +system_catalog = { path = "../system_catalog" } +table_engine = { path = "../table_engine" } +tokio = { version = "1.0", features = ["sync"] } + +[dev-dependencies] +analytic_engine = { path = "../analytic_engine", features = ["test"] } +server = { path = "../server" } diff --git a/catalog_impls/src/lib.rs b/catalog_impls/src/lib.rs new file mode 100644 index 0000000000..6f4ca69947 --- /dev/null +++ b/catalog_impls/src/lib.rs @@ -0,0 +1,52 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use catalog::{consts::SYSTEM_CATALOG, manager::Manager, schema::NameRef, CatalogRef}; +use system_catalog::{tables::Tables, SystemTableAdapter}; + +use crate::system_tables::{SystemTables, SystemTablesBuilder}; + +pub mod memory; +mod system_tables; +pub mod table_based; + +/// CatalogManagerImpl is a wrapper for system and user tables +#[derive(Clone)] +pub struct CatalogManagerImpl { + system_tables: SystemTables, + user_catalog_manager: M, +} + +impl CatalogManagerImpl { + pub fn new(manager: M) -> Self { + let mut system_tables_builder = SystemTablesBuilder::new(); + system_tables_builder = system_tables_builder + .insert_table(SystemTableAdapter::new(Tables::new(manager.clone()))); + Self { + system_tables: system_tables_builder.build(), + user_catalog_manager: manager, + } + } +} + +impl Manager for CatalogManagerImpl { + fn default_catalog_name(&self) -> NameRef { + self.user_catalog_manager.default_catalog_name() + } + + fn default_schema_name(&self) -> NameRef { + self.user_catalog_manager.default_schema_name() + } + + fn catalog_by_name(&self, name: NameRef) -> catalog::manager::Result> { + match name { + SYSTEM_CATALOG => Ok(Some(Arc::new(self.system_tables.clone()))), + _ => self.user_catalog_manager.catalog_by_name(name), + } + } + + fn all_catalogs(&self) -> catalog::manager::Result> { + self.user_catalog_manager.all_catalogs() + } +} diff --git a/catalog_impls/src/memory.rs b/catalog_impls/src/memory.rs new file mode 100644 index 0000000000..e8ab37bb26 --- /dev/null +++ b/catalog_impls/src/memory.rs @@ -0,0 +1,260 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! A memory catalog implementation +//! +//! Mainly for test + +use std::{ + collections::HashMap, + sync::{Arc, RwLock}, +}; + +use async_trait::async_trait; +use catalog::{ + self, consts, + manager::{self, Manager}, + schema::{ + self, CatalogMismatch, CreateOptions, CreateTable, DropOptions, NameRef, Schema, + SchemaMismatch, SchemaRef, TooManyTable, UnSupported, + }, + Catalog, CatalogRef, +}; +use log::info; +use snafu::{ensure, OptionExt, ResultExt}; +use table_engine::{ + engine::{CreateTableRequest, DropTableRequest}, + table::{SchemaId, SchemaIdGenerator, TableId, TableRef, TableSeqGenerator}, +}; + +struct ManagerImplInner { + catalogs: HashMap, +} + +/// In-memory catalog manager +#[derive(Clone)] +pub struct ManagerImpl { + inner: Arc, +} + +impl Default for ManagerImpl { + fn default() -> Self { + let schema_id_generator = SchemaIdGenerator::default(); + let schema_id = schema_id_generator.alloc_schema_id().unwrap(); + + // Register default schema + let default_schema: SchemaRef = Arc::new(SchemaImpl::new( + consts::DEFAULT_CATALOG.to_string(), + consts::DEFAULT_SCHEMA.to_string(), + schema_id, + )); + let mut schemas = HashMap::new(); + schemas.insert(consts::DEFAULT_SCHEMA.to_string(), default_schema); + + // Use above schemas to create a default catalog + let default_catalog: CatalogRef = Arc::new(CatalogImpl { + name: consts::DEFAULT_CATALOG.to_string(), + schemas: RwLock::new(schemas), + schema_id_generator: Arc::new(schema_id_generator), + }); + // Register default catalog + let mut catalogs = HashMap::new(); + catalogs.insert(consts::DEFAULT_CATALOG.to_string(), default_catalog); + + Self { + inner: Arc::new(ManagerImplInner { catalogs }), + } + } +} + +impl Manager for ManagerImpl { + fn default_catalog_name(&self) -> NameRef { + consts::DEFAULT_CATALOG + } + + fn default_schema_name(&self) -> NameRef { + consts::DEFAULT_SCHEMA + } + + fn catalog_by_name(&self, name: NameRef) -> manager::Result> { + let catalog = self.inner.catalogs.get(name).cloned(); + Ok(catalog) + } + + fn all_catalogs(&self) -> manager::Result> { + Ok(self.inner.catalogs.iter().map(|(_, v)| v.clone()).collect()) + } +} + +/// In-memory catalog +struct CatalogImpl { + /// Catalog name + name: String, + /// Schemas of catalog + schemas: RwLock>, + /// Global schema id generator, Each schema has a unique schema id. + schema_id_generator: Arc, +} + +#[async_trait] +impl Catalog for CatalogImpl { + fn name(&self) -> NameRef { + &self.name + } + + fn schema_by_name(&self, name: NameRef) -> catalog::Result> { + let schema = self.schemas.read().unwrap().get(name).cloned(); + Ok(schema) + } + + async fn create_schema<'a>(&'a self, name: NameRef<'a>) -> catalog::Result<()> { + let mut schemas = self.schemas.write().unwrap(); + + if schemas.get(name).is_some() { + return Ok(()); + } + + let schema_id = self.schema_id_generator.alloc_schema_id().unwrap(); + + let schema: SchemaRef = Arc::new(SchemaImpl::new( + self.name.to_string(), + name.to_string(), + schema_id, + )); + + schemas.insert(name.to_string(), schema); + info!( + "create schema success, catalog:{}, schema:{}", + &self.name, name + ); + Ok(()) + } + + fn all_schemas(&self) -> catalog::Result> { + Ok(self + .schemas + .read() + .unwrap() + .iter() + .map(|(_, v)| v.clone()) + .collect()) + } +} + +/// In-memory schema +struct SchemaImpl { + /// Catalog name + catalog_name: String, + /// Schema name + schema_name: String, + /// Tables of schema + tables: RwLock>, + schema_id: SchemaId, + table_seq_generator: TableSeqGenerator, +} + +impl SchemaImpl { + fn new(catalog_name: String, schema_name: String, schema_id: SchemaId) -> Self { + Self { + catalog_name, + schema_name, + tables: RwLock::new(HashMap::new()), + schema_id, + table_seq_generator: TableSeqGenerator::default(), + } + } +} + +#[async_trait] +impl Schema for SchemaImpl { + fn name(&self) -> NameRef { + &self.schema_name + } + + fn table_by_name(&self, name: NameRef) -> schema::Result> { + let table = self.tables.read().unwrap().get(name).cloned(); + Ok(table) + } + + fn alloc_table_id(&self, name: NameRef) -> schema::Result { + let table_seq = self + .table_seq_generator + .alloc_table_seq() + .context(TooManyTable { + schema: &self.schema_name, + table: name, + })?; + + Ok(TableId::new(self.schema_id, table_seq)) + } + + // In memory schema does not support persisting table info + async fn create_table( + &self, + request: CreateTableRequest, + opts: CreateOptions, + ) -> schema::Result { + ensure!( + self.catalog_name == request.catalog_name, + CatalogMismatch { + expect: &self.catalog_name, + given: request.catalog_name, + } + ); + ensure!( + self.schema_name == request.schema_name, + SchemaMismatch { + expect: &self.schema_name, + given: request.schema_name, + } + ); + + { + // Check table existence + let tables = self.tables.read().unwrap(); + if let Some(table) = tables.get(&request.table_name) { + return Ok(table.clone()); + } + } + + // Table engine handles duplicate table creation + let table_name = request.table_name.clone(); + let table = opts + .table_engine + .create_table(request) + .await + .context(CreateTable)?; + + { + // Now the table engine have create the table, but we may not be the + // creator thread + let mut tables = self.tables.write().unwrap(); + tables.entry(table_name).or_insert_with(|| table.clone()); + } + + Ok(table) + } + + async fn drop_table( + &self, + request: DropTableRequest, + _opts: DropOptions, + ) -> schema::Result { + UnSupported { + msg: format!( + "Dropping table is not supported by memory catalog, request:{:?}", + request + ), + } + .fail() + } + + fn all_tables(&self) -> schema::Result> { + Ok(self + .tables + .read() + .unwrap() + .iter() + .map(|(_, v)| v.clone()) + .collect()) + } +} diff --git a/catalog_impls/src/system_tables.rs b/catalog_impls/src/system_tables.rs new file mode 100644 index 0000000000..672f3fa8f6 --- /dev/null +++ b/catalog_impls/src/system_tables.rs @@ -0,0 +1,131 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Contains System tables, such as system.public.tables + +use std::{collections::HashMap, sync::Arc}; + +use async_trait::async_trait; +use catalog::{ + consts::{SYSTEM_CATALOG, SYSTEM_CATALOG_SCHEMA}, + schema::{CreateOptions, DropOptions, NameRef, Schema, SchemaRef}, + Catalog, +}; +use system_catalog::SystemTableAdapter; +use table_engine::{ + engine::{CreateTableRequest, DropTableRequest}, + table::{Table, TableId, TableRef}, +}; + +const UNSUPPORTED_MSG: &str = "system tables not supported"; + +pub struct SystemTablesBuilder { + tables: HashMap>, +} + +impl SystemTablesBuilder { + pub fn new() -> Self { + Self { + tables: HashMap::new(), + } + } + + pub fn insert_table(mut self, table: SystemTableAdapter) -> Self { + self.tables + .insert(table.name().to_string(), Arc::new(table)); + self + } + + pub fn build(self) -> SystemTables { + SystemTables::new(self.tables) + } +} + +#[derive(Clone)] +pub struct SystemTables { + tables: Arc>>, +} + +impl SystemTables { + pub fn new(tables: HashMap>) -> Self { + Self { + tables: Arc::new(tables), + } + } +} + +#[async_trait] +impl Schema for SystemTables { + fn name(&self) -> NameRef { + SYSTEM_CATALOG_SCHEMA + } + + fn table_by_name(&self, name: NameRef) -> catalog::schema::Result> { + Ok(self.tables.get(name).map(|v| v.clone() as TableRef)) + } + + fn alloc_table_id(&self, _name: NameRef) -> catalog::schema::Result { + catalog::schema::UnSupported { + msg: UNSUPPORTED_MSG, + } + .fail() + } + + async fn create_table( + &self, + _request: CreateTableRequest, + _opts: CreateOptions, + ) -> catalog::schema::Result { + catalog::schema::UnSupported { + msg: UNSUPPORTED_MSG, + } + .fail() + } + + async fn drop_table( + &self, + _request: DropTableRequest, + _opts: DropOptions, + ) -> catalog::schema::Result { + catalog::schema::UnSupported { + msg: UNSUPPORTED_MSG, + } + .fail() + } + + fn all_tables(&self) -> catalog::schema::Result> { + Ok(self + .tables + .iter() + .map(|(_, v)| v.clone() as TableRef) + .collect()) + } +} + +#[async_trait] +impl Catalog for SystemTables { + fn name(&self) -> NameRef { + SYSTEM_CATALOG + } + + fn schema_by_name(&self, name: NameRef) -> catalog::Result> { + if name == SYSTEM_CATALOG_SCHEMA { + Ok(Some(Arc::new(self.clone()))) + } else { + Ok(None) + } + } + + async fn create_schema<'a>(&'a self, _name: NameRef<'a>) -> catalog::Result<()> { + catalog::UnSupported { + msg: UNSUPPORTED_MSG, + } + .fail() + } + + fn all_schemas(&self) -> catalog::Result> { + catalog::UnSupported { + msg: UNSUPPORTED_MSG, + } + .fail() + } +} diff --git a/catalog_impls/src/table_based.rs b/catalog_impls/src/table_based.rs new file mode 100644 index 0000000000..60c578a530 --- /dev/null +++ b/catalog_impls/src/table_based.rs @@ -0,0 +1,1126 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table based catalog implementation + +use std::{ + collections::HashMap, + sync::{Arc, RwLock}, +}; + +use async_trait::async_trait; +use catalog::{ + self, consts, + manager::{self, Manager}, + schema::{ + self, CatalogMismatch, CreateExistTable, CreateOptions, CreateTable, DropOptions, + DropTable, InvalidTableId, NameRef, Schema, SchemaMismatch, SchemaRef, TooManyTable, + WriteTableMeta, + }, + Catalog, CatalogRef, +}; +use common_util::define_result; +use log::{debug, error, info}; +use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu}; +use system_catalog::sys_catalog_table::{ + self, CreateCatalogRequest, CreateSchemaRequest, SysCatalogTable, Visitor, + VisitorCatalogNotFound, VisitorOpenTable, VisitorSchemaNotFound, +}; +use table_engine::{ + engine::{ + CreateTableRequest, DropTableRequest, OpenTableRequest, TableEngine, TableEngineRef, + TableState, + }, + table::{ + ReadOptions, SchemaId, SchemaIdGenerator, TableId, TableInfo, TableRef, TableSeqGenerator, + }, +}; +use tokio::sync::Mutex; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to build sys catalog table, err:{}", source))] + BuildSysCatalog { + source: system_catalog::sys_catalog_table::Error, + }, + + #[snafu(display("Failed to visit sys catalog table, err:{}", source))] + VisitSysCatalog { + source: system_catalog::sys_catalog_table::Error, + }, + + #[snafu(display( + "Failed to find table to update, name:{}.\nBacktrace:\n{}", + name, + backtrace + ))] + UpdateTableNotFound { name: String, backtrace: Backtrace }, + + #[snafu(display("Failed to create catalog, catalog:{}, err:{}", catalog, source))] + CreateCatalog { + catalog: String, + source: system_catalog::sys_catalog_table::Error, + }, + + #[snafu(display( + "Failed to create schema, catalog:{}, schema:{}, err:{}", + catalog, + schema, + source + ))] + CreateSchema { + catalog: String, + schema: String, + source: system_catalog::sys_catalog_table::Error, + }, +} + +define_result!(Error); + +/// Table based catalog manager +#[derive(Clone)] +pub struct TableBasedManager { + inner: Arc, +} + +impl Manager for TableBasedManager { + fn default_catalog_name(&self) -> NameRef { + consts::DEFAULT_CATALOG + } + + fn default_schema_name(&self) -> NameRef { + consts::DEFAULT_SCHEMA + } + + fn catalog_by_name(&self, name: NameRef) -> manager::Result> { + let catalog = self.inner.catalogs.get(name).cloned().map(|v| v as _); + Ok(catalog) + } + + fn all_catalogs(&self) -> manager::Result> { + Ok(self + .inner + .catalogs + .iter() + .map(|(_, v)| v.clone() as _) + .collect()) + } +} + +impl TableBasedManager { + /// Create and init the TableBasedManager. + // TODO(yingwen): Define all constants in catalog crate. + pub async fn new(backend: &T, engine_proxy: TableEngineRef) -> Result { + // Create or open sys_catalog table, will also create a space (catalog + schema) + // for system catalog. + let catalog_table = SysCatalogTable::new(backend) + .await + .context(BuildSysCatalog)?; + + let mut inner = Inner { + catalog_table: Arc::new(catalog_table), + catalogs: HashMap::new(), + engine_proxy, + schema_id_generator: Arc::new(SchemaIdGenerator::default()), + }; + + inner.init().await?; + + Ok(Self { + inner: Arc::new(inner), + }) + } + + #[cfg(test)] + pub fn get_engine_proxy(&self) -> TableEngineRef { + self.inner.engine_proxy.clone() + } +} + +type CatalogMap = HashMap>; + +/// Inner state of TableBasedManager +struct Inner { + /// Sys catalog table + catalog_table: Arc, + catalogs: CatalogMap, + /// Table engine proxy + engine_proxy: TableEngineRef, + /// Global schema id generator, Each schema has a unique schema id. + schema_id_generator: Arc, +} + +impl Inner { + /// Load all data from sys catalog table. + async fn init(&mut self) -> Result<()> { + // The system catalog and schema in it is not persisted, so we add it manually. + self.load_system_catalog(); + + let mut visitor = VisitorImpl { + catalog_table: self.catalog_table.clone(), + catalogs: &mut self.catalogs, + engine_proxy: self.engine_proxy.clone(), + schema_id_generator: self.schema_id_generator.clone(), + }; + + // Load all existent catalog/schema/tables from catalog_table. + let opts = ReadOptions::default(); + self.catalog_table + .visit(opts, &mut visitor) + .await + .context(VisitSysCatalog)?; + + // Create default catalog if it is not exists. + self.maybe_create_default_catalog().await?; + + Ok(()) + } + + fn load_system_catalog(&mut self) { + // Get the `sys_catalog` table and add it to tables. + let table = self.catalog_table.inner_table(); + let mut tables = SchemaTables::default(); + tables.insert(self.catalog_table.table_id(), table); + + // Use schema id of schema `system/public` as last schema id. + let schema_id = sys_catalog_table::SCHEMA_ID; + self.schema_id_generator.set_last_schema_id(schema_id); + + // Create the default schema in system catalog. + let schema = Arc::new(SchemaImpl { + catalog_name: consts::SYSTEM_CATALOG.to_string(), + schema_name: consts::SYSTEM_CATALOG_SCHEMA.to_string(), + schema_id, + tables: RwLock::new(tables), + mutex: Mutex::new(()), + catalog_table: self.catalog_table.clone(), + table_seq_generator: TableSeqGenerator::default(), + }); + // Use table seq of `sys_catalog` table as last table seq. + schema + .table_seq_generator + .set_last_table_seq(sys_catalog_table::TABLE_SEQ); + + let mut schemas = HashMap::new(); + schemas.insert(schema.name().to_string(), schema); + + let schema_id_generator = self.schema_id_generator.clone(); + let catalog_table = self.catalog_table.clone(); + // Create the system catalog. + let catalog = Arc::new(CatalogImpl { + name: consts::SYSTEM_CATALOG.to_string(), + schemas: RwLock::new(schemas), + schema_id_generator, + catalog_table, + mutex: Mutex::new(()), + }); + + self.catalogs.insert(catalog.name().to_string(), catalog); + } + + async fn maybe_create_default_catalog(&mut self) -> Result<()> { + // Try to get default catalog, create it if not exists. + let catalog = match self.catalogs.get(consts::DEFAULT_CATALOG) { + Some(v) => v.clone(), + None => { + // Only system catalog should exists. + assert_eq!(1, self.catalogs.len()); + + // Default catalog is not exists, create and store it. + let default_catalog = self + .create_catalog(CreateCatalogRequest { + catalog_name: consts::DEFAULT_CATALOG.to_string(), + }) + .await?; + + default_catalog + } + }; + + // Create default schema if not exists. + if catalog.find_schema(consts::DEFAULT_SCHEMA).is_none() { + // Allocate schema id. + let schema_id = self + .schema_id_generator + .alloc_schema_id() + .expect("Schema id of default catalog should be valid"); + + self.add_schema_to_catalog( + CreateSchemaRequest { + catalog_name: consts::DEFAULT_CATALOG.to_string(), + schema_name: consts::DEFAULT_SCHEMA.to_string(), + schema_id, + }, + &*catalog, + ) + .await?; + } + + Ok(()) + } + + async fn create_catalog(&mut self, request: CreateCatalogRequest) -> Result> { + let catalog_name = request.catalog_name.clone(); + + self.catalog_table + .create_catalog(request) + .await + .context(CreateCatalog { + catalog: &catalog_name, + })?; + + let schema_id_generator = self.schema_id_generator.clone(); + let catalog_table = self.catalog_table.clone(); + let catalog = Arc::new(CatalogImpl { + name: catalog_name.clone(), + schemas: RwLock::new(HashMap::new()), + schema_id_generator, + catalog_table, + mutex: Mutex::new(()), + }); + + self.catalogs.insert(catalog_name, catalog.clone()); + + Ok(catalog) + } + + async fn add_schema_to_catalog( + &mut self, + request: CreateSchemaRequest, + catalog: &CatalogImpl, + ) -> Result> { + let schema_name = request.schema_name.clone(); + let schema_id = request.schema_id; + + self.catalog_table + .create_schema(request) + .await + .context(CreateSchema { + catalog: &catalog.name, + schema: &schema_name, + })?; + + let schema = Arc::new(SchemaImpl::new( + &catalog.name, + &schema_name, + schema_id, + self.catalog_table.clone(), + )); + + catalog.insert_schema_into_memory(schema.clone()); + + Ok(schema) + } +} + +/// Sys catalog visitor implementation, used to load catalog info +struct VisitorImpl<'a> { + catalog_table: Arc, + catalogs: &'a mut CatalogMap, + engine_proxy: TableEngineRef, + schema_id_generator: Arc, +} + +#[async_trait] +impl<'a> Visitor for VisitorImpl<'a> { + fn visit_catalog(&mut self, request: CreateCatalogRequest) -> sys_catalog_table::Result<()> { + debug!("Visitor visit catalog, request:{:?}", request); + let schema_id_generator = self.schema_id_generator.clone(); + let catalog_table = self.catalog_table.clone(); + + let catalog = CatalogImpl { + name: request.catalog_name.to_string(), + schemas: RwLock::new(HashMap::new()), + schema_id_generator, + catalog_table, + mutex: Mutex::new(()), + }; + + // Register catalog. + self.catalogs + .insert(request.catalog_name, Arc::new(catalog)); + + Ok(()) + } + + fn visit_schema(&mut self, request: CreateSchemaRequest) -> sys_catalog_table::Result<()> { + debug!("Visitor visit schema, request:{:?}", request); + + let catalog = + self.catalogs + .get_mut(&request.catalog_name) + .context(VisitorCatalogNotFound { + catalog: &request.catalog_name, + })?; + + let schema_id = request.schema_id; + let schema = Arc::new(SchemaImpl::new( + &request.catalog_name, + &request.schema_name, + schema_id, + self.catalog_table.clone(), + )); + + // If schema exists, we overwrite it. + catalog.insert_schema_into_memory(schema); + + // Update last schema id. + if self.schema_id_generator.last_schema_id_u32() < schema_id.as_u32() { + self.schema_id_generator.set_last_schema_id(schema_id); + } + + Ok(()) + } + + async fn visit_tables(&mut self, table_info: TableInfo) -> sys_catalog_table::Result<()> { + debug!("Visitor visit tables, table_info:{:?}", table_info); + + let catalog = + self.catalogs + .get_mut(&table_info.catalog_name) + .context(VisitorCatalogNotFound { + catalog: &table_info.catalog_name, + })?; + let schema = + catalog + .find_schema(&table_info.schema_name) + .context(VisitorSchemaNotFound { + catalog: &table_info.catalog_name, + schema: &table_info.schema_name, + })?; + + // Update max table sequence of the schema. + let table_id = table_info.table_id; + let table_seq = table_id.table_seq(); + if table_seq.as_u64() >= schema.table_seq_generator.last_table_seq_u64() { + schema.table_seq_generator.set_last_table_seq(table_seq); + } + + // Only the stable/altering table can be opened. + if !matches!(table_info.state, TableState::Stable) { + debug!( + "Visitor visit a unstable table, table_info:{:?}", + table_info + ); + return Ok(()); + } + + let open_request = OpenTableRequest::from(table_info); + let table_name = open_request.table_name.clone(); + let table_opt = self + .engine_proxy + .open_table(open_request) + .await + .context(VisitorOpenTable)?; + + match table_opt { + Some(table) => { + schema.insert_table_into_memory(table_id, table); + } + None => { + // Now we ignore the error that table not in engine but in catalog. + error!( + "Visitor found table not in engine, table_name:{:?}", + table_name + ); + } + } + + Ok(()) + } +} + +type SchemaMap = HashMap>; + +/// Table based catalog +struct CatalogImpl { + /// Catalog name + name: String, + /// Schemas of catalog + // Now the Schema trait does not support create schema, so we use impl type here + schemas: RwLock, + /// Global schema id generator, Each schema has a unique schema id. + schema_id_generator: Arc, + /// Sys catalog table + catalog_table: Arc, + /// Mutex + /// + /// Protects: + /// - create schema + /// - persist to default catalog + mutex: Mutex<()>, +} + +impl CatalogImpl { + /// Insert schema + fn insert_schema_into_memory(&self, schema: Arc) { + let mut schemas = self.schemas.write().unwrap(); + schemas.insert(schema.name().to_string(), schema); + } + + fn find_schema(&self, schema_name: &str) -> Option> { + let schemas = self.schemas.read().unwrap(); + schemas.get(schema_name).cloned() + } +} + +// TODO(yingwen): Support add schema (with options to control schema +// persistence) +#[async_trait] +impl Catalog for CatalogImpl { + fn name(&self) -> NameRef { + &self.name + } + + fn schema_by_name(&self, name: NameRef) -> catalog::Result> { + let schemas = self.schemas.read().unwrap(); + let schema = schemas.get(name).cloned().map(|v| v as _); + Ok(schema) + } + + async fn create_schema<'a>(&'a self, name: NameRef<'a>) -> catalog::Result<()> { + // Check schema existence + if self.schema_by_name(name)?.is_some() { + return Ok(()); + } + + // Lock schema and persist schema to default catalog + let _lock = self.mutex.lock().await; + // Check again + if self.schema_by_name(name)?.is_some() { + return Ok(()); + } + + // Allocate schema id. + let schema_id = self + .schema_id_generator + .alloc_schema_id() + .expect("Schema id of default catalog should be valid"); + + let request = CreateSchemaRequest { + catalog_name: self.name.to_string(), + schema_name: name.to_string(), + schema_id, + }; + + let schema_id = request.schema_id; + + self.catalog_table + .create_schema(request) + .await + .map_err(|e| Box::new(e) as _) + .context(catalog::CreateSchema { + catalog: &self.name, + schema: &name.to_string(), + })?; + + let schema = Arc::new(SchemaImpl::new( + &self.name, + name, + schema_id, + self.catalog_table.clone(), + )); + + self.insert_schema_into_memory(schema); + info!( + "create schema success, catalog:{}, schema:{}", + &self.name, name + ); + Ok(()) + } + + fn all_schemas(&self) -> catalog::Result> { + Ok(self + .schemas + .read() + .unwrap() + .iter() + .map(|(_, v)| v.clone() as _) + .collect()) + } +} + +/// Table based schema +struct SchemaImpl { + /// Catalog name + catalog_name: String, + /// Schema name + schema_name: String, + /// Schema id + schema_id: SchemaId, + /// Tables of schema + tables: RwLock, + /// Mutex + /// + /// Protects: + /// - add/drop/alter table + /// - persist to sys catalog table + mutex: Mutex<()>, + /// Sys catalog table + catalog_table: Arc, + table_seq_generator: TableSeqGenerator, +} + +impl SchemaImpl { + fn new( + catalog_name: &str, + schema_name: &str, + schema_id: SchemaId, + catalog_table: Arc, + ) -> Self { + Self { + catalog_name: catalog_name.to_string(), + schema_name: schema_name.to_string(), + schema_id, + tables: RwLock::new(SchemaTables::default()), + mutex: Mutex::new(()), + catalog_table, + table_seq_generator: TableSeqGenerator::default(), + } + } + + /// Insert table into memory, wont check existence + fn insert_table_into_memory(&self, table_id: TableId, table: TableRef) { + let mut tables = self.tables.write().unwrap(); + tables.insert(table_id, table); + } + + /// Check table existence in read lock + /// + /// If table exists: + /// - if create_if_not_exists is true, return Ok + /// - if create_if_not_exists is false, return Error + fn check_create_table_read( + &self, + request: &CreateTableRequest, + create_if_not_exists: bool, + ) -> schema::Result> { + let table_id = request.table_id; + ensure!( + self.schema_id == table_id.schema_id(), + InvalidTableId { + msg: "schema id unmatch", + table_id, + } + ); + + let tables = self.tables.read().unwrap(); + if let Some(table) = tables.tables_by_name.get(&request.table_name) { + // Already exists + if create_if_not_exists { + // Create if not exists is set + return Ok(Some(table.clone())); + } + // Create if not exists is not set, need to return error + return CreateExistTable { + table: &request.table_name, + } + .fail(); + } + + // Table is not exists, check whether table id is unique under this schema. + let table_by_id = tables.tables_by_id.get(&request.table_id); + ensure!( + table_by_id.is_none(), + InvalidTableId { + msg: "table with given id already exists", + table_id, + } + ); + + Ok(None) + } + + fn find_table_by_name(&self, name: NameRef) -> Option { + self.tables + .read() + .unwrap() + .tables_by_name + .get(name) + .cloned() + } +} + +#[derive(Default)] +struct SchemaTables { + tables_by_name: HashMap, + tables_by_id: HashMap, +} + +impl SchemaTables { + fn insert(&mut self, table_id: TableId, table: TableRef) { + self.tables_by_name + .insert(table.name().to_string(), table.clone()); + self.tables_by_id.insert(table_id, table); + } + + fn remove(&mut self, name: NameRef) { + if let Some(table) = self.tables_by_name.remove(name) { + self.tables_by_id.remove(&table.id()); + } + } +} + +#[async_trait] +impl Schema for SchemaImpl { + fn name(&self) -> NameRef { + &self.schema_name + } + + fn table_by_name(&self, name: NameRef) -> schema::Result> { + let table = self + .tables + .read() + .unwrap() + .tables_by_name + .get(name) + .cloned(); + Ok(table) + } + + fn alloc_table_id(&self, name: NameRef) -> schema::Result { + let table_seq = self + .table_seq_generator + .alloc_table_seq() + .context(TooManyTable { + schema: &self.schema_name, + table: name, + })?; + + Ok(TableId::new(self.schema_id, table_seq)) + } + + // TODO(yingwen): Do not persist if engine is memory engine. + async fn create_table( + &self, + request: CreateTableRequest, + opts: CreateOptions, + ) -> schema::Result { + info!( + "Table based catalog manager create table, request:{:?}", + request + ); + + ensure!( + self.catalog_name == request.catalog_name, + CatalogMismatch { + expect: &self.catalog_name, + given: request.catalog_name, + } + ); + ensure!( + self.schema_name == request.schema_name, + SchemaMismatch { + expect: &self.schema_name, + given: request.schema_name, + } + ); + // TODO(yingwen): Validate table id is unique. + + // Check table existence + if let Some(table) = self.check_create_table_read(&request, opts.create_if_not_exists)? { + return Ok(table); + } + + // Lock schema and persist table to sys catalog table + let _lock = self.mutex.lock().await; + // Check again + if let Some(table) = self.check_create_table_read(&request, opts.create_if_not_exists)? { + return Ok(table); + } + + // Create table + let table_name = request.table_name.clone(); + let table = opts + .table_engine + .create_table(request.clone()) + .await + .context(CreateTable)?; + assert_eq!(table_name, table.name()); + + self.catalog_table + .create_table(request.clone().into()) + .await + .map_err(|e| Box::new(e) as _) + .context(WriteTableMeta { + table: &request.table_name, + })?; + + { + // Insert into memory + let mut tables = self.tables.write().unwrap(); + tables.insert(request.table_id, table.clone()); + } + + Ok(table) + } + + async fn drop_table( + &self, + mut request: DropTableRequest, + opts: DropOptions, + ) -> schema::Result { + info!( + "Table based catalog manager drop table, request:{:?}", + request + ); + + if self.find_table_by_name(&request.table_name).is_none() { + return Ok(false); + }; + + let _lock = self.mutex.lock().await; + // double check whether the table to drop exists. + let table = match self.find_table_by_name(&request.table_name) { + Some(v) => v, + None => return Ok(false), + }; + + // Determine the real engine type of the table to drop. + // FIXME(xikai): the engine should not be part of the DropRequest. + request.engine = table.engine_type().to_string(); + + // Prepare to drop table info in the sys_catalog. + self.catalog_table + .prepare_drop_table(request.clone()) + .await + .map_err(|e| Box::new(e) as _) + .context(WriteTableMeta { + table: &request.table_name, + })?; + + let dropped = opts + .table_engine + .drop_table(request.clone()) + .await + .context(DropTable)?; + + info!( + "Table engine drop table successfully, request:{:?}, dropped:{}", + request, dropped + ); + + // Update the drop table record into the sys_catalog_table. + self.catalog_table + .drop_table(request.clone()) + .await + .map_err(|e| Box::new(e) as _) + .context(WriteTableMeta { + table: &request.table_name, + })?; + + { + let mut tables = self.tables.write().unwrap(); + tables.remove(&request.table_name); + }; + + info!( + "Table based catalog manager drop table successfully, request:{:?}", + request + ); + + return Ok(true); + } + + fn all_tables(&self) -> schema::Result> { + Ok(self + .tables + .read() + .unwrap() + .tables_by_name + .iter() + .map(|(_, v)| v.clone()) + .collect()) + } +} + +#[cfg(any(test, feature = "test"))] +mod tests { + use std::{collections::HashMap, sync::Arc}; + + use analytic_engine::{tests::util::TestEnv, AnalyticTableEngine}; + use catalog::{ + consts::{DEFAULT_CATALOG, DEFAULT_SCHEMA}, + manager::Manager, + schema::{CreateOptions, DropOptions, SchemaRef}, + }; + use server::table_engine::{MemoryTableEngine, TableEngineProxy}; + use table_engine::{ + engine::{CreateTableRequest, DropTableRequest, TableState}, + ANALYTIC_ENGINE_TYPE, + }; + + use crate::table_based::TableBasedManager; + + async fn build_catalog_manager(analytic: AnalyticTableEngine) -> TableBasedManager { + // Create table engine proxy + let memory = MemoryTableEngine; + + let engine_proxy = Arc::new(TableEngineProxy { + memory, + analytic: analytic.clone(), + }); + + // Create catalog manager, use analytic table as backend + TableBasedManager::new(&analytic, engine_proxy.clone()) + .await + .unwrap_or_else(|e| { + panic!("Failed to create catalog manager, err:{}", e); + }) + } + + async fn build_default_schema_with_catalog(catalog_manager: &TableBasedManager) -> SchemaRef { + let catalog_name = catalog_manager.default_catalog_name(); + let schema_name = catalog_manager.default_schema_name(); + let catalog = catalog_manager.catalog_by_name(catalog_name); + assert!(catalog.is_ok()); + assert!(catalog.as_ref().unwrap().is_some()); + catalog + .as_ref() + .unwrap() + .as_ref() + .unwrap() + .schema_by_name(schema_name) + .unwrap() + .unwrap() + } + + async fn build_default_schema(analytic: AnalyticTableEngine) -> SchemaRef { + let catalog_manager = build_catalog_manager(analytic).await; + let catalog_name = catalog_manager.default_catalog_name(); + let schema_name = catalog_manager.default_schema_name(); + let catalog = catalog_manager.catalog_by_name(catalog_name); + assert!(catalog.is_ok()); + assert!(catalog.as_ref().unwrap().is_some()); + catalog + .as_ref() + .unwrap() + .as_ref() + .unwrap() + .schema_by_name(schema_name) + .unwrap() + .unwrap() + } + + fn build_create_table_req(table_name: &str, schema: SchemaRef) -> CreateTableRequest { + CreateTableRequest { + catalog_name: DEFAULT_CATALOG.to_string(), + schema_name: DEFAULT_SCHEMA.to_string(), + table_id: schema.alloc_table_id(table_name).unwrap(), + table_name: table_name.to_string(), + table_schema: common_types::tests::build_schema(), + partition_info: None, + engine: ANALYTIC_ENGINE_TYPE.to_string(), + options: HashMap::new(), + state: TableState::Stable, + } + } + + #[tokio::test] + async fn test_catalog_by_name_schema_by_name() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + test_ctx.open().await; + + let catalog_manager = build_catalog_manager(test_ctx.engine()).await; + let catalog_name = catalog_manager.default_catalog_name(); + let schema_name = catalog_manager.default_schema_name(); + let catalog = catalog_manager.catalog_by_name(catalog_name); + assert!(catalog.is_ok()); + assert!(catalog.as_ref().unwrap().is_some()); + let schema = catalog + .as_ref() + .unwrap() + .as_ref() + .unwrap() + .schema_by_name(schema_name); + assert!(schema.is_ok()); + assert!(schema.as_ref().unwrap().is_some()); + + let schema_name2 = "test"; + let schema = catalog + .as_ref() + .unwrap() + .as_ref() + .unwrap() + .schema_by_name(schema_name2); + assert!(schema.is_ok()); + assert!(schema.as_ref().unwrap().is_none()); + + let catalog_name2 = "test"; + let catalog = catalog_manager.catalog_by_name(catalog_name2); + assert!(catalog.is_ok()); + assert!(catalog.as_ref().unwrap().is_none()); + } + + #[tokio::test] + async fn test_maybe_create_schema_by_name() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + test_ctx.open().await; + + let catalog_manager = build_catalog_manager(test_ctx.engine()).await; + let catalog_name = catalog_manager.default_catalog_name(); + let catalog = catalog_manager.catalog_by_name(catalog_name); + assert!(catalog.is_ok()); + assert!(catalog.as_ref().unwrap().is_some()); + + let schema_name = "test"; + let catalog_ref = catalog.as_ref().unwrap().as_ref().unwrap(); + let mut schema = catalog_ref.schema_by_name(schema_name); + assert!(schema.is_ok()); + assert!(schema.as_ref().unwrap().is_none()); + + catalog_ref.create_schema(schema_name).await.unwrap(); + schema = catalog_ref.schema_by_name(schema_name); + assert!(schema.is_ok()); + assert!(schema.as_ref().unwrap().is_some()); + } + + #[tokio::test] + async fn test_alloc_table_id() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + test_ctx.open().await; + + let schema = build_default_schema(test_ctx.engine()).await; + let table_id = schema.alloc_table_id("test").unwrap(); + let expected_id = 2u64 << 40 | 1u64; + assert_eq!(table_id.as_u64(), expected_id); + } + + #[tokio::test] + async fn test_create_table() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + test_ctx.open().await; + + let catalog_manager = build_catalog_manager(test_ctx.engine()).await; + let schema = build_default_schema_with_catalog(&catalog_manager).await; + + let table_name = "test"; + let request = build_create_table_req(table_name, schema.clone()); + + let opts = CreateOptions { + table_engine: catalog_manager.get_engine_proxy(), + create_if_not_exists: true, + }; + + schema + .create_table(request.clone(), opts.clone()) + .await + .unwrap(); + assert!(schema.table_by_name(table_name).unwrap().is_some()); + + // create again + schema.create_table(request.clone(), opts).await.unwrap(); + assert!(schema.table_by_name(table_name).unwrap().is_some()); + + let opts2 = CreateOptions { + table_engine: catalog_manager.get_engine_proxy(), + create_if_not_exists: false, + }; + assert!(schema.create_table(request.clone(), opts2).await.is_err()); + } + + #[tokio::test] + async fn test_drop_table() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + test_ctx.open().await; + + let catalog_manager = build_catalog_manager(test_ctx.engine()).await; + let schema = build_default_schema_with_catalog(&catalog_manager).await; + + let table_name = "test"; + let engine_name = "test_engine"; + let drop_table_request = DropTableRequest { + catalog_name: DEFAULT_CATALOG.to_string(), + schema_name: DEFAULT_SCHEMA.to_string(), + table_name: table_name.to_string(), + engine: engine_name.to_string(), + }; + let drop_table_opts = DropOptions { + table_engine: catalog_manager.get_engine_proxy(), + }; + + assert!(!schema + .drop_table(drop_table_request.clone(), drop_table_opts.clone()) + .await + .unwrap()); + + let create_table_request = build_create_table_req(table_name, schema.clone()); + let create_table_opts = CreateOptions { + table_engine: catalog_manager.get_engine_proxy(), + create_if_not_exists: true, + }; + + // create table + { + schema + .create_table(create_table_request.clone(), create_table_opts.clone()) + .await + .unwrap(); + // check table exists + assert!(schema.table_by_name(table_name).unwrap().is_some()); + } + + // drop table + { + assert!(schema + .drop_table(drop_table_request.clone(), drop_table_opts.clone()) + .await + .unwrap()); + // check table not exists + assert!(schema.table_by_name(table_name).unwrap().is_none()); + } + + // create table again + { + schema + .create_table(create_table_request.clone(), create_table_opts.clone()) + .await + .unwrap(); + // check table exists + assert!(schema.table_by_name(table_name).unwrap().is_some()); + } + + // drop table again + { + assert!(schema + .drop_table(drop_table_request.clone(), drop_table_opts.clone()) + .await + .unwrap()); + // check table not exists + assert!(schema.table_by_name(table_name).unwrap().is_none()); + } + + // create two tables + { + let table_name2 = "test2"; + let create_table_request2 = build_create_table_req(table_name2, schema.clone()); + schema + .create_table(create_table_request2.clone(), create_table_opts.clone()) + .await + .unwrap(); + // check table exists + assert!(schema.table_by_name(table_name2).unwrap().is_some()); + + schema + .create_table(create_table_request, create_table_opts) + .await + .unwrap(); + // check table exists + assert!(schema.table_by_name(table_name).unwrap().is_some()); + } + + // drop table again + { + assert!(schema + .drop_table(drop_table_request, drop_table_opts) + .await + .unwrap()); + // check table not exists + assert!(schema.table_by_name(table_name).unwrap().is_none()); + } + } +} diff --git a/cluster/Cargo.toml b/cluster/Cargo.toml new file mode 100644 index 0000000000..d75d30a86d --- /dev/null +++ b/cluster/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "cluster" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +analytic_engine = { path = "../analytic_engine" } +async-trait = "0.1.41" +catalog = { path = "../catalog" } +common_types = { path = "../common_types" } +common_util = { path = "../common_util" } +log = "0.4" +meta_client_v2 = { path = "../meta_client_v2" } +rust-fsm = "0.6.0" +serde = "1.0" +serde_derive = "1.0" +serde_json = "1.0.60" +snafu = { version ="0.6.10", features = ["backtraces"]} +tokio = { version = "1.0", features = ["full"] } \ No newline at end of file diff --git a/cluster/src/config.rs b/cluster/src/config.rs new file mode 100644 index 0000000000..2afb0bee57 --- /dev/null +++ b/cluster/src/config.rs @@ -0,0 +1,18 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use meta_client_v2::MetaClientConfig; +use serde_derive::Deserialize; + +#[derive(Default, Clone, Deserialize, Debug)] +pub struct ClusterConfig { + /// Local ip address of this node, used as endpoint ip in meta. + pub node: String, + /// Grpc port of this node, also used as endpoint port in meta. + pub port: u16, + pub zone: String, + pub idc: String, + pub binary_version: String, + pub cmd_channel_buffer_size: usize, + + pub meta_client_config: MetaClientConfig, +} diff --git a/cluster/src/lib.rs b/cluster/src/lib.rs new file mode 100644 index 0000000000..9fe5916dc9 --- /dev/null +++ b/cluster/src/lib.rs @@ -0,0 +1,263 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{sync::Arc, time::Duration}; + +use async_trait::async_trait; +use catalog::manager::Manager; +use common_util::{define_result, runtime::Runtime}; +use log::{error, info}; +use meta_client_v2::{ + build_meta_client, ActionCmd, AllocSchemaIdRequest, AllocTableIdRequest, DropTableRequest, + GetTablesRequest, MetaClient, NodeMetaInfo, SchemaId, ShardId, ShardInfo, TableId, +}; +use snafu::{Backtrace, ResultExt, Snafu}; +use tokio::{ + sync::{mpsc::Receiver, RwLock}, + time, +}; + +use crate::{config::ClusterConfig, table_manager::TableManager}; + +pub mod config; +mod table_manager; + +#[derive(Debug, Snafu)] +#[snafu(visibility = "pub")] +pub enum Error { + #[snafu(display("Build meta client failed, err:{}.", source))] + BuildMetaClient { + source: Box, + }, + + #[snafu(display("Meta client start failed, err:{}.", source))] + StartMetaClient { + source: Box, + }, + + #[snafu(display("Meta client start failed, err:{}.", source))] + MetaClientFailure { + source: Box, + }, + + #[snafu(display( + "Shard not found in current node, shard_id:{}.\nBacktrace:\n{}", + shard_id, + backtrace + ))] + ShardNotFound { + shard_id: ShardId, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +#[async_trait] +pub trait Cluster { + async fn alloc_schema_id(&self, _schema_name: String) -> Result; + + async fn alloc_table_id(&self, _schema_name: String, _table_name: String) -> Result; + + async fn drop_table(&self, _schema_name: String, _table_name: String) -> Result<()>; +} + +pub struct ClusterImpl { + inner: Arc>, + runtime: Arc, +} + +impl ClusterImpl { + pub fn new(config: ClusterConfig, catalog_manager: M, runtime: Arc) -> Result { + Ok(Self { + inner: Arc::new(ClusterImplInner::new( + config, + catalog_manager, + runtime.clone(), + )?), + runtime, + }) + } + + pub async fn start(&self) -> Result<()> { + let inner = self.inner.clone(); + inner + .meta_client + .start() + .await + .map_err(|e| Box::new(e) as _) + .context(StartMetaClient)?; + self.runtime.spawn(async move { + inner.start_heartbeat().await; + }); + + Ok(()) + } +} + +#[async_trait] +impl Cluster for ClusterImpl { + async fn alloc_schema_id(&self, schema_name: String) -> Result { + self.inner.alloc_schema_id(schema_name).await + } + + async fn alloc_table_id(&self, schema_name: String, table_name: String) -> Result { + self.inner.alloc_table_id(schema_name, table_name).await + } + + async fn drop_table(&self, schema_name: String, table_name: String) -> Result<()> { + self.inner.drop_table(schema_name, table_name).await + } +} + +struct ClusterImplInner { + meta_client: Arc, + catalog_manager: M, + table_manager: TableManager, + action_cmd_receiver: RwLock>, + + config: ClusterConfig, +} + +impl ClusterImplInner { + pub fn new(config: ClusterConfig, catalog_manager: M, runtime: Arc) -> Result { + let (sender, receiver) = tokio::sync::mpsc::channel(config.cmd_channel_buffer_size); + let node_meta_info = NodeMetaInfo { + node: config.node.clone(), + zone: config.zone.clone(), + idc: config.idc.clone(), + binary_version: config.binary_version.clone(), + }; + Ok(Self { + meta_client: build_meta_client( + config.meta_client_config.clone(), + node_meta_info, + runtime, + Some(sender), + ) + .map_err(|e| Box::new(e) as _) + .context(BuildMetaClient)?, + catalog_manager, + table_manager: TableManager::new(), + action_cmd_receiver: RwLock::new(receiver), + config: config, + }) + } + + // heartbeat + async fn start_heartbeat(&self) { + let mut interval = time::interval(self.heartbeat_interval()); + + loop { + let shards_info = self.get_shards_info(); + info!("Node heartbeat to meta, shards info:{:?}", shards_info); + let resp = self.meta_client.send_heartbeat(shards_info).await; + match resp { + Ok(()) => { + interval.tick().await; + } + Err(e) => { + error!("Node heartbeat to meta failed, error:{}", e); + time::sleep(self.error_wait_lease()).await; + } + } + } + } + + async fn start_node_action_cmd(&self) { + let action_cmd_receiver = &mut *self.action_cmd_receiver.write().await; + // todo: handle error + while let Some(action_cmd) = action_cmd_receiver.recv().await { + info!( + "Node action cmd from meta received, action_cmd:{:?}", + action_cmd + ); + match action_cmd { + ActionCmd::OpenCmd(open_cmd) => { + let ret = self + .meta_client + .get_tables(GetTablesRequest { + shard_ids: open_cmd.shard_ids, + }) + .await; + match ret { + Err(ref e) => error!("Get shard tables failed, ret:{:?}, err:{}", ret, e), + Ok(v) => { + self.table_manager.update_table_info(v.tables_map); + // todo: self.catalog_manager.open tables + } + } + } + // todo: other action cmd + _ => todo!(), + } + } + info!("Node action cmd receiver exit"); + } + + fn get_shards_info(&self) -> Vec { + self.table_manager.get_shards_info() + } + + // Register node every 2/3 lease + fn heartbeat_interval(&self) -> Duration { + Duration::from_secs(self.config.meta_client_config.lease.as_secs() * 2 / 3) + } + + fn error_wait_lease(&self) -> Duration { + Duration::from_secs(self.config.meta_client_config.lease.as_secs() / 2) + } + + async fn alloc_schema_id(&self, schema_name: String) -> Result { + if let Some(v) = self.table_manager.get_schema_id(&schema_name) { + Ok(v) + } else { + Ok(self + .meta_client + .alloc_schema_id(AllocSchemaIdRequest { + name: schema_name.clone(), + }) + .await + .map_err(|e| Box::new(e) as _) + .context(MetaClientFailure)? + .id) + } + } + + async fn alloc_table_id(&self, schema_name: String, table_name: String) -> Result { + if let Some(v) = self.table_manager.get_table_id(&schema_name, &table_name) { + Ok(v) + } else { + let resp = self + .meta_client + .alloc_table_id(AllocTableIdRequest { + schema_name, + name: table_name, + }) + .await + .map_err(|e| Box::new(e) as _) + .context(MetaClientFailure)?; + self.table_manager.add_table( + resp.shard_id, + resp.schema_name, + resp.name, + resp.schema_id, + resp.id, + )?; + Ok(resp.id) + } + } + + async fn drop_table(&self, schema_name: String, table_name: String) -> Result<()> { + let _resp = self + .meta_client + .drop_table(DropTableRequest { + schema_name: schema_name.clone(), + name: table_name.clone(), + }) + .await + .map_err(|e| Box::new(e) as _) + .context(MetaClientFailure)?; + self.table_manager.drop_table(schema_name, table_name); + Ok(()) + } +} diff --git a/cluster/src/table_manager.rs b/cluster/src/table_manager.rs new file mode 100644 index 0000000000..738df85db6 --- /dev/null +++ b/cluster/src/table_manager.rs @@ -0,0 +1,163 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + collections::{BTreeMap, HashMap}, + sync::RwLock, +}; + +use meta_client_v2::{SchemaId, ShardId, ShardInfo, ShardTables, TableId, TableInfo}; + +use super::Result; +use crate::ShardNotFound; + +struct SchemaInfo { + name: String, + id: SchemaId, +} + +pub struct TableManager { + inner: RwLock, +} + +impl TableManager { + pub fn new() -> Self { + Self { + inner: RwLock::new(TableManagerInner { + shards_info: Vec::new(), + schemas_info: HashMap::new(), + tables: BTreeMap::new(), + }), + } + } + + pub fn get_shards_info(&self) -> Vec { + self.inner.read().unwrap().get_shards_info() + } + + pub fn add_table( + &self, + shard_id: ShardId, + schema_name: String, + table_name: String, + schema_id: SchemaId, + table_id: TableId, + ) -> Result<()> { + self.inner.write().unwrap().add_table( + shard_id, + schema_name, + table_name, + schema_id, + table_id, + ) + } + + pub fn drop_table(&self, schema_name: String, table_name: String) { + self.inner + .write() + .unwrap() + .drop_table(schema_name, table_name) + } + + pub fn update_table_info(&self, shard_table: HashMap) { + self.inner.write().unwrap().update_table_info(shard_table) + } + + pub fn get_schema_id(&self, schema_name: &str) -> Option { + self.inner.read().unwrap().get_schema_id(schema_name) + } + + pub fn get_table_id(&self, schema_name: &str, table_name: &str) -> Option { + self.inner + .read() + .unwrap() + .get_table_id(schema_name, table_name) + } +} + +struct TableManagerInner { + shards_info: Vec, + schemas_info: HashMap, + // schema_name -> table_name -> (shard_info, table_info) + tables: BTreeMap>, +} + +impl TableManagerInner { + fn get_shards_info(&self) -> Vec { + self.shards_info.clone() + } + + fn update_table_info(&mut self, shard_table: HashMap) { + for (shard_id, shard_tables) in shard_table { + let shard_info = ShardInfo { + shard_id, + role: shard_tables.role, + }; + for table in shard_tables.tables { + self.schemas_info + .entry(table.schema_name.clone()) + .or_insert(SchemaInfo { + name: table.schema_name.clone(), + id: table.schema_id, + }); + self.tables + .entry(table.schema_name.clone()) + .or_insert_with(BTreeMap::new) + .insert(table.name.clone(), (shard_info.clone(), table)); + } + } + } + + fn add_table( + &mut self, + shard_id: ShardId, + schema_name: String, + table_name: String, + schema_id: SchemaId, + table_id: TableId, + ) -> Result<()> { + let mut shard_info = None; + for shard in &self.shards_info { + if shard.shard_id == shard_id { + shard_info = Some(shard.clone()); + break; + } + } + match shard_info { + None => ShardNotFound { shard_id }.fail(), + Some(v) => { + self.tables + .entry(schema_name.clone()) + .or_insert_with(BTreeMap::new) + .insert( + table_name.clone(), + ( + v, + TableInfo { + id: table_id, + name: table_name, + schema_id, + schema_name, + }, + ), + ); + Ok(()) + } + } + } + + fn drop_table(&mut self, schema_name: String, table_name: String) { + self.tables + .get_mut(&schema_name) + .map(|v| v.remove(&table_name)); + } + + fn get_schema_id(&self, schema_name: &str) -> Option { + self.schemas_info.get(schema_name).map(|v| v.id) + } + + fn get_table_id(&self, schema_name: &str, table_name: &str) -> Option { + self.tables + .get(schema_name) + .and_then(|schema| schema.get(table_name).map(|v| v.1.id)) + } +} diff --git a/cluster/src/util.rs b/cluster/src/util.rs new file mode 100644 index 0000000000..e69de29bb2 diff --git a/common_types/Cargo.toml b/common_types/Cargo.toml new file mode 100644 index 0000000000..1bb477e3f3 --- /dev/null +++ b/common_types/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "common_types" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[features] +test = [] + +[dependencies] +# In alphabetical order +arrow_deps = { path = "../arrow_deps" } +byteorder = "1.2" +bytes = { path = "../components/bytes" } +chrono = "0.4" +murmur3 = "0.4.1" +paste = "1.0" +proto = { path = "../proto" } +snafu = { version ="0.6.10", features = ["backtraces"]} +# TODO(yingwen): Make sqlparser support a feature +sqlparser = "0.13.0" +serde = "1.0.81" +serde_derive = "1.0.81" diff --git a/common_types/src/bytes.rs b/common_types/src/bytes.rs new file mode 100644 index 0000000000..5a545d7b14 --- /dev/null +++ b/common_types/src/bytes.rs @@ -0,0 +1,5 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Bytes type. + +pub use bytes::*; diff --git a/common_types/src/column.rs b/common_types/src/column.rs new file mode 100644 index 0000000000..44908687bd --- /dev/null +++ b/common_types/src/column.rs @@ -0,0 +1,868 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Column +use std::sync::Arc; + +use arrow_deps::arrow::array::{ + Array, ArrayBuilder, ArrayRef, BinaryArray, BinaryBuilder, BooleanArray, BooleanBuilder, + Float32Array as FloatArray, Float32Builder as FloatBuilder, Float64Array as DoubleArray, + Float64Builder as DoubleBuilder, Int16Array, Int16Builder, Int32Array, Int32Builder, + Int64Array, Int64Builder, Int8Array, Int8Builder, NullArray, StringArray, StringBuilder, + TimestampMillisecondArray, TimestampMillisecondBuilder, UInt16Array, UInt16Builder, + UInt32Array, UInt32Builder, UInt64Array, UInt64Builder, UInt8Array, UInt8Builder, +}; +use paste::paste; +use snafu::{Backtrace, OptionExt, ResultExt, Snafu}; + +use crate::{ + bytes::Bytes, + datum::{Datum, DatumKind, DatumView}, + string::StringBytes, + time::Timestamp, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Invalid array type, datum_kind:{:?}, data_type:{:?}.\nBacktrace:\n{}", + datum_kind, + data_type, + backtrace + ))] + InvalidArrayType { + datum_kind: DatumKind, + data_type: arrow_deps::arrow::datatypes::DataType, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to append value, err:{}.\nBacktrace:\n{}", source, backtrace))] + Append { + source: arrow_deps::arrow::error::ArrowError, + backtrace: Backtrace, + }, + + #[snafu(display( + "Data type conflict, expect:{:?}, given:{:?}.\nBacktrace:\n{}", + expect, + given, + backtrace + ))] + ConflictType { + expect: DatumKind, + given: DatumKind, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to convert arrow data type, data_type:{}.\nBacktrace:\n{}", + data_type, + backtrace + ))] + UnsupportedArray { + data_type: arrow_deps::arrow::datatypes::DataType, + backtrace: Backtrace, + }, +} + +pub type Result = std::result::Result; + +#[derive(Debug)] +pub struct NullColumn(NullArray); + +impl NullColumn { + fn new_null(rows: usize) -> Self { + Self(NullArray::new(rows)) + } + + /// Only the first datum of NullColumn is considered not duplicated. + #[inline] + pub fn dedup(&self, selected: &mut [bool]) { + if !self.0.is_empty() { + selected[0] = true; + } + } +} + +// TODO(yingwen): Builder for columns. + +macro_rules! define_numeric_column { + ($($Kind: ident), *) => { + $(paste! { + #[derive(Debug)] + pub struct [<$Kind Column>]([<$Kind Array>]); + + #[inline] + fn [](array: &[<$Kind Array>], index: usize) -> Datum { + let value = array.value(index); + Datum::$Kind(value) + } + + #[inline] + fn [](array: &[<$Kind Array>], index: usize) -> DatumView { + let value = array.value(index); + DatumView::$Kind(value) + } + })* + } +} + +define_numeric_column!( + Float, Double, UInt64, UInt32, UInt16, UInt8, Int64, Int32, Int16, Int8, Boolean +); + +#[derive(Debug)] +pub struct TimestampColumn(TimestampMillisecondArray); + +#[derive(Debug)] +pub struct VarbinaryColumn(BinaryArray); + +#[derive(Debug)] +pub struct StringColumn(StringArray); + +#[inline] +fn get_null_datum_view(_array: &NullArray, _index: usize) -> DatumView { + DatumView::Null +} + +#[inline] +fn get_timestamp_datum_view(array: &TimestampMillisecondArray, index: usize) -> DatumView { + let value = array.value(index); + DatumView::Timestamp(Timestamp::new(value)) +} + +#[inline] +fn get_varbinary_datum_view(array: &BinaryArray, index: usize) -> DatumView { + let value = array.value(index); + DatumView::Varbinary(value) +} + +#[inline] +fn get_string_datum_view(array: &StringArray, index: usize) -> DatumView { + let value = array.value(index); + DatumView::String(value) +} + +#[inline] +fn get_null_datum(_array: &NullArray, _index: usize) -> Datum { + Datum::Null +} + +#[inline] +fn get_timestamp_datum(array: &TimestampMillisecondArray, index: usize) -> Datum { + let value = array.value(index); + Datum::Timestamp(Timestamp::new(value)) +} + +// TODO(yingwen): Avoid clone of data. +// Require a clone. +#[inline] +fn get_varbinary_datum(array: &BinaryArray, index: usize) -> Datum { + let value = array.value(index); + Datum::Varbinary(Bytes::copy_from_slice(value)) +} + +// TODO(yingwen): Avoid clone of data. +// Require a clone. +#[inline] +fn get_string_datum(array: &StringArray, index: usize) -> Datum { + let value = array.value(index); + Datum::String(StringBytes::copy_from_str(value)) +} + +macro_rules! impl_column { + ($Column: ident, $get_datum: expr, $get_datum_view: expr) => { + impl $Column { + /// Get datum by index. + pub fn datum_opt(&self, index: usize) -> Option { + // Do bound check. + if index >= self.0.len() { + return None; + } + + Some(self.datum(index)) + } + + pub fn datum_view(&self, index: usize) -> DatumView { + // If this datum is null. + if self.0.is_null(index) { + return DatumView::Null; + } + + $get_datum_view(&self.0, index) + } + + pub fn datum(&self, index: usize) -> Datum { + // If this datum is null. + if self.0.is_null(index) { + return Datum::Null; + } + + $get_datum(&self.0, index) + } + + #[inline] + pub fn num_rows(&self) -> usize { + self.0.len() + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.num_rows() == 0 + } + } + }; +} + +macro_rules! impl_dedup { + ($Column: ident) => { + impl $Column { + /// If datum i is not equal to previous datum i - 1, mark `selected[i]` to + /// true. + /// + /// The first datum is marked to true. + /// + /// The size of selected must equal to the size of this column and + /// initialized to false. + #[allow(clippy::float_cmp)] + pub fn dedup(&self, selected: &mut [bool]) { + if self.0.is_empty() { + return; + } + + selected[0] = true; + for i in 1..self.0.len() { + let current = self.0.value(i); + let prev = self.0.value(i - 1); + + if current != prev { + selected[i] = true; + } + } + } + } + }; +} + +macro_rules! impl_new_null { + ($Column: ident, $Builder: ident) => { + impl $Column { + /// Create a column that all values are null. + fn new_null(num_rows: usize) -> Result { + let mut builder = $Builder::new(num_rows); + for _ in 0..num_rows { + builder.append_null().context(Append)?; + } + let array = builder.finish(); + + Ok(Self(array)) + } + } + }; +} + +macro_rules! impl_from_array_and_slice { + ($Column: ident, $ArrayType: ident) => { + impl From<$ArrayType> for $Column { + fn from(array: $ArrayType) -> Self { + Self(array) + } + } + + impl From<&$ArrayType> for $Column { + fn from(array_ref: &$ArrayType) -> Self { + // We need to clone the [arrow_deps::arrow::array::ArrayData], which clones + // the underlying vector of [arrow_deps::arrow::buffer::Buffer] and Bitmap (also + // holds a Buffer), thus require some allocation. However, the Buffer is + // managed by Arc, so cloning the buffer is not too expensive. + let array_data = array_ref.data().clone(); + let array = $ArrayType::from(array_data); + + Self(array) + } + } + + impl $Column { + fn to_arrow_array(&self) -> $ArrayType { + // Clone the array data. + let array_data = self.0.data().clone(); + $ArrayType::from(array_data) + } + + /// Returns a zero-copy slice of this array with the indicated offset and + /// length. + /// + /// Panics if offset with length is greater than column length. + fn slice(&self, offset: usize, length: usize) -> Self { + let array_slice = self.0.slice(offset, length); + // Clone the slice data. + let array_data = array_slice.data().clone(); + let array = $ArrayType::from(array_data); + + Self(array) + } + } + }; +} + +macro_rules! impl_iter { + ($Column: ident, $Value: ident) => { + impl $Column { + /// Iter column values. + pub fn iter(&self) -> impl Iterator> + '_ { + self.0.iter() + } + } + }; +} + +macro_rules! impl_iter_map { + ($Column: ident, $Value: ident) => { + impl $Column { + /// Iter column values. + pub fn iter(&self) -> impl Iterator> + '_ { + self.0.iter().map(|v| v.map($Value::from)) + } + } + }; +} + +impl_column!(NullColumn, get_null_datum, get_null_datum_view); +impl_column!( + TimestampColumn, + get_timestamp_datum, + get_timestamp_datum_view +); +impl_column!( + VarbinaryColumn, + get_varbinary_datum, + get_varbinary_datum_view +); +impl_column!(StringColumn, get_string_datum, get_string_datum_view); + +impl_new_null!(TimestampColumn, TimestampMillisecondBuilder); +impl_new_null!(VarbinaryColumn, BinaryBuilder); +impl_new_null!(StringColumn, StringBuilder); + +impl_from_array_and_slice!(NullColumn, NullArray); +impl_from_array_and_slice!(TimestampColumn, TimestampMillisecondArray); +impl_from_array_and_slice!(VarbinaryColumn, BinaryArray); +impl_from_array_and_slice!(StringColumn, StringArray); + +impl_iter_map!(TimestampColumn, Timestamp); + +impl_dedup!(TimestampColumn); +impl_dedup!(VarbinaryColumn); +impl_dedup!(StringColumn); + +macro_rules! impl_numeric_column { + ($(($Kind: ident, $type: ty)), *) => { + $( + paste! { + impl_column!([<$Kind Column>], [], []); + impl_from_array_and_slice!([<$Kind Column>], [<$Kind Array>]); + impl_new_null!([<$Kind Column>], [<$Kind Builder>]); + impl_iter!([<$Kind Column>], $type); + impl_dedup!([<$Kind Column>]); + } + )* + } +} + +impl_numeric_column!( + (Double, f64), + (Float, f32), + (UInt64, u64), + (UInt32, u32), + (UInt16, u16), + (UInt8, u8), + (Int64, i64), + (Int32, i32), + (Int16, i16), + (Int8, i8), + (Boolean, bool) +); + +macro_rules! impl_numeric_value { + ($Column: ident, $Value: ident) => { + impl $Column { + /// Get value at index. + pub fn value(&self, index: usize) -> Option<$Value> { + if self.0.is_valid(index) { + unsafe { Some(self.0.value_unchecked(index)) } + } else { + None + } + } + } + }; +} + +macro_rules! batch_impl_numeric_value { + ($(($Kind: ident, $type: ty)), *) => { + $( + paste! { + impl_numeric_value!([<$Kind Column>], $type); + } + )* + } +} + +batch_impl_numeric_value!( + (Timestamp, i64), + (Double, f64), + (Float, f32), + (UInt64, u64), + (UInt32, u32), + (UInt16, u16), + (UInt8, u8), + (Int64, i64), + (Int32, i32), + (Int16, i16), + (Int8, i8), + (Boolean, bool) +); + +impl VarbinaryColumn { + pub fn iter(&self) -> impl Iterator> + '_ { + self.0.iter() + } + + pub fn value(&self, index: usize) -> Option<&[u8]> { + if self.0.is_valid(index) { + unsafe { Some(self.0.value_unchecked(index)) } + } else { + None + } + } +} + +impl StringColumn { + pub fn iter(&self) -> impl Iterator> + '_ { + self.0.iter() + } + + pub fn value(&self, index: usize) -> Option<&str> { + if self.0.is_valid(index) { + unsafe { Some(self.0.value_unchecked(index)) } + } else { + None + } + } +} + +macro_rules! impl_column_block { + ($($Kind: ident), *) => { + impl ColumnBlock { + pub fn datum_kind(&self) -> DatumKind { + match self { + $(ColumnBlock::$Kind(_) => DatumKind::$Kind,)* + } + } + + pub fn datum_opt(&self, index: usize) -> Option { + match self { + $(ColumnBlock::$Kind(col) => col.datum_opt(index),)* + } + } + + /// Panic if index is out fo bound. + pub fn datum_view(&self, index: usize) -> DatumView { + match self { + $(ColumnBlock::$Kind(col) => col.datum_view(index),)* + } + } + + /// Panic if index is out fo bound. + pub fn datum(&self, index: usize) -> Datum { + match self { + $(ColumnBlock::$Kind(col) => col.datum(index),)* + } + } + + pub fn num_rows(&self) -> usize { + match self { + $(ColumnBlock::$Kind(col) => col.num_rows(),)* + } + } + + pub fn to_arrow_array_ref(&self) -> ArrayRef { + match self { + $(ColumnBlock::$Kind(col) => Arc::new(col.to_arrow_array()),)* + } + } + + /// If datum i is not equal to previous datum i - 1, mark `selected[i]` to true. + /// + /// The first datum is not marked to true. + pub fn dedup(&self, selected: &mut [bool]) { + match self { + $(ColumnBlock::$Kind(col) => col.dedup(selected),)* + } + } + + /// Returns a zero-copy slice of this array with the indicated offset and length. + /// + /// Panics if offset with length is greater than column length. + #[must_use] + pub fn slice(&self, offset: usize, length: usize) -> Self { + match self { + $(ColumnBlock::$Kind(col) => ColumnBlock::$Kind(col.slice(offset, length)),)* + } + } + } + + $(paste! { + impl From<[<$Kind Column>]> for ColumnBlock { + fn from(column: [<$Kind Column>]) -> Self { + Self::$Kind(column) + } + } + })* + }; +} + +// TODO(yingwen): We can add a unsafe function that don't do bound check. + +macro_rules! define_column_block { + ($($Kind: ident), *) => { + paste! { + #[derive(Debug)] + pub enum ColumnBlock { + Null(NullColumn), + $( + $Kind([<$Kind Column>]), + )* + } + + impl ColumnBlock { + pub fn try_from_arrow_array_ref(datum_kind: &DatumKind, array: &ArrayRef) -> Result { + let column = match datum_kind { + DatumKind::Null => ColumnBlock::Null(NullColumn::new_null(array.len())), + $( + DatumKind::$Kind => { + let column = cast_array(datum_kind, array)?; + ColumnBlock::$Kind([<$Kind Column>]::from(column)) + } + )* + }; + Ok(column) + } + + pub fn new_null_with_type(kind: &DatumKind, rows: usize) -> Result { + let block = match kind { + DatumKind::Null => ColumnBlock::Null(NullColumn::new_null(rows)), + $( + DatumKind::$Kind => ColumnBlock::$Kind([<$Kind Column>]::new_null(rows)?), + )* + }; + + Ok(block) + } + } + } + } +} + +// Define column blocks, Null is defined explicitly in macro. +define_column_block!( + Timestamp, Double, Float, Varbinary, String, UInt64, UInt32, UInt16, UInt8, Int64, Int32, + Int16, Int8, Boolean +); + +impl ColumnBlock { + pub fn try_cast_arrow_array_ref(array: &ArrayRef) -> Result { + let datum_kind = + DatumKind::from_data_type(array.data_type()).with_context(|| UnsupportedArray { + data_type: array.data_type().clone(), + })?; + + Self::try_from_arrow_array_ref(&datum_kind, array) + } + + pub fn new_null(rows: usize) -> Self { + Self::Null(NullColumn::new_null(rows)) + } + + pub fn as_timestamp(&self) -> Option<&TimestampColumn> { + match self { + ColumnBlock::Timestamp(c) => Some(c), + _ => None, + } + } +} + +impl_column_block!( + Null, Timestamp, Double, Float, Varbinary, String, UInt64, UInt32, UInt16, UInt8, Int64, Int32, + Int16, Int8, Boolean +); + +fn cast_array<'a, T: 'static>(datum_kind: &DatumKind, array: &'a ArrayRef) -> Result<&'a T> { + array + .as_any() + .downcast_ref::() + .with_context(|| InvalidArrayType { + datum_kind: *datum_kind, + data_type: array.data_type().clone(), + }) +} + +macro_rules! append_datum { + ($Kind: ident, $builder: ident, $DatumType: ident, $datum: ident) => { + match $datum { + $DatumType::Null => $builder.append_null().context(Append), + $DatumType::$Kind(v) => $builder.append_value(v).context(Append), + _ => ConflictType { + expect: DatumKind::$Kind, + given: $datum.kind(), + } + .fail(), + } + }; +} + +macro_rules! append_datum_into { + ($Kind: ident, $builder: ident, $DatumType: ident, $datum: ident) => { + match $datum { + $DatumType::Null => $builder.append_null().context(Append), + $DatumType::$Kind(v) => $builder.append_value(v.into()).context(Append), + _ => ConflictType { + expect: DatumKind::$Kind, + given: $datum.kind(), + } + .fail(), + } + }; +} + +macro_rules! append_block { + ($Kind: ident, $builder: ident, $BlockType: ident, $block: ident, $start: ident, $len: ident) => { + match $block { + $BlockType::Null(v) => { + let end = std::cmp::min($start + $len, v.num_rows()); + for _ in $start..end { + $builder.append_null().context(Append)?; + } + Ok(()) + } + $BlockType::$Kind(v) => { + // There is no convenient api to copy a range of data from array to builder, so + // we still need to clone value one by one using a for loop. + let end = std::cmp::min($start + $len, v.num_rows()); + for i in $start..end { + let value_opt = v.value(i); + match value_opt { + Some(value) => { + $builder.append_value(value).context(Append)?; + } + None => { + $builder.append_null().context(Append)?; + } + } + } + Ok(()) + } + _ => ConflictType { + expect: DatumKind::$Kind, + given: $block.datum_kind(), + } + .fail(), + } + }; +} + +macro_rules! define_column_block_builder { + ($(($Kind: ident, $Builder: ident)), *) => { + paste! { + #[derive(Debug)] + pub enum ColumnBlockBuilder { + Null { rows: usize }, + Timestamp(TimestampMillisecondBuilder), + $( + $Kind($Builder), + )* + } + + impl ColumnBlockBuilder { + /// Create by data type with initial capacity + pub fn with_capacity(data_type: &DatumKind, capacity: usize) -> Self { + match data_type { + DatumKind::Null => Self::Null { rows: 0 }, + DatumKind::Timestamp => Self::Timestamp(TimestampMillisecondBuilder::new(capacity)), + $( + DatumKind::$Kind => Self::$Kind($Builder::new(capacity)), + )* + } + } + + /// Append the datum into the builder, the datum should have same the data + /// type of builder + pub fn append(&mut self, datum: Datum) -> Result<()> { + let given = datum.kind(); + match self { + Self::Null { rows } => match datum { + Datum::Null => { + *rows += 1; + Ok(()) + } + _ => ConflictType { + expect: DatumKind::Null, + given, + } + .fail(), + }, + Self::Timestamp(builder) => append_datum_into!(Timestamp, builder, Datum, datum), + $( + Self::$Kind(builder) => append_datum!($Kind, builder, Datum, datum), + )* + } + } + + /// Append the [DatumView] into the builder, the datum view should have same the data + /// type of builder + pub fn append_view<'a>(&mut self, datum: DatumView<'a>) -> Result<()> { + let given = datum.kind(); + match self { + Self::Null { rows } => match datum { + DatumView::Null => { + *rows += 1; + Ok(()) + } + _ => ConflictType { + expect: DatumKind::Null, + given, + } + .fail(), + }, + Self::Timestamp(builder) => append_datum_into!(Timestamp, builder, DatumView, datum), + $( + Self::$Kind(builder) => append_datum!($Kind, builder, DatumView, datum), + )* + } + } + + /// Append rows in [start..start + len) from `block` to the builder. + /// + /// Returns rows actually appended. + pub fn append_block_range(&mut self, block: &ColumnBlock, start: usize, len: usize) -> Result<()> { + match self { + Self::Null { rows } => { + if start + len >= block.num_rows() { + *rows += block.num_rows() - start; + } else { + *rows += len; + } + Ok(()) + }, + Self::Timestamp(builder) => append_block!(Timestamp, builder, ColumnBlock, block, start, len), + $( + Self::$Kind(builder) => append_block!($Kind, builder, ColumnBlock, block, start, len), + )* + } + } + + pub fn len(&self) -> usize { + match &self { + Self::Null { rows } => *rows, + Self::Timestamp(builder) => builder.len(), + $( + Self::$Kind(builder) => builder.len(), + )* + } + } + + // Build and reset the builder. + pub fn build(&mut self) -> ColumnBlock { + match self { + Self::Null { rows } => { + let block = ColumnBlock::new_null(*rows); + *rows = 0; + block + } + Self::Timestamp(builder) => TimestampColumn::from(builder.finish()).into(), + $( + Self::$Kind(builder) => [<$Kind Column>]::from(builder.finish()).into(), + )* + } + } + } + } + } +} + +// Define column block builders, Null and Timestamp are defined explicitly in +// macro. +define_column_block_builder!( + (Double, DoubleBuilder), + (Float, FloatBuilder), + (Varbinary, BinaryBuilder), + (String, StringBuilder), + (UInt64, UInt64Builder), + (UInt32, UInt32Builder), + (UInt16, UInt16Builder), + (UInt8, UInt8Builder), + (Int64, Int64Builder), + (Int32, Int32Builder), + (Int16, Int16Builder), + (Int8, Int8Builder), + (Boolean, BooleanBuilder) +); + +impl ColumnBlockBuilder { + /// Create by data type + pub fn new(data_type: &DatumKind) -> Self { + Self::with_capacity(data_type, 0) + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Clear the builder by calling `build()` and drop the built result. + pub fn clear(&mut self) { + let _ = self.build(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::tests::{build_rows, build_schema}; + + #[test] + fn test_column_block_builder() { + let schema = build_schema(); + let rows = build_rows(); + // DatumKind::Varbinary + let column = schema.column(0); + let mut builder = ColumnBlockBuilder::with_capacity(&column.data_type, 2); + + // append + builder.append(rows[0][0].clone()).unwrap(); + let ret = builder.append(rows[0][1].clone()); + assert!(ret.is_err()); + + // append_view + builder.append_view(rows[1][0].as_view()).unwrap(); + let ret = builder.append_view(rows[0][1].as_view()); + assert!(ret.is_err()); + + let column_block = builder.build(); + assert_eq!(column_block.num_rows(), 2); + let mut builder = ColumnBlockBuilder::with_capacity(&column.data_type, 2); + + // append_block_range + builder.append_block_range(&column_block, 0, 1).unwrap(); + builder.append_block_range(&column_block, 1, 1).unwrap(); + + let column_block = builder.build(); + assert_eq!(column_block.num_rows(), 2); + assert_eq!( + column_block.datum(0), + Datum::Varbinary(Bytes::copy_from_slice(b"binary key")) + ); + assert_eq!( + column_block.datum(1), + Datum::Varbinary(Bytes::copy_from_slice(b"binary key1")) + ); + } +} diff --git a/common_types/src/column_schema.rs b/common_types/src/column_schema.rs new file mode 100644 index 0000000000..eecf4303eb --- /dev/null +++ b/common_types/src/column_schema.rs @@ -0,0 +1,477 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Schema of column + +use std::{collections::BTreeMap, convert::TryFrom, str::FromStr}; + +use arrow_deps::arrow::datatypes::{DataType, Field}; +use proto::common as common_pb; +use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu}; + +use crate::datum::DatumKind; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Unsupported arrow data type, type:{}.\nBacktrace:\n{}", + data_type, + backtrace + ))] + UnsupportedDataType { + data_type: DataType, + backtrace: Backtrace, + }, + + #[snafu(display("Invalid tag type:{}.\nBacktrace:\n{}", data_type, backtrace))] + InvalidTagType { + data_type: DataType, + backtrace: Backtrace, + }, + + #[snafu(display( + "Arrow field meta data is missing, field name:{}.\nBacktrace:\n{}", + field_name, + backtrace + ))] + ArrowFieldMetaDataMissing { + field_name: String, + backtrace: Backtrace, + }, + + #[snafu(display( + "Arrow field meta key is not found, key:{:?}.\nBacktrace:\n{}", + key, + backtrace + ))] + ArrowFieldMetaKeyNotFound { + key: ArrowFieldMetaKey, + backtrace: Backtrace, + }, + + #[snafu(display( + "Arrow field meta value is invalid, key:{:?}, raw_value:{}, err:{}.\nBacktrace:\n{}", + key, + raw_value, + source, + backtrace + ))] + InvalidArrowFieldMetaValue { + key: ArrowFieldMetaKey, + raw_value: String, + source: Box, + backtrace: Backtrace, + }, +} + +pub type Result = std::result::Result; + +/// Error of compatibility check +#[derive(Debug, Snafu)] +pub enum CompatError { + #[snafu(display( + "Incompatible data type of column, name:{}, expect:{:?}, given:{:?}.\nBacktrace:\n{}", + name, + expect, + given, + backtrace, + ))] + IncompatDataType { + name: String, + expect: DatumKind, + given: DatumKind, + backtrace: Backtrace, + }, + + #[snafu(display("Column is not nullable, name:{}.\nBacktrace:\n{}", name, backtrace))] + NotNullable { name: String, backtrace: Backtrace }, +} + +/// Id of column +pub type ColumnId = u32; + +/// A ColumnId used to indicate that the column id is uninitialized +pub const COLUMN_ID_UNINIT: ColumnId = 0; + +/// Read operation of a column +#[derive(Debug)] +pub enum ReadOp { + /// Use the column exactly + Exact, + /// Fill the column by null + FillNull, +} + +/// Meta data of the arrow field. +#[derive(Clone, Debug, Default)] +struct ArrowFieldMeta { + id: u32, + is_tag: bool, + comment: String, +} + +#[derive(Copy, Clone, Debug)] +pub enum ArrowFieldMetaKey { + Id, + IsTag, + Comment, +} + +impl ArrowFieldMetaKey { + fn as_str(&self) -> &str { + match self { + ArrowFieldMetaKey::Id => "field::id", + ArrowFieldMetaKey::IsTag => "field::is_tag", + ArrowFieldMetaKey::Comment => "field::comment", + } + } +} + +impl ToString for ArrowFieldMetaKey { + fn to_string(&self) -> String { + self.as_str().to_string() + } +} + +/// Schema of column +#[derive(Debug, Clone, PartialEq)] +pub struct ColumnSchema { + /// Id of column + pub id: ColumnId, + /// Column name + pub name: String, + /// Data type of the column + pub data_type: DatumKind, + /// Is nullable + pub is_nullable: bool, + /// Is tag, tag is just a hint for a column, there is no restriction that a + /// tag column must be a part of primary key + pub is_tag: bool, + /// Comment of the column + pub comment: String, +} + +impl ColumnSchema { + /// Check whether a type is valid tag type. + pub fn is_valid_tag_type(typ: DatumKind) -> bool { + match typ { + DatumKind::Null => false, + DatumKind::Timestamp => true, + DatumKind::Double => false, + DatumKind::Float => false, + DatumKind::Varbinary => true, + DatumKind::String => true, + DatumKind::UInt64 => true, + DatumKind::UInt32 => true, + DatumKind::UInt16 => true, + DatumKind::UInt8 => true, + DatumKind::Int64 => true, + DatumKind::Int32 => true, + DatumKind::Int16 => true, + DatumKind::Int8 => true, + DatumKind::Boolean => true, + } + } + + /// Convert `self` to [proto::common::ColumnSchema] + /// + /// The `is_key` is needed because it is maintained by + /// [crate::schema::Schema] + pub fn to_pb(&self) -> common_pb::ColumnSchema { + let mut column_schema = common_pb::ColumnSchema::new(); + column_schema.set_name(self.name.clone()); + column_schema.set_data_type(self.data_type.into()); + column_schema.set_is_nullable(self.is_nullable); + column_schema.set_id(self.id); + column_schema.set_is_tag(self.is_tag); + column_schema.set_comment(self.comment.clone()); + + column_schema + } + + /// Convert `self` to [arrow_deps::arrow::datatypes::Field] + pub fn to_arrow_field(&self) -> Field { + From::from(self) + } + + /// Returns Ok if column with `writer_schema` can write to column with the + /// same schema as `self`. + pub fn compatible_for_write( + &self, + writer_schema: &ColumnSchema, + ) -> std::result::Result<(), CompatError> { + ensure!( + self.data_type == writer_schema.data_type, + IncompatDataType { + name: &self.name, + expect: writer_schema.data_type, + given: self.data_type, + } + ); + + // This column is not nullable but writer is nullable + ensure!( + self.is_nullable || !writer_schema.is_nullable, + NotNullable { name: &self.name } + ); + + Ok(()) + } + + /// Returns `Ok` if the source schema can read by this schema, now we won't + /// validate data type of column + pub fn compatible_for_read( + &self, + source_schema: &ColumnSchema, + ) -> std::result::Result { + if self.is_nullable { + // Column is nullable + if self.id == source_schema.id { + // Same column + Ok(ReadOp::Exact) + } else { + // Not the same column, maybe dropped, fill by null. + Ok(ReadOp::FillNull) + } + } else { + // Column is not null. We consider the old column was dropped if they have + // different column id and also try to fill by null, so we + // also check column id. + ensure!( + self.id == source_schema.id && !source_schema.is_nullable, + NotNullable { + name: &source_schema.name, + } + ); + + Ok(ReadOp::Exact) + } + } +} + +impl From for ColumnSchema { + fn from(column_schema: common_pb::ColumnSchema) -> Self { + Self { + id: column_schema.id, + name: column_schema.name, + data_type: DatumKind::from(column_schema.data_type), + is_nullable: column_schema.is_nullable, + is_tag: column_schema.is_tag, + comment: column_schema.comment, + } + } +} + +impl TryFrom<&Field> for ColumnSchema { + type Error = Error; + + fn try_from(field: &Field) -> Result { + let meta_data = field.metadata().as_ref(); + let ArrowFieldMeta { + id, + is_tag, + comment, + } = if let Some(meta_data) = meta_data { + decode_arrow_field_meta_data(meta_data)? + } else { + // FIXME(xikai): Now we have to tolerate the decoding failure because of the bug + // of datafusion (fixed by: https://github.com/apache/arrow-datafusion/commit/1448d9752ab3a38f02732274f91136a6a6ad3db4). + // (The bug may cause the meta data of the field meta lost duration plan + // execution.) + ArrowFieldMeta::default() + }; + Ok(Self { + id, + name: field.name().clone(), + data_type: DatumKind::from_data_type(field.data_type()).context( + UnsupportedDataType { + data_type: field.data_type().clone(), + }, + )?, + is_nullable: field.is_nullable(), + is_tag, + comment, + }) + } +} + +impl From<&ColumnSchema> for Field { + fn from(col_schema: &ColumnSchema) -> Self { + let metadata = encode_arrow_field_meta_data(col_schema); + let mut field = Field::new( + &col_schema.name, + col_schema.data_type.into(), + col_schema.is_nullable, + ); + field.set_metadata(Some(metadata)); + + field + } +} + +fn parse_arrow_field_meta_value( + meta: &BTreeMap, + key: ArrowFieldMetaKey, +) -> Result +where + T: FromStr, + T::Err: std::error::Error + Send + Sync + 'static, +{ + let raw_value = meta + .get(key.as_str()) + .context(ArrowFieldMetaKeyNotFound { key })?; + T::from_str(raw_value.as_str()) + .map_err(|e| Box::new(e) as _) + .context(InvalidArrowFieldMetaValue { key, raw_value }) +} + +fn decode_arrow_field_meta_data(meta: &BTreeMap) -> Result { + Ok(ArrowFieldMeta { + id: parse_arrow_field_meta_value(meta, ArrowFieldMetaKey::Id)?, + is_tag: parse_arrow_field_meta_value(meta, ArrowFieldMetaKey::IsTag)?, + comment: parse_arrow_field_meta_value(meta, ArrowFieldMetaKey::Comment)?, + }) +} + +fn encode_arrow_field_meta_data(col_schema: &ColumnSchema) -> BTreeMap { + let mut meta = BTreeMap::new(); + + meta.insert(ArrowFieldMetaKey::Id.to_string(), col_schema.id.to_string()); + meta.insert( + ArrowFieldMetaKey::IsTag.to_string(), + col_schema.is_tag.to_string(), + ); + meta.insert( + ArrowFieldMetaKey::Comment.to_string(), + col_schema.comment.clone(), + ); + + meta +} + +/// ColumnSchema builder +#[must_use] +pub struct Builder { + id: ColumnId, + name: String, + data_type: DatumKind, + is_nullable: bool, + is_tag: bool, + comment: String, +} + +impl Builder { + /// Create a new builder + pub fn new(name: String, data_type: DatumKind) -> Self { + Self { + id: COLUMN_ID_UNINIT, + name, + data_type, + is_nullable: false, + is_tag: false, + comment: String::new(), + } + } + + pub fn id(mut self, id: ColumnId) -> Self { + self.id = id; + self + } + + /// Set this column is nullable, default is true (not nullable). + pub fn is_nullable(mut self, is_nullable: bool) -> Self { + self.is_nullable = is_nullable; + self + } + + /// Set this column is tag, default is false (not a tag). + pub fn is_tag(mut self, is_tag: bool) -> Self { + self.is_tag = is_tag; + self + } + + pub fn comment(mut self, comment: String) -> Self { + self.comment = comment; + self + } + + pub fn validate(&self) -> Result<()> { + if self.is_tag { + ensure!( + ColumnSchema::is_valid_tag_type(self.data_type), + InvalidTagType { + data_type: self.data_type + } + ); + } + + Ok(()) + } + + pub fn build(self) -> Result { + self.validate()?; + + Ok(ColumnSchema { + id: self.id, + name: self.name, + data_type: self.data_type, + is_nullable: self.is_nullable, + is_tag: self.is_tag, + comment: self.comment, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Create a column schema for test, each field is filled with non-default + /// value + fn new_test_column_schema() -> ColumnSchema { + Builder::new("test_column_schema".to_string(), DatumKind::Boolean) + .id(18) + .is_nullable(true) + .is_tag(true) + .comment("Comment of this column".to_string()) + .build() + .expect("should succeed to build column schema") + } + + #[test] + fn test_builder() { + let lhs = new_test_column_schema(); + let rhs = ColumnSchema { + id: 18, + name: "test_column_schema".to_string(), + data_type: DatumKind::Boolean, + is_nullable: true, + is_tag: true, + comment: "Comment of this column".to_string(), + }; + + assert_eq!(&lhs, &rhs); + } + + #[test] + fn test_pb_convert() { + let column_schema = new_test_column_schema(); + let pb_schema = column_schema.to_pb(); + // Check pb specific fields + assert!(pb_schema.is_tag); + + let schema_from_pb = ColumnSchema::from(pb_schema); + assert_eq!(&schema_from_pb, &column_schema); + } + + #[test] + fn test_valid_tag_type() { + let invalid_tag_types = vec![DatumKind::Null, DatumKind::Float, DatumKind::Double]; + + for v in &DatumKind::VALUES { + assert_eq!( + ColumnSchema::is_valid_tag_type(*v), + !invalid_tag_types.contains(v) + ); + } + } +} diff --git a/common_types/src/datum.rs b/common_types/src/datum.rs new file mode 100644 index 0000000000..4ae6a8124b --- /dev/null +++ b/common_types/src/datum.rs @@ -0,0 +1,887 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Datum holds different kind of data + +use std::{convert::TryFrom, fmt, str}; + +use arrow_deps::{ + arrow::datatypes::{DataType, TimeUnit}, + datafusion::scalar::ScalarValue, +}; +use chrono::{Local, TimeZone}; +use proto::common::DataType as DataTypePb; +use serde::ser::{Serialize, Serializer}; +use snafu::{Backtrace, ResultExt, Snafu}; +use sqlparser::ast::{DataType as SqlDataType, Value}; + +use crate::{bytes::Bytes, hash::hash64, string::StringBytes, time::Timestamp}; +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Unsupported SQL data type, type:{}.\nBacktrace:\n{}", + sql_type, + backtrace + ))] + UnsupportedDataType { + sql_type: SqlDataType, + backtrace: Backtrace, + }, + + #[snafu(display("Invalid double or float, err:{}.\nBacktrace:\n{}", source, backtrace))] + InvalidDouble { + source: std::num::ParseFloatError, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid insert value, kind:{}, value:{:?}.\nBacktrace:\n{}", + kind, + value, + backtrace + ))] + InvalidValueType { + kind: DatumKind, + value: Value, + backtrace: Backtrace, + }, + #[snafu(display("Invalid timestamp, err:{}.\nBacktrace:\n{}", source, backtrace))] + InvalidTimestamp { + source: std::num::ParseIntError, + backtrace: Backtrace, + }, + + #[snafu(display("Invalid integer, err:{}.\nBacktrace:\n{}", source, backtrace))] + InvalidInt { + source: std::num::ParseIntError, + backtrace: Backtrace, + }, + + #[snafu(display("Invalid datum byte, byte:{}.\nBacktrace:\n{}", value, backtrace))] + InvalidDatumByte { value: u8, backtrace: Backtrace }, +} + +pub type Result = std::result::Result; + +// FIXME(yingwen): How to handle timezone? + +/// Data type of datum +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DatumKind { + Null = 0, + Timestamp, + Double, + Float, + Varbinary, + String, + UInt64, + UInt32, + UInt16, + UInt8, + Int64, + Int32, + Int16, + Int8, + // DatumKind::Boolean as usize = 14 + Boolean, +} + +impl DatumKind { + pub const VALUES: [Self; 15] = [ + Self::Null, + Self::Timestamp, + Self::Double, + Self::Float, + Self::Varbinary, + Self::String, + Self::UInt64, + Self::UInt32, + Self::UInt16, + Self::UInt8, + Self::Int64, + Self::Int32, + Self::Int16, + Self::Int8, + Self::Boolean, + ]; + + /// Return true if this is DatumKind::Timestamp + pub fn is_timestamp(&self) -> bool { + matches!(self, DatumKind::Timestamp) + } + + pub fn is_f64_castable(&self) -> bool { + matches!( + self, + Self::Double + | Self::Float + | Self::UInt64 + | Self::UInt32 + | Self::UInt16 + | Self::UInt8 + | Self::Int64 + | Self::Int32 + | Self::Int16 + | Self::Int8 + ) + } + + /// Can column of this datum kind used as key column + pub fn is_key_kind(&self) -> bool { + matches!( + self, + DatumKind::Timestamp + | DatumKind::Varbinary + | DatumKind::String + | DatumKind::UInt64 + | DatumKind::UInt32 + | DatumKind::UInt16 + | DatumKind::UInt8 + | DatumKind::Int64 + | DatumKind::Int32 + | DatumKind::Int16 + | DatumKind::Int8 + | DatumKind::Boolean + ) + } + + pub fn unsign_kind(&self) -> Option { + match self { + Self::Int64 | Self::UInt64 => Some(Self::UInt64), + Self::Int32 | Self::UInt32 => Some(Self::UInt32), + Self::Int16 | Self::UInt16 => Some(Self::UInt16), + Self::Int8 | Self::UInt8 => Some(Self::UInt8), + _ => None, + } + } + + /// Create DatumKind from [arrow_deps::arrow::datatypes::DataType], if the + /// type is not supported, returns None + pub fn from_data_type(data_type: &DataType) -> Option { + match data_type { + DataType::Null => Some(Self::Null), + DataType::Timestamp(TimeUnit::Millisecond, None) => Some(Self::Timestamp), + DataType::Float64 => Some(Self::Double), + DataType::Float32 => Some(Self::Float), + DataType::Binary => Some(Self::Varbinary), + DataType::Utf8 => Some(Self::String), + DataType::UInt64 => Some(Self::UInt64), + DataType::UInt32 => Some(Self::UInt32), + DataType::UInt16 => Some(Self::UInt16), + DataType::UInt8 => Some(Self::UInt8), + DataType::Int64 => Some(Self::Int64), + DataType::Int32 => Some(Self::Int32), + DataType::Int16 => Some(Self::Int16), + DataType::Int8 => Some(Self::Int8), + DataType::Boolean => Some(Self::Boolean), + DataType::Float16 + | DataType::LargeUtf8 + | DataType::LargeBinary + | DataType::FixedSizeBinary(_) + | DataType::Struct(_) + | DataType::Union(_, _) + | DataType::List(_) + | DataType::LargeList(_) + | DataType::FixedSizeList(_, _) + | DataType::Time32(_) + | DataType::Time64(_) + | DataType::Timestamp(_, _) + | DataType::Date32 + | DataType::Date64 + | DataType::Interval(_) + | DataType::Duration(_) + | DataType::Dictionary(_, _) + | DataType::Decimal(_, _) + | DataType::Map(_, _) => None, + } + } + + /// Get name of this kind. + fn as_str(&self) -> &str { + match self { + DatumKind::Null => "null", + DatumKind::Timestamp => "timestamp", + DatumKind::Double => "double", + DatumKind::Float => "float", + DatumKind::Varbinary => "varbinary", + DatumKind::String => "string", + DatumKind::UInt64 => "uint64", + DatumKind::UInt32 => "uint32", + DatumKind::UInt16 => "uint16", + DatumKind::UInt8 => "uint8", + DatumKind::Int64 => "bigint", + DatumKind::Int32 => "int", + DatumKind::Int16 => "smallint", + DatumKind::Int8 => "tinyint", + DatumKind::Boolean => "boolean", + } + } + + /// Convert into a byte. + #[inline] + pub fn into_u8(self) -> u8 { + self as u8 + } +} + +impl From for DataType { + fn from(kind: DatumKind) -> Self { + match kind { + DatumKind::Null => DataType::Null, + DatumKind::Timestamp => DataType::Timestamp(TimeUnit::Millisecond, None), + DatumKind::Double => DataType::Float64, + DatumKind::Float => DataType::Float32, + DatumKind::Varbinary => DataType::Binary, + DatumKind::String => DataType::Utf8, + DatumKind::UInt64 => DataType::UInt64, + DatumKind::UInt32 => DataType::UInt32, + DatumKind::UInt16 => DataType::UInt16, + DatumKind::UInt8 => DataType::UInt8, + DatumKind::Int64 => DataType::Int64, + DatumKind::Int32 => DataType::Int32, + DatumKind::Int16 => DataType::Int16, + DatumKind::Int8 => DataType::Int8, + DatumKind::Boolean => DataType::Boolean, + } + } +} + +impl TryFrom<&SqlDataType> for DatumKind { + type Error = Error; + + fn try_from(sql_type: &SqlDataType) -> Result { + match sql_type { + // TODO(yingwen): Consider timezone + SqlDataType::Timestamp => Ok(Self::Timestamp), + SqlDataType::Real | SqlDataType::Float(_) => Ok(Self::Float), + SqlDataType::Double => Ok(Self::Double), + SqlDataType::Boolean => Ok(Self::Boolean), + SqlDataType::BigInt(_) => Ok(Self::Int64), + SqlDataType::Int(_) => Ok(Self::Int32), + SqlDataType::SmallInt(_) => Ok(Self::Int16), + SqlDataType::String => Ok(Self::String), + SqlDataType::Custom(objects) if objects.0.len() == 1 => { + match objects.0[0].value.as_str() { + "UINT64" | "uint64" => Ok(Self::UInt64), + "UINT32" | "uint32" => Ok(Self::UInt32), + "UINT16" | "uint16" => Ok(Self::UInt16), + "UINT8" | "uint8" => Ok(Self::UInt8), + "INT64" | "int64" => Ok(Self::Int64), + "INT32" | "int32" => Ok(Self::Int32), + "INT16" | "int16" => Ok(Self::Int16), + "TINYINT" | "INT8" | "tinyint" | "int8" => Ok(Self::Int8), + "VARBINARY" | "varbinary" => Ok(Self::Varbinary), + _ => UnsupportedDataType { + sql_type: sql_type.clone(), + } + .fail(), + } + } + + // Unlike datafusion, Decimal is not supported now + _ => UnsupportedDataType { + sql_type: sql_type.clone(), + } + .fail(), + } + } +} + +impl TryFrom for DatumKind { + type Error = Error; + + fn try_from(v: u8) -> Result { + match v { + v if DatumKind::Null.into_u8() == v => Ok(DatumKind::Null), + v if DatumKind::Timestamp.into_u8() == v => Ok(DatumKind::Timestamp), + v if DatumKind::Double.into_u8() == v => Ok(DatumKind::Double), + v if DatumKind::Float.into_u8() == v => Ok(DatumKind::Float), + v if DatumKind::Varbinary.into_u8() == v => Ok(DatumKind::Varbinary), + v if DatumKind::String.into_u8() == v => Ok(DatumKind::String), + v if DatumKind::UInt64.into_u8() == v => Ok(DatumKind::UInt64), + v if DatumKind::UInt32.into_u8() == v => Ok(DatumKind::UInt32), + v if DatumKind::UInt16.into_u8() == v => Ok(DatumKind::UInt16), + v if DatumKind::UInt8.into_u8() == v => Ok(DatumKind::UInt8), + v if DatumKind::Int64.into_u8() == v => Ok(DatumKind::Int64), + v if DatumKind::Int32.into_u8() == v => Ok(DatumKind::Int32), + v if DatumKind::Int16.into_u8() == v => Ok(DatumKind::Int16), + v if DatumKind::Int8.into_u8() == v => Ok(DatumKind::Int8), + v if DatumKind::Boolean.into_u8() == v => Ok(DatumKind::Boolean), + _ => InvalidDatumByte { value: v }.fail(), + } + } +} + +impl From for DataTypePb { + fn from(kind: DatumKind) -> Self { + match kind { + DatumKind::Null => Self::NULL, + DatumKind::Timestamp => Self::TIMESTAMP, + DatumKind::Double => Self::DOUBLE, + DatumKind::Float => Self::FLOAT, + DatumKind::Varbinary => Self::VARBINARY, + DatumKind::String => Self::STRING, + DatumKind::UInt64 => Self::UINT64, + DatumKind::UInt32 => Self::UINT32, + DatumKind::UInt16 => Self::UINT16, + DatumKind::UInt8 => Self::UINT8, + DatumKind::Int64 => Self::INT64, + DatumKind::Int32 => Self::INT32, + DatumKind::Int16 => Self::INT16, + DatumKind::Int8 => Self::INT8, + DatumKind::Boolean => Self::BOOL, + } + } +} + +impl From for DatumKind { + fn from(data_type: DataTypePb) -> Self { + match data_type { + DataTypePb::NULL => DatumKind::Null, + DataTypePb::TIMESTAMP => DatumKind::Timestamp, + DataTypePb::DOUBLE => DatumKind::Double, + DataTypePb::FLOAT => DatumKind::Float, + DataTypePb::VARBINARY => DatumKind::Varbinary, + DataTypePb::STRING => DatumKind::String, + DataTypePb::UINT64 => DatumKind::UInt64, + DataTypePb::UINT32 => DatumKind::UInt32, + DataTypePb::UINT16 => DatumKind::UInt16, + DataTypePb::UINT8 => DatumKind::UInt8, + DataTypePb::INT64 => DatumKind::Int64, + DataTypePb::INT32 => DatumKind::Int32, + DataTypePb::INT16 => DatumKind::Int16, + DataTypePb::INT8 => DatumKind::Int8, + DataTypePb::BOOL => DatumKind::Boolean, + } + } +} + +impl fmt::Display for DatumKind { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +// FIXME(yingwen): Validate the length of string and varbinary. +/// A data box holds different kind of data +#[derive(Debug, Clone, PartialEq, PartialOrd)] +pub enum Datum { + Null, + /// Millisecond precision + /// + /// Map to arrow::datatypes::DataType::Timestamp(TimeUnit::Millisecond, + /// None) + Timestamp(Timestamp), + /// Map to arrow::datatypes::DataType::Float64 + Double(f64), + /// Map to arrow::datatypes::DataType::Float32 + Float(f32), + /// Map to arrow::datatypes::DateType::Binary + /// + /// No more than 2G (size of i32) + Varbinary(Bytes), + /// Map to arrow::datatypes::DataType::String + /// + /// No more than 2G (size of i32) + String(StringBytes), + /// Map to arrow::datatypes::DataType::UInt64 + UInt64(u64), + UInt32(u32), + UInt16(u16), + UInt8(u8), + Int64(i64), + Int32(i32), + Int16(i16), + Int8(i8), + Boolean(bool), +} + +impl Datum { + /// Creates an empty datum by given datum kind + pub fn empty(kind: &DatumKind) -> Self { + match kind { + DatumKind::Null => Self::Null, + DatumKind::Timestamp => Self::Timestamp(Timestamp::new(0)), + DatumKind::Double => Self::Double(0.0), + DatumKind::Float => Self::Float(0.0), + DatumKind::Varbinary => Self::Varbinary(Bytes::new()), + DatumKind::String => Self::String(StringBytes::new()), + DatumKind::UInt64 => Self::UInt64(0), + DatumKind::UInt32 => Self::UInt32(0), + DatumKind::UInt16 => Self::UInt16(0), + DatumKind::UInt8 => Self::UInt8(0), + DatumKind::Int64 => Self::Int64(0), + DatumKind::Int32 => Self::Int32(0), + DatumKind::Int16 => Self::Int16(0), + DatumKind::Int8 => Self::Int8(0), + DatumKind::Boolean => Self::Boolean(false), + } + } + + /// Return the kind of datum + pub fn kind(&self) -> DatumKind { + match self { + Datum::Null => DatumKind::Null, + Datum::Timestamp(_) => DatumKind::Timestamp, + Datum::Double(_) => DatumKind::Double, + Datum::Float(_) => DatumKind::Float, + Datum::Varbinary(_) => DatumKind::Varbinary, + Datum::String(_) => DatumKind::String, + Datum::UInt64(_) => DatumKind::UInt64, + Datum::UInt32(_) => DatumKind::UInt32, + Datum::UInt16(_) => DatumKind::UInt16, + Datum::UInt8(_) => DatumKind::UInt8, + Datum::Int64(_) => DatumKind::Int64, + Datum::Int32(_) => DatumKind::Int32, + Datum::Int16(_) => DatumKind::Int16, + Datum::Int8(_) => DatumKind::Int8, + Datum::Boolean(_) => DatumKind::Boolean, + } + } + + // TODO: handle error + pub fn convert_to_uint64(&self) -> u64 { + match self { + Datum::Null => 0, + Datum::Timestamp(v) => v.as_i64() as u64, + Datum::Double(v) => *v as u64, + Datum::Float(v) => *v as u64, + Datum::Varbinary(v) => hash64(v), + Datum::String(v) => hash64(v.as_bytes()), + Datum::UInt64(v) => *v, + Datum::UInt32(v) => *v as u64, + Datum::UInt16(v) => *v as u64, + Datum::UInt8(v) => *v as u64, + Datum::Int64(v) => *v as u64, + Datum::Int32(v) => *v as u64, + Datum::Int16(v) => *v as u64, + Datum::Int8(v) => *v as u64, + Datum::Boolean(v) => *v as u64, + } + } + + pub fn is_null(&self) -> bool { + matches!(self, Datum::Null) + } + + /// Cast datum to timestamp. + pub fn as_timestamp(&self) -> Option { + match self { + Datum::Timestamp(v) => Some(*v), + _ => None, + } + } + + /// Cast datum to &str. + pub fn as_str(&self) -> Option<&str> { + match self { + Datum::String(v) => Some(v), + _ => None, + } + } + + /// Cast datum to uint64. + pub fn as_u64(&self) -> Option { + match self { + Datum::UInt64(v) => Some(*v), + Datum::UInt32(v) => Some(*v as u64), + Datum::UInt16(v) => Some(*v as u64), + Datum::UInt8(v) => Some(*v as u64), + Datum::Int64(v) => Some(*v as u64), + Datum::Int32(v) => Some(*v as u64), + Datum::Int16(v) => Some(*v as u64), + Datum::Int8(v) => Some(*v as u64), + Datum::Boolean(v) => Some(*v as u64), + _ => None, + } + } + + /// Cast datum to Bytes. + pub fn as_varbinary(&self) -> Option<&Bytes> { + match self { + Datum::Varbinary(v) => Some(v), + _ => None, + } + } + + pub fn as_f32(&self) -> Option { + match self { + Datum::Float(v) => Some(*v), + _ => None, + } + } + + pub fn as_f64(&self) -> Option { + match self { + Datum::Double(v) => Some(*v), + Datum::Float(v) => Some(*v as f64), + Datum::UInt64(v) => Some(*v as f64), + Datum::UInt32(v) => Some(*v as f64), + Datum::UInt16(v) => Some(*v as f64), + Datum::UInt8(v) => Some(*v as f64), + Datum::Int64(v) => Some(*v as f64), + Datum::Int32(v) => Some(*v as f64), + Datum::Int16(v) => Some(*v as f64), + Datum::Int8(v) => Some(*v as f64), + Datum::Boolean(_) + | Datum::Null + | Datum::Timestamp(_) + | Datum::Varbinary(_) + | Datum::String(_) => None, + } + } + + pub fn display_string(&self) -> String { + match self { + Datum::Null => "null".to_string(), + Datum::Timestamp(v) => Local.timestamp_millis(v.as_i64()).to_rfc3339(), + Datum::Double(v) => v.to_string(), + Datum::Float(v) => v.to_string(), + Datum::Varbinary(v) => format!("{:?}", v), + Datum::String(v) => v.to_string(), + Datum::UInt64(v) => v.to_string(), + Datum::UInt32(v) => v.to_string(), + Datum::UInt16(v) => v.to_string(), + Datum::UInt8(v) => v.to_string(), + Datum::Int64(v) => v.to_string(), + Datum::Int32(v) => v.to_string(), + Datum::Int16(v) => v.to_string(), + Datum::Int8(v) => v.to_string(), + Datum::Boolean(v) => v.to_string(), + } + } + + pub fn try_from_sql_value(kind: &DatumKind, value: Value) -> Result { + match (kind, value) { + (DatumKind::Null, Value::Null) => Ok(Datum::Null), + (DatumKind::Timestamp, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidTimestamp)?; + Ok(Datum::Timestamp(Timestamp::new(n))) + } + (DatumKind::Double, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidDouble)?; + Ok(Datum::Double(n)) + } + (DatumKind::Float, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidDouble)?; + Ok(Datum::Float(n)) + } + // TODO(yingwen): Support hex string. + (DatumKind::Varbinary, Value::SingleQuotedString(s)) => { + Ok(Datum::Varbinary(Bytes::from(s))) + } + (DatumKind::String, Value::SingleQuotedString(s)) => { + Ok(Datum::String(StringBytes::from(s))) + } + (DatumKind::UInt64, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidInt)?; + Ok(Datum::UInt64(n)) + } + (DatumKind::UInt32, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidInt)?; + Ok(Datum::UInt32(n)) + } + (DatumKind::UInt16, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidInt)?; + Ok(Datum::UInt16(n)) + } + (DatumKind::UInt8, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidInt)?; + Ok(Datum::UInt8(n)) + } + (DatumKind::Int64, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidInt)?; + Ok(Datum::Int64(n)) + } + (DatumKind::Int32, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidInt)?; + Ok(Datum::Int32(n)) + } + (DatumKind::Int16, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidInt)?; + Ok(Datum::Int16(n)) + } + (DatumKind::Int8, Value::Number(n, _long)) => { + let n = n.parse::().context(InvalidInt)?; + Ok(Datum::Int8(n)) + } + (DatumKind::Boolean, Value::Boolean(b)) => Ok(Datum::Boolean(b)), + (_, value) => InvalidValueType { kind: *kind, value }.fail(), + } + } + + pub fn as_scalar_value(&self) -> Option { + match self { + Datum::Null => None, + Datum::Timestamp(v) => { + Some(ScalarValue::TimestampMillisecond(Some((*v).as_i64()), None)) + } + Datum::Double(v) => Some(ScalarValue::Float64(Some(*v))), + Datum::Float(v) => Some(ScalarValue::Float32(Some(*v))), + Datum::Varbinary(v) => Some(ScalarValue::Binary(Some(v.to_vec()))), + Datum::String(v) => Some(ScalarValue::Utf8(Some(v.to_string()))), + Datum::UInt64(v) => Some(ScalarValue::UInt64(Some(*v))), + Datum::UInt32(v) => Some(ScalarValue::UInt32(Some(*v))), + Datum::UInt16(v) => Some(ScalarValue::UInt16(Some(*v))), + Datum::UInt8(v) => Some(ScalarValue::UInt8(Some(*v))), + Datum::Int64(v) => Some(ScalarValue::Int64(Some(*v))), + Datum::Int32(v) => Some(ScalarValue::Int32(Some(*v))), + Datum::Int16(v) => Some(ScalarValue::Int16(Some(*v))), + Datum::Int8(v) => Some(ScalarValue::Int8(Some(*v))), + Datum::Boolean(v) => Some(ScalarValue::Boolean(Some(*v))), + } + } + + #[cfg(test)] + pub fn as_view(&self) -> DatumView { + match self { + Datum::Null => DatumView::Null, + Datum::Timestamp(v) => DatumView::Timestamp(*v), + Datum::Double(v) => DatumView::Double(*v), + Datum::Float(v) => DatumView::Float(*v), + Datum::Varbinary(v) => DatumView::Varbinary(v), + Datum::String(v) => DatumView::String(v), + Datum::UInt64(v) => DatumView::UInt64(*v), + Datum::UInt32(v) => DatumView::UInt32(*v), + Datum::UInt16(v) => DatumView::UInt16(*v), + Datum::UInt8(v) => DatumView::UInt8(*v), + Datum::Int64(v) => DatumView::Int64(*v), + Datum::Int32(v) => DatumView::Int32(*v), + Datum::Int16(v) => DatumView::Int16(*v), + Datum::Int8(v) => DatumView::Int8(*v), + Datum::Boolean(v) => DatumView::Boolean(*v), + } + } +} + +macro_rules! impl_from { + ($Kind: ident, $FromType: ident) => { + impl From<$FromType> for Datum { + fn from(value: $FromType) -> Self { + Self::$Kind(value) + } + } + + impl From> for Datum { + fn from(value_opt: Option<$FromType>) -> Self { + match value_opt { + Some(value) => Self::$Kind(value), + None => Self::Null, + } + } + } + }; +} + +impl_from!(Timestamp, Timestamp); +impl_from!(Double, f64); +impl_from!(Float, f32); +impl_from!(Varbinary, Bytes); +impl_from!(String, StringBytes); +impl_from!(UInt64, u64); +impl_from!(UInt32, u32); +impl_from!(UInt16, u16); +impl_from!(UInt8, u8); +impl_from!(Int64, i64); +impl_from!(Int32, i32); +impl_from!(Int16, i16); +impl_from!(Int8, i8); +impl_from!(Boolean, bool); + +impl From<&str> for Datum { + fn from(value: &str) -> Datum { + Datum::String(StringBytes::copy_from_str(value)) + } +} + +impl From> for Datum { + fn from(value_opt: Option<&str>) -> Datum { + match value_opt { + Some(value) => Datum::String(StringBytes::copy_from_str(value)), + None => Datum::Null, + } + } +} + +impl From<&[u8]> for Datum { + fn from(value: &[u8]) -> Datum { + Datum::Varbinary(Bytes::copy_from_slice(value)) + } +} + +impl From> for Datum { + fn from(value_opt: Option<&[u8]>) -> Datum { + match value_opt { + Some(value) => Datum::Varbinary(Bytes::copy_from_slice(value)), + None => Datum::Null, + } + } +} + +/// impl serde serialize for Datum +impl Serialize for Datum { + fn serialize(&self, serializer: S) -> std::result::Result + where + S: Serializer, + { + match self { + Datum::Null => serializer.serialize_none(), + Datum::Timestamp(v) => serializer.serialize_i64(v.as_i64()), + Datum::Double(v) => serializer.serialize_f64(*v), + Datum::Float(v) => serializer.serialize_f32(*v), + Datum::Varbinary(v) => serializer.serialize_bytes(v), + Datum::String(v) => serializer.serialize_str(v), + Datum::UInt64(v) => serializer.serialize_u64(*v), + Datum::UInt32(v) => serializer.serialize_u32(*v), + Datum::UInt16(v) => serializer.serialize_u16(*v), + Datum::UInt8(v) => serializer.serialize_u8(*v), + Datum::Int64(v) => serializer.serialize_i64(*v), + Datum::Int32(v) => serializer.serialize_i32(*v), + Datum::Int16(v) => serializer.serialize_i16(*v), + Datum::Int8(v) => serializer.serialize_i8(*v), + Datum::Boolean(v) => serializer.serialize_bool(*v), + } + } +} + +/// A view to a datum. +/// +/// Holds copy of integer like datum and reference of string like datum. +#[derive(Debug, PartialEq, PartialOrd)] +pub enum DatumView<'a> { + Null, + Timestamp(Timestamp), + Double(f64), + Float(f32), + Varbinary(&'a [u8]), + String(&'a str), + UInt64(u64), + UInt32(u32), + UInt16(u16), + UInt8(u8), + Int64(i64), + Int32(i32), + Int16(i16), + Int8(i8), + Boolean(bool), +} + +impl<'a> DatumView<'a> { + /// Return the kind of datum + pub fn kind(&self) -> DatumKind { + match self { + DatumView::Null => DatumKind::Null, + DatumView::Timestamp(_) => DatumKind::Timestamp, + DatumView::Double(_) => DatumKind::Double, + DatumView::Float(_) => DatumKind::Float, + DatumView::Varbinary(_) => DatumKind::Varbinary, + DatumView::String(_) => DatumKind::String, + DatumView::UInt64(_) => DatumKind::UInt64, + DatumView::UInt32(_) => DatumKind::UInt32, + DatumView::UInt16(_) => DatumKind::UInt16, + DatumView::UInt8(_) => DatumKind::UInt8, + DatumView::Int64(_) => DatumKind::Int64, + DatumView::Int32(_) => DatumKind::Int32, + DatumView::Int16(_) => DatumKind::Int16, + DatumView::Int8(_) => DatumKind::Int8, + DatumView::Boolean(_) => DatumKind::Boolean, + } + } + + pub fn from_scalar_value(val: &'a ScalarValue) -> Option { + match val { + ScalarValue::Boolean(v) => v.map(DatumView::Boolean), + ScalarValue::Float32(v) => v.map(DatumView::Float), + ScalarValue::Float64(v) => v.map(DatumView::Double), + ScalarValue::Int8(v) => v.map(DatumView::Int8), + ScalarValue::Int16(v) => v.map(DatumView::Int16), + ScalarValue::Int32(v) => v.map(DatumView::Int32), + ScalarValue::Int64(v) => v.map(DatumView::Int64), + ScalarValue::UInt8(v) => v.map(DatumView::UInt8), + ScalarValue::UInt16(v) => v.map(DatumView::UInt16), + ScalarValue::UInt32(v) => v.map(DatumView::UInt32), + ScalarValue::UInt64(v) => v.map(DatumView::UInt64), + ScalarValue::Utf8(v) | ScalarValue::LargeUtf8(v) => { + v.as_ref().map(|v| DatumView::String(v.as_str())) + } + ScalarValue::Binary(v) | ScalarValue::LargeBinary(v) => { + v.as_ref().map(|v| DatumView::Varbinary(v.as_slice())) + } + ScalarValue::TimestampMillisecond(v, _) => { + v.map(|v| DatumView::Timestamp(Timestamp::new(v))) + } + ScalarValue::List(_, _) + | ScalarValue::Date32(_) + | ScalarValue::Date64(_) + | ScalarValue::TimestampSecond(_, _) + | ScalarValue::TimestampMicrosecond(_, _) + | ScalarValue::TimestampNanosecond(_, _) + | ScalarValue::IntervalYearMonth(_) + | ScalarValue::IntervalDayTime(_) + | ScalarValue::Struct(_, _) + | ScalarValue::Decimal128(_, _, _) => None, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_is_key_kind() { + assert!(!DatumKind::Null.is_key_kind()); + assert!(DatumKind::Timestamp.is_key_kind()); + assert!(!DatumKind::Double.is_key_kind()); + assert!(!DatumKind::Float.is_key_kind()); + assert!(DatumKind::Varbinary.is_key_kind()); + assert!(DatumKind::String.is_key_kind()); + assert!(DatumKind::UInt64.is_key_kind()); + assert!(DatumKind::UInt32.is_key_kind()); + assert!(DatumKind::UInt16.is_key_kind()); + assert!(DatumKind::UInt8.is_key_kind()); + assert!(DatumKind::Int64.is_key_kind()); + assert!(DatumKind::Int32.is_key_kind()); + assert!(DatumKind::Int16.is_key_kind()); + assert!(DatumKind::Int8.is_key_kind()); + assert!(DatumKind::Boolean.is_key_kind()); + } + + #[test] + fn test_unsign_kind() { + assert_eq!(DatumKind::UInt64.unsign_kind(), Some(DatumKind::UInt64)); + assert_eq!(DatumKind::Int64.unsign_kind(), Some(DatumKind::UInt64)); + assert_eq!(DatumKind::UInt32.unsign_kind(), Some(DatumKind::UInt32)); + assert_eq!(DatumKind::Int32.unsign_kind(), Some(DatumKind::UInt32)); + assert_eq!(DatumKind::UInt16.unsign_kind(), Some(DatumKind::UInt16)); + assert_eq!(DatumKind::Int16.unsign_kind(), Some(DatumKind::UInt16)); + assert_eq!(DatumKind::UInt8.unsign_kind(), Some(DatumKind::UInt8)); + assert_eq!(DatumKind::Int8.unsign_kind(), Some(DatumKind::UInt8)); + + assert!(DatumKind::Null.unsign_kind().is_none()); + assert!(DatumKind::Timestamp.unsign_kind().is_none()); + assert!(DatumKind::String.unsign_kind().is_none()); + assert!(DatumKind::Boolean.unsign_kind().is_none()); + assert!(DatumKind::Varbinary.unsign_kind().is_none()); + assert!(DatumKind::Double.unsign_kind().is_none()); + assert!(DatumKind::Float.unsign_kind().is_none()); + } + + #[test] + fn test_into_u8() { + assert_eq!(0, DatumKind::Null.into_u8()); + assert_eq!(1, DatumKind::Timestamp.into_u8()); + assert_eq!(2, DatumKind::Double.into_u8()); + assert_eq!(3, DatumKind::Float.into_u8()); + assert_eq!(4, DatumKind::Varbinary.into_u8()); + assert_eq!(5, DatumKind::String.into_u8()); + assert_eq!(6, DatumKind::UInt64.into_u8()); + assert_eq!(7, DatumKind::UInt32.into_u8()); + assert_eq!(8, DatumKind::UInt16.into_u8()); + assert_eq!(9, DatumKind::UInt8.into_u8()); + assert_eq!(10, DatumKind::Int64.into_u8()); + assert_eq!(11, DatumKind::Int32.into_u8()); + assert_eq!(12, DatumKind::Int16.into_u8()); + assert_eq!(13, DatumKind::Int8.into_u8()); + assert_eq!(14, DatumKind::Boolean.into_u8()); + } +} diff --git a/common_types/src/hash.rs b/common_types/src/hash.rs new file mode 100644 index 0000000000..9edc8c69cb --- /dev/null +++ b/common_types/src/hash.rs @@ -0,0 +1,39 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// custom hash mod +use byteorder::{ByteOrder, LittleEndian}; +use murmur3::murmur3_x64_128; + +pub fn hash64(mut bytes: &[u8]) -> u64 { + let mut out = [0; 16]; + murmur3_x64_128(&mut bytes, 0, &mut out); + // in most cases we run on little endian target + LittleEndian::read_u64(&out[0..8]) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn empty_hash_test() { + let res1 = hash64(&[]); + let res2 = hash64(&[]); + assert_eq!(res1, res2); + } + + #[test] + fn hash_test() { + let test_bytes_1 = b"cse_engine_hash_mod_test_bytes1".to_vec(); + let test_bytes_2 = b"cse_engine_hash_mod_test_bytes2".to_vec(); + { + // hash64 testing + let res1 = hash64(&test_bytes_1); + let res1_1 = hash64(&test_bytes_1); + assert_eq!(res1, res1_1); + + let res2 = hash64(&test_bytes_2); + assert_ne!(res1, res2); + } + } +} diff --git a/common_types/src/lib.rs b/common_types/src/lib.rs new file mode 100644 index 0000000000..3da29b0a52 --- /dev/null +++ b/common_types/src/lib.rs @@ -0,0 +1,24 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Contains common types + +pub mod bytes; +pub mod column; +pub mod column_schema; +pub mod datum; +pub mod hash; +pub mod projected_schema; +pub mod record_batch; +pub mod request_id; +pub mod row; +pub mod schema; +pub mod string; +pub mod time; + +/// Sequence number +pub type SequenceNumber = u64; +pub const MAX_SEQUENCE_NUMBER: u64 = u64::MAX; +pub const MIN_SEQUENCE_NUMBER: u64 = 0; + +#[cfg(any(test, feature = "test"))] +pub mod tests; diff --git a/common_types/src/projected_schema.rs b/common_types/src/projected_schema.rs new file mode 100644 index 0000000000..8fa17f2848 --- /dev/null +++ b/common_types/src/projected_schema.rs @@ -0,0 +1,292 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Projected schema + +use std::{fmt, sync::Arc}; + +use snafu::{ensure, Backtrace, ResultExt, Snafu}; + +use crate::{ + column_schema::{ColumnSchema, ReadOp}, + datum::Datum, + row::Row, + schema::{ArrowSchemaRef, RecordSchema, RecordSchemaWithKey, Schema}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Invalid projection index, index:{}.\nBacktrace:\n{}", + index, + backtrace + ))] + InvalidProjectionIndex { index: usize, backtrace: Backtrace }, + + #[snafu(display("Incompatible column schema for read, err:{}", source))] + IncompatReadColumn { + source: crate::column_schema::CompatError, + }, + + #[snafu(display("Failed to build projected schema, err:{}", source))] + BuildProjectedSchema { source: crate::schema::Error }, + + #[snafu(display( + "Missing not null column for read, name:{}.\nBacktrace:\n{}", + name, + backtrace + ))] + MissingReadColumn { name: String, backtrace: Backtrace }, +} + +pub type Result = std::result::Result; + +#[derive(Debug)] +pub struct RowProjector { + schema_with_key: RecordSchemaWithKey, + source_schema: Schema, + /// The Vec stores the column index in source, and `None` means this column + /// is not in source but required by reader, and need to filled by null. + /// The length of Vec is the same as the number of columns reader intended + /// to read. + source_projection: Vec>, +} + +impl RowProjector { + /// The projected indexes of existed columns in the source schema. + pub fn existed_source_projection(&self) -> Vec { + self.source_projection + .iter() + .filter_map(|index| *index) + .collect() + } + + /// The projected indexes of all columns(existed and not exist) in the + /// source schema. + pub fn source_projection(&self) -> &[Option] { + &self.source_projection + } + + pub fn schema_with_key(&self) -> &RecordSchemaWithKey { + &self.schema_with_key + } + + /// Project the row. + /// + /// REQUIRE: The schema of row is the same as source schema. + pub fn project_row(&self, row: &Row, mut datums_buffer: Vec) -> Row { + assert_eq!(self.source_schema.num_columns(), row.num_columns()); + + datums_buffer.reserve(self.schema_with_key.num_columns()); + + for p in &self.source_projection { + let datum = match p { + Some(index_in_source) => row[*index_in_source].clone(), + None => Datum::Null, + }; + + datums_buffer.push(datum); + } + + Row::from_datums(datums_buffer) + } +} + +#[derive(Clone)] +pub struct ProjectedSchema(Arc); + +impl fmt::Debug for ProjectedSchema { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ProjectedSchema") + .field("original_schema", &self.0.original_schema) + .field("projection", &self.0.projection) + .finish() + } +} + +impl ProjectedSchema { + pub fn no_projection(schema: Schema) -> Self { + let inner = ProjectedSchemaInner::no_projection(schema); + Self(Arc::new(inner)) + } + + pub fn new(schema: Schema, projection: Option>) -> Result { + let inner = ProjectedSchemaInner::new(schema, projection)?; + Ok(Self(Arc::new(inner))) + } + + pub fn is_all_projection(&self) -> bool { + self.0.is_all_projection() + } + + /// Returns the [RowProjector] to project the rows with source schema to + /// rows with [RecordSchemaWithKey]. + /// + /// REQUIRE: The key columns are the same as this schema. + #[inline] + pub fn try_project_with_key(&self, source_schema: &Schema) -> Result { + self.0.try_project_with_key(source_schema) + } + + // Returns the record schema after projection with key. + pub fn to_record_schema_with_key(&self) -> RecordSchemaWithKey { + self.0.schema_with_key.clone() + } + + pub(crate) fn as_record_schema_with_key(&self) -> &RecordSchemaWithKey { + &self.0.schema_with_key + } + + // Returns the record schema after projection. + pub fn to_record_schema(&self) -> RecordSchema { + self.0.record_schema.clone() + } + + /// Returns the arrow schema after projection. + pub fn to_projected_arrow_schema(&self) -> ArrowSchemaRef { + self.0.record_schema.to_arrow_schema_ref() + } +} + +/// Schema with projection informations +struct ProjectedSchemaInner { + /// The schema before projection that the reader intended to read, may + /// differ from current schema of the table. + original_schema: Schema, + /// Index of the projected columns in `self.schema`, `None` if + /// all columns are needed. + projection: Option>, + + /// The record schema from `self.schema` with key columns after projection. + schema_with_key: RecordSchemaWithKey, + /// The record schema from `self.schema` after projection. + record_schema: RecordSchema, +} + +impl ProjectedSchemaInner { + fn no_projection(schema: Schema) -> Self { + let schema_with_key = schema.to_record_schema_with_key(); + let record_schema = schema.to_record_schema(); + + Self { + original_schema: schema, + projection: None, + schema_with_key, + record_schema, + } + } + + fn new(schema: Schema, projection: Option>) -> Result { + if let Some(p) = &projection { + // Projection is provided, validate the projection is valid. This is necessary + // to avoid panic when creating RecordSchema and + // RecordSchemaWithKey. + if let Some(max_idx) = p.iter().max() { + ensure!( + *max_idx < schema.num_columns(), + InvalidProjectionIndex { index: *max_idx } + ); + } + + let schema_with_key = schema.project_record_schema_with_key(p); + let record_schema = schema.project_record_schema(p); + + Ok(Self { + original_schema: schema, + projection, + schema_with_key, + record_schema, + }) + } else { + Ok(Self::no_projection(schema)) + } + } + + /// Selecting all the columns is the all projection. + fn is_all_projection(&self) -> bool { + self.projection.is_none() + } + + // TODO(yingwen): We can fill missing not null column with default value instead + // of returning error. + fn try_project_with_key(&self, source_schema: &Schema) -> Result { + debug_assert_eq!( + self.schema_with_key.key_columns(), + source_schema.key_columns() + ); + // We consider the two schema is equal if they have same version. + if self.original_schema.version() == source_schema.version() { + debug_assert_eq!(self.original_schema, *source_schema); + } + + let mut source_projection = Vec::with_capacity(self.schema_with_key.num_columns()); + // For each column in `schema_with_key` + for column_schema in self.schema_with_key.columns() { + self.try_project_column(column_schema, source_schema, &mut source_projection)?; + } + + Ok(RowProjector { + schema_with_key: self.schema_with_key.clone(), + source_schema: source_schema.clone(), + source_projection, + }) + } + + fn try_project_column( + &self, + column: &ColumnSchema, + source_schema: &Schema, + source_projection: &mut Vec>, + ) -> Result<()> { + match source_schema.index_of(&column.name) { + Some(source_idx) => { + // Column is in source + if self.original_schema.version() == source_schema.version() { + // Same version, just use that column in source + source_projection.push(Some(source_idx)); + } else { + // Different version, need to check column schema + let source_column = source_schema.column(source_idx); + // TODO(yingwen): Data type is not checked here because we do not support alter + // data type now. + match column + .compatible_for_read(source_column) + .context(IncompatReadColumn)? + { + ReadOp::Exact => { + source_projection.push(Some(source_idx)); + } + ReadOp::FillNull => { + source_projection.push(None); + } + } + } + } + None => { + // Column is not in source + ensure!(column.is_nullable, MissingReadColumn { name: &column.name }); + // Column is nullable, fill this column by null + source_projection.push(None); + } + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use crate::{projected_schema::ProjectedSchema, tests::build_schema}; + + #[test] + fn test_projected_schema() { + let schema = build_schema(); + assert!(schema.num_columns() > 1); + let projection: Vec = (0..schema.num_columns() - 1).collect(); + let projected_schema = ProjectedSchema::new(schema.clone(), Some(projection)).unwrap(); + assert_eq!( + projected_schema.0.schema_with_key.num_columns(), + schema.num_columns() - 1 + ); + assert!(!projected_schema.is_all_projection()); + } +} diff --git a/common_types/src/record_batch.rs b/common_types/src/record_batch.rs new file mode 100644 index 0000000000..1b7ca99d98 --- /dev/null +++ b/common_types/src/record_batch.rs @@ -0,0 +1,695 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Record batch + +use std::{cmp, convert::TryFrom, mem}; + +use arrow_deps::{ + arrow::{ + datatypes::SchemaRef as ArrowSchemaRef, record_batch::RecordBatch as ArrowRecordBatch, + }, + util, +}; +use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu}; + +use crate::{ + column::{ColumnBlock, ColumnBlockBuilder}, + datum::DatumKind, + projected_schema::{ProjectedSchema, RowProjector}, + row::{ + contiguous::{ContiguousRow, ProjectedContiguousRow}, + Row, RowViewOnBatch, + }, + schema::{RecordSchema, RecordSchemaWithKey}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Invalid schema len to build RecordBatch.\nBacktrace:\n{}", backtrace))] + SchemaLen { backtrace: Backtrace }, + + #[snafu(display("Failed to create column block, err:{}", source))] + CreateColumnBlock { source: crate::column::Error }, + + #[snafu(display( + "Failed to create arrow record batch, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + CreateArrow { + source: arrow_deps::arrow::error::ArrowError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to iterate datum, err:{}", source))] + IterateDatum { source: crate::row::Error }, + + #[snafu(display("Failed to append datum, err:{}", source))] + AppendDatum { source: crate::column::Error }, + + #[snafu(display( + "Column not in schema with key, column_name:{}.\nBacktrace:\n{}", + name, + backtrace + ))] + ColumnNotInSchemaWithKey { name: String, backtrace: Backtrace }, + + #[snafu(display("Failed to convert arrow schema, err:{}", source))] + ConvertArrowSchema { source: crate::schema::Error }, + + #[snafu(display("Mismatch record schema to build RecordBatch, column_name:{}, schema_type:{:?}, column_type:{:?}.\nBacktrace:\n{}", column_name, schema_type, column_type, backtrace))] + MismatchRecordSchema { + column_name: String, + schema_type: DatumKind, + column_type: DatumKind, + backtrace: Backtrace, + }, + + #[snafu(display( + "Projection is out of the index, source_projection:{:?}, arrow_schema:{}.\nBacktrace:\n{}", + source_projection, + arrow_schema, + backtrace + ))] + OutOfIndexProjection { + source_projection: Vec>, + arrow_schema: ArrowSchemaRef, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to reverse record batch data, err:{:?}.\nBacktrace:\n{}", + source, + backtrace + ))] + ReverseRecordBatchData { + source: Box, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to select record batch data, err:{:?}.\nBacktrace:\n{}", + source, + backtrace + ))] + SelectRecordBatchData { + source: Box, + backtrace: Backtrace, + }, +} + +pub type Result = std::result::Result; + +#[derive(Debug)] +pub struct RecordBatchData { + arrow_record_batch: ArrowRecordBatch, + column_blocks: Vec, +} + +impl RecordBatchData { + fn new(arrow_schema: ArrowSchemaRef, column_blocks: Vec) -> Result { + let arrays = column_blocks + .iter() + .map(|column| column.to_arrow_array_ref()) + .collect(); + + let arrow_record_batch = + ArrowRecordBatch::try_new(arrow_schema, arrays).context(CreateArrow)?; + + Ok(RecordBatchData { + arrow_record_batch, + column_blocks, + }) + } + + fn num_rows(&self) -> usize { + self.column_blocks + .first() + .map(|column| column.num_rows()) + .unwrap_or(0) + } + + fn take_column_block(&mut self, index: usize) -> ColumnBlock { + let num_rows = self.num_rows(); + mem::replace( + &mut self.column_blocks[index], + ColumnBlock::new_null(num_rows), + ) + } + + /// Returns a zero-copy slice of this array with the indicated offset and + /// length. + /// + /// Panics if offset with length is greater than column length. + fn slice(&self, offset: usize, length: usize) -> Self { + let column_blocks = self + .column_blocks + .iter() + .map(|col| col.slice(offset, length)) + .collect(); + + Self { + arrow_record_batch: self.arrow_record_batch.slice(offset, length), + column_blocks, + } + } +} + +fn build_column_blocks_from_arrow_record_batch( + arrow_record_batch: &ArrowRecordBatch, + record_schema: &RecordSchema, +) -> Result> { + let mut column_blocks = Vec::with_capacity(arrow_record_batch.num_columns()); + for (column_schema, array) in record_schema + .columns() + .iter() + .zip(arrow_record_batch.columns()) + { + let column = ColumnBlock::try_from_arrow_array_ref(&column_schema.data_type, array) + .context(CreateColumnBlock)?; + column_blocks.push(column); + } + + Ok(column_blocks) +} + +impl TryFrom for RecordBatchData { + type Error = Error; + + fn try_from(arrow_record_batch: ArrowRecordBatch) -> Result { + let record_schema = + RecordSchema::try_from(arrow_record_batch.schema()).context(ConvertArrowSchema)?; + let column_blocks = + build_column_blocks_from_arrow_record_batch(&arrow_record_batch, &record_schema)?; + Ok(Self { + arrow_record_batch, + column_blocks, + }) + } +} + +// TODO(yingwen): The schema in RecordBatch should be much simple because it may +// lack some information. +#[derive(Debug)] +pub struct RecordBatch { + schema: RecordSchema, + data: RecordBatchData, +} + +impl RecordBatch { + pub fn new_empty(schema: RecordSchema) -> Self { + let arrow_schema = schema.to_arrow_schema_ref(); + let arrow_record_batch = ArrowRecordBatch::new_empty(arrow_schema); + + Self { + schema, + data: RecordBatchData { + arrow_record_batch, + column_blocks: Vec::new(), + }, + } + } + + pub fn new(schema: RecordSchema, column_blocks: Vec) -> Result { + ensure!(schema.num_columns() == column_blocks.len(), SchemaLen); + + // Validate schema and column_blocks. + for (column_schema, column_block) in schema.columns().iter().zip(column_blocks.iter()) { + ensure!( + column_schema.data_type == column_block.datum_kind(), + MismatchRecordSchema { + column_name: &column_schema.name, + schema_type: column_schema.data_type, + column_type: column_block.datum_kind(), + } + ); + } + + let arrow_schema = schema.to_arrow_schema_ref(); + let data = RecordBatchData::new(arrow_schema, column_blocks)?; + + Ok(Self { schema, data }) + } + + pub fn schema(&self) -> &RecordSchema { + &self.schema + } + + // REQUIRE: index is valid + pub fn column(&self, index: usize) -> &ColumnBlock { + &self.data.column_blocks[index] + } + + pub fn num_columns(&self) -> usize { + self.schema.num_columns() + } + + pub fn num_rows(&self) -> usize { + self.data.num_rows() + } + + pub fn into_arrow_record_batch(self) -> ArrowRecordBatch { + self.data.arrow_record_batch + } +} + +impl TryFrom for RecordBatch { + type Error = Error; + + fn try_from(arrow_record_batch: ArrowRecordBatch) -> Result { + let record_schema = + RecordSchema::try_from(arrow_record_batch.schema()).context(ConvertArrowSchema)?; + + let column_blocks = + build_column_blocks_from_arrow_record_batch(&arrow_record_batch, &record_schema)?; + + Ok(Self { + schema: record_schema, + data: RecordBatchData { + arrow_record_batch, + column_blocks, + }, + }) + } +} + +#[derive(Debug)] +pub struct RecordBatchWithKey { + schema_with_key: RecordSchemaWithKey, + data: RecordBatchData, +} + +impl RecordBatchWithKey { + pub fn num_rows(&self) -> usize { + self.data.num_rows() + } + + pub fn num_columns(&self) -> usize { + self.data.arrow_record_batch.num_columns() + } + + pub fn columns(&self) -> &[ColumnBlock] { + &self.data.column_blocks + } + + pub fn clone_row_at(&self, index: usize) -> Row { + let datums = self + .data + .column_blocks + .iter() + .map(|column_block| column_block.datum(index)) + .collect(); + + Row::from_datums(datums) + } + + /// Project the [RecordBatchWithKey] into a [RecordBatch] according to + /// [ProjectedSchema]. + /// + /// REQUIRE: The schema_with_key of the [RecordBatchWithKey] is the same as + /// the schema_with_key of [ProjectedSchema]. + pub fn try_project(mut self, projected_schema: &ProjectedSchema) -> Result { + debug_assert_eq!( + &self.schema_with_key, + projected_schema.as_record_schema_with_key() + ); + + // Get the schema after projection. + let record_schema = projected_schema.to_record_schema(); + let mut column_blocks = Vec::with_capacity(record_schema.num_columns()); + + for column_schema in record_schema.columns() { + let column_index = self.schema_with_key.index_of(&column_schema.name).context( + ColumnNotInSchemaWithKey { + name: &column_schema.name, + }, + )?; + + // Take the column block out. + let column_block = self.data.take_column_block(column_index); + column_blocks.push(column_block); + } + + let data = RecordBatchData::new(record_schema.to_arrow_schema_ref(), column_blocks)?; + + Ok(RecordBatch { + schema: record_schema, + data, + }) + } + + pub fn into_record_batch(self) -> RecordBatch { + RecordBatch { + schema: self.schema_with_key.into_record_schema(), + data: self.data, + } + } + + #[inline] + pub fn schema_with_key(&self) -> &RecordSchemaWithKey { + &self.schema_with_key + } + + #[inline] + pub fn column(&self, index: usize) -> &ColumnBlock { + &self.data.column_blocks[index] + } + + /// Reverse the rows in the data. + /// + /// The data retains intact if failed. + pub fn reverse_data(&mut self) -> Result<()> { + let reversed_record_batch = util::reverse_record_batch(&self.data.arrow_record_batch) + .map_err(|e| Box::new(e) as _) + .context(ReverseRecordBatchData)?; + + self.data = RecordBatchData::try_from(reversed_record_batch) + .map_err(|e| Box::new(e) as _) + .context(ReverseRecordBatchData)?; + + Ok(()) + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.num_rows() == 0 + } + + /// Returns a zero-copy slice of this array with the indicated offset and + /// length. + /// + /// Panics if offset with length is greater than column length. + #[must_use] + pub fn slice(&self, offset: usize, length: usize) -> Self { + Self { + schema_with_key: self.schema_with_key.clone(), + data: self.data.slice(offset, length), + } + } + + /// Select the rows according to the `selected_rows`. + /// + /// The data retains intact if failed. + pub fn select_data(&mut self, selected_rows: &[bool]) -> Result<()> { + assert_eq!(self.num_rows(), selected_rows.len()); + + let selected_record_batch = + util::select_record_batch(&self.data.arrow_record_batch, selected_rows) + .map_err(|e| Box::new(e) as _) + .context(SelectRecordBatchData)?; + let selected_data = RecordBatchData::try_from(selected_record_batch) + .map_err(|e| Box::new(e) as _) + .context(SelectRecordBatchData)?; + + self.data = selected_data; + + Ok(()) + } +} + +pub struct RecordBatchWithKeyBuilder { + schema_with_key: RecordSchemaWithKey, + builders: Vec, +} + +impl RecordBatchWithKeyBuilder { + pub fn new(schema_with_key: RecordSchemaWithKey) -> Self { + let builders = schema_with_key + .columns() + .iter() + .map(|column_schema| ColumnBlockBuilder::with_capacity(&column_schema.data_type, 0)) + .collect(); + Self { + schema_with_key, + builders, + } + } + + pub fn with_capacity(schema_with_key: RecordSchemaWithKey, capacity: usize) -> Self { + let builders = schema_with_key + .columns() + .iter() + .map(|column_schema| { + ColumnBlockBuilder::with_capacity(&column_schema.data_type, capacity) + }) + .collect(); + Self { + schema_with_key, + builders, + } + } + + /// Append row into builder. + /// + /// REQUIRE: The row and the builder must have the same schema. + pub fn append_row(&mut self, row: Row) -> Result<()> { + for (builder, datum) in self.builders.iter_mut().zip(row) { + builder.append(datum).context(AppendDatum)?; + } + + Ok(()) + } + + /// Append projected contiguous row into builder. + /// + /// REQUIRE: + /// - The schema of `row` is the same as the source schema of the + /// `projector`. + /// - The projected schema (with key) is the same as the schema of the + /// builder. + pub fn append_projected_contiguous_row( + &mut self, + row: &ProjectedContiguousRow, + ) -> Result<()> { + assert_eq!(row.num_datum_views(), self.builders.len()); + + for (index, builder) in self.builders.iter_mut().enumerate() { + let datum_view = row.datum_view_at(index); + builder.append_view(datum_view).context(AppendDatum)?; + } + + Ok(()) + } + + /// Append the row from the [RowView] to the builder. + /// + /// REQUIRE: The `row_view` and the builder must have the same schema. + pub fn append_row_view(&mut self, row_view: &RowViewOnBatch) -> Result<()> { + for (builder, datum) in self.builders.iter_mut().zip(row_view.iter_columns()) { + let datum = datum.context(IterateDatum)?; + builder.append(datum).context(AppendDatum)?; + } + + Ok(()) + } + + /// Append `len` from `start` (inclusive) to this builder. + /// + /// REQUIRE: + /// - The `record_batch` and the builder must have the same schema. + pub fn append_batch_range( + &mut self, + record_batch: &RecordBatchWithKey, + start: usize, + len: usize, + ) -> Result { + let num_rows = record_batch.num_rows(); + if start >= num_rows { + return Ok(0); + } + + let added = cmp::min(num_rows - start, len); + + for (builder, column) in self.builders.iter_mut().zip(record_batch.columns().iter()) { + builder + .append_block_range(column, start, added) + .context(AppendDatum)?; + } + + Ok(added) + } + + /// The number of the appended rows. + pub fn len(&self) -> usize { + self.builders + .first() + .map(|builder| builder.len()) + .unwrap_or(0) + } + + /// Returns true if the builder is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Reset the builders for reuse. + pub fn clear(&mut self) { + for builder in &mut self.builders { + builder.clear(); + } + } + + /// Build [RecordBatchWithKey] and reset the builder. + pub fn build(&mut self) -> Result { + let column_blocks: Vec<_> = self + .builders + .iter_mut() + .map(|builder| builder.build()) + .collect(); + let arrow_schema = self.schema_with_key.to_arrow_schema_ref(); + + Ok(RecordBatchWithKey { + schema_with_key: self.schema_with_key.clone(), + data: RecordBatchData::new(arrow_schema, column_blocks)?, + }) + } +} + +#[derive(Debug)] +pub struct ArrowRecordBatchProjector { + row_projector: RowProjector, +} + +impl From for ArrowRecordBatchProjector { + fn from(row_projector: RowProjector) -> Self { + Self { row_projector } + } +} + +impl ArrowRecordBatchProjector { + /// Project the [arrow::RecordBatch] to [RecordBatchWithKey] and these + /// things is to be done: + /// - Insert the null column if the projected column does not appear in the + /// source schema. + /// - Convert the [arrow::RecordBatch] to [RecordBatchWithKey]. + /// + /// REQUIRE: Schema of the `arrow_record_batch` is the same as the + /// projection of existing column in the source schema. + pub fn project_to_record_batch_with_key( + &self, + arrow_record_batch: ArrowRecordBatch, + ) -> Result { + let schema_with_key = self.row_projector.schema_with_key().clone(); + let source_projection = self.row_projector.source_projection(); + let mut column_blocks = Vec::with_capacity(schema_with_key.num_columns()); + + let num_rows = arrow_record_batch.num_rows(); + // ensure next_arrow_column_idx < num_columns + let mut next_arrow_column_idx = 0; + let num_columns = arrow_record_batch.num_columns(); + + for (source_idx, column_schema) in source_projection.iter().zip(schema_with_key.columns()) { + match source_idx { + Some(_) => { + ensure!( + next_arrow_column_idx < num_columns, + OutOfIndexProjection { + source_projection, + arrow_schema: arrow_record_batch.schema() + } + ); + + let array = arrow_record_batch.column(next_arrow_column_idx); + next_arrow_column_idx += 1; + + let column_block = + ColumnBlock::try_from_arrow_array_ref(&column_schema.data_type, array) + .context(CreateColumnBlock)?; + + column_blocks.push(column_block); + } + None => { + // Need to push row with specific type. + let null_block = + ColumnBlock::new_null_with_type(&column_schema.data_type, num_rows) + .context(CreateColumnBlock)?; + column_blocks.push(null_block); + } + } + } + + let data = RecordBatchData::new(schema_with_key.to_arrow_schema_ref(), column_blocks)?; + + Ok(RecordBatchWithKey { + schema_with_key, + data, + }) + } +} + +#[cfg(test)] +mod tests { + use crate::{ + record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder}, + row::RowViewOnBatch, + tests::{ + build_projected_schema, build_record_batch_with_key_by_rows, build_rows, + check_record_batch_with_key_with_rows, + }, + }; + + fn build_record_batch_with_key() -> RecordBatchWithKey { + let rows = build_rows(); + build_record_batch_with_key_by_rows(rows) + } + + fn check_record_batch_with_key( + record_batch_with_key: RecordBatchWithKey, + row_num: usize, + column_num: usize, + ) -> bool { + let rows = build_rows(); + check_record_batch_with_key_with_rows(&record_batch_with_key, row_num, column_num, rows) + } + + #[test] + fn test_append_projected_contiguous_row() { + let record_batch_with_key = build_record_batch_with_key(); + assert_eq!(record_batch_with_key.num_rows(), 5); + assert_eq!(record_batch_with_key.num_columns(), 3); + + check_record_batch_with_key(record_batch_with_key, 5, 3); + } + + #[test] + fn test_append_row_view() { + let projected_schema = build_projected_schema(); + + let record_batch_with_key = build_record_batch_with_key(); + + let mut builder = RecordBatchWithKeyBuilder::with_capacity( + projected_schema.to_record_schema_with_key(), + 2, + ); + let view = RowViewOnBatch { + record_batch: &record_batch_with_key, + row_idx: 1, + }; + builder.append_row_view(&view).unwrap(); + let record_batch_with_key = builder.build().unwrap(); + assert_eq!(record_batch_with_key.num_rows(), 1); + assert_eq!(record_batch_with_key.num_columns(), 3); + + check_record_batch_with_key(record_batch_with_key, 1, 3); + } + + #[test] + fn test_append_batch_range() { + let projected_schema = build_projected_schema(); + + let record_batch_with_key = build_record_batch_with_key(); + + let mut builder = RecordBatchWithKeyBuilder::with_capacity( + projected_schema.to_record_schema_with_key(), + 2, + ); + builder + .append_batch_range(&record_batch_with_key, 0, 2) + .unwrap(); + let record_batch_with_key = builder.build().unwrap(); + assert_eq!(record_batch_with_key.num_rows(), 2); + assert_eq!(record_batch_with_key.num_columns(), 3); + + check_record_batch_with_key(record_batch_with_key, 2, 3); + } +} diff --git a/common_types/src/request_id.rs b/common_types/src/request_id.rs new file mode 100644 index 0000000000..6990839818 --- /dev/null +++ b/common_types/src/request_id.rs @@ -0,0 +1,43 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Request id. + +use std::{ + fmt, + sync::atomic::{AtomicU64, Ordering}, +}; + +#[derive(Debug, Clone, Copy)] +pub struct RequestId(u64); + +impl RequestId { + /// Acquire next request id. + pub fn next_id() -> Self { + static NEXT_ID: AtomicU64 = AtomicU64::new(1); + + let id = NEXT_ID.fetch_add(1, Ordering::Relaxed); + + Self(id) + } +} + +impl fmt::Display for RequestId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_request_id() { + let id = RequestId::next_id(); + assert_eq!(1, id.0); + let id = RequestId::next_id(); + assert_eq!(2, id.0); + + assert_eq!("2", id.to_string()); + } +} diff --git a/common_types/src/row/contiguous.rs b/common_types/src/row/contiguous.rs new file mode 100644 index 0000000000..dd35f6ecb0 --- /dev/null +++ b/common_types/src/row/contiguous.rs @@ -0,0 +1,501 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Contiguous row. + +use std::{ + convert::{TryFrom, TryInto}, + fmt, mem, + ops::{Deref, DerefMut}, + str, +}; + +use snafu::{ensure, Backtrace, Snafu}; + +use crate::{ + datum::{Datum, DatumKind, DatumView}, + projected_schema::RowProjector, + row::Row, + schema::{IndexInWriterSchema, Schema}, + time::Timestamp, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "String is too long to encode into row (max is {}), len:{}.\nBacktrace:\n{}", + MAX_STRING_LEN, + len, + backtrace + ))] + StringTooLong { len: usize, backtrace: Backtrace }, +} + +pub type Result = std::result::Result; + +/// Size to store the offset of string buffer. +type OffsetSize = usize; + +/// Max allowed string length of datum to store in a contiguous row (16 MB). +const MAX_STRING_LEN: usize = 1024 * 1024 * 16; + +/// Row encoded in a contiguous buffer. +pub trait ContiguousRow { + /// Returns the number of datums. + fn num_datum_views(&self) -> usize; + + /// Returns [DatumView] of column in given index, and returns null if the + /// datum kind is unknown. + /// + /// Panic if index or buffer is out of bound. + fn datum_view_at(&self, index: usize) -> DatumView; +} + +pub struct ContiguousRowReader<'a, T> { + inner: &'a T, + byte_offsets: &'a [usize], + string_buffer_offset: usize, +} + +impl<'a, T> ContiguousRowReader<'a, T> { + pub fn with_schema(inner: &'a T, schema: &'a Schema) -> Self { + Self { + inner, + byte_offsets: schema.byte_offsets(), + string_buffer_offset: schema.string_buffer_offset(), + } + } +} + +impl<'a, T: Deref> ContiguousRow for ContiguousRowReader<'a, T> { + fn num_datum_views(&self) -> usize { + self.byte_offsets.len() + } + + fn datum_view_at(&self, index: usize) -> DatumView<'a> { + let offset = self.byte_offsets[index]; + let buf = &self.inner[offset..]; + + // Get datum kind, if the datum kind is unknown, returns null. + let datum_kind = match DatumKind::try_from(buf[0]) { + Ok(v) => v, + Err(_) => return DatumView::Null, + }; + + // Advance 1 byte to skip the header byte. + let datum_buf = &buf[1..]; + // If no string column in this schema, the string buffer offset should + // equal to the buffer len, and string buf is an empty slice. + let string_buf = &self.inner[self.string_buffer_offset..]; + + must_read_view(&datum_kind, datum_buf, string_buf) + } +} + +/// Contiguous row with projection information. +/// +/// The caller must ensure the source schema of projector is the same as the +/// schema of source row. +pub struct ProjectedContiguousRow<'a, T> { + source_row: T, + projector: &'a RowProjector, +} + +impl<'a, T: ContiguousRow> ProjectedContiguousRow<'a, T> { + pub fn new(source_row: T, projector: &'a RowProjector) -> Self { + Self { + source_row, + projector, + } + } +} + +impl<'a, T: ContiguousRow> ContiguousRow for ProjectedContiguousRow<'a, T> { + fn num_datum_views(&self) -> usize { + self.projector.source_projection().len() + } + + fn datum_view_at(&self, index: usize) -> DatumView { + let p = self.projector.source_projection()[index]; + + match p { + Some(index_in_source) => self.source_row.datum_view_at(index_in_source), + None => DatumView::Null, + } + } +} + +impl<'a, T: ContiguousRow> fmt::Debug for ProjectedContiguousRow<'a, T> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut list = f.debug_list(); + for i in 0..self.num_datum_views() { + let view = self.datum_view_at(i); + list.entry(&view); + } + list.finish() + } +} + +/// In memory buffer to hold data of a contiguous row. +pub trait RowBuffer: DerefMut { + /// Clear and resize the buffer size to `new_len` with given `value`. + fn reset(&mut self, new_len: usize, value: u8); + + /// Append slice into the buffer, resize the buffer automatically. + fn append_slice(&mut self, src: &[u8]); +} + +/// A writer to build a contiguous row. +pub struct ContiguousRowWriter<'a, T> { + inner: &'a mut T, + /// The schema the row group need to be encoded into, the schema + /// of the row need to be write compatible for the table schema. + table_schema: &'a Schema, + /// The index mapping from table schema to column in the + /// schema of row group. + index_in_writer: &'a IndexInWriterSchema, +} + +// TODO(yingwen): Try to replace usage of row by contiguous row. +impl<'a, T: RowBuffer + 'a> ContiguousRowWriter<'a, T> { + pub fn new( + inner: &'a mut T, + table_schema: &'a Schema, + index_in_writer: &'a IndexInWriterSchema, + ) -> Self { + Self { + inner, + table_schema, + index_in_writer, + } + } + + fn write_datum( + inner: &mut T, + datum: &Datum, + byte_offset: usize, + next_string_offset: &mut usize, + ) -> Result<()> { + let datum_offset = byte_offset + 1; + + match datum { + // Already filled by null, nothing to do. + Datum::Null => (), + Datum::Timestamp(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::Timestamp.into_u8()); + let value_buf = v.as_i64().to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + } + Datum::Double(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::Double.into_u8()); + let value_buf = v.to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + } + Datum::Float(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::Float.into_u8()); + let value_buf = v.to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + } + Datum::Varbinary(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::Varbinary.into_u8()); + let value_buf = next_string_offset.to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + // Use u32 to store length of string. + *next_string_offset += mem::size_of::() + v.len(); + + ensure!(v.len() <= MAX_STRING_LEN, StringTooLong { len: v.len() }); + + let string_len = v.len() as u32; + inner.append_slice(&string_len.to_ne_bytes()); + inner.append_slice(v); + } + Datum::String(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::String.into_u8()); + let value_buf = next_string_offset.to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + // Use u32 to store length of string. + *next_string_offset += mem::size_of::() + v.len(); + + ensure!(v.len() <= MAX_STRING_LEN, StringTooLong { len: v.len() }); + + let string_len = v.len() as u32; + inner.append_slice(&string_len.to_ne_bytes()); + inner.append_slice(v.as_bytes()); + } + Datum::UInt64(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::UInt64.into_u8()); + let value_buf = v.to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + } + Datum::UInt32(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::UInt32.into_u8()); + let value_buf = v.to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + } + Datum::UInt16(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::UInt16.into_u8()); + let value_buf = v.to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + } + Datum::UInt8(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::UInt8.into_u8()); + Self::write_slice_to_offset(inner, datum_offset, &[*v]); + } + Datum::Int64(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::Int64.into_u8()); + let value_buf = v.to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + } + Datum::Int32(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::Int32.into_u8()); + let value_buf = v.to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + } + Datum::Int16(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::Int16.into_u8()); + let value_buf = v.to_ne_bytes(); + Self::write_slice_to_offset(inner, datum_offset, &value_buf); + } + Datum::Int8(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::Int8.into_u8()); + Self::write_slice_to_offset(inner, datum_offset, &[*v as u8]); + } + Datum::Boolean(v) => { + Self::write_byte_to_offset(inner, byte_offset, DatumKind::Boolean.into_u8()); + Self::write_slice_to_offset(inner, datum_offset, &[*v as u8]); + } + } + + Ok(()) + } + + /// Write a row to the buffer, the buffer will be reset first. + pub fn write_row(&mut self, row: &Row) -> Result<()> { + let datum_buffer_len = self.table_schema.string_buffer_offset(); + // Reset the buffer and fill the buffer by null, now new slice will be + // appended to the string buffer. + self.inner + .reset(datum_buffer_len, DatumKind::Null.into_u8()); + + assert_eq!(row.num_columns(), self.table_schema.num_columns()); + + // Offset to next string in string buffer. + let mut next_string_offset: OffsetSize = 0; + for index_in_table in 0..self.table_schema.num_columns() { + if let Some(writer_index) = self.index_in_writer.column_index_in_writer(index_in_table) + { + let datum = &row[writer_index]; + let byte_offset = self.table_schema.byte_offset(index_in_table); + + // Write datum bytes to the buffer. + Self::write_datum(self.inner, datum, byte_offset, &mut next_string_offset)?; + } + // Column not in row is already filled by null. + } + + Ok(()) + } + + #[inline] + fn write_byte_to_offset(inner: &mut T, offset: usize, value: u8) { + inner[offset] = value; + } + + #[inline] + fn write_slice_to_offset(inner: &mut T, offset: usize, value_buf: &[u8]) { + let dst = &mut inner[offset..offset + value_buf.len()]; + dst.copy_from_slice(value_buf); + } +} + +/// The byte size to encode the datum of this kind in memory. +/// +/// Returns the (datum size + 1) for header. For integer types, the datum +/// size is the memory size of the interger type. For string types, the +/// datum size is the memory size to hold the offset. +pub(crate) fn byte_size_of_datum(kind: &DatumKind) -> usize { + let datum_size = match kind { + DatumKind::Null => 1, + DatumKind::Timestamp => mem::size_of::(), + DatumKind::Double => mem::size_of::(), + DatumKind::Float => mem::size_of::(), + // The size of offset. + DatumKind::Varbinary | DatumKind::String => mem::size_of::(), + DatumKind::UInt64 => mem::size_of::(), + DatumKind::UInt32 => mem::size_of::(), + DatumKind::UInt16 => mem::size_of::(), + DatumKind::UInt8 => mem::size_of::(), + DatumKind::Int64 => mem::size_of::(), + DatumKind::Int32 => mem::size_of::(), + DatumKind::Int16 => mem::size_of::(), + DatumKind::Int8 => mem::size_of::(), + DatumKind::Boolean => mem::size_of::(), + }; + + datum_size + 1 +} + +/// Read datum view from given datum buf, and may reference the string in +/// `string_buf`. +/// +/// Panic if out of bound. +/// +/// ## Safety +/// The string in buffer must be valid utf8. +fn must_read_view<'a>( + datum_kind: &DatumKind, + datum_buf: &'a [u8], + string_buf: &'a [u8], +) -> DatumView<'a> { + match datum_kind { + DatumKind::Null => DatumView::Null, + DatumKind::Timestamp => { + let value_buf = datum_buf[..mem::size_of::()].try_into().unwrap(); + let ts = Timestamp::new(i64::from_ne_bytes(value_buf)); + DatumView::Timestamp(ts) + } + DatumKind::Double => { + let value_buf = datum_buf[..mem::size_of::()].try_into().unwrap(); + let v = f64::from_ne_bytes(value_buf); + DatumView::Double(v) + } + DatumKind::Float => { + let value_buf = datum_buf[..mem::size_of::()].try_into().unwrap(); + let v = f32::from_ne_bytes(value_buf); + DatumView::Float(v) + } + DatumKind::Varbinary => { + let bytes = must_read_bytes(datum_buf, string_buf); + DatumView::Varbinary(bytes) + } + DatumKind::String => { + let bytes = must_read_bytes(datum_buf, string_buf); + let v = unsafe { str::from_utf8_unchecked(bytes) }; + DatumView::String(v) + } + DatumKind::UInt64 => { + let value_buf = datum_buf[..mem::size_of::()].try_into().unwrap(); + let v = u64::from_ne_bytes(value_buf); + DatumView::UInt64(v) + } + DatumKind::UInt32 => { + let value_buf = datum_buf[..mem::size_of::()].try_into().unwrap(); + let v = u32::from_ne_bytes(value_buf); + DatumView::UInt32(v) + } + DatumKind::UInt16 => { + let value_buf = datum_buf[..mem::size_of::()].try_into().unwrap(); + let v = u16::from_ne_bytes(value_buf); + DatumView::UInt16(v) + } + DatumKind::UInt8 => DatumView::UInt8(datum_buf[0]), + DatumKind::Int64 => { + let value_buf = datum_buf[..mem::size_of::()].try_into().unwrap(); + let v = i64::from_ne_bytes(value_buf); + DatumView::Int64(v) + } + DatumKind::Int32 => { + let value_buf = datum_buf[..mem::size_of::()].try_into().unwrap(); + let v = i32::from_ne_bytes(value_buf); + DatumView::Int32(v) + } + DatumKind::Int16 => { + let value_buf = datum_buf[..mem::size_of::()].try_into().unwrap(); + let v = i16::from_ne_bytes(value_buf); + DatumView::Int16(v) + } + DatumKind::Int8 => DatumView::Int8(datum_buf[0] as i8), + DatumKind::Boolean => DatumView::Boolean(datum_buf[0] != 0), + } +} + +fn must_read_bytes<'a>(datum_buf: &'a [u8], string_buf: &'a [u8]) -> &'a [u8] { + // Read offset of string in string buf. + let value_buf = datum_buf[..mem::size_of::()] + .try_into() + .unwrap(); + let offset = OffsetSize::from_ne_bytes(value_buf); + let string_buf = &string_buf[offset..]; + + // Read len of the string. + let len_buf = string_buf[..mem::size_of::()].try_into().unwrap(); + let string_len = u32::from_ne_bytes(len_buf) as usize; + let string_buf = &string_buf[mem::size_of::()..]; + + // Read string. + &string_buf[..string_len] +} + +impl RowBuffer for Vec { + fn reset(&mut self, new_len: usize, value: u8) { + self.clear(); + + self.resize(new_len, value); + } + + fn append_slice(&mut self, src: &[u8]) { + self.extend_from_slice(src); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + projected_schema::ProjectedSchema, + tests::{build_rows, build_schema}, + }; + + fn check_contiguous_row(row: &Row, reader: impl ContiguousRow, projection: Option>) { + let range = if let Some(projection) = projection { + projection + } else { + (0..reader.num_datum_views()).collect() + }; + for i in range { + let datum = &row[i]; + let view = reader.datum_view_at(i); + + assert_eq!(datum.as_view(), view); + } + } + + #[test] + fn test_contiguous_read_write() { + let schema = build_schema(); + let rows = build_rows(); + let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns()); + + let mut buf = Vec::new(); + for row in rows { + let mut writer = ContiguousRowWriter::new(&mut buf, &schema, &index_in_writer); + + writer.write_row(&row).unwrap(); + + let reader = ContiguousRowReader::with_schema(&buf, &schema); + check_contiguous_row(&row, reader, None); + } + } + + #[test] + fn test_project_contiguous_read_write() { + let schema = build_schema(); + assert!(schema.num_columns() > 1); + let projection: Vec = (0..schema.num_columns() - 1).collect(); + let projected_schema = + ProjectedSchema::new(schema.clone(), Some(projection.clone())).unwrap(); + let row_projected_schema = projected_schema.try_project_with_key(&schema).unwrap(); + let rows = build_rows(); + let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns()); + + let mut buf = Vec::new(); + for row in rows { + let mut writer = ContiguousRowWriter::new(&mut buf, &schema, &index_in_writer); + + writer.write_row(&row).unwrap(); + + let source_row = ContiguousRowReader::with_schema(&buf, &schema); + let projected_row = ProjectedContiguousRow::new(source_row, &row_projected_schema); + check_contiguous_row(&row, projected_row, Some(projection.clone())); + } + } +} diff --git a/common_types/src/row/mod.rs b/common_types/src/row/mod.rs new file mode 100644 index 0000000000..600052cfcc --- /dev/null +++ b/common_types/src/row/mod.rs @@ -0,0 +1,590 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Row type + +use std::{ + cmp, + ops::{Index, IndexMut}, +}; + +use snafu::{ensure, Backtrace, OptionExt, Snafu}; + +use crate::{ + column_schema::ColumnSchema, + datum::{Datum, DatumKind}, + record_batch::RecordBatchWithKey, + schema::{RecordSchemaWithKey, Schema}, + time::Timestamp, +}; + +pub mod contiguous; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Column out of bound, len:{}, given:{}.\nBacktrace:\n{}", + len, + given, + backtrace + ))] + ColumnOutOfBound { + len: usize, + given: usize, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid column num of row, expect:{}, given:{}.\nBacktrace:\n{}", + expect, + given, + backtrace + ))] + InvalidColumnNum { + expect: usize, + given: usize, + backtrace: Backtrace, + }, + + #[snafu(display("Column cannot be null, name:{}.\nBacktrace:\n{}", column, backtrace))] + NullColumn { + column: String, + backtrace: Backtrace, + }, + + #[snafu(display( + "Column type mismatch, name:{}, expect:{:?}, given:{:?}.\nBacktrace:\n{}", + column, + expect, + given, + backtrace + ))] + TypeMismatch { + column: String, + expect: DatumKind, + given: DatumKind, + backtrace: Backtrace, + }, + + #[snafu(display("Missing columns to build row.\nBacktrace:\n{}", backtrace))] + MissingColumns { backtrace: Backtrace }, + + #[snafu(display("Convert column failed, column:{}, err:{}", column, source))] + ConvertColumn { + column: String, + source: crate::datum::Error, + }, + + #[snafu(display("Column in the schema is not found, column_name:{}", column,))] + ColumnNameNotFound { column: String }, + + #[snafu(display( + "Column in the schema is not found, column_name:{}.\nBacktrace:\n{}", + column, + backtrace + ))] + ColumnNotFoundInSchema { + column: String, + backtrace: Backtrace, + }, +} + +// Do not depend on common_util crates +pub type Result = std::result::Result; + +// TODO(yingwen): +// - Memory pooling (or Arena) and statistics +// - Custom Debug format +// - Add a type RowWithSchema so we can ensure the row always matches the schema +// - Maybe add a type RowOperation like kudu + +/// Row contains multiple columns, each column is represented by a datum +/// The internal representation of row is not specific +#[derive(Debug, Clone, PartialEq)] +pub struct Row { + cols: Vec, +} + +impl Row { + /// Convert vec of Datum into Row + pub fn from_datums(cols: Vec) -> Self { + Self { cols } + } + + /// Returns the column num + pub fn num_columns(&self) -> usize { + self.cols.len() + } + + /// Iterate all datums + pub fn iter(&self) -> IterDatum { + IterDatum { + iter: self.cols.iter(), + } + } + + /// Get the timestamp column + pub fn timestamp(&self, schema: &Schema) -> Option { + let timestamp_index = schema.timestamp_index(); + + self.cols[timestamp_index].as_timestamp() + } +} + +#[derive(Debug)] +pub struct IterDatum<'a> { + iter: std::slice::Iter<'a, Datum>, +} + +impl<'a> Iterator for IterDatum<'a> { + type Item = &'a Datum; + + fn next(&mut self) -> Option { + self.iter.next() + } +} + +impl Index for Row { + type Output = Datum; + + fn index(&self, index: usize) -> &Self::Output { + &self.cols[index] + } +} + +impl IndexMut for Row { + fn index_mut(&mut self, index: usize) -> &mut Self::Output { + &mut self.cols[index] + } +} + +impl<'a> IntoIterator for &'a Row { + type IntoIter = std::slice::Iter<'a, Datum>; + type Item = &'a Datum; + + fn into_iter(self) -> Self::IntoIter { + self.cols.iter() + } +} + +impl IntoIterator for Row { + type IntoIter = std::vec::IntoIter; + type Item = Datum; + + fn into_iter(self) -> Self::IntoIter { + self.cols.into_iter() + } +} + +/// Check whether the schema of the row equals to given `schema` +pub fn check_row_schema(row: &Row, schema: &Schema) -> Result<()> { + ensure!( + schema.num_columns() == row.num_columns(), + InvalidColumnNum { + expect: schema.num_columns(), + given: row.num_columns(), + } + ); + + for (index, datum) in row.iter().enumerate() { + let column = schema.column(index); + check_datum_type(datum, column)?; + } + + Ok(()) +} + +// TODO(yingwen): For multiple rows that share the same schema, no need to store +// Datum for each row element, we can store the whole row as a binary and +// provide more efficent way to convert rows into columns +/// RowGroup +/// +/// The min/max timestamp of an empty RowGroup is 0. +/// +/// Rows in the RowGroup have the same schema. The internal representation of +/// rows is not specific. +#[derive(Debug)] +pub struct RowGroup { + /// Schema of the row group, all rows in the row group should have same + /// schema + schema: Schema, + /// Rows in the row group + rows: Vec, + // TODO(yingwen): Maybe remove min/max timestamp + /// Min timestamp of all the rows + min_timestamp: Timestamp, + /// Max timestamp of all the rows + max_timestamp: Timestamp, +} + +impl RowGroup { + /// Returns true if the row group is empty + #[inline] + pub fn is_empty(&self) -> bool { + self.rows.is_empty() + } + + /// Returns number of rows in the row group + #[inline] + pub fn num_rows(&self) -> usize { + self.rows.len() + } + + /// Returns the idx-th row in the row group + #[inline] + pub fn get_row(&self, idx: usize) -> Option<&Row> { + self.rows.get(idx) + } + + /// Returns the idx-th mutable row in the row group + #[inline] + pub fn get_row_mut(&mut self, idx: usize) -> Option<&mut Row> { + self.rows.get_mut(idx) + } + + /// Iter all datum of the column + /// + /// Will panic if col_index is out of bound + pub fn iter_column(&self, col_index: usize) -> IterCol { + IterCol { + rows: &self.rows, + row_index: 0, + col_index, + } + } + + /// The schema of the row group + #[inline] + pub fn schema(&self) -> &Schema { + &self.schema + } + + /// Iter the row group by rows + // TODO(yingwen): Add a iter_with_schema + pub fn iter(&self) -> IterRow { + IterRow { + iter: self.rows.iter(), + } + } + + /// Get the min timestamp of rows + #[inline] + pub fn min_timestamp(&self) -> Timestamp { + self.min_timestamp + } + + /// Get the max timestamp of rows + #[inline] + pub fn max_timestmap(&self) -> Timestamp { + self.max_timestamp + } +} + +impl<'a> IntoIterator for &'a RowGroup { + type IntoIter = std::slice::Iter<'a, Row>; + type Item = &'a Row; + + fn into_iter(self) -> Self::IntoIter { + self.rows.iter() + } +} + +impl IntoIterator for RowGroup { + type IntoIter = std::vec::IntoIter; + type Item = Row; + + fn into_iter(self) -> Self::IntoIter { + self.rows.into_iter() + } +} + +#[derive(Debug)] +pub struct IterRow<'a> { + iter: std::slice::Iter<'a, Row>, +} + +impl<'a> Iterator for IterRow<'a> { + type Item = &'a Row; + + fn next(&mut self) -> Option { + self.iter.next() + } +} + +#[derive(Debug)] +pub struct IterCol<'a> { + rows: &'a Vec, + row_index: usize, + col_index: usize, +} + +impl<'a> Iterator for IterCol<'a> { + type Item = &'a Datum; + + fn next(&mut self) -> Option { + if self.rows.is_empty() { + return None; + } + + if self.row_index >= self.rows.len() { + return None; + } + + let row = &self.rows[self.row_index]; + self.row_index += 1; + + Some(&row[self.col_index]) + } + + fn size_hint(&self) -> (usize, Option) { + let remaining = self.rows.len() - self.row_index; + (remaining, Some(remaining)) + } +} + +/// RowGroup builder +#[derive(Debug)] +pub struct RowGroupBuilder { + schema: Schema, + rows: Vec, + min_timestamp: Option, + max_timestmap: Timestamp, +} + +impl RowGroupBuilder { + /// Create a new builder + pub fn new(schema: Schema) -> Self { + Self::with_capacity(schema, 0) + } + + /// Create a new builder with given capacity + pub fn with_capacity(schema: Schema, capacity: usize) -> Self { + Self { + schema, + rows: Vec::with_capacity(capacity), + min_timestamp: None, + max_timestmap: Timestamp::new(0), + } + } + + /// Create a new builder with schema and rows + /// + /// Return error if the `rows` do not matched the `schema` + pub fn with_rows(schema: Schema, rows: Vec) -> Result { + let mut row_group = Self::new(schema); + + // Check schema and update min/max timestamp + for row in &rows { + check_row_schema(row, &row_group.schema)?; + row_group.update_timestamps(row); + } + + row_group.rows = rows; + + Ok(row_group) + } + + /// Add a schema checked row + /// + /// REQUIRE: Caller should ensure the schema of row must equal to the schema + /// of this builder + pub fn push_checked_row(&mut self, row: Row) { + self.update_timestamps(&row); + + self.rows.push(row); + } + + /// Acquire builder to build next row of the row group + pub fn row_builder(&mut self) -> RowBuilder { + RowBuilder { + // schema: &self.schema, + cols: Vec::with_capacity(self.schema.num_columns()), + // rows: &mut self.rows, + group_builder: self, + } + } + + /// Build the row group + pub fn build(self) -> RowGroup { + RowGroup { + schema: self.schema, + rows: self.rows, + min_timestamp: self.min_timestamp.unwrap_or_else(|| Timestamp::new(0)), + max_timestamp: self.max_timestmap, + } + } + + /// Update min/max timestamp of the row group + fn update_timestamps(&mut self, row: &Row) { + // check_row_schema() ensures this datum is a timestamp, so we just unwrap here + let row_timestamp = row.timestamp(&self.schema).unwrap(); + + self.min_timestamp = match self.min_timestamp { + Some(min_timestamp) => Some(cmp::min(min_timestamp, row_timestamp)), + None => Some(row_timestamp), + }; + self.max_timestmap = cmp::max(self.max_timestmap, row_timestamp); + } +} + +/// Check whether the datum kind matches the column schema +pub fn check_datum_type(datum: &Datum, column_schema: &ColumnSchema) -> Result<()> { + // Check null datum + if let Datum::Null = datum { + ensure!( + column_schema.is_nullable, + NullColumn { + column: &column_schema.name, + } + ); + } else { + ensure!( + datum.kind() == column_schema.data_type, + TypeMismatch { + column: &column_schema.name, + expect: column_schema.data_type, + given: datum.kind(), + } + ); + } + + Ok(()) +} + +// TODO(yingwen): This builder is used to build RowGroup, need to provide a +// builder to build one row +/// Row builder for the row group +#[derive(Debug)] +pub struct RowBuilder<'a> { + group_builder: &'a mut RowGroupBuilder, + cols: Vec, +} + +impl<'a> RowBuilder<'a> { + /// Append a datum into the row + pub fn append_datum(mut self, datum: Datum) -> Result { + self.check_datum(&datum)?; + + self.cols.push(datum); + + Ok(self) + } + + /// Check whether the datum is valid + fn check_datum(&self, datum: &Datum) -> Result<()> { + let index = self.cols.len(); + let schema = &self.group_builder.schema; + ensure!( + index < schema.num_columns(), + ColumnOutOfBound { + len: schema.num_columns(), + given: index, + } + ); + + let column = schema.column(index); + check_datum_type(datum, column) + } + + /// Finish building this row and append this row into the row group + pub fn finish(self) -> Result<()> { + ensure!( + self.cols.len() == self.group_builder.schema.num_columns(), + MissingColumns + ); + + self.group_builder.push_checked_row(Row { cols: self.cols }); + Ok(()) + } +} + +pub trait RowView { + fn try_get_column_by_name(&self, column_name: &str) -> Result>; + + fn column_by_idx(&self, column_idx: usize) -> Datum; +} + +// TODO(yingwen): Add a method to get row view on RecordBatchWithKey. +/// A row view on the [RecordBatchWithKey]. +/// +/// `row_idx < record_batch.num_rows()` is ensured. +#[derive(Debug)] +pub struct RowViewOnBatch<'a> { + pub record_batch: &'a RecordBatchWithKey, + pub row_idx: usize, +} + +impl<'a> RowViewOnBatch<'a> { + pub fn iter_columns(&self) -> RowViewOnBatchColumnIter { + RowViewOnBatchColumnIter { + next_column_idx: 0, + row_idx: self.row_idx, + record_batch: self.record_batch, + } + } +} + +pub struct RowViewOnBatchColumnIter<'a> { + next_column_idx: usize, + row_idx: usize, + record_batch: &'a RecordBatchWithKey, +} + +impl<'a> RowView for RowViewOnBatch<'a> { + fn try_get_column_by_name(&self, column_name: &str) -> Result> { + let column_idx = self + .record_batch + .schema_with_key() + .index_of(column_name) + .context(ColumnNameNotFound { + column: column_name, + })?; + Ok(Some(self.column_by_idx(column_idx))) + } + + #[inline] + fn column_by_idx(&self, column_idx: usize) -> Datum { + let column = self.record_batch.column(column_idx); + column.datum(self.row_idx) + } +} + +impl<'a> Iterator for RowViewOnBatchColumnIter<'a> { + type Item = Result; + + fn next(&mut self) -> Option { + if self.next_column_idx >= self.record_batch.num_columns() { + return None; + } + + let curr_column_idx = self.next_column_idx; + let column = self.record_batch.column(curr_column_idx); + let datum = column.datum_opt(self.row_idx).map(Ok); + + self.next_column_idx += 1; + + datum + } +} + +#[derive(Debug, Clone)] +pub struct RowWithMeta<'a> { + pub row: &'a Row, + pub schema: &'a RecordSchemaWithKey, +} + +impl<'a> RowView for RowWithMeta<'a> { + fn try_get_column_by_name(&self, column_name: &str) -> Result> { + let idx = self + .schema + .index_of(column_name) + .context(ColumnNotFoundInSchema { + column: column_name, + })?; + Ok(Some(self.column_by_idx(idx))) + } + + #[inline] + fn column_by_idx(&self, column_idx: usize) -> Datum { + self.row.cols[column_idx].clone() + } +} diff --git a/common_types/src/schema.rs b/common_types/src/schema.rs new file mode 100644 index 0000000000..4172886057 --- /dev/null +++ b/common_types/src/schema.rs @@ -0,0 +1,1554 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Schema of table + +use std::{ + cmp::{self, Ordering}, + collections::{HashMap, HashSet}, + convert::TryFrom, + fmt, + str::FromStr, + sync::Arc, +}; + +// Just re-use arrow's types +// TODO(yingwen): No need to support all schema that arrow supports, we can +// use a new type pattern to wrap Schema/SchemaRef and not allow to use +// the data type we not supported +pub use arrow_deps::arrow::datatypes::{ + DataType, Field, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, +}; +use proto::common as common_pb; +use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu}; + +use crate::{ + column_schema::{self, ColumnId, ColumnSchema}, + datum::DatumKind, + row::{contiguous, RowView}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Projection too long, max:{}, given:{}.\nBacktrace:\n{}", + max, + given, + backtrace + ))] + ProjectionTooLong { + max: usize, + given: usize, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid projection index, max:{}, given:{}.\nBacktrace:\n{}", + max, + given, + backtrace + ))] + InvalidProjectionIndex { + max: usize, + given: usize, + backtrace: Backtrace, + }, + + #[snafu(display("Projection must have timestamp column.\nBacktrace:\n{}", backtrace))] + ProjectionMissTimestamp { backtrace: Backtrace }, + + #[snafu(display( + "Column name already exists, name:{}.\nBacktrace:\n{}", + name, + backtrace + ))] + ColumnNameExists { name: String, backtrace: Backtrace }, + + #[snafu(display( + "Column id already exists, name:{}, id:{}.\nBacktrace:\n{}", + name, + id, + backtrace + ))] + ColumnIdExists { + name: String, + id: ColumnId, + backtrace: Backtrace, + }, + + #[snafu(display( + "Unsupported key column type, name:{}, type:{:?}.\nBacktrace:\n{}", + name, + kind, + backtrace + ))] + KeyColumnType { + name: String, + kind: DatumKind, + backtrace: Backtrace, + }, + + #[snafu(display( + "Timestamp key column already exists, timestamp_column:{}, given:{}.\nBacktrace:\n{}", + timestamp_column, + given_column, + backtrace + ))] + TimestampKeyExists { + timestamp_column: String, + given_column: String, + backtrace: Backtrace, + }, + + #[snafu(display("Timestamp key not exists.\nBacktrace:\n{}", backtrace))] + MissingTimestampKey { backtrace: Backtrace }, + + #[snafu(display( + "Key column cannot be nullable, name:{}.\nBacktrace:\n{}", + name, + backtrace + ))] + NullKeyColumn { name: String, backtrace: Backtrace }, + + #[snafu(display( + "Invalid arrow field, field_name:{}, arrow_schema:{:?}, err:{}", + field_name, + arrow_schema, + source + ))] + InvalidArrowField { + field_name: String, + arrow_schema: ArrowSchemaRef, + source: crate::column_schema::Error, + }, + + #[snafu(display( + "Invalid schema to generate tsid primary key.\nBacktrace:\n{}", + backtrace + ))] + InvalidTsidSchema { backtrace: Backtrace }, + + #[snafu(display( + "Invalid arrow schema key, key:{:?}, raw_value:{}, err:{:?}.\nBacktrace:\n{}", + key, + raw_value, + source, + backtrace + ))] + InvalidArrowSchemaMetaValue { + key: ArrowSchemaMetaKey, + raw_value: String, + source: Box, + backtrace: Backtrace, + }, + + #[snafu(display( + "Arrow schema meta key not found, key:{:?}.\nBacktrace:\n{}", + key, + backtrace + ))] + ArrowSchemaMetaKeyNotFound { + key: ArrowSchemaMetaKey, + backtrace: Backtrace, + }, +} + +// TODO(boyan) make these constants configurable +pub const TSID_COLUMN: &str = "tsid"; +pub const TIMESTAMP_COLUMN: &str = "timestamp"; + +pub type Result = std::result::Result; + +const DEFAULT_SCHEMA_VERSION: Version = 1; + +#[derive(Debug, Snafu)] +pub enum CompatError { + #[snafu(display("Incompatible column schema for write, err:{}", source))] + IncompatWriteColumn { + source: crate::column_schema::CompatError, + }, + + #[snafu(display("Missing column, name:{}", name))] + MissingWriteColumn { name: String }, + + #[snafu(display("Columns to write not found in table, names:{:?}", names))] + WriteMoreColumn { names: Vec }, +} + +/// Meta data of the arrow schema +struct ArrowSchemaMeta { + num_key_columns: usize, + timestamp_index: usize, + enable_tsid_primary_key: bool, + version: u32, +} + +#[derive(Copy, Clone, Debug)] +pub enum ArrowSchemaMetaKey { + NumKeyColumns, + TimestampIndex, + EnableTsidPrimaryKey, + Version, +} + +impl ArrowSchemaMetaKey { + fn as_str(&self) -> &str { + match self { + ArrowSchemaMetaKey::NumKeyColumns => "schema:num_key_columns", + ArrowSchemaMetaKey::TimestampIndex => "schema::timestamp_index", + ArrowSchemaMetaKey::EnableTsidPrimaryKey => "schema::enable_tsid_primary_key", + ArrowSchemaMetaKey::Version => "schema::version", + } + } +} + +impl ToString for ArrowSchemaMetaKey { + fn to_string(&self) -> String { + self.as_str().to_string() + } +} + +/// Schema version +pub type Version = u32; + +/// Mapping column index in table schema to column index in writer schema +#[derive(Default)] +pub struct IndexInWriterSchema(Vec>); + +impl IndexInWriterSchema { + /// Create a index mapping for same schema with `num_columns` columns. + pub fn for_same_schema(num_columns: usize) -> Self { + let indexes = (0..num_columns).into_iter().map(Some).collect(); + Self(indexes) + } + + /// Returns the column index in writer schema of the column with index + /// `index_in_table` in the table schema where the writer prepared to + /// write to. + /// + /// If the column is not in writer schema, returns None, which means that + /// this column should be filled by null. + /// + /// Panic if the index_in_table is out of bound + pub fn column_index_in_writer(&self, index_in_table: usize) -> Option { + self.0[index_in_table] + } +} + +// TODO(yingwen): No need to compare all elements in ColumnSchemas, Schema, +// RecordSchema, custom PartialEq for them. + +/// Data of column schemas +#[derive(PartialEq)] +pub(crate) struct ColumnSchemas { + /// Column schemas + columns: Vec, + /// Column name to index of that column schema in `columns`, the index is + /// guaranteed to be valid + name_to_index: HashMap, + /// Byte offsets of each column in contiguous row. + byte_offsets: Vec, + /// String buffer offset in contiguous row. + string_buffer_offset: usize, +} + +impl ColumnSchemas { + fn new(columns: Vec) -> Self { + let name_to_index = columns + .iter() + .enumerate() + .map(|(idx, c)| (c.name.to_string(), idx)) + .collect(); + + let mut current_offset = 0; + let mut byte_offsets = Vec::with_capacity(columns.len()); + for column_schema in &columns { + byte_offsets.push(current_offset); + current_offset += contiguous::byte_size_of_datum(&column_schema.data_type); + } + + Self { + columns, + name_to_index, + byte_offsets, + string_buffer_offset: current_offset, + } + } +} + +impl ColumnSchemas { + pub fn num_columns(&self) -> usize { + self.columns().len() + } + + pub fn columns(&self) -> &[ColumnSchema] { + &self.columns + } + + pub fn column(&self, i: usize) -> &ColumnSchema { + &self.columns[i] + } + + pub fn index_of(&self, name: &str) -> Option { + self.name_to_index.get(name).copied() + } +} + +impl fmt::Debug for ColumnSchemas { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ColumnSchemas") + // name_to_index is ignored. + .field("columns", &self.columns) + .finish() + } +} + +/// Schema of [crate::record_batch::RecordBatch] +/// +/// Should be cheap to clone. +/// +/// Note: Only `name`, `data_type`, `is_nullable` is valid after converting from +/// arrow's schema, the additional fields like `id`/`is_tag`/`comment` is always +/// unset. Now we only convert arrow's schema into our record before we output +/// the final query result, where the additional fields is never used. +#[derive(Debug, Clone, PartialEq)] +pub struct RecordSchema { + arrow_schema: ArrowSchemaRef, + column_schemas: Arc, +} + +impl RecordSchema { + fn from_column_schemas(column_schemas: ColumnSchemas) -> Self { + // Convert to arrow fields. + let fields = column_schemas + .columns + .iter() + .map(|col| col.to_arrow_field()) + .collect(); + // Build arrow schema. + let arrow_schema = Arc::new(ArrowSchema::new(fields)); + + Self { + arrow_schema, + column_schemas: Arc::new(column_schemas), + } + } + + pub fn num_columns(&self) -> usize { + self.column_schemas.num_columns() + } + + pub fn columns(&self) -> &[ColumnSchema] { + self.column_schemas.columns() + } + + pub fn index_of(&self, name: &str) -> Option { + self.column_schemas.index_of(name) + } + + pub fn column(&self, i: usize) -> &ColumnSchema { + self.column_schemas.column(i) + } + + pub fn to_arrow_schema_ref(&self) -> ArrowSchemaRef { + self.arrow_schema.clone() + } +} + +impl TryFrom for RecordSchema { + type Error = Error; + + fn try_from(arrow_schema: ArrowSchemaRef) -> Result { + let fields = arrow_schema.fields(); + let mut columns = Vec::with_capacity(fields.len()); + + for field in fields { + let column_schema = + ColumnSchema::try_from(field).with_context(|| InvalidArrowField { + arrow_schema: arrow_schema.clone(), + field_name: field.name(), + })?; + columns.push(column_schema); + } + + let column_schemas = ColumnSchemas::new(columns); + + Ok(Self::from_column_schemas(column_schemas)) + } +} + +#[derive(Debug, Clone, PartialEq)] +pub struct RecordSchemaWithKey { + record_schema: RecordSchema, + num_key_columns: usize, +} + +impl RecordSchemaWithKey { + pub fn num_columns(&self) -> usize { + self.record_schema.num_columns() + } + + pub fn compare_row(&self, lhs: &LR, rhs: &RR) -> Ordering { + compare_row(self.num_key_columns, lhs, rhs) + } + + pub fn index_of(&self, name: &str) -> Option { + self.record_schema.index_of(name) + } + + pub fn columns(&self) -> &[ColumnSchema] { + self.record_schema.columns() + } + + /// Returns an immutable reference of the key column vector. + pub fn key_columns(&self) -> &[ColumnSchema] { + &self.columns()[..self.num_key_columns] + } + + pub(crate) fn into_record_schema(self) -> RecordSchema { + self.record_schema + } + + pub(crate) fn to_arrow_schema_ref(&self) -> ArrowSchemaRef { + self.record_schema.to_arrow_schema_ref() + } + + #[inline] + pub fn num_key_columns(&self) -> usize { + self.num_key_columns + } +} + +/// Compare the two rows. +/// +/// REQUIRES: the two rows must have the same number of key columns as +/// `num_key_columns`. +pub fn compare_row( + num_key_columns: usize, + lhs: &LR, + rhs: &RR, +) -> Ordering { + for column_idx in 0..num_key_columns { + // caller should ensure the row view is valid. + // TODO(xikai): unwrap may not a good way to handle the error. + let left_datum = lhs.column_by_idx(column_idx); + let right_datum = rhs.column_by_idx(column_idx); + // the two datums must be of the same kind type. + match left_datum.partial_cmp(&right_datum).unwrap() { + Ordering::Equal => continue, + v @ Ordering::Less | v @ Ordering::Greater => return v, + } + } + + Ordering::Equal +} + +// TODO(yingwen): Maybe rename to TableSchema. +/// Schema of a table +/// +/// - Should be immutable +/// - Each schema must have a timestamp column +/// - Should be immutable and cheap to clone, though passing by reference is +/// preferred +/// - The prefix of columns makes up the primary key (similar to kudu's schema) +/// - The Schema should built by builder +#[derive(Clone, PartialEq)] +pub struct Schema { + /// The underlying arrow schema, data type of fields must be supported by + /// datum + arrow_schema: ArrowSchemaRef, + /// The number of primary key columns + num_key_columns: usize, + /// Index of timestamp key column + // TODO(yingwen): Maybe we can remove the restriction that timestamp column must exists in + // schema (mainly for projected schema) + timestamp_index: usize, + /// Index of tsid key column and None denotes the `enable_tsid_primary_key` + /// is not set. + tsid_index: Option, + /// Control whether to generate tsid as primary key + enable_tsid_primary_key: bool, + /// Column schemas, only holds arc pointer so the Schema can be cloned + /// without much overhead. + column_schemas: Arc, + /// Version of the schema, schemas with same version should be identical. + version: Version, +} + +impl fmt::Debug for Schema { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Schema") + // arrow_schema is ignored. + .field("num_key_columns", &self.num_key_columns) + .field("timestamp_index", &self.timestamp_index) + .field("tsid_index", &self.tsid_index) + .field("enable_tsid_primary_key", &self.enable_tsid_primary_key) + .field("column_schemas", &self.column_schemas) + .field("version", &self.version) + .finish() + } +} + +impl TryFrom for Schema { + type Error = Error; + + fn try_from(arrow_schema: ArrowSchemaRef) -> Result { + Builder::build_from_arrow_schema(arrow_schema) + } +} + +impl TryFrom for Schema { + type Error = Error; + + fn try_from(record_schema: RecordSchema) -> Result { + Builder::build_from_arrow_schema(record_schema.to_arrow_schema_ref()) + } +} + +impl Schema { + /// Returns an immutable reference of the vector of [ColumnSchema]. + pub fn columns(&self) -> &[ColumnSchema] { + self.column_schemas.columns() + } + + /// Returns an immutable reference of the key column vector. + pub fn key_columns(&self) -> &[ColumnSchema] { + &self.columns()[..self.num_key_columns] + } + + /// Returns an immutable reference of the normal column vector. + pub fn normal_columns(&self) -> &[ColumnSchema] { + &self.columns()[self.num_key_columns..] + } + + /// Returns index of the tsid column. + pub fn index_of_tsid(&self) -> Option { + self.tsid_index + } + + /// Returns tsid column index and immutable reference of tsid column + pub fn tsid_column(&self) -> Option<&ColumnSchema> { + if let Some(idx) = self.index_of_tsid() { + Some(&self.column_schemas.columns[idx]) + } else { + None + } + } + + /// Returns total number of columns + pub fn num_columns(&self) -> usize { + self.column_schemas.num_columns() + } + + /// Returns an immutable reference of a specific [ColumnSchema] selected by + /// name. + pub fn column_with_name(&self, name: &str) -> Option<&ColumnSchema> { + let index = self.column_schemas.name_to_index.get(name)?; + Some(&self.column_schemas.columns[*index]) + } + + /// Returns an immutable reference of a specific [ColumnSchema] selected + /// using an offset within the internal vector. + /// + /// Panic if i is out of bound + pub fn column(&self, i: usize) -> &ColumnSchema { + self.column_schemas.column(i) + } + + /// Return the ref to [arrow_deps::arrow::datatypes::SchemaRef] + pub fn as_arrow_schema_ref(&self) -> &ArrowSchemaRef { + &self.arrow_schema + } + + /// Return the cloned [arrow_deps::arrow::datatypes::SchemaRef] + pub fn to_arrow_schema_ref(&self) -> ArrowSchemaRef { + self.arrow_schema.clone() + } + + /// Into [arrow_deps::arrow::datatypes::SchemaRef] + pub fn into_arrow_schema_ref(self) -> ArrowSchemaRef { + self.arrow_schema + } + + /// Find the index of the column with the given name. + pub fn index_of(&self, name: &str) -> Option { + self.column_schemas.index_of(name) + } + + /// Returns the number of columns in primary key + #[inline] + pub fn num_key_columns(&self) -> usize { + self.num_key_columns + } + + /// Get the name of the timestamp column + #[inline] + pub fn timestamp_name(&self) -> &str { + &self.column(self.timestamp_index()).name + } + + /// Get the index of the timestamp column + #[inline] + pub fn timestamp_index(&self) -> usize { + self.timestamp_index + } + + /// Get the version of this schema + #[inline] + pub fn version(&self) -> Version { + self.version + } + + /// Compare the two rows. + /// + /// REQUIRES: the two rows must have the key columns defined by the schema. + pub fn compare_row(&self, lhs: &R, rhs: &R) -> Ordering { + compare_row(self.num_key_columns, lhs, rhs) + } + + /// Returns `Ok` if rows with `writer_schema` can write to table with the + /// same schema as `self`. + pub fn compatible_for_write( + &self, + writer_schema: &Schema, + index_in_writer: &mut IndexInWriterSchema, + ) -> std::result::Result<(), CompatError> { + index_in_writer.0.reserve(self.num_columns()); + + let mut num_col_in_writer = 0; + for column in self.columns() { + // Find column in schema of writer. + match writer_schema.index_of(&column.name) { + Some(writer_index) => { + let writer_column = writer_schema.column(writer_index); + + // Column is found in writer + num_col_in_writer += 1; + + // Column with same name, but not compatible + column + .compatible_for_write(writer_column) + .context(IncompatWriteColumn)?; + + // Column is compatible, push index mapping + index_in_writer.0.push(Some(writer_index)); + } + None => { + // Column is not found in writer, then the column should be nullable. + ensure!( + column.is_nullable, + MissingWriteColumn { name: &column.name } + ); + + // Column is nullable, push index mapping + index_in_writer.0.push(None); + } + } + } + // All columns of this schema have been checked + + // If the writer have columns not in this schema, then we consider it + // incompatible + ensure!( + num_col_in_writer == writer_schema.num_columns(), + WriteMoreColumn { + names: writer_schema + .columns() + .iter() + .filter_map(|c| if self.column_with_name(&c.name).is_none() { + Some(c.name.clone()) + } else { + None + }) + .collect::>(), + } + ); + + Ok(()) + } + + pub fn to_record_schema(&self) -> RecordSchema { + RecordSchema { + arrow_schema: self.arrow_schema.clone(), + column_schemas: self.column_schemas.clone(), + } + } + + pub fn to_record_schema_with_key(&self) -> RecordSchemaWithKey { + RecordSchemaWithKey { + record_schema: self.to_record_schema(), + num_key_columns: self.num_key_columns, + } + } + + /// Panic if projection is invalid. + pub(crate) fn project_record_schema_with_key( + &self, + projection: &[usize], + ) -> RecordSchemaWithKey { + let mut columns = Vec::with_capacity(self.num_key_columns); + // Keep all key columns in order. + for key_column in self.key_columns() { + columns.push(key_column.clone()); + } + + // Collect normal columns needed by the projection. + for p in projection { + if *p >= self.num_key_columns { + // A normal column + let normal_column = &self.columns()[*p]; + columns.push(normal_column.clone()); + } + } + + let record_schema = RecordSchema::from_column_schemas(ColumnSchemas::new(columns)); + + RecordSchemaWithKey { + record_schema, + num_key_columns: self.num_key_columns, + } + } + + /// Panic if projection is invalid. + pub(crate) fn project_record_schema(&self, projection: &[usize]) -> RecordSchema { + let mut columns = Vec::with_capacity(projection.len()); + + // Collect all columns needed by the projection. + for p in projection { + let column_schema = &self.columns()[*p]; + // Insert the index in projected schema of the column + columns.push(column_schema.clone()); + } + + RecordSchema::from_column_schemas(ColumnSchemas::new(columns)) + } + + /// Returns byte offsets in contiguous row. + #[inline] + pub fn byte_offsets(&self) -> &[usize] { + &self.column_schemas.byte_offsets + } + + /// Returns byte offset in contiguous row of given column. + /// + /// Panic if out of bound. + #[inline] + pub fn byte_offset(&self, index: usize) -> usize { + self.column_schemas.byte_offsets[index] + } + + /// Returns string buffer offset in contiguous row. + #[inline] + pub fn string_buffer_offset(&self) -> usize { + self.column_schemas.string_buffer_offset + } +} + +impl TryFrom for Schema { + type Error = Error; + + fn try_from(schema: common_pb::TableSchema) -> Result { + let mut builder = Builder::with_capacity(schema.columns.len()) + .version(schema.version) + .enable_tsid_primary_key(schema.enable_tsid_primary_key); + + for (i, column_schema_pb) in schema.columns.into_iter().enumerate() { + let column = ColumnSchema::from(column_schema_pb); + + if i < schema.num_key_columns as usize { + builder = builder.add_key_column(column)?; + } else { + builder = builder.add_normal_column(column)?; + } + } + + builder.build() + } +} + +impl From for common_pb::TableSchema { + fn from(schema: Schema) -> Self { + let mut table_schema = common_pb::TableSchema::new(); + + for column in schema.columns() { + // Convert schema of each column + let column_schema = column.to_pb(); + table_schema.columns.push(column_schema); + } + + table_schema.num_key_columns = schema.num_key_columns as u32; + table_schema.timestamp_index = schema.timestamp_index as u32; + table_schema.enable_tsid_primary_key = schema.enable_tsid_primary_key; + table_schema.version = schema.version; + + table_schema + } +} + +/// Schema builder +#[must_use] +pub struct Builder { + columns: Vec, + /// The number of primary key columns + num_key_columns: usize, + /// Timestamp column index + timestamp_index: Option, + column_names: HashSet, + column_ids: HashSet, + /// Version of the schema + version: Version, + /// Auto increment the column id if the id of the input ColumnSchema is + /// [crate::column_schema::COLUMN_ID_UNINIT]. + auto_increment_column_id: bool, + max_column_id: ColumnId, + enable_tsid_primary_key: bool, +} + +impl Default for Builder { + fn default() -> Self { + Self::new() + } +} + +impl Builder { + /// Create a new builder + pub fn new() -> Self { + Self::with_capacity(0) + } + + /// Create a builder with given capacity + pub fn with_capacity(capacity: usize) -> Self { + Self { + columns: Vec::with_capacity(capacity), + num_key_columns: 0, + timestamp_index: None, + column_names: HashSet::with_capacity(capacity), + column_ids: HashSet::with_capacity(capacity), + version: DEFAULT_SCHEMA_VERSION, + auto_increment_column_id: false, + max_column_id: column_schema::COLUMN_ID_UNINIT, + enable_tsid_primary_key: false, + } + } + + /// Add a key column + pub fn add_key_column(mut self, mut column: ColumnSchema) -> Result { + self.may_alloc_column_id(&mut column); + self.validate_column(&column, true)?; + + ensure!(!column.is_nullable, NullKeyColumn { name: column.name }); + + // FIXME(xikai): it seems not reasonable to decide the timestamp column in this + // way. + let is_timestamp = DatumKind::Timestamp == column.data_type; + if is_timestamp { + ensure!( + self.timestamp_index.is_none(), + TimestampKeyExists { + timestamp_column: &self.columns[self.timestamp_index.unwrap()].name, + given_column: column.name, + } + ); + self.timestamp_index = Some(self.num_key_columns); + } + + self.insert_new_key_column(column); + + Ok(self) + } + + /// Add a normal (non key) column + pub fn add_normal_column(mut self, mut column: ColumnSchema) -> Result { + self.may_alloc_column_id(&mut column); + self.validate_column(&column, false)?; + + self.insert_new_normal_column(column); + + Ok(self) + } + + /// Set version of the schema + pub fn version(mut self, version: Version) -> Self { + self.version = version; + self + } + + /// When auto increment is true, assign the column schema an auto + /// incremented id if its id is [crate::column_schema::COLUMN_ID_UNINIT]. + /// + /// Default is false + pub fn auto_increment_column_id(mut self, auto_increment: bool) -> Self { + self.auto_increment_column_id = auto_increment; + self + } + + /// Enable tsid as primary key. + pub fn enable_tsid_primary_key(mut self, enable_tsid_primary_key: bool) -> Self { + self.enable_tsid_primary_key = enable_tsid_primary_key; + self + } + + fn may_alloc_column_id(&mut self, column: &mut ColumnSchema) { + // Assign this column an id + if self.auto_increment_column_id && column.id == column_schema::COLUMN_ID_UNINIT { + column.id = self.max_column_id + 1; + } + + self.max_column_id = cmp::max(self.max_column_id, column.id); + } + + // TODO(yingwen): Do we need to support null data type? + fn validate_column(&self, column: &ColumnSchema, is_key: bool) -> Result<()> { + ensure!( + !self.column_names.contains(&column.name), + ColumnNameExists { name: &column.name } + ); + + // Check datum kind if this is a key column + if is_key { + ensure!( + column.data_type.is_key_kind(), + KeyColumnType { + name: &column.name, + kind: column.data_type, + } + ); + } + + ensure!( + !self.column_ids.contains(&column.id), + ColumnIdExists { + name: &column.name, + id: column.id, + } + ); + + Ok(()) + } + + fn insert_new_key_column(&mut self, column: ColumnSchema) { + self.column_names.insert(column.name.clone()); + self.column_ids.insert(column.id); + + self.columns.insert(self.num_key_columns, column); + self.num_key_columns += 1; + } + + fn insert_new_normal_column(&mut self, column: ColumnSchema) { + self.column_names.insert(column.name.clone()); + self.column_ids.insert(column.id); + + self.columns.push(column); + } + + fn build_from_arrow_schema(arrow_schema: ArrowSchemaRef) -> Result { + let fields = arrow_schema.fields(); + let mut columns = Vec::with_capacity(fields.len()); + + for field in fields { + let column_schema = + ColumnSchema::try_from(field).with_context(|| InvalidArrowField { + arrow_schema: arrow_schema.clone(), + field_name: field.name(), + })?; + columns.push(column_schema); + } + + // FIXME(xikai): Now we have to tolerate the decoding failure because of the bug + // of datafusion (fixed by: https://github.com/apache/arrow-datafusion/commit/1448d9752ab3a38f02732274f91136a6a6ad3db4). + // (The bug may cause the meta data of the schema meta lost duration plan + // execution.) + let ArrowSchemaMeta { + num_key_columns, + timestamp_index, + enable_tsid_primary_key, + version, + } = Self::parse_arrow_schema_meta_or_default(arrow_schema.metadata())?; + let tsid_index = Self::find_tsid_index(enable_tsid_primary_key, &columns)?; + + let column_schemas = Arc::new(ColumnSchemas::new(columns)); + + Ok(Schema { + arrow_schema, + num_key_columns, + timestamp_index, + tsid_index, + enable_tsid_primary_key, + column_schemas, + version, + }) + } + + fn parse_arrow_schema_meta_value( + meta: &HashMap, + key: ArrowSchemaMetaKey, + ) -> Result + where + T: FromStr, + T::Err: std::error::Error + Send + Sync + 'static, + { + let raw_value = meta + .get(key.as_str()) + .context(ArrowSchemaMetaKeyNotFound { key })?; + T::from_str(raw_value.as_str()) + .map_err(|e| Box::new(e) as _) + .context(InvalidArrowSchemaMetaValue { key, raw_value }) + } + + /// Parse the necessary meta information from the arrow schema's meta data. + fn parse_arrow_schema_meta_or_default( + meta: &HashMap, + ) -> Result { + match Self::parse_arrow_schema_meta(meta) { + Ok(v) => Ok(v), + Err(Error::ArrowSchemaMetaKeyNotFound { .. }) => Ok(ArrowSchemaMeta { + num_key_columns: 0, + timestamp_index: 0, + enable_tsid_primary_key: false, + version: 0, + }), + Err(e) => Err(e), + } + } + + /// Parse the necessary meta information from the arrow schema's meta data. + fn parse_arrow_schema_meta(meta: &HashMap) -> Result { + Ok(ArrowSchemaMeta { + num_key_columns: Self::parse_arrow_schema_meta_value( + meta, + ArrowSchemaMetaKey::NumKeyColumns, + )?, + timestamp_index: Self::parse_arrow_schema_meta_value( + meta, + ArrowSchemaMetaKey::TimestampIndex, + )?, + enable_tsid_primary_key: Self::parse_arrow_schema_meta_value( + meta, + ArrowSchemaMetaKey::EnableTsidPrimaryKey, + )?, + version: Self::parse_arrow_schema_meta_value(meta, ArrowSchemaMetaKey::Version)?, + }) + } + + /// Build arrow schema meta data. + /// + /// Requires: the timestamp index is not None. + fn build_arrow_schema_meta(&self) -> HashMap { + let mut meta = HashMap::with_capacity(4); + meta.insert( + ArrowSchemaMetaKey::NumKeyColumns.to_string(), + self.num_key_columns.to_string(), + ); + meta.insert( + ArrowSchemaMetaKey::TimestampIndex.to_string(), + self.timestamp_index.unwrap().to_string(), + ); + meta.insert( + ArrowSchemaMetaKey::Version.to_string(), + self.version.to_string(), + ); + meta.insert( + ArrowSchemaMetaKey::EnableTsidPrimaryKey.to_string(), + self.enable_tsid_primary_key.to_string(), + ); + + meta + } + + fn find_tsid_index( + enable_tsid_primary_key: bool, + columns: &[ColumnSchema], + ) -> Result> { + if !enable_tsid_primary_key { + return Ok(None); + } + + let idx = columns + .iter() + .enumerate() + .find_map(|(idx, col_schema)| { + if col_schema.name == TSID_COLUMN { + Some(idx) + } else { + None + } + }) + .context(InvalidTsidSchema)?; + + Ok(Some(idx)) + } + + /// Build the schema + pub fn build(self) -> Result { + let timestamp_index = self.timestamp_index.context(MissingTimestampKey)?; + // Timestamp key column is exists, so key columns should not be zero + assert!(self.num_key_columns > 0); + if self.enable_tsid_primary_key { + ensure!(self.num_key_columns == 2, InvalidTsidSchema); + } + + let tsid_index = Self::find_tsid_index(self.enable_tsid_primary_key, &self.columns)?; + + let fields = self.columns.iter().map(|c| c.to_arrow_field()).collect(); + let meta = self.build_arrow_schema_meta(); + + Ok(Schema { + arrow_schema: Arc::new(ArrowSchema::new_with_metadata(fields, meta)), + num_key_columns: self.num_key_columns, + timestamp_index, + tsid_index, + enable_tsid_primary_key: self.enable_tsid_primary_key, + column_schemas: Arc::new(ColumnSchemas::new(self.columns)), + version: self.version, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + bytes::Bytes, + datum::Datum, + row::{Row, RowWithMeta}, + time::Timestamp, + }; + + #[test] + fn test_schema() { + let schema = Builder::new() + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new("key1".to_string(), DatumKind::Varbinary) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("timestamp".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Double) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field2".to_string(), DatumKind::Double) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .build() + .unwrap(); + + // Length related test + assert_eq!(4, schema.columns().len()); + assert_eq!(4, schema.num_columns()); + assert_eq!(2, schema.num_key_columns()); + assert_eq!(1, schema.timestamp_index()); + + // Test key columns + assert_eq!(2, schema.key_columns().len()); + assert_eq!("key1", &schema.key_columns()[0].name); + assert_eq!("timestamp", &schema.key_columns()[1].name); + + // Test normal columns + assert_eq!(2, schema.normal_columns().len()); + assert_eq!("field1", &schema.normal_columns()[0].name); + assert_eq!("field2", &schema.normal_columns()[1].name); + + // Test column_with_name() + let field1 = schema.column_with_name("field1").unwrap(); + assert_eq!(3, field1.id); + assert_eq!("field1", field1.name); + assert!(schema.column_with_name("not exists").is_none()); + + // Test column() + assert_eq!(field1, schema.column(2)); + + // Test arrow schema + let arrow_schema = schema.as_arrow_schema_ref(); + let key1 = arrow_schema.field(0); + assert_eq!("key1", key1.name()); + let field2 = arrow_schema.field(3); + assert_eq!("field2", field2.name()); + + // Test index_of() + assert_eq!(1, schema.index_of("timestamp").unwrap()); + assert!(schema.index_of("not exists").is_none()); + + // Test pb convert + let schema_pb = common_pb::TableSchema::from(schema.clone()); + let schema_from_pb = Schema::try_from(schema_pb).unwrap(); + assert_eq!(schema, schema_from_pb); + } + + #[test] + fn test_build_unordered() { + let schema = Builder::new() + .auto_increment_column_id(true) + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Double) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("key1".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("key2".to_string(), DatumKind::Varbinary) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field2".to_string(), DatumKind::Double) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .build() + .unwrap(); + + let columns = schema.columns(); + assert_eq!(2, columns[0].id); + assert_eq!("key1", columns[0].name); + assert_eq!(3, columns[1].id); + assert_eq!("key2", columns[1].name); + assert_eq!(1, columns[2].id); + assert_eq!("field1", columns[2].name); + assert_eq!(4, columns[3].id); + assert_eq!("field2", columns[3].name); + } + + #[test] + fn test_name_exists() { + let builder = Builder::new() + .auto_increment_column_id(true) + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Double) + .build() + .expect("should succeed build column schema"), + ) + .unwrap(); + assert!(builder + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Double) + .build() + .expect("should succeed build column schema"), + ) + .is_err()); + } + + #[test] + fn test_id_exists() { + let builder = Builder::new() + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Double) + .id(1) + .build() + .expect("should succeed build column schema"), + ) + .unwrap(); + assert!(builder + .add_normal_column( + column_schema::Builder::new("field2".to_string(), DatumKind::Double) + .id(1) + .build() + .expect("should succeed build column schema") + ) + .is_err()); + } + + #[test] + fn test_key_column_type() { + assert!(Builder::new() + .add_key_column( + column_schema::Builder::new("key".to_string(), DatumKind::Double) + .id(1) + .build() + .expect("should succeed build column schema") + ) + .is_err()); + } + + #[test] + fn test_timestamp_key_exists() { + let builder = Builder::new() + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new("key1".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema"), + ) + .unwrap(); + assert!(builder + .add_key_column( + column_schema::Builder::new("key2".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema") + ) + .is_err()); + } + + #[test] + fn test_mulitple_timestamp() { + Builder::new() + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new("key1".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .build() + .unwrap(); + } + + #[test] + fn test_missing_timestamp_key() { + let builder = Builder::new() + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new("key1".to_string(), DatumKind::Varbinary) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Double) + .build() + .expect("should succeed build column schema"), + ) + .unwrap(); + assert!(builder.build().is_err()); + } + + #[test] + fn test_null_key() { + assert!(Builder::new() + .add_key_column( + column_schema::Builder::new("key1".to_string(), DatumKind::Varbinary) + .id(1) + .is_nullable(true) + .build() + .expect("should succeed build column schema") + ) + .is_err()); + } + + #[test] + fn test_max_column_id() { + let builder = Builder::new() + .add_key_column( + column_schema::Builder::new("key1".to_string(), DatumKind::Varbinary) + .id(2) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Timestamp) + .id(5) + .build() + .expect("should succeed build column schema"), + ) + .unwrap(); + + let schema = builder + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new("key2".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field2".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .build() + .unwrap(); + + let columns = schema.columns(); + // Check key1 + assert_eq!("key1", &columns[0].name); + assert_eq!(2, columns[0].id); + // Check key2 + assert_eq!("key2", &columns[1].name); + assert_eq!(6, columns[1].id); + // Check field1 + assert_eq!("field1", &columns[2].name); + assert_eq!(5, columns[2].id); + // Check field2 + assert_eq!("field2", &columns[3].name); + assert_eq!(7, columns[3].id); + } + + fn assert_row_compare(ordering: Ordering, schema: &Schema, row1: &Row, row2: &Row) { + let schema_with_key = schema.to_record_schema_with_key(); + let lhs = RowWithMeta { + row: row1, + schema: &schema_with_key, + }; + let rhs = RowWithMeta { + row: row2, + schema: &schema_with_key, + }; + assert_eq!(ordering, schema.compare_row(&lhs, &rhs)); + } + + #[test] + fn test_compare_row() { + let schema = Builder::new() + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new("key1".to_string(), DatumKind::Varbinary) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("key2".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Double) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .build() + .unwrap(); + + // Test equal + { + let row1 = Row::from_datums(vec![ + Datum::Varbinary(Bytes::from_static(b"key1")), + Datum::Timestamp(Timestamp::new(1005)), + Datum::Double(12.5), + ]); + let row2 = Row::from_datums(vec![ + Datum::Varbinary(Bytes::from_static(b"key1")), + Datum::Timestamp(Timestamp::new(1005)), + Datum::Double(15.5), + ]); + + assert_row_compare(Ordering::Equal, &schema, &row1, &row2); + } + + // Test first key column less + { + let row1 = Row::from_datums(vec![ + Datum::Varbinary(Bytes::from_static(b"key2")), + Datum::Timestamp(Timestamp::new(1005)), + Datum::Double(17.5), + ]); + let row2 = Row::from_datums(vec![ + Datum::Varbinary(Bytes::from_static(b"key5")), + Datum::Timestamp(Timestamp::new(1005)), + Datum::Double(17.5), + ]); + + assert_row_compare(Ordering::Less, &schema, &row1, &row2); + } + + // Test second key column less + { + let row1 = Row::from_datums(vec![ + Datum::Varbinary(Bytes::from_static(b"key2")), + Datum::Timestamp(Timestamp::new(1002)), + Datum::Double(17.5), + ]); + let row2 = Row::from_datums(vec![ + Datum::Varbinary(Bytes::from_static(b"key2")), + Datum::Timestamp(Timestamp::new(1005)), + Datum::Double(17.5), + ]); + + assert_row_compare(Ordering::Less, &schema, &row1, &row2); + } + + // Test first key column greater + { + let row1 = Row::from_datums(vec![ + Datum::Varbinary(Bytes::from_static(b"key7")), + Datum::Timestamp(Timestamp::new(1005)), + Datum::Double(17.5), + ]); + let row2 = Row::from_datums(vec![ + Datum::Varbinary(Bytes::from_static(b"key5")), + Datum::Timestamp(Timestamp::new(1005)), + Datum::Double(17.5), + ]); + + assert_row_compare(Ordering::Greater, &schema, &row1, &row2); + } + + // Test second key column greater + { + let row1 = Row::from_datums(vec![ + Datum::Varbinary(Bytes::from_static(b"key2")), + Datum::Timestamp(Timestamp::new(1007)), + Datum::Double(17.5), + ]); + let row2 = Row::from_datums(vec![ + Datum::Varbinary(Bytes::from_static(b"key2")), + Datum::Timestamp(Timestamp::new(1005)), + Datum::Double(17.5), + ]); + + assert_row_compare(Ordering::Greater, &schema, &row1, &row2); + } + } + + #[test] + fn test_build_from_arrow_schema() { + let schema = Builder::new() + .auto_increment_column_id(true) + .enable_tsid_primary_key(true) + .add_key_column( + column_schema::Builder::new(TSID_COLUMN.to_string(), DatumKind::UInt64) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("timestamp".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("value".to_string(), DatumKind::Double) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .build() + .expect("should succeed to build schema"); + + let arrow_schema = schema.clone().into_arrow_schema_ref(); + let new_schema = Builder::build_from_arrow_schema(arrow_schema) + .expect("should succeed to build new schema"); + + assert_eq!(schema, new_schema); + } +} diff --git a/common_types/src/string.rs b/common_types/src/string.rs new file mode 100644 index 0000000000..be41c82702 --- /dev/null +++ b/common_types/src/string.rs @@ -0,0 +1,107 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Bytes that can safely cast to str/string. + +use std::{convert::TryFrom, fmt, ops, str}; + +use snafu::{Backtrace, ResultExt, Snafu}; + +use crate::bytes::Bytes; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Bytes are not valid utf8, err:{}.\nBacktrace:\n{}", source, backtrace))] + FromBytes { + source: std::str::Utf8Error, + backtrace: Backtrace, + }, +} + +pub type Result = std::result::Result; + +/// String using [crate::bytes::Bytes] as storage so it can be cast into `Bytes` +/// and clone like `Bytes`. +#[derive(Debug, Clone, PartialEq, PartialOrd)] +pub struct StringBytes(Bytes); + +impl StringBytes { + pub fn new() -> StringBytes { + StringBytes(Bytes::new()) + } + + pub const fn from_static(src: &'static str) -> StringBytes { + StringBytes(Bytes::from_static(src.as_bytes())) + } + + pub fn copy_from_str(src: &str) -> StringBytes { + StringBytes(Bytes::copy_from_slice(src.as_bytes())) + } + + /// Create a [StringBytes] from a valid utf bytes. + /// + /// # Safety + /// The caller must ensure `bytes` is valid utf string. + pub unsafe fn from_bytes_unchecked(bytes: Bytes) -> StringBytes { + StringBytes(bytes) + } + + #[inline] + pub fn as_bytes(&self) -> &[u8] { + &self.0 + } + + #[inline] + pub fn as_str(&self) -> &str { + unsafe { str::from_utf8_unchecked(self.as_bytes()) } + } +} + +impl Default for StringBytes { + fn default() -> Self { + Self::new() + } +} + +impl ops::Deref for StringBytes { + type Target = str; + + #[inline] + fn deref(&self) -> &str { + self.as_str() + } +} + +impl AsRef for StringBytes { + #[inline] + fn as_ref(&self) -> &str { + self.as_str() + } +} + +impl fmt::Display for StringBytes { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(self.as_str()) + } +} + +impl TryFrom for StringBytes { + type Error = Error; + + fn try_from(bytes: Bytes) -> Result { + str::from_utf8(&bytes).context(FromBytes)?; + + Ok(StringBytes(bytes)) + } +} + +impl From for StringBytes { + fn from(src: String) -> Self { + Self(Bytes::from(src)) + } +} + +impl From<&str> for StringBytes { + fn from(src: &str) -> Self { + Self::copy_from_str(src) + } +} diff --git a/common_types/src/tests.rs b/common_types/src/tests.rs new file mode 100644 index 0000000000..e20313ce1c --- /dev/null +++ b/common_types/src/tests.rs @@ -0,0 +1,139 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use bytes::Bytes; + +use crate::{ + column_schema, + datum::{Datum, DatumKind}, + projected_schema::ProjectedSchema, + record_batch::{RecordBatchWithKey, RecordBatchWithKeyBuilder}, + row::{ + contiguous::{ContiguousRowReader, ContiguousRowWriter, ProjectedContiguousRow}, + Row, + }, + schema, + schema::{IndexInWriterSchema, Schema}, + string::StringBytes, + time::Timestamp, +}; + +fn base_schema_builder() -> schema::Builder { + schema::Builder::new() + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new("key1".to_string(), DatumKind::Varbinary) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("key2".to_string(), DatumKind::Timestamp) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Double) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field2".to_string(), DatumKind::String) + .build() + .expect("should succeed build column schema"), + ) + .unwrap() +} + +/// Build a schema for testing: +/// (key1(varbinary), key2(timestamp), field1(double), field2(string)) +pub fn build_schema() -> Schema { + base_schema_builder().build().unwrap() +} + +pub fn build_projected_schema() -> ProjectedSchema { + let schema = build_schema(); + assert!(schema.num_columns() > 1); + let projection: Vec = (0..schema.num_columns() - 1).collect(); + ProjectedSchema::new(schema, Some(projection)).unwrap() +} + +pub fn build_row(key1: &[u8], key2: i64, field1: f64, field2: &str) -> Row { + let datums = vec![ + Datum::Varbinary(Bytes::copy_from_slice(key1)), + Datum::Timestamp(Timestamp::new(key2)), + Datum::Double(field1), + Datum::String(StringBytes::from(field2)), + ]; + + Row::from_datums(datums) +} + +pub fn build_row_opt(key1: &[u8], key2: i64, field1: Option, field2: Option<&str>) -> Row { + let datums = vec![ + Datum::Varbinary(Bytes::copy_from_slice(key1)), + Datum::Timestamp(Timestamp::new(key2)), + field1.map(Datum::Double).unwrap_or(Datum::Null), + field2 + .map(|v| Datum::String(StringBytes::from(v))) + .unwrap_or(Datum::Null), + ]; + + Row::from_datums(datums) +} + +pub fn build_rows() -> Vec { + vec![ + build_row(b"binary key", 1000000, 10.0, "string value"), + build_row(b"binary key1", 1000001, 11.0, "string value 1"), + build_row_opt(b"binary key2", 1000002, None, Some("string value 2")), + build_row_opt(b"binary key3", 1000003, Some(13.0), None), + build_row_opt(b"binary key4", 1000004, None, None), + ] +} + +pub fn build_record_batch_with_key_by_rows(rows: Vec) -> RecordBatchWithKey { + let schema = build_schema(); + assert!(schema.num_columns() > 1); + let projection: Vec = (0..schema.num_columns() - 1).collect(); + let projected_schema = ProjectedSchema::new(schema.clone(), Some(projection)).unwrap(); + let row_projected_schema = projected_schema.try_project_with_key(&schema).unwrap(); + + let mut builder = + RecordBatchWithKeyBuilder::with_capacity(projected_schema.to_record_schema_with_key(), 2); + let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns()); + + let mut buf = Vec::new(); + for row in rows { + let mut writer = ContiguousRowWriter::new(&mut buf, &schema, &index_in_writer); + + writer.write_row(&row).unwrap(); + + let source_row = ContiguousRowReader::with_schema(&buf, &schema); + let projected_row = ProjectedContiguousRow::new(source_row, &row_projected_schema); + builder + .append_projected_contiguous_row(&projected_row) + .unwrap(); + } + builder.build().unwrap() +} + +pub fn check_record_batch_with_key_with_rows( + record_batch_with_key: &RecordBatchWithKey, + row_num: usize, + column_num: usize, + rows: Vec, +) -> bool { + for (i, row) in rows.iter().enumerate().take(row_num) { + for j in 0..column_num { + let datum = &row[j]; + let datum2 = record_batch_with_key.column(j).datum(i); + + if *datum != datum2 { + return false; + } + } + } + true +} diff --git a/common_types/src/time.rs b/common_types/src/time.rs new file mode 100644 index 0000000000..27ff8802c0 --- /dev/null +++ b/common_types/src/time.rs @@ -0,0 +1,363 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Time types + +// TODO(yingwen): Support timezone + +use std::{ + convert::{TryFrom, TryInto}, + time::{self, Duration, SystemTime}, +}; + +use proto::common::TimeRange as TimeRangePb; +use snafu::{Backtrace, OptionExt, Snafu}; + +/// Error of time module. +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Invalid time range, start:{}, end:{}", start, end))] + InvalidTimeRange { + start: i64, + end: i64, + backtrace: Backtrace, + }, +} + +/// Unix timestamp type in millis +// Use i64 so we can store timestamp before 1970-01-01 +#[derive(Clone, Copy, Debug, Default, Eq, Ord, PartialEq, PartialOrd, Hash)] +pub struct Timestamp(i64); + +impl Timestamp { + pub const MAX: Timestamp = Timestamp(i64::MAX); + pub const MIN: Timestamp = Timestamp(i64::MIN); + pub const ZERO: Timestamp = Timestamp(0); + + pub const fn new(ts: i64) -> Self { + Self(ts) + } + + /// Return current (non-negative) unix timestamp in millis. + pub fn now() -> Self { + SystemTime::now() + .duration_since(time::UNIX_EPOCH) + .map(|duration| { + duration + .as_millis() + .try_into() + .map(Timestamp) + .unwrap_or(Timestamp::MAX) + }) + .unwrap_or(Timestamp::ZERO) + } + + /// Returns the earliest expired timestamp. + #[inline] + pub fn expire_time(ttl: Duration) -> Timestamp { + Timestamp::now().sub_duration_or_min(ttl) + } + + #[inline] + pub fn as_i64(&self) -> i64 { + self.0 + } + + /// Truncate the value of this timestamp by given duration, return that + /// value and keeps current timestamp unchanged. + /// + /// This function won't do overflow check. + #[must_use] + pub fn truncate_by(&self, duration: Duration) -> Self { + let duration_millis = duration.as_millis() as i64; + Timestamp::new(self.0 / duration_millis * duration_millis) + } + + /// Floor the timestamp by the `duration_ms` (in millisecond) and return a + /// new Timestamp instance or None if overflow occurred. + /// + /// The `duration_ms` must be positive + #[inline] + fn checked_floor_by_i64(&self, duration_ms: i64) -> Option { + assert!(duration_ms > 0); + let normalized_ts = if self.0 >= 0 { + // self / duration_ms * duration_ms + self.0 + } else { + // (self - (duration_ms - 1)) / duration_ms * duration_ms + self.0.checked_sub(duration_ms - 1)? + }; + + normalized_ts + .checked_div(duration_ms) + .and_then(|v| v.checked_mul(duration_ms)) + .map(Timestamp) + } + + /// Returns the result of this `timestamp + offset_ms`, or None if overflow + /// occurred. + /// + /// The `offset_ms` is in millis resolution + pub fn checked_add_i64(&self, offset_ms: i64) -> Option { + self.0.checked_add(offset_ms).map(Timestamp) + } + + pub fn checked_add(&self, other: Self) -> Option { + self.0.checked_add(other.0).map(Timestamp) + } + + pub fn checked_sub(&self, other: Self) -> Option { + self.0.checked_sub(other.0).map(Timestamp) + } + + /// Returns the result of this `timestamp` - `duration`, or None if overflow + /// occurred. + pub fn checked_sub_duration(&self, duration: Duration) -> Option { + let duration_millis = duration.as_millis().try_into().ok()?; + self.0.checked_sub(duration_millis).map(Timestamp) + } + + /// Return true if the time is expired + pub fn is_expired(&self, expired_time: Timestamp) -> bool { + *self < expired_time + } + + /// Returns the result of this `timestamp` - `duration`, or MIN if overflow + /// occurred. + #[must_use] + pub fn sub_duration_or_min(&self, duration: Duration) -> Timestamp { + self.checked_sub_duration(duration) + .unwrap_or(Timestamp::MIN) + } +} + +impl From for i64 { + fn from(timestamp: Timestamp) -> Self { + timestamp.0 + } +} + +impl From for Timestamp { + fn from(ts: i64) -> Self { + Self::new(ts) + } +} + +impl From<&i64> for Timestamp { + fn from(ts: &i64) -> Self { + Self::new(*ts) + } +} + +/// Unix timestamp range in millis +/// +/// The start time is inclusive and the end time is exclusive: [start, end). +/// The range is empty if start equals end. +#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)] +pub struct TimeRange { + /// The start timestamp (inclusive) + inclusive_start: Timestamp, + /// The end timestamp (exclusive) + exclusive_end: Timestamp, +} + +impl TimeRange { + /// Create a new time range, returns None if the start/end is invalid + pub fn new(inclusive_start: Timestamp, exclusive_end: Timestamp) -> Option { + if inclusive_start <= exclusive_end { + Some(Self { + inclusive_start, + exclusive_end, + }) + } else { + None + } + } + + /// Create a new time range, panic if the start/end is invalid. + pub fn new_unchecked(inclusive_start: Timestamp, exclusive_end: Timestamp) -> Self { + Self::new(inclusive_start, exclusive_end).unwrap() + } + + #[cfg(any(test, feature = "test"))] + pub fn new_unchecked_for_test(inclusive_start: i64, exclusive_end: i64) -> Self { + Self::new( + Timestamp::new(inclusive_start), + Timestamp::new(exclusive_end), + ) + .unwrap() + } + + /// Create a time range only including the single timestamp. + pub fn from_timestamp(t: Timestamp) -> Self { + // FIXME(xikai): now the time range can not express the `exclusive_end` as + // infinite. + let end = t.checked_add_i64(1).unwrap_or(t); + Self::new(t, end).unwrap() + } + + /// Create a new time range of [0, max) + pub fn min_to_max() -> Self { + Self { + inclusive_start: Timestamp::MIN, + exclusive_end: Timestamp::MAX, + } + } + + /// Create a empty time range. + pub fn empty() -> Self { + Self { + inclusive_start: Timestamp::ZERO, + exclusive_end: Timestamp::ZERO, + } + } + + /// The inclusive start timestamp + #[inline] + pub fn inclusive_start(&self) -> Timestamp { + self.inclusive_start + } + + /// The exclusive end timestamp + #[inline] + pub fn exclusive_end(&self) -> Timestamp { + self.exclusive_end + } + + /// Return the reference to the exclusive end timestamp. + #[inline] + pub fn exclusive_end_ref(&self) -> &Timestamp { + &self.exclusive_end + } + + /// Returns true if the time range contains the given `ts` + #[inline] + pub fn contains(&self, ts: Timestamp) -> bool { + self.inclusive_start <= ts && ts < self.exclusive_end + } + + /// Returns a time bucket with fixed bucket size that the timestamp belongs + /// to. Returns None if overflow occurred, the bucket_duration is greater + /// than [i64::MAX] or not positive. + pub fn bucket_of(timestamp: Timestamp, bucket_duration: Duration) -> Option { + let bucket_duration_ms: i64 = bucket_duration.as_millis().try_into().ok()?; + if bucket_duration_ms <= 0 { + return None; + } + + let inclusive_start = timestamp.checked_floor_by_i64(bucket_duration_ms)?; + // end = start + bucket_duration + let exclusive_end = inclusive_start.checked_add_i64(bucket_duration_ms)?; + + Some(Self { + inclusive_start, + exclusive_end, + }) + } + + /// Returns true if this time range intersect with `other` + pub fn intersect_with(&self, other: TimeRange) -> bool { + !self.not_intersecting(other) + } + + /// Return true if the time range is expired (`exclusive_end_time` < + /// `expire_time`). + pub fn is_expired(&self, expire_time: Option) -> bool { + expire_time.is_some() && self.exclusive_end() <= expire_time.unwrap() + } + + #[inline] + fn not_intersecting(&self, other: TimeRange) -> bool { + other.exclusive_end <= self.inclusive_start || other.inclusive_start >= self.exclusive_end + } + + pub fn intersected_range(&self, other: TimeRange) -> Option { + TimeRange::new( + self.inclusive_start.max(other.inclusive_start), + self.exclusive_end.min(other.exclusive_end), + ) + } +} + +impl From for TimeRangePb { + fn from(src: TimeRange) -> Self { + let mut target = TimeRangePb::default(); + target.set_start(src.inclusive_start.as_i64()); + target.set_end(src.exclusive_end.as_i64()); + target + } +} + +impl TryFrom for TimeRange { + type Error = Error; + + fn try_from(src: TimeRangePb) -> Result { + Self::new(Timestamp::new(src.start), Timestamp::new(src.end)).context(InvalidTimeRange { + start: src.start, + end: src.end, + }) + } +} + +#[cfg(test)] +mod test { + use std::time::Duration; + + use crate::time::{TimeRange, Timestamp}; + + #[test] + fn test_timestamp() { + // 1637723901000: 2021-11-24 11:18:21 + let timestamp = Timestamp::new(1637723901000); + // 1d + let ttl = Duration::from_secs(24 * 3600); + assert_eq!( + timestamp.sub_duration_or_min(ttl), + Timestamp::new(1637637501000) + ); + assert_eq!(timestamp.truncate_by(ttl), Timestamp::new(1637712000000)); + assert_eq!( + timestamp.checked_floor_by_i64(2000), + Some(Timestamp::new(1637723900000)) + ); + assert_eq!( + timestamp.checked_add_i64(2000), + Some(Timestamp::new(1637723903000)) + ); + assert_eq!( + timestamp.checked_sub_duration(ttl), + Some(Timestamp::new(1637637501000)) + ); + } + + #[test] + fn test_time_range() { + // [100,200) + let time_range = TimeRange::new_unchecked_for_test(100, 200); + assert!(time_range.contains(Timestamp::new(150))); + assert!(time_range.contains(Timestamp::new(100))); + assert!(!time_range.contains(Timestamp::new(200))); + + assert!(!time_range.is_expired(Some(Timestamp::new(50)))); + assert!(time_range.is_expired(Some(Timestamp::new(200)))); + + assert_eq!( + TimeRange::bucket_of(Timestamp::new(100), Duration::from_millis(2)), + Some(TimeRange::new_unchecked_for_test(100, 102)) + ); + + let time_range2 = TimeRange::new_unchecked_for_test(200, 300); + assert!(!time_range.intersect_with(time_range2)); + let time_range3 = TimeRange::new_unchecked_for_test(50, 200); + assert!(time_range.intersect_with(time_range3)); + + assert!(time_range.not_intersecting(time_range2)); + assert!(!time_range.not_intersecting(time_range3)); + } + + #[test] + fn test_bucket_of_negative_timestamp() { + let ts = Timestamp::new(-126316800000); + let range = TimeRange::bucket_of(ts, Duration::from_millis(25920000000)).unwrap(); + assert!(range.contains(ts), "range:{:?}", range); + } +} diff --git a/common_util/Cargo.toml b/common_util/Cargo.toml new file mode 100644 index 0000000000..884b13236b --- /dev/null +++ b/common_util/Cargo.toml @@ -0,0 +1,44 @@ +[package] +name = "common_util" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[features] +test = ["env_logger"] + +[dependencies] +# In alphabetical order +backtrace = "0.3.9" +common_types = { path = "../common_types", features = ["test"] } +chrono = "0.4" +crossbeam-utils = "0.8" +env_logger = { version = "0.6", optional = true } +lazy_static = "1.4.0" +libc = "0.2" +log = "0.4" +logger = { path = "../components/logger"} +snafu = { version ="0.6.10", features = ["backtraces"]} +serde = {version = "1.0.81", features = ["derive"]} +serde_derive = "1.0.81" +pin-project-lite = "0.2" +prometheus = "0.12" +proto = { path = "../proto" } +time = "0.1" +tokio = { version = "1.15", features = ["full"] } +toml = "0.5" + +[dev-dependencies] +env_logger = "0.6" +gag = "1.0" +nix = "0.19" +slog = "2.7" +tempfile = "3.1.0" +tokio-test = "0.4.2" + +[dev-dependencies.slog-global] +version = "0.1" +git = "https://github.com/breezewish/slog-global.git" +rev = "0e23a5baff302a9d7bccd85f8f31e43339c2f2c1" diff --git a/common_util/src/alloc_tracker.rs b/common_util/src/alloc_tracker.rs new file mode 100644 index 0000000000..7e0979cb0f --- /dev/null +++ b/common_util/src/alloc_tracker.rs @@ -0,0 +1,159 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Alloc tracker + +use std::sync::atomic::{AtomicUsize, Ordering}; + +/// Collect memory usage from tracker, useful for extending the tracker +pub trait Collector { + /// Called when `bytes` bytes memory is allocated and tracked by the tracker + fn on_allocate(&self, bytes: usize); + + /// Called when `bytes` bytes memory is freed and tracked by the tracker + fn on_free(&self, bytes: usize); +} + +/// A tracker to track memory in used +// TODO(yingwen): Impl a thread local or local tracker that are not thread safe, +// and collect statistics into the thread safe one for better performance +pub struct Tracker { + collector: T, + bytes_allocated: AtomicUsize, +} + +impl Tracker { + pub fn new(collector: T) -> Self { + Self { + collector, + bytes_allocated: AtomicUsize::new(0), + } + } + + /// Increase consumption of this tracker by bytes + pub fn consume(&self, bytes: usize) { + self.bytes_allocated.fetch_add(bytes, Ordering::Relaxed); + self.collector.on_allocate(bytes); + } + + /// Decrease consumption of this tracker by bytes + /// + /// The caller should guarantee the released bytes wont larger than bytes + /// already consumed + pub fn release(&self, bytes: usize) { + self.bytes_allocated.fetch_sub(bytes, Ordering::Relaxed); + self.collector.on_free(bytes); + } + + /// Bytes allocated + pub fn bytes_allocated(&self) -> usize { + self.bytes_allocated.load(Ordering::Relaxed) + } +} + +impl Drop for Tracker { + fn drop(&mut self) { + let bytes = *self.bytes_allocated.get_mut(); + self.collector.on_free(bytes); + } +} + +/// The noop collector does nothing on alloc and free +struct NoopCollector; + +impl Collector for NoopCollector { + fn on_allocate(&self, _bytes: usize) {} + + fn on_free(&self, _bytes: usize) {} +} + +/// A simple tracker hides the collector api +pub struct SimpleTracker(Tracker); + +impl Default for SimpleTracker { + fn default() -> Self { + Self(Tracker::new(NoopCollector)) + } +} + +impl SimpleTracker { + /// Increase consumption of this tracker by bytes + #[inline] + pub fn consume(&self, bytes: usize) { + self.0.consume(bytes); + } + + /// Decrease consumption of this tracker by bytes + /// + /// The caller should guarantee the released bytes wont larger than bytes + /// already consumed + #[inline] + pub fn release(&self, bytes: usize) { + self.0.release(bytes); + } + + /// Bytes allocated + pub fn bytes_allocated(&self) -> usize { + self.0.bytes_allocated() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_simple_tracker() { + let tracker = SimpleTracker::default(); + tracker.consume(256); + assert_eq!(256, tracker.bytes_allocated()); + + tracker.release(100); + assert_eq!(156, tracker.bytes_allocated()); + } + + #[test] + fn test_collector() { + use std::sync::atomic::AtomicBool; + + struct MockCollector { + allocated: AtomicBool, + freed: AtomicBool, + } + + impl MockCollector { + fn new() -> Self { + Self { + allocated: AtomicBool::new(false), + freed: AtomicBool::new(false), + } + } + } + + impl Drop for MockCollector { + fn drop(&mut self) { + assert!(*self.allocated.get_mut()); + assert!(*self.freed.get_mut()); + } + } + + impl Collector for MockCollector { + fn on_allocate(&self, bytes: usize) { + assert_eq!(800, bytes); + self.allocated.store(true, Ordering::Relaxed); + } + + fn on_free(&self, bytes: usize) { + if self.freed.load(Ordering::Relaxed) { + assert_eq!(440, bytes); + } else { + assert_eq!(360, bytes); + } + self.freed.store(true, Ordering::Relaxed); + } + } + + let tracker = Tracker::new(MockCollector::new()); + tracker.consume(800); + tracker.release(360); + } +} diff --git a/common_util/src/codec/compact/bytes.rs b/common_util/src/codec/compact/bytes.rs new file mode 100644 index 0000000000..aeeff7739d --- /dev/null +++ b/common_util/src/codec/compact/bytes.rs @@ -0,0 +1,130 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Bytes format + +use std::convert::TryFrom; + +use common_types::bytes::{Bytes, BytesMut, MemBuf, MemBufMut}; +use snafu::{ensure, ResultExt}; + +use crate::codec::{ + compact::{ + DecodeEmptyValue, DecodeValue, DecodeVarint, EncodeValue, EncodeVarint, Error, + MemCompactDecoder, MemCompactEncoder, Result, TryIntoUsize, + }, + consts, varint, DecodeTo, Encoder, +}; + +impl Encoder<[u8]> for MemCompactEncoder { + type Error = Error; + + // EncodeCompactBytes joins bytes with its length into a byte slice. It is more + // efficient in both space and time compare to EncodeBytes. Note that the + // encoded result is not memcomparable. + fn encode(&self, buf: &mut B, value: &[u8]) -> Result<()> { + varint::encode_varint(buf, value.len() as i64).context(EncodeVarint)?; + buf.write_slice(value).context(EncodeValue)?; + Ok(()) + } + + fn estimate_encoded_size(&self, value: &[u8]) -> usize { + consts::MAX_VARINT_BYTES + value.len() + } +} + +impl Encoder for MemCompactEncoder { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &Bytes) -> Result<()> { + self.encode(buf, &value[..]) + } + + fn estimate_encoded_size(&self, value: &Bytes) -> usize { + self.estimate_encoded_size(&value[..]) + } +} + +impl DecodeTo for MemCompactDecoder { + type Error = Error; + + fn decode_to(&self, buf: &mut B, value: &mut BytesMut) -> Result<()> { + let v = usize::try_from(varint::decode_varint(buf).context(DecodeVarint)?) + .context(TryIntoUsize)?; + ensure!(buf.remaining_slice().len() >= v, DecodeEmptyValue); + value + .write_slice(&buf.remaining_slice()[..v]) + .context(DecodeValue)?; + buf.must_advance(v); + Ok(()) + } +} + +#[cfg(test)] +mod test { + use super::*; + + struct BytesTest { + data: Bytes, + estimate_encoded_size: usize, + } + + #[test] + fn test_compact_bytes_codec() { + let data = vec![ + BytesTest { + data: Bytes::from_static(b""), + estimate_encoded_size: 10, + }, + BytesTest { + data: Bytes::from_static(b"hello1"), + estimate_encoded_size: 16, + }, + BytesTest { + data: Bytes::from_static(b"hello2"), + estimate_encoded_size: 16, + }, + BytesTest { + data: Bytes::from_static(b"hello3"), + estimate_encoded_size: 16, + }, + BytesTest { + data: Bytes::from_static(&[0x00, 0x01]), + estimate_encoded_size: 12, + }, + BytesTest { + data: Bytes::from_static(&[0xff, 0xff]), + estimate_encoded_size: 12, + }, + BytesTest { + data: Bytes::from_static(&[0x01, 0x00]), + estimate_encoded_size: 12, + }, + BytesTest { + data: Bytes::from_static(b"abc"), + estimate_encoded_size: 13, + }, + BytesTest { + data: Bytes::from_static(b"hello world"), + estimate_encoded_size: 21, + }, + ]; + + let encoder = MemCompactEncoder; + let mut buf = vec![]; + for x in &data { + encoder.encode(&mut buf, &x.data).unwrap(); + assert_eq!( + x.estimate_encoded_size, + encoder.estimate_encoded_size(&x.data) + ); + } + + let decoder = MemCompactDecoder; + let mut buf = &buf[..]; + for x in &data { + let mut d = BytesMut::new(); + decoder.decode_to(&mut buf, &mut d).unwrap(); + assert_eq!(d, x.data); + } + } +} diff --git a/common_util/src/codec/compact/datum.rs b/common_util/src/codec/compact/datum.rs new file mode 100644 index 0000000000..0d80088e06 --- /dev/null +++ b/common_util/src/codec/compact/datum.rs @@ -0,0 +1,264 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Datum compact codec + +use common_types::{ + bytes::{BytesMut, MemBuf, MemBufMut}, + datum::Datum, + string::StringBytes, + time::Timestamp, +}; +use snafu::ResultExt; + +use crate::codec::{ + compact::{EncodeKey, Error, MemCompactDecoder, MemCompactEncoder, Result}, + consts, DecodeTo, Encoder, +}; + +// For float points, we use same encoding as mem comparable encoder +impl Encoder for MemCompactEncoder { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &Datum) -> Result<()> { + match value { + Datum::Null => buf.write_u8(consts::NULL_FLAG).context(EncodeKey), + Datum::Timestamp(ts) => { + buf.write_u8(consts::VARINT_FLAG).context(EncodeKey)?; + self.encode(buf, &ts.as_i64()) + } + Datum::Double(v) => { + buf.write_u8(consts::FLOAT_FLAG).context(EncodeKey)?; + self.encode(buf, v) + } + Datum::Float(v) => { + buf.write_u8(consts::FLOAT_FLAG).context(EncodeKey)?; + self.encode(buf, v) + } + Datum::Varbinary(v) => { + buf.write_u8(consts::COMPACT_BYTES_FLAG) + .context(EncodeKey)?; + self.encode(buf, v) + } + // For string, just encode/decode like bytes. + Datum::String(v) => { + buf.write_u8(consts::COMPACT_BYTES_FLAG) + .context(EncodeKey)?; + self.encode(buf, v.as_bytes()) + } + Datum::UInt64(v) => { + buf.write_u8(consts::UVARINT_FLAG).context(EncodeKey)?; + self.encode(buf, v) + } + Datum::UInt32(v) => { + buf.write_u8(consts::UVARINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(u64::from(*v))) + } + Datum::UInt16(v) => { + buf.write_u8(consts::UVARINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(u64::from(*v))) + } + Datum::UInt8(v) => { + buf.write_u8(consts::UVARINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(u64::from(*v))) + } + Datum::Int64(v) => { + buf.write_u8(consts::VARINT_FLAG).context(EncodeKey)?; + self.encode(buf, v) + } + Datum::Int32(v) => { + buf.write_u8(consts::VARINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(i64::from(*v))) + } + Datum::Int16(v) => { + buf.write_u8(consts::VARINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(i64::from(*v))) + } + Datum::Int8(v) => { + buf.write_u8(consts::VARINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(i64::from(*v))) + } + Datum::Boolean(v) => { + buf.write_u8(consts::UVARINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(u64::from(*v))) + } + } + } + + fn estimate_encoded_size(&self, value: &Datum) -> usize { + match value { + // Null takes 1 byte + Datum::Null => 1, + Datum::Timestamp(ts) => self.estimate_encoded_size(&ts.as_i64()), + Datum::Double(v) => self.estimate_encoded_size(v), + Datum::Float(v) => self.estimate_encoded_size(v), + Datum::Varbinary(v) => self.estimate_encoded_size(v), + Datum::String(v) => self.estimate_encoded_size(v.as_bytes()), + Datum::UInt64(v) => self.estimate_encoded_size(v), + Datum::UInt32(v) => self.estimate_encoded_size(&(u64::from(*v))), + Datum::UInt16(v) => self.estimate_encoded_size(&(u64::from(*v))), + Datum::UInt8(v) => self.estimate_encoded_size(&(u64::from(*v))), + Datum::Int64(v) => self.estimate_encoded_size(v), + Datum::Int32(v) => self.estimate_encoded_size(&(i64::from(*v))), + Datum::Int16(v) => self.estimate_encoded_size(&(i64::from(*v))), + Datum::Int8(v) => self.estimate_encoded_size(&(i64::from(*v))), + Datum::Boolean(v) => self.estimate_encoded_size(&(u64::from(*v))), + } + } +} + +macro_rules! decode_var_u64_into { + ($self: ident, $v: ident, $actual: ident, $buf: ident, $type: ty) => {{ + Self::ensure_flag(consts::UVARINT_FLAG, $actual)?; + let mut data = 0u64; + $self.decode_to($buf, &mut data)?; + *$v = data as $type; + }}; +} + +macro_rules! decode_var_u64_into_bool { + ($self: ident, $v: ident, $actual: ident, $buf: ident) => {{ + Self::ensure_flag(consts::UVARINT_FLAG, $actual)?; + let mut data = 0u64; + $self.decode_to($buf, &mut data)?; + *$v = data != 0; + }}; +} + +macro_rules! decode_var_i64_into { + ($self: ident, $v: ident, $actual: ident, $buf: ident, $type: ty) => {{ + Self::ensure_flag(consts::VARINT_FLAG, $actual)?; + let mut data = 0i64; + $self.decode_to($buf, &mut data)?; + *$v = data as $type; + }}; +} + +impl DecodeTo for MemCompactDecoder { + type Error = Error; + + /// REQUIRE: The datum type should match the type in buf + /// + /// For string datum, the utf8 check will be skipped. + fn decode_to(&self, buf: &mut B, value: &mut Datum) -> Result<()> { + let actual = match self.maybe_read_null(buf)? { + Some(v) => v, + None => { + *value = Datum::Null; + return Ok(()); + } + }; + + match value { + Datum::Null => { + Self::ensure_flag(consts::NULL_FLAG, actual)?; + } + Datum::Timestamp(ts) => { + Self::ensure_flag(consts::VARINT_FLAG, actual)?; + let mut data = 0; + self.decode_to(buf, &mut data)?; + *ts = Timestamp::new(data); + } + Datum::Double(v) => { + Self::ensure_flag(consts::FLOAT_FLAG, actual)?; + self.decode_to(buf, v)?; + } + Datum::Float(v) => { + Self::ensure_flag(consts::FLOAT_FLAG, actual)?; + self.decode_to(buf, v)?; + } + Datum::Varbinary(v) => { + Self::ensure_flag(consts::COMPACT_BYTES_FLAG, actual)?; + let mut data = BytesMut::new(); + self.decode_to(buf, &mut data)?; + *v = data.freeze(); + } + Datum::String(v) => { + Self::ensure_flag(consts::COMPACT_BYTES_FLAG, actual)?; + let mut data = BytesMut::new(); + self.decode_to(buf, &mut data)?; + // For string datum, we won't validate whether the bytes is a valid utf string + // during decoding to improve decode performance. The encoder + // should already done the utf8 check. + unsafe { + *v = StringBytes::from_bytes_unchecked(data.freeze()); + } + } + Datum::UInt64(v) => { + Self::ensure_flag(consts::UVARINT_FLAG, actual)?; + self.decode_to(buf, v)?; + } + Datum::UInt32(v) => decode_var_u64_into!(self, v, actual, buf, u32), + Datum::UInt16(v) => decode_var_u64_into!(self, v, actual, buf, u16), + Datum::UInt8(v) => decode_var_u64_into!(self, v, actual, buf, u8), + Datum::Int64(v) => { + Self::ensure_flag(consts::VARINT_FLAG, actual)?; + self.decode_to(buf, v)?; + } + Datum::Int32(v) => decode_var_i64_into!(self, v, actual, buf, i32), + Datum::Int16(v) => decode_var_i64_into!(self, v, actual, buf, i16), + Datum::Int8(v) => decode_var_i64_into!(self, v, actual, buf, i8), + Datum::Boolean(v) => decode_var_u64_into_bool!(self, v, actual, buf), + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use common_types::bytes::Bytes; + + use super::*; + + // TODO(yingwen): Test nullable. + #[test] + fn test_datum_codec() { + let data = vec![ + // (datum to encode, estimate_encoded_size) + (Datum::Null, 1), + (Datum::Timestamp(Timestamp::new(12345)), 10), + (Datum::Double(10.5), 8), + (Datum::Float(1.99), 4), + (Datum::Varbinary(Bytes::from_static(b"hello world")), 21), + (Datum::String(StringBytes::from_static("hello world")), 21), + (Datum::UInt64(12345), 10), + (Datum::UInt32(1000), 10), + (Datum::UInt16(65000), 10), + (Datum::UInt8(150), 10), + (Datum::Int64(-100209), 10), + (Datum::Int32(-10020), 10), + (Datum::Int16(32500), 10), + (Datum::Int8(-120), 10), + (Datum::Boolean(true), 10), + (Datum::Boolean(false), 10), + ]; + let mut decoded = vec![ + Datum::Null, + Datum::Timestamp(Timestamp::new(0)), + Datum::Double(0.0), + Datum::Float(0.0), + Datum::Varbinary(Bytes::new()), + Datum::String(StringBytes::new()), + Datum::UInt64(0), + Datum::UInt32(0), + Datum::UInt16(0), + Datum::UInt8(0), + Datum::Int64(0), + Datum::Int32(0), + Datum::Int16(0), + Datum::Int8(0), + Datum::Boolean(false), + Datum::Boolean(false), + ]; + let encoder = MemCompactEncoder; + let decoder = MemCompactDecoder; + for (index, x) in data.iter().enumerate() { + let mut buf = vec![]; + encoder.encode(&mut buf, &x.0).unwrap(); + assert_eq!(x.1, encoder.estimate_encoded_size(&x.0)); + decoder + .decode_to(&mut buf.as_slice(), &mut decoded[index]) + .unwrap(); + assert_eq!(decoded[index], data[index].0); + } + } +} diff --git a/common_util/src/codec/compact/float.rs b/common_util/src/codec/compact/float.rs new file mode 100644 index 0000000000..867ff3282b --- /dev/null +++ b/common_util/src/codec/compact/float.rs @@ -0,0 +1,101 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::mem; + +use common_types::bytes::{MemBuf, MemBufMut}; +use snafu::ResultExt; + +use crate::codec::{ + compact::{DecodeValue, EncodeValue, Error, MemCompactDecoder, MemCompactEncoder, Result}, + DecodeTo, Encoder, +}; + +impl Encoder for MemCompactEncoder { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &f64) -> Result<()> { + buf.write_f64(*value).context(EncodeValue)?; + Ok(()) + } + + fn estimate_encoded_size(&self, _value: &f64) -> usize { + mem::size_of::() + } +} + +impl DecodeTo for MemCompactDecoder { + type Error = Error; + + fn decode_to(&self, buf: &mut B, value: &mut f64) -> Result<()> { + *value = buf.read_f64().context(DecodeValue)?; + Ok(()) + } +} + +impl Encoder for MemCompactEncoder { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &f32) -> Result<()> { + buf.write_f32(*value).context(EncodeValue)?; + Ok(()) + } + + fn estimate_encoded_size(&self, _value: &f32) -> usize { + mem::size_of::() + } +} + +impl DecodeTo for MemCompactDecoder { + type Error = Error; + + fn decode_to(&self, buf: &mut B, value: &mut f32) -> Result<()> { + *value = buf.read_f32().context(DecodeValue)?; + Ok(()) + } +} + +#[cfg(test)] +mod test { + use super::*; + + struct TestF64 { + data: f64, + estimate_encoded_size: usize, + } + + #[test] + fn test_compact_f64_codec() { + let data = vec![ + TestF64 { + data: 162132470.5, + estimate_encoded_size: 8, + }, + TestF64 { + data: f64::MIN, + estimate_encoded_size: 8, + }, + TestF64 { + data: f64::MAX, + estimate_encoded_size: 8, + }, + ]; + + let encoder = MemCompactEncoder; + let mut buf = vec![]; + for x in &data { + encoder.encode(&mut buf, &x.data).unwrap(); + assert_eq!( + x.estimate_encoded_size, + encoder.estimate_encoded_size(&x.data) + ); + } + + let decoder = MemCompactDecoder; + let mut buf = &buf[..]; + for x in &data { + let mut d = 0.0; + decoder.decode_to(&mut buf, &mut d).unwrap(); + assert!((d - x.data).abs() < f64::EPSILON); + } + } +} diff --git a/common_util/src/codec/compact/mod.rs b/common_util/src/codec/compact/mod.rs new file mode 100644 index 0000000000..1327e05929 --- /dev/null +++ b/common_util/src/codec/compact/mod.rs @@ -0,0 +1,92 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Mem compact format codec + +// Implementation reference: +// https://github.com/pingcap/tidb/blob/bd011d3c9567c506d8d4343ade03edf77fcd5b56/util/codec/codec.go +mod bytes; +mod datum; +mod float; +mod number; + +use common_types::bytes::MemBuf; +use snafu::{ensure, Backtrace, ResultExt, Snafu}; + +use crate::codec::consts; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to encode flag, err:{}", source))] + EncodeKey { source: common_types::bytes::Error }, + + #[snafu(display("Failed to encode value, err:{}", source))] + EncodeValue { source: common_types::bytes::Error }, + + #[snafu(display("Failed to encode varint, err:{}", source))] + EncodeVarint { source: crate::codec::varint::Error }, + + #[snafu(display("Failed to decode varint, err:{}", source))] + DecodeVarint { source: crate::codec::varint::Error }, + + #[snafu(display("Failed to decode key, err:{}", source))] + DecodeKey { source: common_types::bytes::Error }, + + #[snafu(display("Insufficient bytes to decode value.\nBacktrace:\n{}", backtrace))] + DecodeEmptyValue { backtrace: Backtrace }, + + #[snafu(display( + "Invalid flag, expect:{}, actual:{}.\nBacktrace:\n{}", + expect, + actual, + backtrace + ))] + InvalidKeyFlag { + expect: u8, + actual: u8, + backtrace: Backtrace, + }, + + #[snafu(display("Insufficient bytes to decode value, err:{}", source))] + DecodeValue { source: common_types::bytes::Error }, + + #[snafu(display("Try into usize error:{}.\nBacktrace:\n{}", source, backtrace))] + TryIntoUsize { + source: std::num::TryFromIntError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to decode string, err:{}", source))] + DecodeString { source: common_types::string::Error }, + + #[snafu(display("Datum cannot be null.\nBacktrace:\n{}", backtrace))] + NullDatum { backtrace: Backtrace }, +} + +define_result!(Error); + +/// Mem compact encoder +pub struct MemCompactEncoder; + +/// Mem compact decoder +pub struct MemCompactDecoder; + +impl MemCompactDecoder { + /// Returns None if we need to return null datum, otherwise return the flag. + fn maybe_read_null(&self, buf: &mut B) -> Result> { + let actual = buf.read_u8().context(DecodeKey)?; + // If actual flag is null, need to check whether this datum is nullable. + if actual == consts::NULL_FLAG { + // The decoder need to return null datum. + return Ok(None); + } + + Ok(Some(actual)) + } + + #[inline] + fn ensure_flag(expect: u8, actual: u8) -> Result<()> { + // Actual flag is not null. + ensure!(expect == actual, InvalidKeyFlag { expect, actual }); + Ok(()) + } +} diff --git a/common_util/src/codec/compact/number.rs b/common_util/src/codec/compact/number.rs new file mode 100644 index 0000000000..56aa76504f --- /dev/null +++ b/common_util/src/codec/compact/number.rs @@ -0,0 +1,160 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Number format + +use common_types::bytes::{MemBuf, MemBufMut}; +use snafu::ResultExt; + +use crate::codec::{ + compact::{DecodeVarint, EncodeVarint, Error, MemCompactDecoder, MemCompactEncoder, Result}, + consts, varint, DecodeTo, Encoder, +}; + +impl Encoder for MemCompactEncoder { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &i64) -> Result<()> { + varint::encode_varint(buf, *value).context(EncodeVarint)?; + Ok(()) + } + + fn estimate_encoded_size(&self, _value: &i64) -> usize { + consts::MAX_VARINT_BYTES + } +} + +impl DecodeTo for MemCompactDecoder { + type Error = Error; + + fn decode_to(&self, buf: &mut B, value: &mut i64) -> Result<()> { + *value = varint::decode_varint(buf).context(DecodeVarint)?; + Ok(()) + } +} + +impl Encoder for MemCompactEncoder { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &u64) -> Result<()> { + varint::encode_uvarint(buf, *value).context(EncodeVarint)?; + Ok(()) + } + + fn estimate_encoded_size(&self, _value: &u64) -> usize { + consts::MAX_UVARINT_BYTES + } +} + +impl DecodeTo for MemCompactDecoder { + type Error = Error; + + fn decode_to(&self, buf: &mut B, value: &mut u64) -> Result<()> { + *value = varint::decode_uvarint(buf).context(DecodeVarint)?; + Ok(()) + } +} + +#[cfg(test)] +mod test { + use super::*; + + struct TestI64 { + data: i64, + estimate_encoded_size: usize, + } + #[test] + fn test_compact_i64_codec() { + let data = vec![ + TestI64 { + data: 1621324705, + estimate_encoded_size: 10, + }, + TestI64 { + data: 1621324705000, + estimate_encoded_size: 10, + }, + TestI64 { + data: 1521324705, + estimate_encoded_size: 10, + }, + TestI64 { + data: 1621324705123, + estimate_encoded_size: 10, + }, + TestI64 { + data: i64::MIN, + estimate_encoded_size: 10, + }, + TestI64 { + data: i64::MIN + 1, + estimate_encoded_size: 10, + }, + TestI64 { + data: 0, + estimate_encoded_size: 10, + }, + TestI64 { + data: i64::MAX, + estimate_encoded_size: 10, + }, + TestI64 { + data: (1 << 47) - 1, + estimate_encoded_size: 10, + }, + TestI64 { + data: -1 << 47, + estimate_encoded_size: 10, + }, + TestI64 { + data: (1 << 23) - 1, + estimate_encoded_size: 10, + }, + TestI64 { + data: -1 << 23, + estimate_encoded_size: 10, + }, + TestI64 { + data: (1 << 33) - 1, + estimate_encoded_size: 10, + }, + TestI64 { + data: -1 << 33, + estimate_encoded_size: 10, + }, + TestI64 { + data: (1 << 55) - 1, + estimate_encoded_size: 10, + }, + TestI64 { + data: -1 << 55, + estimate_encoded_size: 10, + }, + TestI64 { + data: 1, + estimate_encoded_size: 10, + }, + TestI64 { + data: -1, + estimate_encoded_size: 10, + }, + ]; + + let encoder = MemCompactEncoder; + let mut buf = vec![]; + for x in &data { + encoder.encode(&mut buf, &x.data).unwrap(); + assert_eq!( + x.estimate_encoded_size, + encoder.estimate_encoded_size(&x.data) + ); + } + + let decoder = MemCompactDecoder; + let mut buf = &buf[..]; + for x in &data { + let mut d = -1; + decoder.decode_to(&mut buf, &mut d).unwrap(); + assert_eq!(d, x.data); + } + } +} diff --git a/common_util/src/codec/consts.rs b/common_util/src/codec/consts.rs new file mode 100644 index 0000000000..843985eec6 --- /dev/null +++ b/common_util/src/codec/consts.rs @@ -0,0 +1,21 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Common constants used in codec + +// First byte in the encoded value which specifies the encoding type. +// TODO(yingwen): Replace flags by datum kind. (Incompatible with old format). +pub const NULL_FLAG: u8 = 0; +pub const BYTES_FLAG: u8 = 1; +pub const COMPACT_BYTES_FLAG: u8 = 2; +pub const INT_FLAG: u8 = 3; +pub const UINT_FLAG: u8 = 4; +pub const FLOAT_FLAG: u8 = 5; +pub const VARINT_FLAG: u8 = 8; +pub const UVARINT_FLAG: u8 = 9; + +/// Max bytes varint can use +pub const MAX_VARINT_BYTES: usize = 10; +/// Max bytes uvarint can be use +pub const MAX_UVARINT_BYTES: usize = 10; +/// Sign mask for u64/i64 conversion +pub const SIGN_MASK: u64 = 0x8000000000000000; diff --git a/common_util/src/codec/memcomparable/bytes.rs b/common_util/src/codec/memcomparable/bytes.rs new file mode 100644 index 0000000000..878ad9c051 --- /dev/null +++ b/common_util/src/codec/memcomparable/bytes.rs @@ -0,0 +1,279 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Bytes format + +use common_types::bytes::{Bytes, BytesMut, MemBuf, MemBufMut}; +use snafu::{ensure, ResultExt}; + +use crate::codec::{ + memcomparable::{ + DecodeValueGroup, DecodeValueMarker, DecodeValuePadding, EncodeValue, Error, MemComparable, + Result, + }, + DecodeTo, Encoder, +}; + +const ENC_GROUP_SIZE: usize = 8; +const ENC_MARKER: u8 = 0xFF; +const ENC_PAD: u8 = 0x0; +const PADS: [u8; ENC_GROUP_SIZE] = [0; ENC_GROUP_SIZE]; + +impl Encoder<[u8]> for MemComparable { + type Error = Error; + + // encode Bytes guarantees the encoded value is in ascending order for + // comparison, encoding with the following rule: + // [group1][marker1]...[groupN][markerN] + // group is 8 bytes slice which is padding with 0. + // marker is `0xFF - padding 0 count` + // For example: + // + // ``` + // [] -> [0, 0, 0, 0, 0, 0, 0, 0, 247] + // [1, 2, 3] -> [1, 2, 3, 0, 0, 0, 0, 0, 250] + // [1, 2, 3, 0] -> [1, 2, 3, 0, 0, 0, 0, 0, 251] + // [1, 2, 3, 4, 5, 6, 7, 8] -> [1, 2, 3, 4, 5, 6, 7, 8, 255, 0, 0, 0, 0, 0, 0, 0, 0, 247] + // ``` + // + // Refer: https://github.com/facebook/mysql-5.6/wiki/MyRocks-record-format#memcomparable-format + fn encode(&self, buf: &mut B, value: &[u8]) -> Result<()> { + let value_len = value.len(); + for idx in (0..=value_len).step_by(ENC_GROUP_SIZE) { + let remain = value_len - idx; + let mut pad_count = 0; + if remain >= ENC_GROUP_SIZE { + buf.write_slice(&value[idx..idx + ENC_GROUP_SIZE]) + .context(EncodeValue)?; + } else { + pad_count = ENC_GROUP_SIZE - remain; + buf.write_slice(&value[idx..]).context(EncodeValue)?; + buf.write_slice(&PADS[..pad_count]).context(EncodeValue)?; + } + let marker = ENC_MARKER - pad_count as u8; + buf.write_u8(marker).context(EncodeValue)?; + } + Ok(()) + } + + // Allocate more space to avoid unnecessary slice growing. + // Assume that the byte slice size is about `(len(data) / encGroupSize + 1) * + // (encGroupSize + 1)` bytes, that is `(len(data) / 8 + 1) * 9` in our + // implement. + fn estimate_encoded_size(&self, value: &[u8]) -> usize { + (value.len() / ENC_GROUP_SIZE + 1) * (ENC_GROUP_SIZE + 1) + } +} + +impl Encoder for MemComparable { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &Bytes) -> Result<()> { + self.encode(buf, &value[..]) + } + + fn estimate_encoded_size(&self, value: &Bytes) -> usize { + self.estimate_encoded_size(&value[..]) + } +} + +impl DecodeTo for MemComparable { + type Error = Error; + + // decode Bytes which is encoded by encode Bytes before, + // returns the leftover bytes and decoded value if no error. + fn decode_to(&self, buf: &mut B, value: &mut BytesMut) -> Result<()> { + loop { + let b = buf.remaining_slice(); + ensure!(b.len() > ENC_GROUP_SIZE, DecodeValueGroup); + + let group_bytes = &b[..ENC_GROUP_SIZE + 1]; + let group = &group_bytes[..ENC_GROUP_SIZE]; + let marker = group_bytes[ENC_GROUP_SIZE]; + let pad_count = usize::from(ENC_MARKER - marker); + ensure!( + pad_count <= ENC_GROUP_SIZE, + DecodeValueMarker { group_bytes } + ); + + let real_group_size = ENC_GROUP_SIZE - pad_count; + value + .write_slice(&group[..real_group_size]) + .context(EncodeValue)?; + + if pad_count != 0 { + // Check validity of padding bytes. + for v in &group[real_group_size..] { + ensure!(*v == ENC_PAD, DecodeValuePadding { group_bytes }); + } + buf.must_advance(ENC_GROUP_SIZE + 1); + + break; + } + buf.must_advance(ENC_GROUP_SIZE + 1); + } + Ok(()) + } +} + +#[cfg(test)] +mod test { + use core::cmp::Ordering; + + use super::*; + + struct BytesTest { + data: Bytes, + estimate_encoded_size: usize, + } + + #[test] + fn test_bytes_codec() { + let data = vec![ + BytesTest { + data: Bytes::from_static(b""), + estimate_encoded_size: 9, + }, + BytesTest { + data: Bytes::from_static(b"hello1"), + estimate_encoded_size: 9, + }, + BytesTest { + data: Bytes::from_static(b"hello2"), + estimate_encoded_size: 9, + }, + BytesTest { + data: Bytes::from_static(b"hello3"), + estimate_encoded_size: 9, + }, + BytesTest { + data: Bytes::from_static(&[0x00, 0x01]), + estimate_encoded_size: 9, + }, + BytesTest { + data: Bytes::from_static(&[0xff, 0xff]), + estimate_encoded_size: 9, + }, + BytesTest { + data: Bytes::from_static(&[0x01, 0x00]), + estimate_encoded_size: 9, + }, + BytesTest { + data: Bytes::from_static(b"abc"), + estimate_encoded_size: 9, + }, + BytesTest { + data: Bytes::from_static(b"hello world"), + estimate_encoded_size: 18, + }, + ]; + + let c = MemComparable; + let mut buf = vec![]; + for x in &data { + c.encode(&mut buf, &x.data).unwrap(); + assert_eq!(x.estimate_encoded_size, c.estimate_encoded_size(&x.data)); + } + + let mut buf = &buf[..]; + for x in &data { + let mut d = BytesMut::new(); + c.decode_to(&mut buf, &mut d).unwrap(); + assert_eq!(d, x.data); + } + } + + struct TbBytes { + arg1: Bytes, + arg2: Bytes, + ret: Ordering, + } + + #[test] + fn test_bytes_order() { + let data = vec![ + TbBytes { + arg1: Bytes::new(), + arg2: Bytes::from_static(&[0x00]), + ret: Ordering::Less, + }, + TbBytes { + arg1: Bytes::from_static(&[0x00]), + arg2: Bytes::from_static(&[0x00]), + ret: Ordering::Equal, + }, + TbBytes { + arg1: Bytes::from_static(&[0xFF]), + arg2: Bytes::from_static(&[0x00]), + ret: Ordering::Greater, + }, + TbBytes { + arg1: Bytes::from_static(&[0xFF]), + arg2: Bytes::from_static(&[0xFF, 0x00]), + ret: Ordering::Less, + }, + TbBytes { + arg1: Bytes::from_static(b"a"), + arg2: Bytes::from_static(b"b"), + ret: Ordering::Less, + }, + TbBytes { + arg1: Bytes::from_static(b"a"), + arg2: Bytes::from_static(&[0x00]), + ret: Ordering::Greater, + }, + TbBytes { + arg1: Bytes::from_static(&[0x00]), + arg2: Bytes::from_static(&[0x01]), + ret: Ordering::Less, + }, + TbBytes { + arg1: Bytes::from_static(&[0x00, 0x01]), + arg2: Bytes::from_static(&[0x00, 0x00]), + ret: Ordering::Greater, + }, + TbBytes { + arg1: Bytes::from_static(&[0x00, 0x00, 0x00]), + arg2: Bytes::from_static(&[0x00, 0x00]), + ret: Ordering::Greater, + }, + TbBytes { + arg1: Bytes::from_static(&[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]), + arg2: Bytes::from_static(&[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]), + ret: Ordering::Less, + }, + TbBytes { + arg1: Bytes::from_static(&[0x01, 0x02, 0x03, 0x00]), + arg2: Bytes::from_static(&[0x01, 0x02, 0x03]), + ret: Ordering::Greater, + }, + TbBytes { + arg1: Bytes::from_static(&[0x01, 0x03, 0x03, 0x04]), + arg2: Bytes::from_static(&[0x01, 0x03, 0x03, 0x05]), + ret: Ordering::Less, + }, + TbBytes { + arg1: Bytes::from_static(&[0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07]), + arg2: Bytes::from_static(&[0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08]), + ret: Ordering::Less, + }, + TbBytes { + arg1: Bytes::from_static(&[0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09]), + arg2: Bytes::from_static(&[0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08]), + ret: Ordering::Greater, + }, + TbBytes { + arg1: Bytes::from_static(&[0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x00]), + arg2: Bytes::from_static(&[0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08]), + ret: Ordering::Greater, + }, + ]; + let c = MemComparable; + for x in &data { + let mut buf1 = vec![]; + let mut buf2 = vec![]; + c.encode(&mut buf1, &x.arg1).unwrap(); + c.encode(&mut buf2, &x.arg2).unwrap(); + assert_eq!(x.ret, buf1.as_slice().cmp(buf2.as_slice())); + } + } +} diff --git a/common_util/src/codec/memcomparable/datum.rs b/common_util/src/codec/memcomparable/datum.rs new file mode 100644 index 0000000000..3af3d5f474 --- /dev/null +++ b/common_util/src/codec/memcomparable/datum.rs @@ -0,0 +1,290 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Datum comparable codec + +use common_types::{ + bytes::{BytesMut, MemBuf, MemBufMut}, + datum::{Datum, DatumKind}, + string::StringBytes, + time::Timestamp, +}; +use snafu::ResultExt; + +use crate::codec::{ + consts, + memcomparable::{EncodeKey, Error, MemComparable, Result, UnsupportedKind}, + DecodeTo, Encoder, +}; + +// TODO(yingwen): Consider collate for string. +impl Encoder for MemComparable { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &Datum) -> Result<()> { + match value { + Datum::Null => buf.write_u8(consts::NULL_FLAG).context(EncodeKey), + Datum::Timestamp(ts) => { + buf.write_u8(consts::INT_FLAG).context(EncodeKey)?; + self.encode(buf, &ts.as_i64()) + } + Datum::Varbinary(v) => { + buf.write_u8(consts::BYTES_FLAG).context(EncodeKey)?; + self.encode(buf, v) + } + // For string, we just use same encoding method as bytes now. + Datum::String(v) => { + buf.write_u8(consts::BYTES_FLAG).context(EncodeKey)?; + self.encode(buf, v.as_bytes()) + } + Datum::UInt64(v) => { + buf.write_u8(consts::UINT_FLAG).context(EncodeKey)?; + self.encode(buf, v) + } + Datum::UInt32(v) => { + buf.write_u8(consts::UINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(u64::from(*v))) + } + Datum::UInt16(v) => { + buf.write_u8(consts::UINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(u64::from(*v))) + } + Datum::UInt8(v) => { + buf.write_u8(consts::UINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(u64::from(*v))) + } + Datum::Int64(v) => { + buf.write_u8(consts::INT_FLAG).context(EncodeKey)?; + self.encode(buf, v) + } + Datum::Int32(v) => { + buf.write_u8(consts::INT_FLAG).context(EncodeKey)?; + self.encode(buf, &(i64::from(*v))) + } + Datum::Int16(v) => { + buf.write_u8(consts::INT_FLAG).context(EncodeKey)?; + self.encode(buf, &(i64::from(*v))) + } + Datum::Int8(v) => { + buf.write_u8(consts::INT_FLAG).context(EncodeKey)?; + self.encode(buf, &(i64::from(*v))) + } + Datum::Boolean(v) => { + buf.write_u8(consts::UINT_FLAG).context(EncodeKey)?; + self.encode(buf, &(u64::from(*v))) + } + Datum::Double(_) => UnsupportedKind { + kind: DatumKind::Double, + } + .fail(), + Datum::Float(_) => UnsupportedKind { + kind: DatumKind::Float, + } + .fail(), + } + } + + fn estimate_encoded_size(&self, value: &Datum) -> usize { + match value { + // Null takes 1 byte + Datum::Null => 1, + Datum::Timestamp(ts) => self.estimate_encoded_size(&ts.as_i64()), + Datum::Varbinary(v) => self.estimate_encoded_size(v), + Datum::String(v) => self.estimate_encoded_size(v.as_bytes()), + Datum::UInt64(v) => self.estimate_encoded_size(v), + Datum::UInt32(v) => self.estimate_encoded_size(&(u64::from(*v))), + Datum::UInt16(v) => self.estimate_encoded_size(&(u64::from(*v))), + Datum::UInt8(v) => self.estimate_encoded_size(&(u64::from(*v))), + Datum::Int64(v) => self.estimate_encoded_size(v), + Datum::Int32(v) => self.estimate_encoded_size(&(i64::from(*v))), + Datum::Int16(v) => self.estimate_encoded_size(&(i64::from(*v))), + Datum::Int8(v) => self.estimate_encoded_size(&(i64::from(*v))), + Datum::Boolean(v) => self.estimate_encoded_size(&(u64::from(*v))), + // Unsupported kind, but we return 1 + Datum::Double(_) | Datum::Float(_) => 1, + } + } +} + +macro_rules! decode_u64_into { + ($self: ident, $v: ident, $buf: ident, $type: ty) => {{ + Self::ensure_flag($buf, consts::UINT_FLAG)?; + let mut data = 0u64; + $self.decode_to($buf, &mut data)?; + *$v = data as $type; + }}; +} + +macro_rules! decode_u64_into_bool { + ($self: ident, $v: ident, $buf: ident) => {{ + Self::ensure_flag($buf, consts::UINT_FLAG)?; + let mut data = 0u64; + $self.decode_to($buf, &mut data)?; + *$v = data != 0; + }}; +} + +macro_rules! decode_i64_into { + ($self: ident, $v: ident, $buf: ident, $type: ty) => {{ + Self::ensure_flag($buf, consts::INT_FLAG)?; + let mut data = 0i64; + $self.decode_to($buf, &mut data)?; + *$v = data as $type; + }}; +} + +impl DecodeTo for MemComparable { + type Error = Error; + + /// REQUIRE: The datum type should match the type in buf + /// + /// For string datum, the utf8 check will be skipped. + fn decode_to(&self, buf: &mut B, value: &mut Datum) -> Result<()> { + match value { + Datum::Null => { + Self::ensure_flag(buf, consts::NULL_FLAG)?; + } + Datum::Timestamp(ts) => { + Self::ensure_flag(buf, consts::INT_FLAG)?; + let mut data = 0; + self.decode_to(buf, &mut data)?; + *ts = Timestamp::new(data); + } + Datum::Varbinary(v) => { + Self::ensure_flag(buf, consts::BYTES_FLAG)?; + let mut data = BytesMut::new(); + self.decode_to(buf, &mut data)?; + *v = data.freeze(); + } + Datum::String(v) => { + Self::ensure_flag(buf, consts::BYTES_FLAG)?; + let mut data = BytesMut::new(); + self.decode_to(buf, &mut data)?; + // For string datum, we won't validate whether the bytes is a valid utf string + // during decoding to improve decode performance. The encoder + // should already done the utf8 check. + unsafe { + *v = StringBytes::from_bytes_unchecked(data.freeze()); + } + } + Datum::UInt64(v) => { + Self::ensure_flag(buf, consts::UINT_FLAG)?; + self.decode_to(buf, v)?; + } + Datum::UInt32(v) => decode_u64_into!(self, v, buf, u32), + Datum::UInt16(v) => decode_u64_into!(self, v, buf, u16), + Datum::UInt8(v) => decode_u64_into!(self, v, buf, u8), + Datum::Int64(v) => { + Self::ensure_flag(buf, consts::INT_FLAG)?; + self.decode_to(buf, v)?; + } + Datum::Int32(v) => decode_i64_into!(self, v, buf, i32), + Datum::Int16(v) => decode_i64_into!(self, v, buf, i16), + Datum::Int8(v) => decode_i64_into!(self, v, buf, i8), + Datum::Boolean(v) => decode_u64_into_bool!(self, v, buf), + Datum::Double(_) => { + return UnsupportedKind { + kind: DatumKind::Double, + } + .fail(); + } + Datum::Float(_) => { + return UnsupportedKind { + kind: DatumKind::Float, + } + .fail(); + } + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use core::cmp::Ordering; + + use common_types::bytes::Bytes; + + use super::*; + + #[test] + fn test_datum_codec() { + let data = vec![ + // (datum to encode, estimate_encoded_size) + (Datum::Null, 1), + (Datum::Timestamp(Timestamp::new(12345)), 9), + (Datum::Varbinary(Bytes::from_static(b"hello world")), 18), + (Datum::String(StringBytes::from_static("hello world")), 18), + (Datum::UInt64(100209), 9), + (Datum::UInt32(10020), 9), + (Datum::UInt16(65000), 9), + (Datum::UInt8(150), 9), + (Datum::Int64(-100209), 9), + (Datum::Int32(-10020), 9), + (Datum::Int16(32500), 9), + (Datum::Int8(-120), 9), + (Datum::Boolean(true), 9), + (Datum::Boolean(false), 9), + ]; + let mut decoded = vec![ + Datum::Null, + Datum::Timestamp(Timestamp::new(0)), + Datum::Varbinary(Bytes::new()), + Datum::String(StringBytes::new()), + Datum::UInt64(0), + Datum::UInt32(0), + Datum::UInt16(0), + Datum::UInt8(0), + Datum::Int64(0), + Datum::Int32(0), + Datum::Int16(0), + Datum::Int8(0), + Datum::Boolean(false), + Datum::Boolean(false), + ]; + let c = MemComparable; + for (index, x) in data.iter().enumerate() { + let mut buf = vec![]; + c.encode(&mut buf, &x.0).unwrap(); + assert_eq!(x.1, c.estimate_encoded_size(&x.0)); + c.decode_to(&mut buf.as_slice(), &mut decoded[index]) + .unwrap(); + assert_eq!(decoded[index], data[index].0); + } + } + + #[test] + fn test_datum_order() { + let data = vec![ + // (arg1, arg2, cmp order of arg1 and arg2) + (Datum::Null, Datum::Null, Ordering::Equal), + ( + Datum::Timestamp(Timestamp::new(12345)), + Datum::Timestamp(Timestamp::new(123456)), + Ordering::Less, + ), + ( + Datum::Varbinary(Bytes::from_static(&[ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + ])), + Datum::Varbinary(Bytes::from_static(&[ + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + ])), + Ordering::Less, + ), + ( + Datum::String(StringBytes::from_static("abce123")), + Datum::String(StringBytes::from_static("abce1234")), + Ordering::Less, + ), + (Datum::UInt64(888), Datum::UInt64(889), Ordering::Less), + ]; + let c = MemComparable; + for x in &data { + let mut buf1 = vec![]; + let mut buf2 = vec![]; + c.encode(&mut buf1, &x.0).unwrap(); + c.encode(&mut buf2, &x.1).unwrap(); + assert_eq!(x.2, buf1.as_slice().cmp(buf2.as_slice())); + } + } +} diff --git a/common_util/src/codec/memcomparable/mod.rs b/common_util/src/codec/memcomparable/mod.rs new file mode 100644 index 0000000000..1321fffdab --- /dev/null +++ b/common_util/src/codec/memcomparable/mod.rs @@ -0,0 +1,98 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Mem comparable format codec + +// Implementation reference: +// https://github.com/pingcap/tidb/blob/bd011d3c9567c506d8d4343ade03edf77fcd5b56/util/codec/codec.go + +mod bytes; +mod datum; +mod number; + +use common_types::{ + bytes::{BytesMut, MemBuf}, + datum::DatumKind, +}; +use snafu::{ensure, Backtrace, ResultExt, Snafu}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to encode flag, err:{}", source))] + EncodeKey { source: common_types::bytes::Error }, + + #[snafu(display("Failed to encode value, err:{}", source))] + EncodeValue { source: common_types::bytes::Error }, + + #[snafu(display("Failed to decode key, err:{}", source))] + DecodeKey { source: common_types::bytes::Error }, + + #[snafu(display( + "Invalid flag, expect:{}, actual:{}.\nBacktrace:\n{}", + expect, + actual, + backtrace + ))] + InvalidKeyFlag { + expect: u8, + actual: u8, + backtrace: Backtrace, + }, + + #[snafu(display( + "Unsupported datum kind to compare in mem, kind :{}.\nBacktrace:\n{}", + kind, + backtrace + ))] + UnsupportedKind { + kind: DatumKind, + backtrace: Backtrace, + }, + + #[snafu(display("Insufficient bytes to decode value, err:{}", source))] + DecodeValue { source: common_types::bytes::Error }, + + #[snafu(display("Insufficient bytes to decode value group.\nBacktrace:\n{}", backtrace))] + DecodeValueGroup { backtrace: Backtrace }, + + #[snafu(display( + "Invalid marker byte, group bytes: {:?}.\nBacktrace:\n{}", + group_bytes, + backtrace + ))] + DecodeValueMarker { + group_bytes: BytesMut, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid padding byte, group bytes: {:?}.\nBacktrace:\n{}", + group_bytes, + backtrace + ))] + DecodeValuePadding { + group_bytes: BytesMut, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to decode string, err:{}", source))] + DecodeString { source: common_types::string::Error }, +} + +define_result!(Error); + +/// Mem comparable codec +pub struct MemComparable; + +impl MemComparable { + fn ensure_flag(buf: &mut B, flag: u8) -> Result<()> { + let actual = buf.read_u8().context(DecodeKey)?; + ensure!( + flag == actual, + InvalidKeyFlag { + expect: flag, + actual + } + ); + Ok(()) + } +} diff --git a/common_util/src/codec/memcomparable/number.rs b/common_util/src/codec/memcomparable/number.rs new file mode 100644 index 0000000000..70cb36b03d --- /dev/null +++ b/common_util/src/codec/memcomparable/number.rs @@ -0,0 +1,333 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Number format + +use common_types::bytes::{MemBuf, MemBufMut}; +use snafu::ResultExt; + +use crate::codec::{ + consts, + memcomparable::{DecodeValue, EncodeValue, Error, MemComparable, Result}, + DecodeTo, Encoder, +}; + +impl Encoder for MemComparable { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &i64) -> Result<()> { + buf.write_u64(encode_int_to_cmp_uint(*value)) + .context(EncodeValue)?; + Ok(()) + } + + fn estimate_encoded_size(&self, _value: &i64) -> usize { + // flag + u64 + 9 + } +} + +impl DecodeTo for MemComparable { + type Error = Error; + + fn decode_to(&self, buf: &mut B, value: &mut i64) -> Result<()> { + *value = decode_cmp_uint_to_int(buf.read_u64().context(DecodeValue)?); + Ok(()) + } +} + +// encode_int_to_cmp_uint make int v to comparable uint type +fn encode_int_to_cmp_uint(v: i64) -> u64 { + (v as u64) ^ consts::SIGN_MASK +} + +// decode_cmp_uint_to_int decodes the u that encoded by encode_int_to_cmp_uint +fn decode_cmp_uint_to_int(u: u64) -> i64 { + (u ^ consts::SIGN_MASK) as i64 +} + +impl Encoder for MemComparable { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &u64) -> Result<()> { + buf.write_u64(*value).context(EncodeValue)?; + Ok(()) + } + + fn estimate_encoded_size(&self, _value: &u64) -> usize { + // flag + u64 + 9 + } +} + +impl DecodeTo for MemComparable { + type Error = Error; + + fn decode_to(&self, buf: &mut B, value: &mut u64) -> Result<()> { + *value = buf.read_u64().context(DecodeValue)?; + Ok(()) + } +} + +#[cfg(test)] +mod test { + use core::cmp::Ordering; + + use super::*; + + struct TestI64 { + data: i64, + estimate_encoded_size: usize, + } + + impl TestI64 { + fn new(data: i64) -> Self { + Self { + data, + estimate_encoded_size: 9, + } + } + } + + #[test] + fn test_i64_codec() { + let data = vec![ + TestI64::new(1621324705), + TestI64::new(1621324705000), + TestI64::new(1521324705), + TestI64::new(1621324705123), + TestI64::new(i64::MIN), + TestI64::new(i64::MIN + 1), + TestI64::new(0), + TestI64::new(i64::MAX), + TestI64::new((1 << 47) - 1), + TestI64::new(-1 << 47), + TestI64::new((1 << 23) - 1), + TestI64::new(-1 << 23), + TestI64::new((1 << 33) - 1), + TestI64::new(-1 << 33), + TestI64::new((1 << 55) - 1), + TestI64::new(-1 << 55), + TestI64::new(1), + TestI64::new(-1), + ]; + let c = MemComparable; + let mut buf = vec![]; + for x in &data { + c.encode(&mut buf, &x.data).unwrap(); + assert_eq!(x.estimate_encoded_size, c.estimate_encoded_size(&x.data)); + } + + let mut buf = &buf[..]; + for x in &data { + let mut d = -1; + c.decode_to(&mut buf, &mut d).unwrap(); + assert_eq!(d, x.data); + } + } + + struct TestU64 { + data: u64, + estimate_encoded_size: usize, + } + + impl TestU64 { + fn new(data: u64) -> Self { + Self { + data, + estimate_encoded_size: 9, + } + } + } + + #[test] + fn test_u64_codec() { + let data = vec![ + TestU64::new(0), + TestU64::new(u64::from(u8::MAX)), + TestU64::new(u64::from(u16::MAX)), + TestU64::new(u64::from(u32::MAX)), + TestU64::new(u64::MAX), + TestU64::new((1 << 24) - 1), + TestU64::new((1 << 48) - 1), + TestU64::new((1 << 56) - 1), + TestU64::new(1), + TestU64::new(i8::MAX as u64), + TestU64::new(i16::MAX as u64), + TestU64::new(i32::MAX as u64), + TestU64::new(i64::MAX as u64), + ]; + let c = MemComparable; + let mut buf = vec![]; + for x in &data { + c.encode(&mut buf, &x.data).unwrap(); + assert_eq!(x.estimate_encoded_size, c.estimate_encoded_size(&x.data)); + } + + let mut buf = &buf[..]; + for x in &data { + let mut d = 0; + c.decode_to(&mut buf, &mut d).unwrap(); + assert_eq!(d, x.data); + } + } + + struct TblI64 { + arg1: i64, + arg2: i64, + ret: Ordering, + } + + #[test] + fn test_i64_order() { + let data = vec![ + TblI64 { + arg1: -1, + arg2: 1, + ret: Ordering::Less, + }, + TblI64 { + arg1: i64::MAX, + arg2: i64::MIN, + ret: Ordering::Greater, + }, + TblI64 { + arg1: i64::MAX, + arg2: i32::MAX as i64, + ret: Ordering::Greater, + }, + TblI64 { + arg1: i32::MIN as i64, + arg2: i16::MAX as i64, + ret: Ordering::Less, + }, + TblI64 { + arg1: i64::MIN, + arg2: i8::MAX as i64, + ret: Ordering::Less, + }, + TblI64 { + arg1: 0, + arg2: i8::MAX as i64, + ret: Ordering::Less, + }, + TblI64 { + arg1: i8::MIN as i64, + arg2: 0, + ret: Ordering::Less, + }, + TblI64 { + arg1: i16::MIN as i64, + arg2: i16::MAX as i64, + ret: Ordering::Less, + }, + TblI64 { + arg1: 1, + arg2: -1, + ret: Ordering::Greater, + }, + TblI64 { + arg1: 1, + arg2: 0, + ret: Ordering::Greater, + }, + TblI64 { + arg1: -1, + arg2: 0, + ret: Ordering::Less, + }, + TblI64 { + arg1: 0, + arg2: 0, + ret: Ordering::Equal, + }, + TblI64 { + arg1: i16::MAX as i64, + arg2: i16::MAX as i64, + ret: Ordering::Equal, + }, + ]; + let c = MemComparable; + for x in &data { + let mut buf1 = vec![]; + let mut buf2 = vec![]; + c.encode(&mut buf1, &x.arg1).unwrap(); + c.encode(&mut buf2, &x.arg2).unwrap(); + assert_eq!(x.ret, buf1.as_slice().cmp(buf2.as_slice())); + } + } + + struct TblU64 { + arg1: u64, + arg2: u64, + ret: Ordering, + } + + #[test] + fn test_u64_order() { + let data = vec![ + TblU64 { + arg1: 0, + arg2: 0, + ret: Ordering::Equal, + }, + TblU64 { + arg1: 1, + arg2: 0, + ret: Ordering::Greater, + }, + TblU64 { + arg1: 0, + arg2: 1, + ret: Ordering::Less, + }, + TblU64 { + arg1: i8::MAX as u64, + arg2: i16::MAX as u64, + ret: Ordering::Less, + }, + TblU64 { + arg1: u32::MAX as u64, + arg2: i32::MAX as u64, + ret: Ordering::Greater, + }, + TblU64 { + arg1: u8::MAX as u64, + arg2: i8::MAX as u64, + ret: Ordering::Greater, + }, + TblU64 { + arg1: u16::MAX as u64, + arg2: i32::MAX as u64, + ret: Ordering::Less, + }, + TblU64 { + arg1: u64::MAX as u64, + arg2: i64::MAX as u64, + ret: Ordering::Greater, + }, + TblU64 { + arg1: i64::MAX as u64, + arg2: u32::MAX as u64, + ret: Ordering::Greater, + }, + TblU64 { + arg1: u64::MAX, + arg2: 0, + ret: Ordering::Greater, + }, + TblU64 { + arg1: 0, + arg2: u64::MAX, + ret: Ordering::Less, + }, + ]; + let c = MemComparable; + for x in &data { + let mut buf1 = vec![]; + let mut buf2 = vec![]; + c.encode(&mut buf1, &x.arg1).unwrap(); + c.encode(&mut buf2, &x.arg2).unwrap(); + assert_eq!(x.ret, buf1.as_slice().cmp(buf2.as_slice())); + } + } +} diff --git a/common_util/src/codec/mod.rs b/common_util/src/codec/mod.rs new file mode 100644 index 0000000000..0a9825f355 --- /dev/null +++ b/common_util/src/codec/mod.rs @@ -0,0 +1,42 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Data encoding + +// TODO(yingwen): Buf use generic type to avoid cost of vtable call per +// encode/decode + +pub mod compact; +mod consts; +pub mod memcomparable; +pub mod row; +mod varint; + +use common_types::bytes::{MemBuf, MemBufMut}; + +// encoder/decoder +/// Data encode abstraction +pub trait Encoder { + type Error; + + /// Encode value into buf + fn encode(&self, buf: &mut B, value: &T) -> Result<(), Self::Error>; + + /// Estimate the value size after encoded + fn estimate_encoded_size(&self, value: &T) -> usize; +} + +/// Data decode to target +pub trait DecodeTo { + type Error; + + /// Decode from `buf` to `value` + fn decode_to(&self, buf: &mut B, value: &mut T) -> Result<(), Self::Error>; +} + +/// Data decode abstraction +pub trait Decoder { + type Error; + + /// Decode `value` from `buf` + fn decode(&self, buf: &mut B) -> Result; +} diff --git a/common_util/src/codec/row.rs b/common_util/src/codec/row.rs new file mode 100644 index 0000000000..54c1b8ccbe --- /dev/null +++ b/common_util/src/codec/row.rs @@ -0,0 +1,234 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Row encoding utils +//! +//! Notice: The encoding method is used both in wal and memtable. Be careful for +//! data compatibility + +use std::convert::TryFrom; + +use common_types::{ + bytes::{BufMut, ByteVec, BytesMut, MemBuf, MemBufMut}, + datum::Datum, + row::{Row, RowGroup}, + schema::{IndexInWriterSchema, Schema}, +}; +use snafu::{ResultExt, Snafu}; + +use crate::codec::{ + compact::{MemCompactDecoder, MemCompactEncoder}, + DecodeTo, Decoder, Encoder, +}; + +#[derive(Debug, Snafu)] +#[snafu(visibility(pub(crate)))] +pub enum Error { + #[snafu(display("Failed to encode row datum, err:{}", source))] + EncodeRowDatum { + source: crate::codec::compact::Error, + }, + + #[snafu(display("Failed to decode row datum, err:{}", source))] + DecodeRowDatum { + source: crate::codec::compact::Error, + }, +} + +define_result!(Error); + +/// Compact row encoder for wal. +struct WalRowEncoder<'a> { + /// Schema of table + table_schema: &'a Schema, + /// Index of table column in writer + index_in_writer: &'a IndexInWriterSchema, +} + +impl<'a> Encoder for WalRowEncoder<'a> { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &Row) -> Result<()> { + let encoder = MemCompactEncoder; + for index_in_table in 0..self.table_schema.num_columns() { + match self.index_in_writer.column_index_in_writer(index_in_table) { + Some(writer_index) => { + // Column in writer + encoder + .encode(buf, &value[writer_index]) + .context(EncodeRowDatum)?; + } + None => { + // Column not in writer + encoder.encode(buf, &Datum::Null).context(EncodeRowDatum)?; + } + } + } + + Ok(()) + } + + fn estimate_encoded_size(&self, value: &Row) -> usize { + let encoder = MemCompactEncoder; + let mut total_len = 0; + for index_in_table in 0..self.table_schema.num_columns() { + match self.index_in_writer.column_index_in_writer(index_in_table) { + Some(writer_index) => { + // Column in writer + total_len += encoder.estimate_encoded_size(&value[writer_index]); + } + None => { + // Column not in writer + total_len += encoder.estimate_encoded_size(&Datum::Null); + } + } + } + + total_len + } +} + +/// Compact row decoder for wal, supports projection. +#[derive(Debug)] +pub struct WalRowDecoder<'a> { + /// Schema of row to decode + schema: &'a Schema, +} + +impl<'a> WalRowDecoder<'a> { + /// Create a decoder with given `schema`, the caller should ensure the + /// schema matches the row to be decoded. + pub fn new(schema: &'a Schema) -> Self { + Self { schema } + } +} + +impl<'a> Decoder for WalRowDecoder<'a> { + type Error = Error; + + fn decode(&self, buf: &mut B) -> Result { + let num_columns = self.schema.num_columns(); + let mut datums = Vec::with_capacity(num_columns); + + for idx in 0..num_columns { + let column_schema = &self.schema.column(idx); + let datum_kind = &column_schema.data_type; + let decoder = MemCompactDecoder; + + // Decode each column + let mut datum = Datum::empty(datum_kind); + decoder.decode_to(buf, &mut datum).context(DecodeRowDatum)?; + + datums.push(datum); + } + + Ok(Row::from_datums(datums)) + } +} + +/// Encode the row group in the format that can write to wal. +/// +/// Arguments +/// - row_group: The rows to be encoded and wrote to. +/// - table_schema: The schema the row group need to be encoded into, the schema +/// of the row group need to be write compatible for the table schema. +/// - index_in_writer: The index mapping from table schema to column in the +/// schema of row group. +/// - encoded_rows: The Vec to store bytes of each encoded row. +pub fn encode_row_group_for_wal( + row_group: &RowGroup, + table_schema: &Schema, + index_in_writer: &IndexInWriterSchema, + encoded_rows: &mut Vec, +) -> Result<()> { + let row_encoder = WalRowEncoder { + table_schema, + index_in_writer, + }; + + // Use estimated size of first row to avoid compute all + let row_estimated_size = match row_group.get_row(0) { + Some(first_row) => row_encoder.estimate_encoded_size(first_row), + // The row group is empty + None => return Ok(()), + }; + + encoded_rows.reserve(row_group.num_rows()); + + // Each row is constructed in writer schema, we need to encode it in + // `table_schema` + for row in row_group { + let mut buf = Vec::with_capacity(row_estimated_size); + row_encoder.encode(&mut buf, row)?; + + encoded_rows.push(buf); + } + + Ok(()) +} + +/// Return the next prefix key +/// +/// Assume there are keys like: +/// +/// ```text +/// rowkey1 +/// rowkey1_column1 +/// rowkey1_column2 +/// rowKey2 +/// ``` +/// +/// If we seek 'rowkey1' Next, we will get 'rowkey1_column1'. +/// If we seek 'rowkey1' PrefixNext, we will get 'rowkey2'. +/// +/// Ported from +/// +/// REQUIRE: The key should be memory comparable +// TODO(yingwen): Maybe add scratch param +// TODO(yingwen): Move to another mod +pub fn key_prefix_next(key: &[u8]) -> BytesMut { + let mut buf = BytesMut::from(key); + // isize should be enough to represent the key len + let mut idx = isize::try_from(key.len() - 1).unwrap(); + while idx >= 0 { + let i = idx as usize; + buf[i] += 1; + if buf[i] != 0 { + break; + } + + idx -= 1; + } + if idx == -1 { + buf.copy_from_slice(key); + buf.put_u8(0); + } + + buf +} +#[cfg(test)] +mod test { + use common_types::schema::IndexInWriterSchema; + + use crate::codec::{ + row::{WalRowDecoder, WalRowEncoder}, + Decoder, Encoder, + }; + + #[test] + fn test_wal_encode_decode() { + let schema = common_types::tests::build_schema(); + let rows = common_types::tests::build_rows(); + let index_in_writer = IndexInWriterSchema::for_same_schema(schema.num_columns()); + let wal_encoder = WalRowEncoder { + table_schema: &schema, + index_in_writer: &index_in_writer, + }; + let wal_decoder = WalRowDecoder::new(&schema); + for row in rows { + let mut buf = Vec::new(); + wal_encoder.encode(&mut buf, &row).unwrap(); + let row_decoded = wal_decoder.decode(&mut buf.as_slice()).unwrap(); + assert_eq!(row_decoded, row); + } + } +} diff --git a/common_util/src/codec/varint.rs b/common_util/src/codec/varint.rs new file mode 100644 index 0000000000..eb5616b692 --- /dev/null +++ b/common_util/src/codec/varint.rs @@ -0,0 +1,209 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Varint for codec whose test is covered by compact/number.rs +use common_types::bytes::{MemBuf, MemBufMut}; +use snafu::{Backtrace, ResultExt, Snafu}; + +#[derive(Debug, Snafu)] +#[snafu(visibility(pub(crate)))] +pub enum Error { + #[snafu(display("Failed to encode varint, err:{}", source))] + EncodeVarint { source: common_types::bytes::Error }, + + #[snafu(display("Insufficient bytes to decode value.\nBacktrace:\n{}", backtrace))] + DecodeEmptyValue { backtrace: Backtrace }, + + #[snafu(display("Insufficient bytes to decode value, err:{}", source))] + DecodeValue { source: common_types::bytes::Error }, + + #[snafu(display("Value larger than 64 bits (overflow).\nBacktrace:\n{}", backtrace))] + UvarintOverflow { backtrace: Backtrace }, +} + +define_result!(Error); + +// from https://golang.org/src/encoding/binary/varint.go?s=2506:2545#L68 +// PutVarint encodes an int64 into buf and returns the number of bytes written. +// If the buffer is too small, PutVarint will panic. +// +// ```go +// func PutVarint(buf []byte, x int64) int { +// ux := uint64(x) << 1 +// if x < 0 { +// ux = ^ux +// } +// return PutUvarint(buf, ux) +// } +// ``` +pub fn encode_varint(buf: &mut B, value: i64) -> Result<()> { + let mut x = (value as u64) << 1; + if value < 0 { + x = !x; + } + encode_uvarint(buf, x) +} + +// +// from https://golang.org/src/encoding/binary/varint.go?s=1611:1652#L31 +// +// ```go +// func PutUvarint(buf []byte, x uint64) int { +// i := 0 +// for x >= 0x80 { +// buf[i] = byte(x) | 0x80 +// x >>= 7 +// i++ +// } +// buf[i] = byte(x) +// return i + 1 +// } +// ``` +pub fn encode_uvarint(buf: &mut B, mut x: u64) -> Result<()> { + while x >= 0x80 { + buf.write_u8(x as u8 | 0x80).context(EncodeVarint)?; + x >>= 7; + } + buf.write_u8(x as u8).context(EncodeVarint)?; + Ok(()) +} + +// from https://golang.org/src/encoding/binary/varint.go?s=2955:2991#L84 +// Varint decodes an int64 from buf and returns that value and the +// number of bytes read (> 0). If an error occurred, the value is 0 +// and the number of bytes n is <= 0 with the following meaning: +// +// n == 0: buf too small +// n < 0: value larger than 64 bits (overflow) +// and -n is the number of bytes read +// +// ```go +// func Varint(buf []byte) (int64, int) { +// ux, n := Uvarint(buf) // ok to continue in presence of error +// x := int64(ux >> 1) +// if ux&1 != 0 { +// x = ^x +// } +// return x, n +// } +// ``` +pub fn decode_varint(buf: &mut B) -> Result { + let ux = decode_uvarint(buf)?; + let mut x = (ux >> 1) as i64; + if ux & 1 != 0 { + x = !x; + } + Ok(x) +} + +// from https://golang.org/src/encoding/binary/varint.go?s=2070:2108#L50 +// Uvarint decodes a uint64 from buf and returns that value and the +// number of bytes read (> 0). If an error occurred, the value is 0 +// and the number of bytes n is <= 0 meaning: +// +// n == 0: buf too small +// n < 0: value larger than 64 bits (overflow) +// and -n is the number of bytes read +// +// ```go +// func Uvarint(buf []byte) (uint64, int) { +// var x uint64 +// var s uint +// for i, b := range buf { +// if b < 0x80 { +// if i > 9 || i == 9 && b > 1 { +// return 0, -(i + 1) // overflow +// } +// return x | uint64(b)<(buf: &mut B) -> Result { + let mut x: u64 = 0; + let mut s: usize = 0; + let len = buf.remaining_slice().len(); + for i in 0..len { + let b = buf.read_u8().context(DecodeValue)?; + if b < 0x80 { + if i > 9 || i == 9 && b > 1 { + return UvarintOverflow.fail(); // overflow + } + return Ok(x | u64::from(b) << s); + } + x |= u64::from(b & 0x7f) << s; + s += 7; + } + DecodeEmptyValue.fail() +} + +#[cfg(test)] +mod tests { + use common_types::bytes::BytesMut; + + use super::*; + + #[test] + fn test_encode_decode_varint() { + let nums: Vec<(i64, usize)> = vec![ + (i64::MIN, 10), + (-1000000000000000, 8), + (-100000000000, 6), + (-1000000000, 5), + (-100000, 3), + (-65535, 3), + (-1000, 2), + (-125, 2), + (-32, 1), + (0, 1), + (64, 2), + (125, 2), + (1000, 2), + (65535, 3), + (10000, 3), + (1000000000, 5), + (100000000000, 6), + (10000000000000, 7), + (1000000000000000, 8), + (i64::MAX, 10), + ]; + + for (i, size) in nums { + let mut buf = BytesMut::with_capacity(8); + assert!(encode_varint(&mut buf, i).is_ok()); + assert_eq!(size, buf.len()); + let d = decode_varint(&mut buf); + assert!(d.is_ok()); + assert_eq!(i, d.unwrap()); + } + } + + #[test] + fn test_encode_decode_uvarint() { + let nums: Vec<(u64, usize)> = vec![ + (0, 1), + (64, 1), + (125, 1), + (1000, 2), + (65535, 3), + (10000, 2), + (1000000000, 5), + (100000000000, 6), + (10000000000000, 7), + (1000000000000000, 8), + (u64::MAX, 10), + ]; + + for (i, size) in nums { + let mut buf = BytesMut::with_capacity(8); + assert!(encode_uvarint(&mut buf, i).is_ok()); + assert_eq!(size, buf.len()); + let d = decode_uvarint(&mut buf); + assert!(d.is_ok()); + assert_eq!(i, d.unwrap()); + } + } +} diff --git a/common_util/src/config.rs b/common_util/src/config.rs new file mode 100644 index 0000000000..ac7232767f --- /dev/null +++ b/common_util/src/config.rs @@ -0,0 +1,711 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. + +//! Configure utils + +//This module is forked from tikv and remove unnessary code. +//https://github.com/tikv/tikv/blob/HEAD/src/util/config.rs +use std::{ + fmt::{self, Write}, + ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Sub, SubAssign}, + str::{self, FromStr}, + time::Duration, +}; + +use proto::analytic_common; +use serde::{ + de::{self, Unexpected, Visitor}, + Deserialize, Deserializer, Serialize, Serializer, +}; + +const UNIT: u64 = 1; + +const BINARY_DATA_MAGNITUDE: u64 = 1024; +pub const B: u64 = UNIT; +pub const KIB: u64 = UNIT * BINARY_DATA_MAGNITUDE; +pub const MIB: u64 = KIB * BINARY_DATA_MAGNITUDE; +pub const GIB: u64 = MIB * BINARY_DATA_MAGNITUDE; +pub const TIB: u64 = GIB * BINARY_DATA_MAGNITUDE; +pub const PIB: u64 = TIB * BINARY_DATA_MAGNITUDE; + +const TIME_MAGNITUDE_1: u64 = 1000; +const TIME_MAGNITUDE_2: u64 = 60; +const TIME_MAGNITUDE_3: u64 = 24; +const MS: u64 = UNIT; +const SECOND: u64 = MS * TIME_MAGNITUDE_1; +const MINUTE: u64 = SECOND * TIME_MAGNITUDE_2; +const HOUR: u64 = MINUTE * TIME_MAGNITUDE_2; +const DAY: u64 = HOUR * TIME_MAGNITUDE_3; + +/// Convert Duration to milliseconds. +/// +/// Panic if overflow. Mainly used by `ReadableDuration`. +#[inline] +fn duration_to_ms(d: Duration) -> u64 { + let nanos = u64::from(d.subsec_nanos()); + // Most of case, we can't have so large Duration, so here just panic if overflow + // now. + d.as_secs() * 1_000 + (nanos / 1_000_000) +} + +#[derive(Clone, Debug, Copy, PartialEq, PartialOrd, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum TimeUnit { + Nanoseconds, + Microseconds, + Milliseconds, + Seconds, + Minutes, + Hours, + Days, +} + +impl From for analytic_common::TimeUnit { + fn from(unit: TimeUnit) -> Self { + match unit { + TimeUnit::Nanoseconds => analytic_common::TimeUnit::NANOSECONDS, + TimeUnit::Microseconds => analytic_common::TimeUnit::MICROSECONDS, + TimeUnit::Milliseconds => analytic_common::TimeUnit::MILLISECONDS, + TimeUnit::Seconds => analytic_common::TimeUnit::SECONDS, + TimeUnit::Minutes => analytic_common::TimeUnit::MINUTES, + TimeUnit::Hours => analytic_common::TimeUnit::HOURS, + TimeUnit::Days => analytic_common::TimeUnit::DAYS, + } + } +} + +impl From for TimeUnit { + fn from(unit: analytic_common::TimeUnit) -> Self { + match unit { + analytic_common::TimeUnit::NANOSECONDS => TimeUnit::Nanoseconds, + analytic_common::TimeUnit::MICROSECONDS => TimeUnit::Microseconds, + analytic_common::TimeUnit::MILLISECONDS => TimeUnit::Milliseconds, + analytic_common::TimeUnit::SECONDS => TimeUnit::Seconds, + analytic_common::TimeUnit::MINUTES => TimeUnit::Minutes, + analytic_common::TimeUnit::HOURS => TimeUnit::Hours, + analytic_common::TimeUnit::DAYS => TimeUnit::Days, + } + } +} + +impl FromStr for TimeUnit { + type Err = String; + + fn from_str(tu_str: &str) -> Result { + let tu_str = tu_str.trim(); + if !tu_str.is_ascii() { + return Err(format!("unexpect ascii string: {}", tu_str)); + } + + match tu_str.to_lowercase().as_str() { + "nanoseconds" => Ok(TimeUnit::Nanoseconds), + "microseconds" => Ok(TimeUnit::Microseconds), + "milliseconds" => Ok(TimeUnit::Milliseconds), + "seconds" => Ok(TimeUnit::Seconds), + "minutes" => Ok(TimeUnit::Minutes), + "hours" => Ok(TimeUnit::Hours), + "days" => Ok(TimeUnit::Days), + _ => Err(format!("unexpect TimeUnit: {}", tu_str)), + } + } +} + +impl fmt::Display for TimeUnit { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let s = match self { + TimeUnit::Nanoseconds => "nanoseconds", + TimeUnit::Microseconds => "microseconds", + TimeUnit::Milliseconds => "milliseconds", + TimeUnit::Seconds => "seconds", + TimeUnit::Minutes => "minutes", + TimeUnit::Hours => "hours", + TimeUnit::Days => "days", + }; + write!(f, "{}", s) + } +} + +#[derive(Clone, Debug, Copy, PartialEq, PartialOrd)] +pub struct ReadableSize(pub u64); + +impl ReadableSize { + pub const fn kb(count: u64) -> ReadableSize { + ReadableSize(count * KIB) + } + + pub const fn mb(count: u64) -> ReadableSize { + ReadableSize(count * MIB) + } + + pub const fn gb(count: u64) -> ReadableSize { + ReadableSize(count * GIB) + } + + pub const fn as_mb(self) -> u64 { + self.0 / MIB + } + + pub const fn as_bytes(self) -> u64 { + self.0 + } +} + +impl Div for ReadableSize { + type Output = ReadableSize; + + fn div(self, rhs: u64) -> ReadableSize { + ReadableSize(self.0 / rhs) + } +} + +impl Div for ReadableSize { + type Output = u64; + + fn div(self, rhs: ReadableSize) -> u64 { + self.0 / rhs.0 + } +} + +impl Mul for ReadableSize { + type Output = ReadableSize; + + fn mul(self, rhs: u64) -> ReadableSize { + ReadableSize(self.0 * rhs) + } +} + +impl Serialize for ReadableSize { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let size = self.0; + let mut buffer = String::new(); + if size == 0 { + write!(buffer, "{}KiB", size).unwrap(); + } else if size % PIB == 0 { + write!(buffer, "{}PiB", size / PIB).unwrap(); + } else if size % TIB == 0 { + write!(buffer, "{}TiB", size / TIB).unwrap(); + } else if size % GIB as u64 == 0 { + write!(buffer, "{}GiB", size / GIB).unwrap(); + } else if size % MIB as u64 == 0 { + write!(buffer, "{}MiB", size / MIB).unwrap(); + } else if size % KIB as u64 == 0 { + write!(buffer, "{}KiB", size / KIB).unwrap(); + } else { + return serializer.serialize_u64(size); + } + serializer.serialize_str(&buffer) + } +} + +impl FromStr for ReadableSize { + type Err = String; + + // This method parses value in binary unit. + fn from_str(s: &str) -> Result { + let size_str = s.trim(); + if size_str.is_empty() { + return Err(format!("{:?} is not a valid size.", s)); + } + + if !size_str.is_ascii() { + return Err(format!("ASCII string is expected, but got {:?}", s)); + } + + // size: digits and '.' as decimal separator + let size_len = size_str + .to_string() + .chars() + .take_while(|c| char::is_ascii_digit(c) || ['.', 'e', 'E', '-', '+'].contains(c)) + .count(); + + // unit: alphabetic characters + let (size, unit) = size_str.split_at(size_len); + + let unit = match unit.trim() { + "K" | "KB" | "KiB" => KIB, + "M" | "MB" | "MiB" => MIB, + "G" | "GB" | "GiB" => GIB, + "T" | "TB" | "TiB" => TIB, + "P" | "PB" | "PiB" => PIB, + "B" | "" => UNIT, + _ => { + return Err(format!( + "only B, KB, KiB, MB, MiB, GB, GiB, TB, TiB, PB, and PiB are supported: {:?}", + s + )); + } + }; + + match size.parse::() { + Ok(n) => Ok(ReadableSize((n * unit as f64) as u64)), + Err(_) => Err(format!("invalid size string: {:?}", s)), + } + } +} + +impl<'de> Deserialize<'de> for ReadableSize { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + struct SizeVisitor; + + impl<'de> Visitor<'de> for SizeVisitor { + type Value = ReadableSize; + + fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + formatter.write_str("valid size") + } + + fn visit_i64(self, size: i64) -> Result + where + E: de::Error, + { + if size >= 0 { + self.visit_u64(size as u64) + } else { + Err(E::invalid_value(Unexpected::Signed(size), &self)) + } + } + + fn visit_u64(self, size: u64) -> Result + where + E: de::Error, + { + Ok(ReadableSize(size)) + } + + fn visit_str(self, size_str: &str) -> Result + where + E: de::Error, + { + size_str.parse().map_err(E::custom) + } + } + + deserializer.deserialize_any(SizeVisitor) + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Ord, PartialOrd)] +pub struct ReadableDuration(pub Duration); + +impl Add for ReadableDuration { + type Output = ReadableDuration; + + fn add(self, rhs: ReadableDuration) -> ReadableDuration { + Self(self.0 + rhs.0) + } +} + +impl AddAssign for ReadableDuration { + fn add_assign(&mut self, rhs: ReadableDuration) { + *self = *self + rhs; + } +} + +impl Sub for ReadableDuration { + type Output = ReadableDuration; + + fn sub(self, rhs: ReadableDuration) -> ReadableDuration { + Self(self.0 - rhs.0) + } +} + +impl SubAssign for ReadableDuration { + fn sub_assign(&mut self, rhs: ReadableDuration) { + *self = *self - rhs; + } +} + +impl Mul for ReadableDuration { + type Output = ReadableDuration; + + fn mul(self, rhs: u32) -> Self::Output { + Self(self.0 * rhs) + } +} + +impl MulAssign for ReadableDuration { + fn mul_assign(&mut self, rhs: u32) { + *self = *self * rhs; + } +} + +impl Div for ReadableDuration { + type Output = ReadableDuration; + + fn div(self, rhs: u32) -> ReadableDuration { + Self(self.0 / rhs) + } +} + +impl DivAssign for ReadableDuration { + fn div_assign(&mut self, rhs: u32) { + *self = *self / rhs; + } +} + +impl From for Duration { + fn from(readable: ReadableDuration) -> Duration { + readable.0 + } +} + +// yingwen: Support From. +impl From for ReadableDuration { + fn from(t: Duration) -> ReadableDuration { + ReadableDuration(t) + } +} + +impl FromStr for ReadableDuration { + type Err = String; + + fn from_str(dur_str: &str) -> Result { + let dur_str = dur_str.trim(); + if !dur_str.is_ascii() { + return Err(format!("unexpect ascii string: {}", dur_str)); + } + let err_msg = "valid duration, only d, h, m, s, ms are supported.".to_owned(); + let mut left = dur_str.as_bytes(); + let mut last_unit = DAY + 1; + let mut dur = 0f64; + while let Some(idx) = left.iter().position(|c| b"dhms".contains(c)) { + let (first, second) = left.split_at(idx); + let unit = if second.starts_with(b"ms") { + left = &left[idx + 2..]; + MS + } else { + let u = match second[0] { + b'd' => DAY, + b'h' => HOUR, + b'm' => MINUTE, + b's' => SECOND, + _ => return Err(err_msg), + }; + left = &left[idx + 1..]; + u + }; + if unit >= last_unit { + return Err("d, h, m, s, ms should occur in given order.".to_owned()); + } + // do we need to check 12h360m? + let number_str = unsafe { str::from_utf8_unchecked(first) }; + dur += match number_str.trim().parse::() { + Ok(n) => n * unit as f64, + Err(_) => return Err(err_msg), + }; + last_unit = unit; + } + if !left.is_empty() { + return Err(err_msg); + } + if dur.is_sign_negative() { + return Err("duration should be positive.".to_owned()); + } + let secs = dur as u64 / SECOND as u64; + let millis = (dur as u64 % SECOND as u64) as u32 * 1_000_000; + Ok(ReadableDuration(Duration::new(secs, millis))) + } +} + +impl ReadableDuration { + pub const fn secs(secs: u64) -> ReadableDuration { + ReadableDuration(Duration::from_secs(secs)) + } + + pub const fn millis(millis: u64) -> ReadableDuration { + ReadableDuration(Duration::from_millis(millis)) + } + + pub const fn minutes(minutes: u64) -> ReadableDuration { + ReadableDuration::secs(minutes * 60) + } + + pub const fn hours(hours: u64) -> ReadableDuration { + ReadableDuration::minutes(hours * 60) + } + + pub const fn days(days: u64) -> ReadableDuration { + ReadableDuration::hours(days * 24) + } + + pub fn as_secs(&self) -> u64 { + self.0.as_secs() + } + + pub fn as_millis(&self) -> u64 { + duration_to_ms(self.0) + } + + pub fn is_zero(&self) -> bool { + self.0.as_nanos() == 0 + } +} + +impl fmt::Display for ReadableDuration { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut dur = duration_to_ms(self.0); + let mut written = false; + if dur >= DAY { + written = true; + write!(f, "{}d", dur / DAY)?; + dur %= DAY; + } + if dur >= HOUR { + written = true; + write!(f, "{}h", dur / HOUR)?; + dur %= HOUR; + } + if dur >= MINUTE { + written = true; + write!(f, "{}m", dur / MINUTE)?; + dur %= MINUTE; + } + if dur >= SECOND { + written = true; + write!(f, "{}s", dur / SECOND)?; + dur %= SECOND; + } + if dur > 0 { + written = true; + write!(f, "{}ms", dur)?; + } + if !written { + write!(f, "0s")?; + } + Ok(()) + } +} + +impl Serialize for ReadableDuration { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let mut buffer = String::new(); + write!(buffer, "{}", self).unwrap(); + serializer.serialize_str(&buffer) + } +} + +impl<'de> Deserialize<'de> for ReadableDuration { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + struct DurVisitor; + + impl<'de> Visitor<'de> for DurVisitor { + type Value = ReadableDuration; + + fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + formatter.write_str("valid duration") + } + + fn visit_str(self, dur_str: &str) -> Result + where + E: de::Error, + { + dur_str.parse().map_err(E::custom) + } + } + + deserializer.deserialize_str(DurVisitor) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_readable_size() { + let s = ReadableSize::kb(2); + assert_eq!(s.0, 2048); + assert_eq!(s.as_mb(), 0); + let s = ReadableSize::mb(2); + assert_eq!(s.0, 2 * 1024 * 1024); + assert_eq!(s.as_mb(), 2); + let s = ReadableSize::gb(2); + assert_eq!(s.0, 2 * 1024 * 1024 * 1024); + assert_eq!(s.as_mb(), 2048); + + assert_eq!((ReadableSize::mb(2) / 2).0, MIB); + assert_eq!((ReadableSize::mb(1) / 2).0, 512 * KIB); + assert_eq!(ReadableSize::mb(2) / ReadableSize::kb(1), 2048); + } + + #[test] + fn test_parse_readable_size() { + #[derive(Serialize, Deserialize)] + struct SizeHolder { + s: ReadableSize, + } + + let legal_cases = vec![ + (0, "0KiB"), + (2 * KIB, "2KiB"), + (4 * MIB, "4MiB"), + (5 * GIB, "5GiB"), + (7 * TIB, "7TiB"), + (11 * PIB, "11PiB"), + ]; + for (size, exp) in legal_cases { + let c = SizeHolder { + s: ReadableSize(size), + }; + let res_str = toml::to_string(&c).unwrap(); + let exp_str = format!("s = {:?}\n", exp); + assert_eq!(res_str, exp_str); + let res_size: SizeHolder = toml::from_str(&exp_str).unwrap(); + assert_eq!(res_size.s.0, size); + } + + let c = SizeHolder { + s: ReadableSize(512), + }; + let res_str = toml::to_string(&c).unwrap(); + assert_eq!(res_str, "s = 512\n"); + let res_size: SizeHolder = toml::from_str(&res_str).unwrap(); + assert_eq!(res_size.s.0, c.s.0); + + let decode_cases = vec![ + (" 0.5 PB", PIB / 2), + ("0.5 TB", TIB / 2), + ("0.5GB ", GIB / 2), + ("0.5MB", MIB / 2), + ("0.5KB", KIB / 2), + ("0.5P", PIB / 2), + ("0.5T", TIB / 2), + ("0.5G", GIB / 2), + ("0.5M", MIB / 2), + ("0.5K", KIB / 2), + ("23", 23), + ("1", 1), + ("1024B", KIB), + // units with binary prefixes + (" 0.5 PiB", PIB / 2), + ("1PiB", PIB), + ("0.5 TiB", TIB / 2), + ("2 TiB", TIB * 2), + ("0.5GiB ", GIB / 2), + ("787GiB ", GIB * 787), + ("0.5MiB", MIB / 2), + ("3MiB", MIB * 3), + ("0.5KiB", KIB / 2), + ("1 KiB", KIB), + // scientific notation + ("0.5e6 B", B * 500000), + ("0.5E6 B", B * 500000), + ("1e6B", B * 1000000), + ("8E6B", B * 8000000), + ("8e7", B * 80000000), + ("1e-1MB", MIB / 10), + ("1e+1MB", MIB * 10), + ("0e+10MB", 0), + ]; + for (src, exp) in decode_cases { + let src = format!("s = {:?}", src); + let res: SizeHolder = toml::from_str(&src).unwrap(); + assert_eq!(res.s.0, exp); + } + + let illegal_cases = vec![ + "0.5kb", "0.5kB", "0.5Kb", "0.5k", "0.5g", "b", "gb", "1b", "B", "1K24B", " 5_KB", + "4B7", "5M_", + ]; + for src in illegal_cases { + let src_str = format!("s = {:?}", src); + assert!(toml::from_str::(&src_str).is_err(), "{}", src); + } + } + + #[test] + fn test_duration_construction() { + let mut dur = ReadableDuration::secs(1); + assert_eq!(dur.0, Duration::new(1, 0)); + assert_eq!(dur.as_secs(), 1); + assert_eq!(dur.as_millis(), 1000); + dur = ReadableDuration::millis(1001); + assert_eq!(dur.0, Duration::new(1, 1_000_000)); + assert_eq!(dur.as_secs(), 1); + assert_eq!(dur.as_millis(), 1001); + dur = ReadableDuration::minutes(2); + assert_eq!(dur.0, Duration::new(2 * 60, 0)); + assert_eq!(dur.as_secs(), 120); + assert_eq!(dur.as_millis(), 120000); + dur = ReadableDuration::hours(2); + assert_eq!(dur.0, Duration::new(2 * 3600, 0)); + assert_eq!(dur.as_secs(), 7200); + assert_eq!(dur.as_millis(), 7200000); + } + + #[test] + fn test_parse_readable_duration() { + #[derive(Serialize, Deserialize)] + struct DurHolder { + d: ReadableDuration, + } + + let legal_cases = vec![ + (0, 0, "0s"), + (0, 1, "1ms"), + (2, 0, "2s"), + (24 * 3600, 0, "1d"), + (2 * 24 * 3600, 10, "2d10ms"), + (4 * 60, 0, "4m"), + (5 * 3600, 0, "5h"), + (3600 + 2 * 60, 0, "1h2m"), + (5 * 24 * 3600 + 3600 + 2 * 60, 0, "5d1h2m"), + (3600 + 2, 5, "1h2s5ms"), + (3 * 24 * 3600 + 7 * 3600 + 2, 5, "3d7h2s5ms"), + ]; + for (secs, ms, exp) in legal_cases { + let d = DurHolder { + d: ReadableDuration(Duration::new(secs, ms * 1_000_000)), + }; + let res_str = toml::to_string(&d).unwrap(); + let exp_str = format!("d = {:?}\n", exp); + assert_eq!(res_str, exp_str); + let res_dur: DurHolder = toml::from_str(&exp_str).unwrap(); + assert_eq!(res_dur.d.0, d.d.0); + } + + let decode_cases = vec![(" 0.5 h2m ", 3600 / 2 + 2 * 60, 0)]; + for (src, secs, ms) in decode_cases { + let src = format!("d = {:?}", src); + let res: DurHolder = toml::from_str(&src).unwrap(); + assert_eq!(res.d.0, Duration::new(secs, ms * 1_000_000)); + } + + let illegal_cases = vec!["1H", "1M", "1S", "1MS", "1h1h", "h"]; + for src in illegal_cases { + let src_str = format!("d = {:?}", src); + assert!(toml::from_str::(&src_str).is_err(), "{}", src); + } + assert!(toml::from_str::("d = 23").is_err()); + } + + #[test] + fn test_parse_timeunit() { + let s = "milliseconds"; + assert_eq!(TimeUnit::Milliseconds, s.parse::().unwrap()); + let s = "seconds"; + assert_eq!(TimeUnit::Seconds, s.parse::().unwrap()); + let s = "minutes"; + assert_eq!(TimeUnit::Minutes, s.parse::().unwrap()); + let s = "hours"; + assert_eq!(TimeUnit::Hours, s.parse::().unwrap()); + let s = "days"; + assert_eq!(TimeUnit::Days, s.parse::().unwrap()); + let s = "microseconds"; + assert_eq!(TimeUnit::Microseconds, s.parse::().unwrap()); + } +} diff --git a/common_util/src/lib.rs b/common_util/src/lib.rs new file mode 100644 index 0000000000..f7c2c11e31 --- /dev/null +++ b/common_util/src/lib.rs @@ -0,0 +1,31 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Common utils shared by the whole project + +// We need to define mod with macro_use before other mod so that other +// mods in this crate can use the macros +#[macro_use] +pub mod macros; + +// TODO(yingwen): Move some mod into components as a crate +pub mod alloc_tracker; +pub mod codec; +pub mod config; +pub mod metric; +pub mod panic; +pub mod runtime; +pub mod time; +pub mod toml; + +#[cfg(any(test, feature = "test"))] +pub mod tests { + use std::sync::Once; + + static INIT_LOG: Once = Once::new(); + + pub fn init_log_for_test() { + INIT_LOG.call_once(|| { + env_logger::init(); + }); + } +} diff --git a/common_util/src/macros.rs b/common_util/src/macros.rs new file mode 100644 index 0000000000..5ac5b6f1c8 --- /dev/null +++ b/common_util/src/macros.rs @@ -0,0 +1,25 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Contains all needed macros + +/// Define result for given Error type +#[macro_export] +macro_rules! define_result { + ($t:ty) => { + pub type Result = std::result::Result; + }; +} + +#[cfg(test)] +mod tests { + #[test] + fn test_define_result() { + define_result!(i32); + + fn return_i32_error() -> Result<()> { + Err(18) + } + + assert_eq!(Err(18), return_i32_error()); + } +} diff --git a/common_util/src/metric.rs b/common_util/src/metric.rs new file mode 100644 index 0000000000..3219a3c757 --- /dev/null +++ b/common_util/src/metric.rs @@ -0,0 +1,267 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +/// Copied from https://github.com/sunng87/metriki/blob/master/metriki-core/src/metrics/meter.rs +/// But supports 1 hour and 2 hour rate. +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::{Duration, Instant, SystemTime}; + +use crossbeam_utils::atomic::AtomicCell; +#[cfg(feature = "ser")] +use serde::ser::SerializeMap; +#[cfg(feature = "ser")] +use serde::{Serialize, Serializer}; + +use crate::time; + +/// Meters are used to calculate rate of an event. +#[derive(Debug)] +pub struct Meter { + moving_avarages: ExponentiallyWeightedMovingAverages, + count: AtomicU64, + start_time: SystemTime, +} + +impl Default for Meter { + fn default() -> Self { + Self::new() + } +} + +impl Meter { + pub fn new() -> Meter { + Meter { + moving_avarages: ExponentiallyWeightedMovingAverages::new(), + count: AtomicU64::from(0), + start_time: SystemTime::now(), + } + } + + pub fn mark(&self) { + self.mark_n(1) + } + + pub fn mark_n(&self, n: u64) { + self.count.fetch_add(n, Ordering::Relaxed); + self.moving_avarages.tick_if_needed(); + self.moving_avarages.update(n); + } + + pub fn h1_rate(&self) -> f64 { + self.moving_avarages.tick_if_needed(); + self.moving_avarages.h1_rate() + } + + pub fn h2_rate(&self) -> f64 { + self.moving_avarages.tick_if_needed(); + self.moving_avarages.h2_rate() + } + + pub fn m15_rate(&self) -> f64 { + self.moving_avarages.tick_if_needed(); + self.moving_avarages.m15_rate() + } + + pub fn count(&self) -> u64 { + self.count.load(Ordering::Relaxed) + } + + pub fn mean_rate(&self) -> f64 { + let count = self.count(); + if count > 0 { + if let Ok(elapsed) = SystemTime::now() + .duration_since(self.start_time) + .map(|d| d.as_secs() as f64) + { + count as f64 / elapsed + } else { + 0f64 + } + } else { + 0f64 + } + } +} + +#[derive(Debug)] +struct ExponentiallyWeightedMovingAverage { + alpha: f64, + interval_nanos: u64, + + uncounted: AtomicCell, + rate: AtomicCell>, +} + +impl ExponentiallyWeightedMovingAverage { + fn new(alpha: f64, interval_secs: u64) -> ExponentiallyWeightedMovingAverage { + ExponentiallyWeightedMovingAverage { + alpha, + interval_nanos: time::secs_to_nanos(interval_secs), + + uncounted: AtomicCell::new(0), + rate: AtomicCell::new(None), + } + } + + fn update(&self, n: u64) { + self.uncounted.fetch_add(n); + } + + fn tick(&self) { + let count = self.uncounted.swap(0); + let instant_rate = count as f64 / self.interval_nanos as f64; + + if let Some(prev_rate) = self.rate.load() { + let new_rate = prev_rate + (self.alpha * (instant_rate - prev_rate)); + self.rate.store(Some(new_rate)); + } else { + self.rate.store(Some(instant_rate)); + } + } + + fn get_rate(&self) -> f64 { + if let Some(rate) = self.rate.load() { + rate * time::secs_to_nanos(1) as f64 + } else { + 0f64 + } + } +} + +#[derive(Debug)] +struct ExponentiallyWeightedMovingAverages { + h1: ExponentiallyWeightedMovingAverage, + h2: ExponentiallyWeightedMovingAverage, + m15: ExponentiallyWeightedMovingAverage, + + last_tick: AtomicCell, +} + +#[inline] +fn alpha(interval_secs: u64, minutes: u64) -> f64 { + 1.0 - (-(interval_secs as f64) / 60.0 / minutes as f64).exp() +} + +const DEFAULT_INTERVAL_SECS: u64 = 5; +const DEFAULT_INTERVAL_MILLIS: u64 = DEFAULT_INTERVAL_SECS * 1000; + +impl ExponentiallyWeightedMovingAverages { + fn new() -> ExponentiallyWeightedMovingAverages { + ExponentiallyWeightedMovingAverages { + h1: ExponentiallyWeightedMovingAverage::new( + alpha(DEFAULT_INTERVAL_SECS, 60), + DEFAULT_INTERVAL_SECS, + ), + + h2: ExponentiallyWeightedMovingAverage::new( + alpha(DEFAULT_INTERVAL_SECS, 120), + DEFAULT_INTERVAL_SECS, + ), + + m15: ExponentiallyWeightedMovingAverage::new( + alpha(DEFAULT_INTERVAL_SECS, 15), + DEFAULT_INTERVAL_SECS, + ), + + last_tick: AtomicCell::new(Instant::now()), + } + } + + fn update(&self, n: u64) { + self.h1.update(n); + self.h2.update(n); + self.m15.update(n); + } + + fn tick_if_needed(&self) { + let previous_tick = self.last_tick.load(); + let current_tick = Instant::now(); + + let tick_age = (current_tick - previous_tick).as_millis() as u64; + + if tick_age > DEFAULT_INTERVAL_MILLIS { + let latest_tick = + current_tick - Duration::from_millis(tick_age % DEFAULT_INTERVAL_MILLIS); + if self + .last_tick + .compare_exchange(previous_tick, latest_tick) + .is_ok() + { + let required_ticks = tick_age / DEFAULT_INTERVAL_MILLIS; + for _ in 0..required_ticks { + self.h1.tick(); + self.h2.tick(); + self.m15.tick(); + } + } + } + } + + fn h1_rate(&self) -> f64 { + self.h1.get_rate() + } + + fn h2_rate(&self) -> f64 { + self.h2.get_rate() + } + + fn m15_rate(&self) -> f64 { + self.m15.get_rate() + } +} + +#[cfg(feature = "ser")] +impl Serialize for Meter { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let mut map = serializer.serialize_map(Some(4))?; + + map.serialize_entry("count", &self.count())?; + map.serialize_entry("h1_rate", &self.h1_rate())?; + map.serialize_entry("h2_rate", &self.h2_rate())?; + map.serialize_entry("m15_rate", &self.m15_rate())?; + + map.end() + } +} + +#[cfg(test)] +mod tests { + use std::{thread, time}; + + use super::*; + + macro_rules! assert_float_eq { + ($left:expr, $right:expr) => {{ + match (&$left, &$right) { + (left_val, right_val) => { + let diff = (left_val - right_val).abs(); + + if diff > f64::EPSILON { + panic!( + "assertion failed: `(left == right)`\n left: `{:?}`,\n right: `{:?}`", + &*left_val, &*right_val + ) + } + } + } + }}; + } + + #[test] + fn test_meter() { + let m = Meter::new(); + + for _ in 0..10 { + m.mark(); + } + + thread::sleep(time::Duration::from_millis(DEFAULT_INTERVAL_MILLIS + 10)); + + assert_eq!(10, m.count()); + assert_float_eq!(2.0, m.m15_rate()); + assert_float_eq!(2.0, m.h1_rate()); + assert_float_eq!(2.0, m.h2_rate()); + } +} diff --git a/common_util/src/panic.rs b/common_util/src/panic.rs new file mode 100644 index 0000000000..5b0a9f5713 --- /dev/null +++ b/common_util/src/panic.rs @@ -0,0 +1,159 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::thread; + +use log::error; + +/// fork from https://github.com/tikv/tikv/blob/83d173a2c0058246631f0e71de74238ccff670fd/components/tikv_util/src/lib.rs#L429 +/// Exit the whole process when panic. +pub fn set_panic_hook(panic_abort: bool) { + use std::{panic, process}; + + // HACK! New a backtrace ahead for caching necessary elf sections of this + // tikv-server, in case it can not open more files during panicking + // which leads to no stack info (0x5648bdfe4ff2 - ). + // + // Crate backtrace caches debug info in a static variable `STATE`, + // and the `STATE` lives forever once it has been created. + // See more: https://github.com/alexcrichton/backtrace-rs/blob/\ + // 597ad44b131132f17ed76bf94ac489274dd16c7f/\ + // src/symbolize/libbacktrace.rs#L126-L159 + // Caching is slow, spawn it in another thread to speed up. + thread::Builder::new() + .name("backtrace-loader".to_owned()) + .spawn(::backtrace::Backtrace::new) + .unwrap(); + + panic::set_hook(Box::new(move |info: &panic::PanicInfo<'_>| { + let msg = match info.payload().downcast_ref::<&'static str>() { + Some(s) => *s, + None => match info.payload().downcast_ref::() { + Some(s) => &s[..], + None => "Box", + }, + }; + + let thread = thread::current(); + let name = thread.name().unwrap_or(""); + let loc = info + .location() + .map(|l| format!("{}:{}", l.file(), l.line())); + let bt = backtrace::Backtrace::new(); + error!( + "thread '{}' panicked '{}' at {:?}\n{:?}", + name, + msg, + loc.unwrap_or_else(|| "".to_owned()), + bt + ); + + // There might be remaining logs in the async logger. + // To collect remaining logs and also collect future logs, replace the old one + // with a terminal logger. + // When the old global async logger is replaced, the old async guard will be + // taken and dropped. In the drop() the async guard, it waits for the + // finish of the remaining logs in the async logger. + if let Some(level) = ::log::max_level().to_level() { + let drainer = logger::term_drainer(); + let _ = logger::init_log( + drainer, + logger::convert_log_level_to_slog_level(level), + false, // Use sync logger to avoid an unnecessary log thread. + 0, + false, // It is initialized already. + ); + } + + if panic_abort { + process::abort(); + } else { + unsafe { + // Calling process::exit would trigger global static to destroy, like C++ + // static variables of RocksDB, which may cause other threads encounter + // pure virtual method call. So calling libc::_exit() instead to skip the + // cleanup process. + libc::_exit(1); + } + } + })) +} + +#[cfg(test)] +mod tests { + use std::{io::Read, time::Duration}; + + use nix::{ + sys::wait::{wait, WaitStatus}, + unistd::{fork, ForkResult}, + }; + use slog::{self, Drain, Level, OwnedKVList, Record}; + + use crate::panic::set_panic_hook; + + /// Create a child process and wait to get its exit code. + fn run_and_wait_child_process(child: impl Fn()) -> Result { + match unsafe { fork() } { + Ok(ForkResult::Parent { .. }) => match wait().unwrap() { + WaitStatus::Exited(_, status) => Ok(status), + v => Err(format!("{:?}", v)), + }, + Ok(ForkResult::Child) => { + child(); + std::process::exit(0); + } + Err(e) => Err(format!("Fork failed: {}", e)), + } + } + + #[test] + fn test_panic_hook() { + use gag::BufferRedirect; + + struct DelayDrain(D); + + impl Drain for DelayDrain + where + D: Drain, + ::Err: std::fmt::Display, + { + type Err = ::Err; + type Ok = ::Ok; + + fn log( + &self, + record: &Record<'_>, + values: &OwnedKVList, + ) -> Result { + std::thread::sleep(Duration::from_millis(100)); + self.0.log(record, values) + } + } + + let mut stdout = BufferRedirect::stdout().unwrap(); + let status = run_and_wait_child_process(|| { + set_panic_hook(false); + let drainer = logger::term_drainer(); + let _ = logger::init_log( + drainer, + Level::Debug, + true, // use async drainer + 0, + true, // init std log + ); + + let _ = std::thread::spawn(|| { + // let the global logger is held by the other thread, so the + // drop() of the async drain is not called in time. + let _guard = slog_global::borrow_global(); + std::thread::sleep(Duration::from_secs(1)); + }); + panic!("test"); + }) + .unwrap(); + + assert_eq!(status, 1); + let mut panic = String::new(); + stdout.read_to_string(&mut panic).unwrap(); + assert!(!panic.is_empty()); + } +} diff --git a/common_util/src/runtime/metrics.rs b/common_util/src/runtime/metrics.rs new file mode 100644 index 0000000000..4f82494093 --- /dev/null +++ b/common_util/src/runtime/metrics.rs @@ -0,0 +1,57 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use lazy_static::lazy_static; +use prometheus::{register_int_gauge_vec, IntGauge, IntGaugeVec}; + +lazy_static! { + // Gauges: + static ref RUNTIME_THREAD_ALIVE_GAUGE: IntGaugeVec = register_int_gauge_vec!( + "runtime_thread_alive_gauge", + "alive thread number for runtime", + &["name"] + ) + .unwrap(); + static ref RUNTIME_THREAD_IDLE_GAUGE: IntGaugeVec = register_int_gauge_vec!( + "runtime_thread_idle_gauge", + "idle thread number for runtime", + &["name"] + ) + .unwrap(); +} + +/// Runtime metrics. +#[derive(Debug)] +pub struct Metrics { + // Gauges: + pub thread_alive_gauge: IntGauge, + pub thread_idle_gauge: IntGauge, +} + +impl Metrics { + pub fn new(name: &str) -> Self { + Self { + thread_alive_gauge: RUNTIME_THREAD_ALIVE_GAUGE.with_label_values(&[name]), + thread_idle_gauge: RUNTIME_THREAD_IDLE_GAUGE.with_label_values(&[name]), + } + } + + #[inline] + pub fn on_thread_start(&self) { + self.thread_alive_gauge.inc(); + } + + #[inline] + pub fn on_thread_stop(&self) { + self.thread_alive_gauge.dec(); + } + + #[inline] + pub fn on_thread_park(&self) { + self.thread_idle_gauge.inc(); + } + + #[inline] + pub fn on_thread_unpark(&self) { + self.thread_idle_gauge.dec(); + } +} diff --git a/common_util/src/runtime/mod.rs b/common_util/src/runtime/mod.rs new file mode 100644 index 0000000000..70494d6b6f --- /dev/null +++ b/common_util/src/runtime/mod.rs @@ -0,0 +1,277 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! A multi-threaded runtime that supports running Futures +use std::{ + future::Future, + pin::Pin, + sync::Arc, + task::{Context, Poll}, +}; + +use pin_project_lite::pin_project; +use snafu::{Backtrace, GenerateBacktrace, ResultExt, Snafu}; +use tokio::{ + runtime::{Builder as RuntimeBuilder, Runtime as TokioRuntime}, + task::{JoinError, JoinHandle as TokioJoinHandle}, +}; +mod metrics; +use metrics::Metrics; + +// TODO(yingwen): Use opaque error type +#[derive(Debug, Snafu)] +#[snafu(visibility(pub))] +pub enum Error { + #[snafu(display( + "Runtime Failed to build runtime, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + BuildRuntime { + source: std::io::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Runtime Failed to join task, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + JoinTask { + source: JoinError, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +/// A runtime to run future tasks +#[derive(Debug)] +pub struct Runtime { + rt: TokioRuntime, + metrics: Arc, +} + +impl Runtime { + /// Spawn a future and execute it in this thread pool + /// + /// Similar to tokio::runtime::Runtime::spawn() + pub fn spawn(&self, future: F) -> JoinHandle + where + F: Future + Send + 'static, + F::Output: Send + 'static, + { + JoinHandle { + inner: self.rt.spawn(future), + } + } + + /// Run the provided function on an executor dedicated to blocking + /// operations. + pub fn spawn_blocking(&self, func: F) -> JoinHandle + where + F: FnOnce() -> R + Send + 'static, + R: Send + 'static, + { + JoinHandle { + inner: self.rt.spawn_blocking(func), + } + } + + /// Run a future to complete, this is the runtime's entry point + pub fn block_on(&self, future: F) -> F::Output { + self.rt.block_on(future) + } + + /// Returns the runtime stats + pub fn stats(&self) -> RuntimeStats { + RuntimeStats { + alive_thread_num: self.metrics.thread_alive_gauge.get(), + idle_thread_num: self.metrics.thread_idle_gauge.get(), + } + } +} + +pin_project! { + #[derive(Debug)] + pub struct JoinHandle { + #[pin] + inner: TokioJoinHandle, + } +} + +impl Future for JoinHandle { + type Output = Result; + + fn poll(self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll { + let this = self.project(); + this.inner.poll(ctx).map_err(|source| Error::JoinTask { + source, + backtrace: Backtrace::generate(), + }) + } +} + +/// Runtime statistics +pub struct RuntimeStats { + pub alive_thread_num: i64, + pub idle_thread_num: i64, +} + +pub struct Builder { + thread_name: String, + builder: RuntimeBuilder, +} + +impl Default for Builder { + fn default() -> Self { + Self { + thread_name: "cse-runtime-worker".to_string(), + builder: RuntimeBuilder::new_multi_thread(), + } + } +} + +fn with_metrics(metrics: &Arc, f: F) -> impl Fn() +where + F: Fn(&Arc) + 'static, +{ + let m = metrics.clone(); + move || { + f(&m); + } +} + +impl Builder { + /// Sets the number of worker threads the Runtime will use. + /// + /// This can be any number above 0 + pub fn worker_threads(&mut self, val: usize) -> &mut Self { + self.builder.worker_threads(val); + self + } + + /// Sets name of threads spawned by the Runtime thread pool + pub fn thread_name(&mut self, val: impl Into) -> &mut Self { + self.thread_name = val.into(); + self + } + + /// Enable all feature of the underlying runtime + pub fn enable_all(&mut self) -> &mut Self { + self.builder.enable_all(); + self + } + + pub fn build(&mut self) -> Result { + let metrics = Arc::new(Metrics::new(&self.thread_name)); + + let rt = self + .builder + .thread_name(self.thread_name.clone()) + .on_thread_start(with_metrics(&metrics, |m| { + m.on_thread_start(); + })) + .on_thread_stop(with_metrics(&metrics, |m| { + m.on_thread_stop(); + })) + .on_thread_park(with_metrics(&metrics, |m| { + m.on_thread_park(); + })) + .on_thread_unpark(with_metrics(&metrics, |m| { + m.on_thread_unpark(); + })) + .build() + .context(BuildRuntime)?; + + Ok(Runtime { rt, metrics }) + } +} + +#[cfg(test)] +mod tests { + use std::{sync::Arc, thread, time::Duration}; + + use tokio::sync::oneshot; + use tokio_test::assert_ok; + + use super::*; + + fn rt() -> Arc { + let rt = Builder::default() + .worker_threads(2) + .thread_name("test_spawn_join") + .enable_all() + .build(); + assert!(rt.is_ok()); + Arc::new(rt.unwrap()) + } + + #[test] + fn test_stats() { + let rt = Builder::default() + .worker_threads(5) + .thread_name("test_stats") + .enable_all() + .build(); + assert!(rt.is_ok()); + let rt = Arc::new(rt.unwrap()); + // wait threads created + thread::sleep(Duration::from_millis(50)); + + let s = rt.stats(); + assert_eq!(5, s.alive_thread_num); + assert_eq!(5, s.idle_thread_num); + + rt.spawn(async { + thread::sleep(Duration::from_millis(50)); + }); + + thread::sleep(Duration::from_millis(10)); + let s = rt.stats(); + assert_eq!(5, s.alive_thread_num); + assert_eq!(4, s.idle_thread_num); + } + + #[test] + fn block_on_async() { + let rt = rt(); + + let out = rt.block_on(async { + let (tx, rx) = oneshot::channel(); + + thread::spawn(move || { + thread::sleep(Duration::from_millis(50)); + tx.send("ZOMG").unwrap(); + }); + + assert_ok!(rx.await) + }); + + assert_eq!(out, "ZOMG"); + } + + #[test] + fn spawn_from_blocking() { + let rt = rt(); + let rt1 = rt.clone(); + let out = rt.block_on(async move { + let rt2 = rt1.clone(); + let inner = assert_ok!( + rt1.spawn_blocking(move || { rt2.spawn(async move { "hello" }) }) + .await + ); + + assert_ok!(inner.await) + }); + + assert_eq!(out, "hello") + } + + #[test] + fn test_spawn_join() { + let rt = rt(); + let handle = rt.spawn(async { 1 + 1 }); + + assert_eq!(2, rt.block_on(handle).unwrap()); + } +} diff --git a/common_util/src/time.rs b/common_util/src/time.rs new file mode 100644 index 0000000000..1a44f98402 --- /dev/null +++ b/common_util/src/time.rs @@ -0,0 +1,68 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Time utilities + +// TODO(yingwen): Move to common_types ? + +use std::{ + convert::TryInto, + time::{Duration, Instant}, +}; + +pub trait DurationExt { + /// Convert into u64. + /// + /// Returns u64::MAX if overflow + fn as_millis_u64(&self) -> u64; +} + +impl DurationExt for Duration { + #[inline] + fn as_millis_u64(&self) -> u64 { + match self.as_millis().try_into() { + Ok(v) => v, + Err(_) => u64::MAX, + } + } +} + +pub trait InstantExt { + fn saturating_elapsed(&self) -> Duration; +} + +impl InstantExt for Instant { + fn saturating_elapsed(&self) -> Duration { + Instant::now().saturating_duration_since(*self) + } +} + +#[inline] +pub fn secs_to_nanos(s: u64) -> u64 { + s * 1000000000 +} + +#[cfg(test)] +mod tests { + use std::thread; + + use super::*; + #[test] + fn test_as_mills_u64() { + let d = Duration::from_millis(100); + assert_eq!(100, d.as_millis_u64()); + + let d = Duration::from_secs(100); + assert_eq!(100000, d.as_millis_u64()); + } + + #[test] + fn test_saturating_elapsed() { + let ins = Instant::now(); + let one_hundred_mills = Duration::from_millis(100); + let error = 10; + thread::sleep(one_hundred_mills); + assert!(ins.saturating_elapsed().as_millis_u64() - 100 < error); + thread::sleep(one_hundred_mills); + assert!(ins.saturating_elapsed().as_millis_u64() - 200 < 2 * error); + } +} diff --git a/common_util/src/toml.rs b/common_util/src/toml.rs new file mode 100644 index 0000000000..58332dc4c2 --- /dev/null +++ b/common_util/src/toml.rs @@ -0,0 +1,104 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Toml config utilities. + +use std::{fs::File, io::Read}; + +use serde::de; +use snafu::{Backtrace, ResultExt, Snafu}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Failed to open file, path:{}, err:{}.\nBacktrace:\n{}", + path, + source, + backtrace + ))] + OpenFile { + path: String, + source: std::io::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to read toml, path:{}, err:{}.\nBacktrace:\n{}", + path, + source, + backtrace + ))] + ReadToml { + path: String, + source: std::io::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to parse toml, path:{}, err:{}.\nBacktrace:\n{}", + path, + source, + backtrace + ))] + ParseToml { + path: String, + source: toml::de::Error, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +/// Read toml file from given `path` to `toml_buf`, then parsed it to `T` and +/// return. +pub fn parse_toml_from_path<'a, T>(path: &str, toml_buf: &'a mut String) -> Result +where + T: de::Deserialize<'a>, +{ + let mut file = File::open(path).context(OpenFile { path })?; + file.read_to_string(toml_buf).context(ReadToml { path })?; + + toml::from_str(toml_buf).context(ParseToml { path }) +} + +#[cfg(test)] +mod tests { + use std::io::Write; + + use serde_derive::Deserialize; + use tempfile::tempdir; + + use super::*; + + #[test] + fn test_parse_toml_from_path() { + let dir = tempdir().unwrap(); + let file_path = dir.path().join("test.toml"); + let path = file_path.to_str().unwrap(); + + let mut f = File::create(path).expect("Failed to create test config file"); + f.write_all(b"host=\"localhost\"\nport=1081") + .expect("Failed to write test config"); + + f.sync_all().expect("Failed to sync test config"); + + #[derive(Clone, Debug, Deserialize)] + struct TestConfig { + host: String, + port: u16, + } + let mut config = TestConfig { + host: "".to_string(), + port: 0, + }; + + assert_eq!("", config.host); + assert_eq!(0, config.port); + + let mut toml_str = String::new(); + + config = parse_toml_from_path(path, &mut toml_str).unwrap(); + + assert_eq!("localhost", config.host); + assert_eq!(1081, config.port); + } +} diff --git a/components/arena/Cargo.toml b/components/arena/Cargo.toml new file mode 100644 index 0000000000..ec70993c17 --- /dev/null +++ b/components/arena/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "arena" +version = "0.1.0" +authors = ["Ruihang Xia "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +parking_lot = "0.11.1" \ No newline at end of file diff --git a/components/arena/src/arena_trait.rs b/components/arena/src/arena_trait.rs new file mode 100644 index 0000000000..a8808fa38b --- /dev/null +++ b/components/arena/src/arena_trait.rs @@ -0,0 +1,70 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{alloc::Layout, ptr::NonNull, sync::Arc}; + +/// Memory Arena trait. +/// +/// The trait itself provides and enforces no guarantee about alignment. It's +/// implementation's responsibility to cover. +/// +/// All memory-relavent methods (`alloc()` etc.) are not "unsafe". Compare with +/// "deallocate" which is not included in this trait, allocating is more safer +/// and not likely to run into UB. However in fact, playing with raw pointer is +/// always dangerous and needs to be careful for both who implements and uses +/// this trait. +pub trait Arena { + type Stats; + + // required methods + + /// Try to allocate required memory described by layout. Return a pointer of + /// allocated space in success, while `None` if failed. + fn try_alloc(&self, layout: Layout) -> Option>; + + /// Get arena's statistics. + fn stats(&self) -> Self::Stats; + + // provided methods + + /// Allocate required memory. Panic if failed. + fn alloc(&self, layout: Layout) -> NonNull { + self.try_alloc(layout).unwrap() + } +} + +/// Basic statistics of arena. Offers [bytes_allocated] +/// and [bytes_used]. +#[derive(Debug, Clone, Copy)] +pub struct BasicStats { + pub(crate) bytes_allocated: usize, + pub(crate) bytes_used: usize, +} + +impl BasicStats { + /// Total bytes allocated from system. + #[inline] + pub fn bytes_allocated(&self) -> usize { + self.bytes_allocated + } + + /// Total bytes allocated to user. + #[inline] + pub fn bytes_used(&self) -> usize { + self.bytes_used + } +} + +/// Collect memory usage from Arean +pub trait Collector { + /// Called when `bytes` bytes memory is allocated in arena. + fn on_alloc(&self, bytes: usize); + + /// Called when `bytes` bytes memory is used in arena. + fn on_used(&self, bytes: usize); + + /// Called when `allocated` bytes memory is released, and `used` bytes in + /// it. + fn on_free(&self, used: usize, allocated: usize); +} + +pub type CollectorRef = Arc; diff --git a/components/arena/src/fixed_size.rs b/components/arena/src/fixed_size.rs new file mode 100644 index 0000000000..f7305e6144 --- /dev/null +++ b/components/arena/src/fixed_size.rs @@ -0,0 +1,107 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + alloc::{alloc, dealloc, Layout}, + ptr::NonNull, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, +}; + +use crate::arena_trait::{Arena, BasicStats}; + +const DEFAULT_ALIGN: usize = 8; + +#[derive(Clone)] +pub struct FixedSizeArena { + core: Arc, +} + +impl FixedSizeArena { + pub fn with_capacity(cap: usize) -> Self { + Self { + core: Arc::new(Core::with_capacity(cap)), + } + } +} + +struct Core { + len: AtomicUsize, + cap: usize, + ptr: NonNull, +} + +impl Core { + /// # Safety + /// - alloc + /// See [std::alloc::alloc]. + /// - new_unchecked + /// `ptr` is allocated from allocator. + fn with_capacity(cap: usize) -> Self { + let layout = Layout::from_size_align(cap as usize, DEFAULT_ALIGN).unwrap(); + let ptr = unsafe { alloc(layout) }; + + Self { + len: AtomicUsize::new(0), + cap, + ptr: unsafe { NonNull::new_unchecked(ptr) }, + } + } + + /// # Safety + /// `self.ptr` is allocated from allocator + fn try_alloc(&self, layout: Layout) -> Option> { + let layout = layout.pad_to_align(); + let size = layout.size(); + + let offset = self.len.fetch_add(size, Ordering::SeqCst) as usize; + if offset + size > self.cap { + self.len.fetch_sub(size, Ordering::SeqCst); + return None; + } + + unsafe { Some(NonNull::new_unchecked(self.ptr.as_ptr().add(size))) } + } +} + +impl Drop for Core { + /// Reclaim space pointed by `data`. + fn drop(&mut self) { + unsafe { + dealloc( + self.ptr.as_ptr(), + Layout::from_size_align_unchecked(self.cap, DEFAULT_ALIGN), + ) + } + } +} + +impl Arena for FixedSizeArena { + type Stats = BasicStats; + + fn try_alloc(&self, layout: Layout) -> Option> { + self.core.try_alloc(layout) + } + + fn stats(&self) -> Self::Stats { + Self::Stats { + bytes_used: self.core.cap, + bytes_allocated: self.core.len.load(Ordering::SeqCst) as usize, + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn capacity_overflow() { + let arena = FixedSizeArena::with_capacity(1024); + let layout = unsafe { Layout::from_size_align_unchecked(768, DEFAULT_ALIGN) }; + let _ = arena.alloc(layout); + + assert_eq!(None, arena.try_alloc(layout)); + } +} diff --git a/components/arena/src/lib.rs b/components/arena/src/lib.rs new file mode 100644 index 0000000000..963dd47933 --- /dev/null +++ b/components/arena/src/lib.rs @@ -0,0 +1,11 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! `Arena` Trait and implementations. + +mod arena_trait; +mod fixed_size; +mod mono_inc; + +pub use arena_trait::{Arena, BasicStats, Collector, CollectorRef}; +pub use fixed_size::FixedSizeArena; +pub use mono_inc::{MonoIncArena, NoopCollector}; diff --git a/components/arena/src/mono_inc.rs b/components/arena/src/mono_inc.rs new file mode 100644 index 0000000000..0adc7253de --- /dev/null +++ b/components/arena/src/mono_inc.rs @@ -0,0 +1,347 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + alloc::{alloc, dealloc, Layout}, + ptr::NonNull, + sync::Arc, +}; + +use parking_lot::Mutex; + +use crate::arena_trait::{Arena, BasicStats, Collector, CollectorRef}; + +/// The noop collector does nothing on alloc and free +pub struct NoopCollector; + +impl Collector for NoopCollector { + fn on_alloc(&self, _bytes: usize) {} + + fn on_used(&self, _bytes: usize) {} + + fn on_free(&self, _used: usize, _allocated: usize) {} +} + +const DEFAULT_ALIGN: usize = 8; + +/// A thread-safe arena. All allocated memory is aligned to 8. Organizes its +/// allocated memory as blocks. +#[derive(Clone)] +pub struct MonoIncArena { + core: Arc>, +} + +impl MonoIncArena { + pub fn new(regular_block_size: usize) -> Self { + Self { + core: Arc::new(Mutex::new(ArenaCore::new( + regular_block_size, + Arc::new(NoopCollector {}), + ))), + } + } + + pub fn with_collector(regular_block_size: usize, collector: CollectorRef) -> Self { + Self { + core: Arc::new(Mutex::new(ArenaCore::new(regular_block_size, collector))), + } + } +} + +impl Arena for MonoIncArena { + type Stats = BasicStats; + + fn try_alloc(&self, layout: Layout) -> Option> { + Some(self.core.lock().alloc(layout)) + } + + fn stats(&self) -> Self::Stats { + self.core.lock().stats + } + + fn alloc(&self, layout: Layout) -> NonNull { + self.core.lock().alloc(layout) + } +} + +struct ArenaCore { + collector: CollectorRef, + regular_layout: Layout, + regular_blocks: Vec, + special_blocks: Vec, + stats: BasicStats, +} + +impl ArenaCore { + /// # Safety + /// Required property is tested in debug assertions. + fn new(regular_block_size: usize, collector: CollectorRef) -> Self { + debug_assert_ne!(DEFAULT_ALIGN, 0); + debug_assert_eq!(DEFAULT_ALIGN & (DEFAULT_ALIGN - 1), 0); + // TODO(yingwen): Avoid panic. + let regular_layout = Layout::from_size_align(regular_block_size, DEFAULT_ALIGN).unwrap(); + let regular_blocks = vec![Block::new(regular_layout)]; + let special_blocks = vec![]; + let bytes = regular_layout.size(); + collector.on_alloc(bytes); + + Self { + collector, + regular_layout, + regular_blocks, + special_blocks, + stats: BasicStats { + bytes_allocated: bytes, + bytes_used: 0, + }, + } + } + + /// Input layout will be aligned. + fn alloc(&mut self, layout: Layout) -> NonNull { + let layout = layout + .align_to(self.regular_layout.align()) + .unwrap() + .pad_to_align(); + let bytes = layout.size(); + // TODO(Ruihang): determine threshold + if layout.size() > self.regular_layout.size() { + self.stats.bytes_used += bytes; + self.collector.on_used(bytes); + Self::add_new_block( + layout, + &mut self.special_blocks, + &mut self.stats, + &self.collector, + ); + let block = self.special_blocks.last().unwrap(); + return block.data; + } + + self.stats.bytes_used += bytes; + self.collector.on_used(bytes); + if let Some(ptr) = self.try_alloc(layout) { + ptr + } else { + Self::add_new_block( + self.regular_layout, + &mut self.regular_blocks, + &mut self.stats, + &self.collector, + ); + self.try_alloc(layout).unwrap() + } + } + + /// # Safety + /// `regular_blocks` vector is guaranteed to contains at least one element. + fn try_alloc(&mut self, layout: Layout) -> Option> { + self.regular_blocks.last_mut().unwrap().alloc(layout) + } + + fn add_new_block( + layout: Layout, + container: &mut Vec, + stats: &mut BasicStats, + collector: &CollectorRef, + ) { + let new_block = Block::new(layout); + container.push(new_block); + // Update allocated stats once a new block has been allocated from the system. + stats.bytes_allocated += layout.size(); + collector.on_alloc(layout.size()); + } +} + +impl Drop for ArenaCore { + fn drop(&mut self) { + self.collector + .on_free(self.stats.bytes_used, self.stats.bytes_allocated); + } +} + +struct Block { + data: NonNull, + len: usize, + layout: Layout, +} + +impl Block { + /// Create a new block. Return the pointer of this new block. + /// + /// # Safety + /// See [std::alloc::alloc]. The allocated memory will be deallocated in + /// drop(). + fn new(layout: Layout) -> Block { + let data = unsafe { alloc(layout) }; + + Self { + data: NonNull::new(data).unwrap(), + len: 0, + layout, + } + } + + /// # Safety + /// ## ptr:add() + /// The added offset is checked before. + /// ## NonNull::new_unchecked() + /// `ptr` is added from a NonNull. + fn alloc(&mut self, layout: Layout) -> Option> { + let size = layout.size(); + + if self.len + size <= self.layout.size() { + let ptr = unsafe { self.data.as_ptr().add(self.len) }; + self.len += size; + unsafe { Some(NonNull::new_unchecked(ptr)) } + } else { + None + } + } +} + +impl Drop for Block { + /// Reclaim space pointed by `data`. + fn drop(&mut self) { + unsafe { dealloc(self.data.as_ptr(), self.layout) } + } +} + +unsafe impl Send for Block {} +unsafe impl Sync for Block {} + +#[cfg(test)] +mod test { + use std::{ + mem, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, + }; + + use super::*; + + /// # Safety: + /// Caller should check the input buf has enough space. + fn consume_buf_as_u64_slice(buf: NonNull, n: usize) { + unsafe { + let mut buf = buf.as_ptr() as *mut u64; + for i in 0..n { + *buf = i as u64; + buf = buf.add(1); + } + } + } + + #[test] + fn test_stats() { + let arena = MonoIncArena::new(1024 * 1024); + + // Size is 80 + let layout_slice = Layout::new::<[u64; 10]>().align_to(8).unwrap(); + for _ in 0..20 { + arena.alloc(layout_slice); + } + + assert_eq!(1024 * 1024, arena.stats().bytes_allocated()); + assert_eq!(1600, arena.stats().bytes_used()); + } + + struct MockCollector { + allocated: AtomicUsize, + used: AtomicUsize, + } + + impl Collector for MockCollector { + fn on_alloc(&self, bytes: usize) { + self.allocated.fetch_add(bytes, Ordering::Relaxed); + } + + fn on_used(&self, bytes: usize) { + self.used.fetch_add(bytes, Ordering::Relaxed); + } + + fn on_free(&self, _used: usize, _allocated: usize) {} + } + + #[test] + fn test_collector() { + let collector = Arc::new(MockCollector { + allocated: AtomicUsize::new(0), + used: AtomicUsize::new(0), + }); + + let arena = MonoIncArena::with_collector(1024 * 1024, collector.clone()); + + // Size is 80 + let layout_slice = Layout::new::<[u64; 10]>().align_to(8).unwrap(); + for _ in 0..20 { + arena.alloc(layout_slice); + } + + assert_eq!(1024 * 1024, collector.allocated.load(Ordering::Relaxed)); + assert_eq!(1600, collector.used.load(Ordering::Relaxed)); + } + + #[test] + fn alloc_small_slice() { + let arena = MonoIncArena::new(128); + + let layout_slice = Layout::new::<[u64; 10]>().align_to(8).unwrap(); + for _ in 0..20 { + let buf = arena.alloc(layout_slice); + consume_buf_as_u64_slice(buf, 10); + } + + assert_eq!(2560, arena.stats().bytes_allocated()); + assert_eq!(1600, arena.stats().bytes_used()); + } + + #[test] + fn alloc_huge_slice() { + let arena = MonoIncArena::new(128); + + let layout_slice = Layout::new::<[u64; 20]>().align_to(8).unwrap(); + for _ in 0..20 { + let buf = arena.alloc(layout_slice); + consume_buf_as_u64_slice(buf, 20); + } + + assert_eq!(3328, arena.stats().bytes_allocated()); + assert_eq!(3200, arena.stats().bytes_used()); + } + + #[test] + fn alloc_various_slice() { + let arena = MonoIncArena::new(1024); + const SIZES: [usize; 12] = [10, 200, 30, 1024, 512, 77, 89, 1, 3, 29, 16, 480]; + let total_used: usize = SIZES.iter().map(|v| v * 8).sum(); + + for size in &SIZES { + let layout_slice = Layout::from_size_align(mem::size_of::() * *size, 8).unwrap(); + let buf = arena.alloc(layout_slice); + consume_buf_as_u64_slice(buf, *size); + } + + assert_eq!(20800, arena.stats().bytes_allocated()); + assert_eq!(total_used, arena.stats().bytes_used()); + } + + #[test] + fn unaligned_alloc_request() { + let arena = MonoIncArena::new(1024); + + let regular_req_layout = Layout::from_size_align(mem::size_of::(), 2).unwrap(); + for _ in 0..10 { + let buf = arena.alloc(regular_req_layout).as_ptr() as usize; + assert_eq!(0, buf % DEFAULT_ALIGN); + } + + // 2003 is a prime number and 2004 % 8 != 0 + let special_req_layout = Layout::from_size_align(2003, 2).unwrap(); + for _ in 0..10 { + let buf = arena.alloc(special_req_layout).as_ptr() as usize; + assert_eq!(0, buf % DEFAULT_ALIGN); + } + } +} diff --git a/components/bytes/Cargo.toml b/components/bytes/Cargo.toml new file mode 100644 index 0000000000..0fecefbe8e --- /dev/null +++ b/components/bytes/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "bytes" +version = "0.1.0" +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +upstream = { version = "1.0", package = "bytes" } +snafu = { version ="0.6.10", features = ["backtraces"]} diff --git a/components/bytes/src/lib.rs b/components/bytes/src/lib.rs new file mode 100644 index 0000000000..015aabce0c --- /dev/null +++ b/components/bytes/src/lib.rs @@ -0,0 +1,368 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Provides utilities for byte arrays +//! +//! Use Bytes instead of Vec. Currently just re-export bytes crate + +use std::{ + fmt, + io::{self, Read, Write}, +}; + +use snafu::{ensure, Backtrace, GenerateBacktrace, Snafu}; +// Should not use bytes crate outside of this mod so we can replace the actual +// implementations if needed +pub use upstream::{Buf, BufMut, Bytes, BytesMut}; + +/// Error of MemBuf/MemBufMut +/// +/// We do not use `std::io::Error` because it is too large +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to fill whole buffer.\nBacktrace:\n{}", backtrace))] + UnexpectedEof { backtrace: Backtrace }, + + #[snafu(display("Failed to write whole buffer.\nBacktrace:\n{}", backtrace))] + WouldOverflow { backtrace: Backtrace }, +} + +pub type Result = std::result::Result; + +/// Now is just an alias to `Vec`, prefer to use this alias instead of +/// `Vec` +pub type ByteVec = Vec; + +/// Read bytes from a buffer. +/// +/// Unlike `bytes::Buf`, the underlying storage is in contiguous memory +pub trait MemBuf: fmt::Debug { + /// Return the remaining byte slice + fn remaining_slice(&self) -> &[u8]; + + /// Advance the internal cursor of the buffer, panic if overflow + fn must_advance(&mut self, cnt: usize); + + /// Read bytes from self into dst. + /// + /// The cursor is advanced by the number of bytes copied. + /// + /// Returns error if self does not have enough remaining bytes to fill dst. + fn read_to_slice(&mut self, dst: &mut [u8]) -> Result<()>; + + /// Gets an unsigned 8 bit integer from self and advance current position + /// + /// Returns error if the capacity is not enough + fn read_u8(&mut self) -> Result { + let mut buf = [0; 1]; + self.read_to_slice(&mut buf)?; + Ok(buf[0]) + } + + /// Gets an unsighed 32 bit integer from self in big-endian byte order and + /// advance current position + /// + /// Returns error if the capacity is not enough + fn read_u32(&mut self) -> Result { + let mut buf = [0; 4]; + self.read_to_slice(&mut buf)?; + Ok(u32::from_be_bytes(buf)) + } + + /// Gets an unsighed 64 bit integer from self in big-endian byte order and + /// advance current position + /// + /// Returns error if the capacity is not enough + fn read_u64(&mut self) -> Result { + let mut buf = [0; 8]; + self.read_to_slice(&mut buf)?; + Ok(u64::from_be_bytes(buf)) + } + + fn read_f64(&mut self) -> Result { + let mut buf = [0; 8]; + self.read_to_slice(&mut buf)?; + Ok(f64::from_be_bytes(buf)) + } + + fn read_f32(&mut self) -> Result { + let mut buf = [0; 4]; + self.read_to_slice(&mut buf)?; + Ok(f32::from_be_bytes(buf)) + } +} + +/// Write bytes to a buffer +/// +/// Unlike `bytes::BufMut`, write operations may fail +pub trait MemBufMut: fmt::Debug { + /// Write bytes into self from src, advance the buffer position + /// + /// Returns error if the capacity is not enough + fn write_slice(&mut self, src: &[u8]) -> Result<()>; + + /// Write an unsigned 8 bit integer to self, advance the buffer position + /// + /// Returns error if the capacity is not enough + fn write_u8(&mut self, n: u8) -> Result<()> { + let src = [n]; + self.write_slice(&src) + } + + /// Writes an unsigned 32 bit integer to self in the big-endian byte order, + /// advance the buffer position + /// + /// Returns error if the capacity is not enough + fn write_u32(&mut self, n: u32) -> Result<()> { + self.write_slice(&n.to_be_bytes()) + } + + /// Writes an unsigned 64 bit integer to self in the big-endian byte order, + /// advance the buffer position + /// + /// Returns error if the capacity is not enough + fn write_u64(&mut self, n: u64) -> Result<()> { + self.write_slice(&n.to_be_bytes()) + } + + /// Writes an float 64 to self in the big-endian byte order, + /// advance the buffer position + /// + /// Returns error if the capacity is not enough + fn write_f64(&mut self, n: f64) -> Result<()> { + self.write_slice(&n.to_be_bytes()) + } + + /// Writes an float 32 to self in the big-endian byte order, + /// advance the buffer position + /// + /// Returns error if the capacity is not enough + fn write_f32(&mut self, n: f32) -> Result<()> { + self.write_slice(&n.to_be_bytes()) + } +} + +macro_rules! impl_mem_buf { + () => { + #[inline] + fn remaining_slice(&self) -> &[u8] { + &self + } + + #[inline] + fn must_advance(&mut self, cnt: usize) { + self.advance(cnt); + } + + #[inline] + fn read_to_slice(&mut self, dst: &mut [u8]) -> Result<()> { + ensure!(self.remaining() >= dst.len(), UnexpectedEof); + self.copy_to_slice(dst); + Ok(()) + } + }; +} + +impl MemBuf for Bytes { + impl_mem_buf!(); +} + +impl MemBuf for BytesMut { + impl_mem_buf!(); +} + +impl MemBufMut for BytesMut { + fn write_slice(&mut self, src: &[u8]) -> Result<()> { + ensure!(self.remaining_mut() >= src.len(), WouldOverflow); + self.put_slice(src); + Ok(()) + } +} + +impl MemBuf for &[u8] { + #[inline] + fn remaining_slice(&self) -> &[u8] { + self + } + + #[inline] + fn must_advance(&mut self, cnt: usize) { + *self = &self[cnt..]; + } + + #[inline] + fn read_to_slice(&mut self, dst: &mut [u8]) -> Result<()> { + // slice::read_exact() only throws UnexpectedEof error, see + // + // https://doc.rust-lang.org/src/std/io/impls.rs.html#264-281 + self.read_exact(dst).map_err(|_| Error::UnexpectedEof { + backtrace: Backtrace::generate(), + }) + } +} + +impl MemBufMut for &mut [u8] { + fn write_slice(&mut self, src: &[u8]) -> Result<()> { + // slice::write_all() actually wont fail, see + // + // https://doc.rust-lang.org/src/std/io/impls.rs.html#344-350 + self.write_all(src).map_err(|_| Error::WouldOverflow { + backtrace: Backtrace::generate(), + }) + } +} + +impl MemBufMut for Vec { + fn write_slice(&mut self, src: &[u8]) -> Result<()> { + self.extend_from_slice(src); + Ok(()) + } +} + +/// A `MemBufMut` adapter which implements [std::io::Write] for the inner value +#[derive(Debug)] +pub struct Writer<'a, B> { + buf: &'a mut B, +} + +impl<'a, B: MemBufMut> Writer<'a, B> { + /// Create a new Writer from a mut ref to buf + pub fn new(buf: &'a mut B) -> Self { + Self { buf } + } +} + +impl<'a, B: MemBufMut> Write for Writer<'a, B> { + fn write(&mut self, src: &[u8]) -> io::Result { + self.buf.write_slice(src).map_err(|e| match &e { + Error::UnexpectedEof { .. } => io::Error::new(io::ErrorKind::UnexpectedEof, e), + Error::WouldOverflow { .. } => io::Error::new(io::ErrorKind::WriteZero, e), + })?; + Ok(src.len()) + } + + fn flush(&mut self) -> io::Result<()> { + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_bytes_mut_mem_buf() { + let hello = b"hello"; + let mut buffer = BytesMut::new(); + buffer.write_u8(8).unwrap(); + buffer.write_u64(u64::MAX - 5).unwrap(); + buffer.write_slice(hello).unwrap(); + + assert_eq!(&buffer, buffer.remaining_slice()); + assert_eq!(8, buffer.read_u8().unwrap()); + assert_eq!(u64::MAX - 5, buffer.read_u64().unwrap()); + let mut dst = [0; 5]; + buffer.read_to_slice(&mut dst).unwrap(); + assert_eq!(hello, &dst); + + assert!(buffer.remaining_slice().is_empty()); + } + + #[test] + fn test_bytes_mut_empty() { + let mut buffer = BytesMut::new(); + assert!(buffer.remaining_slice().is_empty()); + assert!(matches!(buffer.read_u8(), Err(Error::UnexpectedEof { .. }))); + assert!(matches!( + buffer.read_u64(), + Err(Error::UnexpectedEof { .. }) + )); + } + + #[test] + fn test_bytes_mem_buf() { + let mut buffer = Bytes::from_static(b"hello world"); + assert_eq!(b"hello world", buffer.remaining_slice()); + + let mut dst = [0; 5]; + buffer.read_to_slice(&mut dst).unwrap(); + assert_eq!(b"hello", &dst); + + assert_eq!(b" world", buffer.remaining_slice()); + buffer.must_advance(1); + assert_eq!(b"world", buffer.remaining_slice()); + + let mut dst = [0; 50]; + assert!(matches!( + buffer.read_to_slice(&mut dst), + Err(Error::UnexpectedEof { .. }) + )); + } + + #[test] + fn test_slice_mem_buf() { + let hello = b"hello world"; + let mut buf = &hello[..]; + + assert_eq!(hello, buf.remaining_slice()); + let mut dst = [0; 6]; + buf.read_to_slice(&mut dst).unwrap(); + assert_eq!(b"hello ", &dst); + assert_eq!(b"world", buf.remaining_slice()); + + buf.must_advance(1); + assert_eq!(b"orld", buf.remaining_slice()); + } + + #[test] + fn test_slice_mem_buf_mut() { + let mut dst = [b'x'; 11]; + { + let mut buf = &mut dst[..]; + + buf.write_slice(b"abcde").unwrap(); + assert_eq!(b"abcdexxxxxx", &dst); + } + + { + let mut buf = &mut dst[..]; + + buf.write_slice(b"hello").unwrap(); + buf.write_slice(b" world").unwrap(); + assert_eq!(b"hello world", &dst); + } + + let mut dst = [0; 3]; + let mut buf = &mut dst[..]; + assert!(matches!( + buf.write_slice(b"a long long long slice"), + Err(Error::WouldOverflow { .. }) + )); + } + + #[test] + fn test_vec_mem_buf_mut() { + let mut buf = Vec::new(); + buf.write_slice(b"hello").unwrap(); + assert_eq!(b"hello", &buf[..]); + } + + #[test] + fn test_writer_write() { + let mut buf = Vec::new(); + let mut writer = Writer::new(&mut buf); + writer.write_all(b"he").unwrap(); + writer.write_all(b"llo").unwrap(); + assert_eq!(b"hello", &buf[..]); + } + + #[test] + fn test_writer_overflow() { + let mut dst = [0; 3]; + let mut buf = &mut dst[..]; + let mut writer = Writer::new(&mut buf); + assert_eq!( + io::ErrorKind::WriteZero, + writer.write_all(b"0123456789").err().unwrap().kind() + ); + } +} diff --git a/components/logger/Cargo.toml b/components/logger/Cargo.toml new file mode 100644 index 0000000000..9fdc938340 --- /dev/null +++ b/components/logger/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "logger" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +chrono = "0.4" +grpcio = { path = "../../grpcio" } +log = "0.4" +slog = "2.7" +slog-async = "2.6" +slog-term = "2.8" +slog_derive = "0.2" + +[dependencies.slog-global] +version = "0.1" +git = "https://github.com/breeswish/slog-global.git" +rev = "0e23a5baff302a9d7bccd85f8f31e43339c2f2c1" diff --git a/components/logger/src/lib.rs b/components/logger/src/lib.rs new file mode 100644 index 0000000000..f0317ab586 --- /dev/null +++ b/components/logger/src/lib.rs @@ -0,0 +1,422 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt, + fs::{File, OpenOptions}, + io, + str::FromStr, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, Mutex, + }, +}; + +use log::{info, SetLoggerError}; +pub use slog::Level; +use slog::{slog_o, Drain, Key, OwnedKVList, Record, KV}; +use slog_async::{Async, OverflowStrategy}; +use slog_term::{Decorator, PlainDecorator, RecordDecorator, TermDecorator}; + +const ASYNC_CHAN_SIZE: usize = 102400; +// This format is required for xflush monitor +const TIMESTAMP_FORMAT: &str = "%Y-%m-%d %H:%M:%S%.3f"; + +// Thanks to tikv +// https://github.com/tikv/tikv/blob/eaeb39a2c85684de08c48cf4b9426b3faf4defe6/components/tikv_util/src/logger/mod.rs + +pub fn convert_slog_level_to_log_level(lv: Level) -> log::Level { + match lv { + Level::Critical | Level::Error => log::Level::Error, + Level::Warning => log::Level::Warn, + Level::Debug => log::Level::Debug, + Level::Trace => log::Level::Trace, + Level::Info => log::Level::Info, + } +} + +pub fn convert_log_level_to_slog_level(lv: log::Level) -> Level { + match lv { + log::Level::Error => Level::Error, + log::Level::Warn => Level::Warning, + log::Level::Debug => Level::Debug, + log::Level::Trace => Level::Trace, + log::Level::Info => Level::Info, + } +} + +// The `to_string()` function of `slog::Level` produces values like `erro` and +// `trce` instead of the full words. This produces the full word. +fn get_string_by_level(lv: Level) -> &'static str { + match lv { + Level::Critical => "critical", + Level::Error => "error", + Level::Warning => "warn", + Level::Debug => "debug", + Level::Trace => "trace", + Level::Info => "info", + } +} + +pub fn term_drainer() -> CeresFormat { + let decorator = TermDecorator::new().stdout().build(); + CeresFormat::new(decorator) +} + +pub fn file_drainer(path: &Option) -> Option>> { + match path { + Some(path) => { + let file = OpenOptions::new() + .create(true) + .append(true) + .open(path) + .unwrap(); + let decorator = PlainDecorator::new(file); + Some(CeresFormat::new(decorator)) + } + None => None, + } +} + +// dispacher +pub struct LogDispatcher { + normal: N, +} + +impl LogDispatcher { + pub fn new(normal: N) -> Self { + Self { normal } + } +} + +impl Drain for LogDispatcher +where + N: Drain, +{ + type Err = io::Error; + type Ok = (); + + fn log(&self, record: &Record, values: &OwnedKVList) -> Result { + self.normal.log(record, values) + } +} + +pub fn init_log( + drain: D, + level: Level, + use_async: bool, + async_log_channel_len: i32, + init_stdlog: bool, +) -> Result +where + D: Drain + Send + 'static, + ::Err: std::fmt::Display, +{ + let runtime_level = RuntimeLevel::new(level); + // TODO(yingwen): Consider print the error instead of just ignoring it? + let root_logger = if use_async { + let drain = if async_log_channel_len <= 0 { + Async::new(drain.ignore_res()) + .chan_size(ASYNC_CHAN_SIZE) + .overflow_strategy(OverflowStrategy::Block) + .build() + } else { + Async::new(drain.ignore_res()) + .chan_size(async_log_channel_len as usize) + .build() + }; + let drain = RuntimeLevelFilter::new(drain, runtime_level.clone()); + slog::Logger::root(drain.ignore_res(), slog_o!()) + } else { + let drain = RuntimeLevelFilter::new(Mutex::new(drain), runtime_level.clone()); + slog::Logger::root(drain.ignore_res(), slog_o!()) + }; + + slog_global::set_global(root_logger); + if init_stdlog { + slog_global::redirect_std_log(Some(level))?; + grpcio::redirect_log(); + } + + Ok(runtime_level) +} + +// e.g. +// ```text +// 2020-01-20 13:00:14.998 INFO [src/engine/rocksdb/rocks_kv.rs:394] RocksKV::open_with_op start, name:autogen +// ``` +pub struct CeresFormat +where + D: Decorator, +{ + decorator: D, +} + +impl CeresFormat +where + D: Decorator, +{ + fn new(decorator: D) -> Self { + Self { decorator } + } +} + +impl Drain for CeresFormat +where + D: Decorator, +{ + type Err = io::Error; + type Ok = (); + + fn log(&self, record: &Record, values: &OwnedKVList) -> Result { + self.decorator.with_record(record, values, |decorator| { + write_log_header(decorator, record)?; + write_log_msg(decorator, record)?; + write_log_fields(decorator, record, values)?; + + decorator.start_whitespace()?; + writeln!(decorator)?; + + decorator.flush()?; + + Ok(()) + }) + } +} + +#[derive(Clone)] +pub struct RuntimeLevel { + level: Arc, + default_level: Level, +} + +impl RuntimeLevel { + fn new(default_level: Level) -> Self { + Self { + level: Arc::new(AtomicUsize::new(default_level.as_usize())), + default_level, + } + } + + #[inline] + pub fn current_level(&self) -> Level { + Level::from_usize(self.level.load(Ordering::Relaxed)).unwrap_or(self.default_level) + } + + pub fn set_level(&self, level: Level) { + self.level.store(level.as_usize(), Ordering::Relaxed); + // Log level of std log is not changed unless we call `log::set_max_level` + log::set_max_level(convert_slog_level_to_log_level(level).to_level_filter()); + + info!( + "RuntimeLevel::set_level log level changed to {}", + get_string_by_level(level) + ); + } + + #[inline] + pub fn reset(&self) { + self.set_level(self.default_level); + } + + #[inline] + pub fn default_level(&self) -> Level { + self.default_level + } + + #[inline] + pub fn current_level_str(&self) -> &str { + get_string_by_level(self.current_level()) + } + + pub fn set_level_by_str(&self, level_str: &str) -> Result<(), String> { + Level::from_str(level_str) + .map_err(|_| format!("Invalid level {}", level_str)) + .and_then(|level| match level { + Level::Trace | Level::Debug | Level::Info => Ok(level), + _ => Err("Only allow to change log level to ".to_owned()), + }) + .map(|level| self.set_level(level)) + } +} + +struct RuntimeLevelFilter { + drain: D, + runtime_level: RuntimeLevel, +} + +impl RuntimeLevelFilter { + fn new(drain: D, runtime_level: RuntimeLevel) -> Self { + Self { + drain, + runtime_level, + } + } +} + +impl Drain for RuntimeLevelFilter +where + D: Drain, +{ + type Err = D::Err; + type Ok = Option; + + fn log(&self, record: &Record, values: &OwnedKVList) -> Result { + let current_level = self.runtime_level.current_level(); + + if record.level().is_at_least(current_level) { + Ok(Some(self.drain.log(record, values)?)) + } else { + Ok(None) + } + } +} + +fn write_log_header(decorator: &mut dyn RecordDecorator, record: &Record<'_>) -> io::Result<()> { + decorator.start_timestamp()?; + write!( + decorator, + "{}", + chrono::Local::now().format(TIMESTAMP_FORMAT) + )?; + + decorator.start_whitespace()?; + write!(decorator, " ")?; + + decorator.start_level()?; + write!(decorator, "{}", record.level().as_short_str())?; + + decorator.start_whitespace()?; + write!(decorator, " ")?; + + // Writes source file info. + decorator.start_msg()?; // There is no `start_file()` or `start_line()`. + write!(decorator, "[{}:{}]", record.file(), record.line())?; + + Ok(()) +} + +fn write_log_msg(decorator: &mut dyn RecordDecorator, record: &Record<'_>) -> io::Result<()> { + decorator.start_whitespace()?; + write!(decorator, " ")?; + + decorator.start_msg()?; + write!(decorator, "{}", record.msg())?; + + Ok(()) +} + +fn write_log_fields( + decorator: &mut dyn RecordDecorator, + record: &Record<'_>, + values: &OwnedKVList, +) -> io::Result<()> { + let mut serializer = Serializer::new(decorator); + + record.kv().serialize(record, &mut serializer)?; + + values.serialize(record, &mut serializer)?; + + serializer.finish()?; + + Ok(()) +} + +struct Serializer<'a> { + decorator: &'a mut dyn RecordDecorator, +} + +impl<'a> Serializer<'a> { + fn new(decorator: &'a mut dyn RecordDecorator) -> Self { + Serializer { decorator } + } + + fn write_whitespace(&mut self) -> io::Result<()> { + self.decorator.start_whitespace()?; + write!(self.decorator, " ")?; + Ok(()) + } + + fn finish(self) -> io::Result<()> { + Ok(()) + } +} + +impl<'a> Drop for Serializer<'a> { + fn drop(&mut self) {} +} + +impl<'a> slog::Serializer for Serializer<'a> { + fn emit_none(&mut self, key: Key) -> slog::Result { + self.emit_arguments(key, &format_args!("None")) + } + + fn emit_arguments(&mut self, key: Key, val: &fmt::Arguments<'_>) -> slog::Result { + self.write_whitespace()?; + + // Write key + write!(self.decorator, "[")?; + self.decorator.start_key()?; + write!(self.decorator, "{}", key)?; + + // Write separator + self.decorator.start_separator()?; + write!(self.decorator, ":")?; + + // Write value + self.decorator.start_value()?; + write!(self.decorator, "{}", val)?; + self.decorator.reset()?; + write!(self.decorator, "]")?; + + Ok(()) + } +} + +pub fn init_test_logger() { + // level + let level = Level::Info; + + // drain + let term_drain = term_drainer(); + let drain = LogDispatcher::new(term_drain); + + // Use async and init stdlog + let _ = init_log(drain, level, false, 12400, true); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_runtime_level() { + let runtime_level = RuntimeLevel::new(Level::Info); + + assert_eq!(runtime_level.current_level(), Level::Info); + assert_eq!(runtime_level.default_level(), Level::Info); + + runtime_level.set_level(Level::Debug); + assert_eq!(runtime_level.current_level(), Level::Debug); + assert_eq!(runtime_level.default_level(), Level::Info); + + runtime_level.reset(); + assert_eq!(runtime_level.current_level(), Level::Info); + assert_eq!(runtime_level.current_level_str(), "info"); + + runtime_level.set_level_by_str("trace").unwrap(); + assert_eq!(runtime_level.current_level(), Level::Trace); + runtime_level.set_level_by_str("debug").unwrap(); + assert_eq!(runtime_level.current_level(), Level::Debug); + runtime_level.set_level_by_str("info").unwrap(); + assert_eq!(runtime_level.current_level(), Level::Info); + + assert!(runtime_level.set_level_by_str("warn").is_err()); + assert_eq!(runtime_level.current_level(), Level::Info); + assert!(runtime_level.set_level_by_str("warning").is_err()); + assert!(runtime_level.set_level_by_str("critical").is_err()); + assert!(runtime_level.set_level_by_str("error").is_err()); + assert!(runtime_level.set_level_by_str("no such level").is_err()); + + assert_eq!(runtime_level.current_level(), Level::Info); + } +} diff --git a/components/object_store/Cargo.toml b/components/object_store/Cargo.toml new file mode 100644 index 0000000000..787f330dcd --- /dev/null +++ b/components/object_store/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "object_store" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +[dependencies] # In alphabetical order +async-trait = "0.1.42" +bytes = "1.0" +common_util = { path = "../../common_util" } +futures = "0.3" +itertools = "0.10" +percent-encoding = "2.1" +snafu = { version = "0.6.10", features = ["futures", "backtraces"] } +tokio = { version = "1.0", features = ["macros", "fs"] } +# Filesystem integration +tokio-util = { version = "0.6.3", features = [ "io","compat" ] } +walkdir = "2.3.2" + +[dev-dependencies] +tempfile = "3.1.0" diff --git a/components/object_store/src/disk.rs b/components/object_store/src/disk.rs new file mode 100644 index 0000000000..14cdbb9cc0 --- /dev/null +++ b/components/object_store/src/disk.rs @@ -0,0 +1,389 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! This module contains the IOx implementation for using local disk as the +//! object store. +use std::{collections::BTreeSet, convert::TryFrom, io, path::PathBuf}; + +use async_trait::async_trait; +use futures::{ + stream, + stream::{BoxStream, StreamExt}, + AsyncRead, +}; +use snafu::{Backtrace, GenerateBacktrace, OptionExt, ResultExt, Snafu}; +use tokio::fs; +use tokio_util::compat::{Compat, FuturesAsyncReadCompatExt}; +use walkdir::WalkDir; + +use crate::{path::file::FilePath, ListResult, ObjectMeta, ObjectStore}; + +/// A specialized `Result` for filesystem object store-related errors +pub type Result = std::result::Result; + +/// A specialized `Error` for filesystem object store-related errors +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Expected streamed data to have length {}, got {}.\nBacktrace:\n{}", + expected, + actual, + backtrace + ))] + DataDoesNotMatchLength { + expected: usize, + actual: usize, + backtrace: Backtrace, + }, + + #[snafu(display("File size for {} did not fit in a usize: {}.\nBacktrace:\n{}", path.display(), source, backtrace))] + FileSizeOverflowedUsize { + path: PathBuf, + source: std::num::TryFromIntError, + backtrace: Backtrace, + }, + + #[snafu(display("Unable to walk dir: {}.\nBacktrace:\n{}", source, backtrace))] + UnableToWalkDir { + source: walkdir::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Unable to access metadata for {}: {}.\nBacktrace:\n{}", path.display(), source, backtrace))] + UnableToAccessMetadata { + path: PathBuf, + source: walkdir::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Unable to copy data to file: {}.\nBacktrace:\n{}", source, backtrace))] + UnableToCopyDataToFile { + source: io::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Unable to create dir {}: {}.\nBacktrace:\n{}", path.display(), source, backtrace))] + UnableToCreateDir { + source: io::Error, + path: PathBuf, + backtrace: Backtrace, + }, + + #[snafu(display("Unable to create file {}: {}.\nBacktrace:\n{}", path.display(), err, backtrace))] + UnableToCreateFile { + path: PathBuf, + err: io::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Unable to delete file {}: {}.\nBacktrace:\n{}", path.display(), source, backtrace))] + UnableToDeleteFile { + source: io::Error, + path: PathBuf, + backtrace: Backtrace, + }, + + #[snafu(display("Unable to open file {}: {}.\nBacktrace:\n{}", path.display(), source, backtrace))] + UnableToOpenFile { + source: io::Error, + path: PathBuf, + backtrace: Backtrace, + }, + + #[snafu(display("Unable to read data from file {}: {}.\nBacktrace:\n{}", path.display(), source, backtrace))] + UnableToReadBytes { + source: io::Error, + path: PathBuf, + backtrace: Backtrace, + }, + + #[snafu(display( + "Unable to stream data from the request into memory: {}.\nBacktrace:\n{}", + source, + backtrace + ))] + UnableToStreamDataIntoMemory { + source: std::io::Error, + backtrace: Backtrace, + }, +} + +/// Local filesystem storage suitable for testing or for opting out of using a +/// cloud storage provider. +#[derive(Debug)] +pub struct File { + root: FilePath, +} + +#[async_trait] +impl ObjectStore for File { + type Error = Error; + type Path = FilePath; + type Reader = Compat; + + fn new_path(&self) -> Self::Path { + FilePath::default() + } + + async fn put( + &self, + location: &Self::Path, + bytes: R, + _length: Option, + ) -> Result<(), Self::Error> + where + R: AsyncRead + Send + Unpin, + { + let path = self.path(location); + + let mut file = match fs::File::create(&path).await { + Ok(f) => f, + Err(err) if err.kind() == std::io::ErrorKind::NotFound => { + let parent = path + .parent() + .context(UnableToCreateFile { path: &path, err })?; + fs::create_dir_all(&parent) + .await + .context(UnableToCreateDir { path: parent })?; + + match fs::File::create(&path).await { + Ok(f) => f, + Err(err) => return UnableToCreateFile { path, err }.fail(), + } + } + Err(err) => return UnableToCreateFile { path, err }.fail(), + }; + + tokio::io::copy(&mut bytes.compat(), &mut file) + .await + .context(UnableToCopyDataToFile)?; + + Ok(()) + } + + async fn get(&self, location: &Self::Path) -> Result { + let path = self.path(location); + let file = fs::File::open(&path) + .await + .context(UnableToOpenFile { path: &path })?; + Ok(file.into_std().await) + } + + async fn delete(&self, location: &Self::Path) -> Result<(), Self::Error> { + let path = self.path(location); + fs::remove_file(&path) + .await + .context(UnableToDeleteFile { path })?; + Ok(()) + } + + async fn list<'a>( + &'a self, + prefix: Option<&'a Self::Path>, + ) -> Result, Self::Error>>, Self::Error> { + let root_path = self.root.to_raw(); + let walkdir = WalkDir::new(&root_path) + // Don't include the root directory itself + .min_depth(1); + + let s = + walkdir.into_iter().filter_map(move |result_dir_entry| { + match convert_walkdir_result(result_dir_entry) { + Err(e) => Some(Err(e)), + Ok(None) => None, + Ok(entry @ Some(_)) => entry + .filter(|dir_entry| dir_entry.file_type().is_file()) + .map(|file| { + let relative_path = file.path().strip_prefix(&root_path).expect( + "Must start with root path because this came from walking the root", + ); + FilePath::raw(relative_path, false) + }) + .filter(|name| prefix.map_or(true, |p| name.prefix_matches(p))) + .map(|name| Ok(vec![name])), + } + }); + + Ok(stream::iter(s).boxed()) + } + + async fn list_with_delimiter( + &self, + prefix: &Self::Path, + ) -> Result, Self::Error> { + // Always treat prefix as relative because the list operations don't know + // anything about where on disk the root of this object store is; they + // only care about what's within this object store's directory. See + // documentation for `push_path`: it deliberately does *not* behave as + // `PathBuf::push` does: there is no way to replace the root. So even if + // `prefix` isn't relative, we treat it as such here. + let mut resolved_prefix = self.root.clone(); + resolved_prefix.push_path(prefix); + + // It is valid to specify a prefix with directories `[foo, bar]` and filename + // `baz`, in which case we want to treat it like a glob for + // `foo/bar/baz*` and there may not actually be a file or directory + // named `foo/bar/baz`. We want to look at all the entries in + // `foo/bar/`, so remove the file name. + let mut search_path = resolved_prefix.clone(); + search_path.unset_file_name(); + + let walkdir = WalkDir::new(&search_path.to_raw()) + .min_depth(1) + .max_depth(1); + + let mut common_prefixes = BTreeSet::new(); + let mut objects = Vec::new(); + + let root_path = self.root.to_raw(); + for entry_res in walkdir.into_iter().map(convert_walkdir_result) { + if let Some(entry) = entry_res? { + let entry_location = FilePath::raw(entry.path(), false); + + if entry_location.prefix_matches(&resolved_prefix) { + let metadata = entry + .metadata() + .context(UnableToAccessMetadata { path: entry.path() })?; + + if metadata.is_dir() { + let parts = entry_location + .parts_after_prefix(&resolved_prefix) + .expect("must have prefix because of the if prefix_matches condition"); + + let mut relative_location = prefix.to_owned(); + relative_location.push_part_as_dir(&parts[0]); + common_prefixes.insert(relative_location); + } else { + let path = entry + .path() + .strip_prefix(&root_path) + .expect("must have prefix because of the if prefix_matches condition"); + let location = FilePath::raw(path, false); + + let last_modified = metadata + .modified() + .expect("Modified file time should be supported on this platform"); + let size = usize::try_from(metadata.len()) + .context(FileSizeOverflowedUsize { path: entry.path() })?; + + objects.push(ObjectMeta { + location, + last_modified, + size, + }); + } + } + } + } + + Ok(ListResult { + next_token: None, + common_prefixes: common_prefixes.into_iter().collect(), + objects, + }) + } +} + +impl File { + /// Create new filesystem storage. + pub fn new(root: impl Into) -> Self { + Self { + root: FilePath::raw(root, true), + } + } + + /// Return full path of the given location + pub fn path(&self, location: &FilePath) -> PathBuf { + let mut path = self.root.clone(); + path.push_path(location); + path.to_raw() + } +} + +/// Convert walkdir results and converts not-found errors into `None`. +fn convert_walkdir_result( + res: std::result::Result, +) -> Result> { + match res { + Ok(entry) => Ok(Some(entry)), + Err(walkdir_err) => match walkdir_err.io_error() { + Some(io_err) => match io_err.kind() { + io::ErrorKind::NotFound => Ok(None), + _ => Err(Error::UnableToWalkDir { + source: walkdir_err, + backtrace: Backtrace::generate(), + }), + }, + None => Err(Error::UnableToWalkDir { + source: walkdir_err, + backtrace: Backtrace::generate(), + }), + }, + } +} + +#[cfg(test)] +mod tests { + use std::io::Read; + + use bytes::Bytes; + use tempfile::TempDir; + + use super::*; + use crate::{ + path::ObjectStorePath, + tests::{list_with_delimiter, put_get_delete_list}, + ObjectStore, + }; + + #[tokio::test] + async fn file_test() { + let root = TempDir::new().unwrap(); + let file = File::new(root.path()); + + put_get_delete_list(&file).await.unwrap(); + list_with_delimiter(&file).await.unwrap(); + } + + #[tokio::test] + async fn creates_dir_if_not_present() { + let root = TempDir::new().unwrap(); + let file = File::new(root.path()); + + let data = Bytes::from("arbitrary data"); + let mut location = file.new_path(); + location.push_all_dirs(&["nested", "file", "test_file"]); + + file.put(&location, Box::new(data.as_ref()), Some(data.len())) + .await + .unwrap(); + + let mut read_data = Vec::with_capacity(data.len()); + file.get(&location) + .await + .unwrap() + .read_to_end(&mut read_data) + .unwrap(); + assert_eq!(&*read_data, data); + } + + #[tokio::test] + async fn unknown_length() { + let root = TempDir::new().unwrap(); + let file = File::new(root.path()); + + let data = Bytes::from("arbitrary data"); + + let mut location = file.new_path(); + location.set_file_name("some_file"); + file.put(&location, Box::new(data.as_ref()), None) + .await + .unwrap(); + let mut read_data = Vec::with_capacity(data.len()); + file.get(&location) + .await + .unwrap() + .read_to_end(&mut read_data) + .unwrap(); + assert_eq!(&*read_data, data); + } +} diff --git a/components/object_store/src/lib.rs b/components/object_store/src/lib.rs new file mode 100644 index 0000000000..326a68459c --- /dev/null +++ b/components/object_store/src/lib.rs @@ -0,0 +1,329 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! # object_store +//! +//! This crate provides APIs for interacting with object storage services. It +//! currently supports PUT, GET, DELETE, and list for in-memory and +//! local file storage. +//! +//! Future compatibility will include Aliyun OSS. +//! +//! Fork from https://github.com/influxdata/influxdb_iox/tree/main/object_store + +use std::time::SystemTime; + +use async_trait::async_trait; +use futures::{stream::BoxStream, AsyncRead}; +use path::ObjectStorePath; + +pub mod disk; +pub mod path; + +/// Universal API to multiple object store services. +// TODO(xikai): ObjectStore -> FileStore +#[async_trait] +pub trait ObjectStore: std::fmt::Debug + Send + Sync + 'static { + /// The type of the locations used in interacting with this object store. + type Path: ObjectStorePath; + + /// The error returned from fallible methods + type Error: std::error::Error + Send + Sync + 'static; + + type Reader: AsyncRead + Send + Unpin; + + /// Return a new location path appropriate for this object storage + fn new_path(&self) -> Self::Path; + + /// Save the provided bytes to the specified location. + async fn put( + &self, + location: &Self::Path, + bytes: R, + length: Option, + ) -> Result<(), Self::Error> + where + R: AsyncRead + Send + Unpin; + + /// Return the bytes that are stored at the specified location. + async fn get(&self, location: &Self::Path) -> Result; + + /// Delete the object at the specified location. + async fn delete(&self, location: &Self::Path) -> Result<(), Self::Error>; + + /// List all the objects with the given prefix. + async fn list<'a>( + &'a self, + prefix: Option<&'a Self::Path>, + ) -> Result, Self::Error>>, Self::Error>; + + /// List objects with the given prefix and an implementation specific + /// delimiter. Returns common prefixes (directories) in addition to object + /// metadata. + async fn list_with_delimiter( + &self, + prefix: &Self::Path, + ) -> Result, Self::Error>; +} + +/// Result of a list call that includes objects, prefixes (directories) and a +/// token for the next set of results. Individual result sets may be limited to +/// 1,00 objects based on the underlying object storage's limitations. +#[derive(Debug)] +pub struct ListResult { + /// Token passed to the API for the next page of list results. + pub next_token: Option, + /// Prefixes that are common (like directories) + pub common_prefixes: Vec

{ + provider: P, +} + +impl

Frontend

{ + pub fn new(provider: P) -> Self { + Self { provider } + } + + /// Parse the sql and returns the statements + pub fn parse_sql(&self, _ctx: &mut Context, sql: &str) -> Result { + Parser::parse_sql(sql).context(InvalidSql { sql }) + } + + /// Parse the request and returns the Expr + pub fn parse_promql( + &self, + _ctx: &mut Context, + mut req: PrometheusQueryRequest, + ) -> Result { + req.take_expr().try_into().context(InvalidPromRequest) + } +} + +impl Frontend

{ + /// Create logical plan for the statement + pub fn statement_to_plan(&self, ctx: &mut Context, stmt: Statement) -> Result { + let planner = Planner::new(&self.provider, ctx.request_id, ctx.read_parallelism); + + planner.statement_to_plan(stmt).context(CreatePlan) + } + + pub fn promql_expr_to_plan( + &self, + ctx: &mut Context, + expr: Expr, + ) -> Result<(Plan, Arc)> { + let planner = Planner::new(&self.provider, ctx.request_id, ctx.read_parallelism); + + planner.promql_expr_to_plan(expr).context(CreatePlan) + } +} diff --git a/sql/src/lib.rs b/sql/src/lib.rs new file mode 100644 index 0000000000..fe2f41e287 --- /dev/null +++ b/sql/src/lib.rs @@ -0,0 +1,19 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! SQL frontend +//! +//! Parse sql into logical plan that can be handled by interpreters + +#[macro_use] +extern crate common_util; + +pub mod ast; +pub mod container; +pub mod frontend; +pub mod parser; +pub mod plan; +pub mod planner; +pub mod promql; +pub mod provider; +#[cfg(any(test, feature = "test"))] +pub mod tests; diff --git a/sql/src/parser.rs b/sql/src/parser.rs new file mode 100644 index 0000000000..dca4d82ba2 --- /dev/null +++ b/sql/src/parser.rs @@ -0,0 +1,814 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! SQL parser +//! +//! Some codes are copied from datafusion: + +use log::debug; +use paste::paste; +use sqlparser::{ + ast::{ColumnDef, ColumnOption, ColumnOptionDef, Ident, TableConstraint}, + dialect::{keywords::Keyword, Dialect, MySqlDialect}, + parser::{IsOptional::Mandatory, Parser as SqlParser, ParserError}, + tokenizer::{Token, Tokenizer}, +}; +use table_engine::ANALYTIC_ENGINE_TYPE; + +use crate::ast::{ + AlterAddColumn, AlterModifySetting, CreateTable, DescribeTable, DropTable, ExistsTable, + ShowCreate, ShowCreateObject, Statement, +}; + +define_result!(ParserError); + +// Use `Parser::expected` instead, if possible +macro_rules! parser_err { + ($MSG:expr) => { + Err(ParserError::ParserError($MSG.to_string())) + }; +} + +const TS_KEY: &str = "__ts_key"; +const TAG: &str = "TAG"; +const COMMENT: &str = "COMMENT"; +const UNSIGN: &str = "UNSIGN"; +const MODIFY: &str = "MODIFY"; +const SETTING: &str = "SETTING"; + +macro_rules! is_custom_column { + ($name: ident) => { + paste! { + #[inline] + pub fn [](opt: &ColumnOption) -> bool { + match opt { + ColumnOption::DialectSpecific(tokens) => { + if let [Token::Word(word)] = &tokens[..] { + return word.value == $name; + } + } + _ => return false, + } + return false; + } + + } + }; +} + +is_custom_column!(TAG); +is_custom_column!(UNSIGN); + +/// Get the comment from the [`ColumnOption`] if it is a comment option. +pub fn get_column_comment(opt: &ColumnOption) -> Option { + if let ColumnOption::DialectSpecific(tokens) = opt { + if let [Token::Word(keyword), Token::SingleQuotedString(comment)] = &tokens[..] { + if keyword.value == COMMENT { + return Some(comment.clone()); + } + } + } + + None +} + +/// Returns true when is a TIMESTAMP KEY table constraint +pub fn is_timestamp_key_constraint(constrait: &TableConstraint) -> bool { + if let TableConstraint::Unique { + name: Some(Ident { + value, + quote_style: None, + }), + columns: _, + is_primary: false, + } = constrait + { + return value == TS_KEY; + } + false +} + +/// SQL Parser with ceresdb dialect support +pub struct Parser<'a> { + parser: SqlParser<'a>, +} + +impl<'a> Parser<'a> { + // Parse the specified tokens with dialect + fn new_with_dialect(sql: &str, dialect: &'a dyn Dialect) -> Result { + let mut tokenizer = Tokenizer::new(dialect, sql); + let tokens = tokenizer.tokenize()?; + + Ok(Parser { + parser: SqlParser::new(tokens, dialect), + }) + } + + /// Parse a SQL statement and produce a set of statements + pub fn parse_sql(sql: &str) -> Result> { + // Use MySqlDialect, so we can support "`" and chinese characters. + let dialect = &MySqlDialect {}; + let mut parser = Parser::new_with_dialect(sql, dialect)?; + let mut stmts = Vec::new(); + let mut expecting_statement_delimiter = false; + loop { + // ignore empty statements (between successive statement delimiters) + while parser.parser.consume_token(&Token::SemiColon) { + expecting_statement_delimiter = false; + } + + if parser.parser.peek_token() == Token::EOF { + break; + } + if expecting_statement_delimiter { + return parser.expected("end of statement", parser.parser.peek_token()); + } + + let statement = parser.parse_statement()?; + stmts.push(statement); + expecting_statement_delimiter = true; + } + + debug!("Parser parsed sql, sql:{}, stmts:{:#?}", sql, stmts); + + Ok(stmts) + } + + // Report unexpected token + fn expected(&self, expected: &str, found: Token) -> Result { + parser_err!(format!("Expected {}, found: {}", expected, found)) + } + + // Parse a new expression + fn parse_statement(&mut self) -> Result { + match self.parser.peek_token() { + Token::Word(w) => { + match w.keyword { + Keyword::CREATE => { + // Move one token forward + self.parser.next_token(); + // Use custom parse + self.parse_create() + } + Keyword::DROP => { + // Move one token forward + self.parser.next_token(); + // Use custom parse + self.parse_drop() + } + Keyword::DESCRIBE | Keyword::DESC => { + self.parser.next_token(); + self.parse_describe() + } + Keyword::ALTER => { + self.parser.next_token(); + self.parse_alter() + } + Keyword::SHOW => { + self.parser.next_token(); + self.parse_show() + } + Keyword::EXISTS => { + self.parser.next_token(); + self.parse_exists() + } + _ => { + // use the native parser + Ok(Statement::Standard(Box::new( + self.parser.parse_statement()?, + ))) + } + } + } + _ => { + // use the native parser + Ok(Statement::Standard(Box::new( + self.parser.parse_statement()?, + ))) + } + } + } + + pub fn parse_alter(&mut self) -> Result { + let nth1_token = self.parser.peek_token(); + let nth2_token = self.parser.peek_nth_token(2); + let nth3_token = self.parser.peek_nth_token(3); + if let (Token::Word(nth1_word), Token::Word(nth2_word), Token::Word(nth3_word)) = + (nth1_token, nth2_token, nth3_token) + { + // example: ALTER TABLE test_ttl modify SETTING ttl='8d' + if let (Keyword::TABLE, MODIFY, SETTING) = ( + nth1_word.keyword, + nth2_word.value.to_uppercase().as_str(), + nth3_word.value.to_uppercase().as_str(), + ) { + return self.parse_alter_modify_setting(); + } + // examples: + // ALTER TABLE test_table ADD COLUMN col_17 STRING TAG + // ALTER TABLE test_table ADD COLUMN (col_18 STRING TAG, col_19 UNIT64) + if let (Keyword::TABLE, Keyword::ADD, Keyword::COLUMN) = + (nth1_word.keyword, nth2_word.keyword, nth3_word.keyword) + { + return self.parse_alter_add_column(); + } + } + Ok(Statement::Standard(Box::new(self.parser.parse_alter()?))) + } + + pub fn parse_show(&mut self) -> Result { + if self + .parser + .parse_one_of_keywords(&[Keyword::CREATE]) + .is_some() + { + Ok(self.parse_show_create()?) + } else { + self.expected("create", self.parser.peek_token()) + } + } + + fn parse_show_create(&mut self) -> Result { + let obj_type = match self.parser.expect_one_of_keywords(&[Keyword::TABLE])? { + Keyword::TABLE => Ok(ShowCreateObject::Table), + keyword => Err(ParserError::ParserError(format!( + "Unable to map keyword to ShowCreateObject: {:?}", + keyword + ))), + }?; + + let obj_name = self.parser.parse_object_name()?; + + Ok(Statement::ShowCreate(ShowCreate { obj_type, obj_name })) + } + + fn parse_alter_add_column(&mut self) -> Result { + self.parser.expect_keyword(Keyword::TABLE)?; + let table_name = self.parser.parse_object_name()?; + self.parser + .expect_keywords(&[Keyword::ADD, Keyword::COLUMN])?; + let (mut columns, _) = self.parse_columns()?; + if columns.is_empty() { + let column_def = self.parse_column_def()?; + columns.push(column_def); + } + Ok(Statement::AlterAddColumn(AlterAddColumn { + table_name, + columns, + })) + } + + fn parse_alter_modify_setting(&mut self) -> Result { + self.parser.expect_keyword(Keyword::TABLE)?; + let table_name = self.parser.parse_object_name()?; + if self.consume_token(MODIFY) && self.consume_token(SETTING) { + let options = self + .parser + .parse_comma_separated(SqlParser::parse_sql_option)?; + Ok(Statement::AlterModifySetting(AlterModifySetting { + table_name, + options, + })) + } else { + unreachable!() + } + } + + pub fn parse_describe(&mut self) -> Result { + let _ = self.parser.parse_keyword(Keyword::TABLE); + let table_name = self.parser.parse_object_name()?; + Ok(Statement::Describe(DescribeTable { table_name })) + } + + // Parse a SQL CREATE statement + pub fn parse_create(&mut self) -> Result { + self.parser.expect_keyword(Keyword::TABLE)?; + let if_not_exists = + self.parser + .parse_keywords(&[Keyword::IF, Keyword::NOT, Keyword::EXISTS]); + let table_name = self.parser.parse_object_name()?; + let (columns, constraints) = self.parse_columns()?; + let engine = self.parse_table_engine()?; + let options = self.parser.parse_options(Keyword::WITH)?; + + Ok(Statement::Create(CreateTable { + if_not_exists, + name: table_name, + columns, + engine, + constraints, + options, + })) + } + + pub fn parse_drop(&mut self) -> Result { + self.parser.expect_keyword(Keyword::TABLE)?; + let if_exists = self.parser.parse_keywords(&[Keyword::IF, Keyword::EXISTS]); + let table_name = self.parser.parse_object_name()?; + let engine = self.parse_table_engine()?; + + Ok(Statement::Drop(DropTable { + name: table_name, + if_exists, + engine, + })) + } + + pub fn parse_exists(&mut self) -> Result { + let _ = self.parser.parse_keyword(Keyword::TABLE); + let table_name = self.parser.parse_object_name()?; + Ok(Statement::Exists(ExistsTable { table_name })) + } + + // Copy from sqlparser + fn parse_columns(&mut self) -> Result<(Vec, Vec)> { + let mut columns = vec![]; + let mut constraints = vec![]; + if !self.parser.consume_token(&Token::LParen) || self.parser.consume_token(&Token::RParen) { + return Ok((columns, constraints)); + } + + loop { + if let Some(constraint) = self.parse_optional_table_constraint()? { + constraints.push(constraint); + } else if let Token::Word(_) = self.parser.peek_token() { + columns.push(self.parse_column_def()?); + } else { + return self.expected( + "column name or constraint definition", + self.parser.peek_token(), + ); + } + let comma = self.parser.consume_token(&Token::Comma); + if self.parser.consume_token(&Token::RParen) { + // allow a trailing comma, even though it's not in standard + break; + } else if !comma { + return self.expected( + "',' or ')' after column definition", + self.parser.peek_token(), + ); + } + } + + Ok((columns, constraints)) + } + + /// Parses the set of valid formats + fn parse_table_engine(&mut self) -> Result { + // TODO make ENGINE as a keyword + if !self.consume_token("ENGINE") { + return Ok(ANALYTIC_ENGINE_TYPE.to_string()); + } + + self.parser.expect_token(&Token::Eq)?; + + match self.parser.next_token() { + Token::Word(w) => Ok(w.value), + unexpected => self.expected("Engine is missing", unexpected), + } + } + + // Copy from sqlparser + fn parse_column_def(&mut self) -> Result { + let name = self.parser.parse_identifier()?; + let data_type = self.parser.parse_data_type()?; + let collation = if self.parser.parse_keyword(Keyword::COLLATE) { + Some(self.parser.parse_object_name()?) + } else { + None + }; + let mut options = vec![]; + loop { + if self.parser.parse_keyword(Keyword::CONSTRAINT) { + let name = Some(self.parser.parse_identifier()?); + if let Some(option) = self.parse_optional_column_option()? { + options.push(ColumnOptionDef { name, option }); + } else { + return self.expected( + "constraint details after CONSTRAINT ", + self.parser.peek_token(), + ); + } + } else if let Some(option) = self.parse_optional_column_option()? { + options.push(ColumnOptionDef { name: None, option }); + } else { + break; + }; + } + Ok(ColumnDef { + name, + data_type, + collation, + options, + }) + } + + // Copy from sqlparser by boyan + fn parse_optional_table_constraint(&mut self) -> Result> { + let name = if self.parser.parse_keyword(Keyword::CONSTRAINT) { + Some(self.parser.parse_identifier()?) + } else { + None + }; + match self.parser.next_token() { + Token::Word(w) if w.keyword == Keyword::PRIMARY => { + self.parser.expect_keyword(Keyword::KEY)?; + let columns = self.parser.parse_parenthesized_column_list(Mandatory)?; + Ok(Some(TableConstraint::Unique { + name, + columns, + is_primary: true, + })) + } + Token::Word(w) if w.keyword == Keyword::TIMESTAMP => { + self.parser.expect_keyword(Keyword::KEY)?; + let columns = self.parser.parse_parenthesized_column_list(Mandatory)?; + // TODO(boyan), TableConstraint doesn't support dialect right now + // we use unique constraint as TIMESTAMP KEY constraint. + Ok(Some(TableConstraint::Unique { + name: Some(Ident { + value: TS_KEY.to_owned(), + quote_style: None, + }), + columns, + is_primary: false, + })) + } + unexpected => { + if name.is_some() { + self.expected("PRIMARY, TIMESTAMP", unexpected) + } else { + self.parser.prev_token(); + Ok(None) + } + } + } + } + + // Copy from sqlparser by boyan + fn parse_optional_column_option(&mut self) -> Result> { + if self.parser.parse_keywords(&[Keyword::NOT, Keyword::NULL]) { + Ok(Some(ColumnOption::NotNull)) + } else if self.parser.parse_keyword(Keyword::NULL) { + Ok(Some(ColumnOption::Null)) + } else if self.parser.parse_keyword(Keyword::DEFAULT) { + Ok(Some(ColumnOption::Default(self.parser.parse_expr()?))) + } else if self + .parser + .parse_keywords(&[Keyword::PRIMARY, Keyword::KEY]) + { + Ok(Some(ColumnOption::Unique { is_primary: true })) + } else if self.consume_token(TAG) { + // Support TAG for ceresdbx + Ok(Some(ColumnOption::DialectSpecific(vec![ + Token::make_keyword(TAG), + ]))) + } else if self.consume_token(UNSIGN) { + // Support unsign for ceresdbx + Ok(Some(ColumnOption::DialectSpecific(vec![ + Token::make_keyword(UNSIGN), + ]))) + } else if self.consume_token(COMMENT) { + let comment = self.parser.parse_literal_string()?; + Ok(Some(ColumnOption::DialectSpecific(vec![ + Token::make_keyword(COMMENT), + Token::SingleQuotedString(comment), + ]))) + } else { + Ok(None) + } + } + + fn consume_token(&mut self, expected: &str) -> bool { + if self.parser.peek_token().to_string().to_uppercase() == *expected.to_uppercase() { + self.parser.next_token(); + true + } else { + false + } + } +} + +#[cfg(test)] +mod tests { + use sqlparser::ast::{DataType, Ident, ObjectName, Value}; + + use super::*; + + fn expect_parse_ok(sql: &str, expected: Statement) -> Result<()> { + let statements = Parser::parse_sql(sql)?; + assert_eq!( + statements.len(), + 1, + "Expected to parse exactly one statement" + ); + assert_eq!(statements[0], expected); + Ok(()) + } + + /// Parses sql and asserts that the expected error message was found + fn expect_parse_error(sql: &str, expected_error: &str) { + match Parser::parse_sql(sql) { + Ok(statements) => { + panic!( + "Expected parse error for '{}', but was successful: {:?}", + sql, statements + ); + } + Err(e) => { + let error_message = e.to_string(); + assert!( + error_message.contains(expected_error), + "Expected error '{}' not found in actual error '{}'", + expected_error, + error_message + ); + } + } + } + + fn make_column_def(name: impl Into, data_type: DataType) -> ColumnDef { + ColumnDef { + name: Ident { + value: name.into(), + quote_style: None, + }, + data_type, + collation: None, + options: vec![], + } + } + + fn make_tag_column_def(name: impl Into, data_type: DataType) -> ColumnDef { + ColumnDef { + name: Ident { + value: name.into(), + quote_style: None, + }, + data_type, + collation: None, + options: vec![ColumnOptionDef { + name: None, + option: ColumnOption::DialectSpecific(vec![Token::make_keyword(TAG)]), + }], + } + } + + fn make_object_name(name: impl Into) -> ObjectName { + ObjectName(vec![Ident::new(name)]) + } + + #[test] + fn create_table() { + // positive case + let sql = "CREATE TABLE IF NOT EXISTS t(c1 double)"; + let expected = Statement::Create(CreateTable { + if_not_exists: true, + name: make_object_name("t"), + columns: vec![make_column_def("c1", DataType::Double)], + engine: table_engine::ANALYTIC_ENGINE_TYPE.to_string(), + constraints: vec![], + options: vec![], + }); + expect_parse_ok(sql, expected).unwrap(); + + // positive case, multiple columns + let sql = "CREATE TABLE mytbl(c1 timestamp, c2 double, c3 string,) ENGINE = XX"; + let expected = Statement::Create(CreateTable { + if_not_exists: false, + name: make_object_name("mytbl"), + columns: vec![ + make_column_def("c1", DataType::Timestamp), + make_column_def("c2", DataType::Double), + make_column_def("c3", DataType::String), + ], + engine: "XX".to_string(), + constraints: vec![], + options: vec![], + }); + expect_parse_ok(sql, expected).unwrap(); + + // Error cases: Invalid sql + let sql = "CREATE TABLE t(c1 timestamp) AS"; + expect_parse_error( + sql, + "sql parser error: Expected end of statement, found: AS", + ); + } + + #[test] + fn test_unsign_tag_column() { + let sql = "CREATE TABLE IF NOT EXISTS t(c1 string tag, c2 float, c3 bigint unsign)"; + let statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + match &statements[0] { + Statement::Create(v) => { + let columns = &v.columns; + assert_eq!(3, columns.len()); + for c in columns { + if c.name.value == "c1" { + assert_eq!(1, c.options.len()); + let opt = &c.options[0]; + assert!(is_tag_column(&opt.option)); + } else if c.name.value == "c2" { + assert_eq!(0, c.options.len()); + } else if c.name.value == "c3" { + assert_eq!(1, c.options.len()); + let opt = &c.options[0]; + assert!(is_unsign_column(&opt.option)); + } else { + panic!("failed"); + } + } + } + _ => panic!("failed"), + } + } + + #[test] + fn test_comment_column() { + let sql = "CREATE TABLE IF NOT EXISTS t(c1 string, c2 float, c3 bigint comment 'id')"; + let statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + match &statements[0] { + Statement::Create(v) => { + let columns = &v.columns; + assert_eq!(3, columns.len()); + for c in columns { + if c.name.value == "c3" { + assert_eq!(1, c.options.len()); + let opt = &c.options[0]; + let comment = get_column_comment(&opt.option).unwrap(); + assert_eq!("id", comment); + } + } + } + _ => panic!("failed"), + } + } + + #[test] + fn test_timestamp_key_constraint() { + let sql = "CREATE TABLE IF NOT EXISTS t(c1 TIMESTAMP, TIMESTAMP key(c1))"; + let statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + match &statements[0] { + Statement::Create(v) => { + let constraints = &v.constraints; + assert_eq!(1, constraints.len()); + assert!(is_timestamp_key_constraint(&constraints[0])); + } + _ => panic!("failed"), + } + } + + #[test] + fn create_table_engine() { + let sql = "CREATE TABLE IF NOT EXISTS t(c1 double)"; + let statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + match &statements[0] { + Statement::Create(v) => { + assert_eq!(v.engine, table_engine::ANALYTIC_ENGINE_TYPE.to_string()) + } + _ => panic!("failed"), + } + + let sql = "CREATE TABLE IF NOT EXISTS t(c1 double) ENGINE = XX"; + let statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + match &statements[0] { + Statement::Create(v) => assert_eq!(v.engine, "XX".to_string()), + _ => panic!("failed"), + } + + let sql = "CREATE TABLE IF NOT EXISTS t(c1 double) engine = XX2"; + let statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + match &statements[0] { + Statement::Create(v) => assert_eq!(v.engine, "XX2".to_string()), + _ => panic!("failed"), + } + } + + #[test] + fn test_alter_table_option() { + let sql = "ALTER TABLE test_ttl modify SETTING arena_block_size='1k';"; + let statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + match &statements[0] { + Statement::AlterModifySetting(v) => { + assert_eq!(v.table_name.to_string(), "test_ttl".to_string()); + assert_eq!(v.options.len(), 1); + assert_eq!(v.options[0].name.value, "arena_block_size".to_string()); + assert_eq!( + v.options[0].value, + Value::SingleQuotedString("1k".to_string()) + ); + } + _ => panic!("failed"), + } + } + + #[test] + fn test_alter_table_column() { + { + let sql = "ALTER TABLE t ADD COLUMN (c1 DOUBLE, c2 STRING)"; + let expected = Statement::AlterAddColumn(AlterAddColumn { + table_name: make_object_name("t"), + columns: vec![ + make_column_def("c1", DataType::Double), + make_column_def("c2", DataType::String), + ], + }); + expect_parse_ok(sql, expected).unwrap(); + } + + { + let sql = "ALTER TABLE t ADD COLUMN c1 DOUBLE"; + let expected = Statement::AlterAddColumn(AlterAddColumn { + table_name: make_object_name("t"), + columns: vec![make_column_def("c1", DataType::Double)], + }); + expect_parse_ok(sql, expected).unwrap(); + } + } + + #[test] + fn test_alter_table_tag_column() { + { + let sql = "ALTER TABLE t ADD COLUMN (c1 DOUBLE, c2 STRING tag)"; + let expected = Statement::AlterAddColumn(AlterAddColumn { + table_name: make_object_name("t"), + columns: vec![ + make_column_def("c1", DataType::Double), + make_tag_column_def("c2", DataType::String), + ], + }); + expect_parse_ok(sql, expected).unwrap(); + } + + { + let sql = "ALTER TABLE t ADD COLUMN c1 string tag"; + let expected = Statement::AlterAddColumn(AlterAddColumn { + table_name: make_object_name("t"), + columns: vec![make_tag_column_def("c1", DataType::String)], + }); + expect_parse_ok(sql, expected).unwrap(); + } + } + + #[test] + fn test_drop_table() { + let sql = "drop table test_ttl"; + let statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + match &statements[0] { + Statement::Drop(DropTable { + name, + if_exists, + engine, + }) => { + assert_eq!(name.to_string(), "test_ttl".to_string()); + assert!(!if_exists); + assert_eq!(*engine, ANALYTIC_ENGINE_TYPE.to_string()); + } + _ => panic!("failed"), + } + + let sql = "drop table if exists test_ttl"; + let statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + match &statements[0] { + Statement::Drop(DropTable { + name, + if_exists, + engine, + }) => { + assert_eq!(name.to_string(), "test_ttl".to_string()); + assert!(if_exists); + assert_eq!(*engine, ANALYTIC_ENGINE_TYPE.to_string()); + } + _ => panic!("failed"), + } + } + + #[test] + fn test_exists_table() { + { + let sql = "EXISTS TABLE xxx_table"; + let expected = Statement::Exists(ExistsTable { + table_name: make_object_name("xxx_table"), + }); + expect_parse_ok(sql, expected).unwrap(); + } + + { + let sql = "EXISTS xxx_table"; + let expected = Statement::Exists(ExistsTable { + table_name: make_object_name("xxx_table"), + }); + expect_parse_ok(sql, expected).unwrap() + } + } +} diff --git a/sql/src/plan.rs b/sql/src/plan.rs new file mode 100644 index 0000000000..25c9fe9874 --- /dev/null +++ b/sql/src/plan.rs @@ -0,0 +1,158 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Logical plans such as select/insert/update/delete + +use std::{ + collections::{BTreeMap, HashMap}, + fmt, + fmt::{Debug, Formatter}, + sync::Arc, +}; + +use arrow_deps::datafusion::logical_plan::LogicalPlan as DataFusionLogicalPlan; +use common_types::{column_schema::ColumnSchema, row::RowGroup, schema::Schema}; +use common_util::define_result; +use snafu::Snafu; +use table_engine::table::TableRef; + +use crate::{ast::ShowCreateObject, container::TableContainer}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Unsupported alter table operation."))] + UnsupportedOperation, + + #[snafu(display("Unsupported column data type, err:{}.", source))] + UnsupportedDataType { source: common_types::datum::Error }, + + #[snafu(display("Unsupported column option:{}.", name))] + UnsupportedColumnOption { name: String }, + + #[snafu(display("Alter primary key is not allowed."))] + AlterPrimaryKey, +} + +define_result!(Error); + +// TODO(yingwen): Custom Debug format +/// Logical plan to be processed by interpreters +#[derive(Debug)] +pub enum Plan { + /// A SQL SELECT plan or other plans related to query + Query(QueryPlan), + // TODO(yingwen): Other sql command + Insert(InsertPlan), + /// Create table plan + Create(CreateTablePlan), + /// Drop table plan + Drop(DropTablePlan), + /// Describe table plan + Describe(DescribeTablePlan), + /// Alter table plan + AlterTable(AlterTablePlan), + /// Show create plan + ShowCreate(ShowCreatePlan), + /// Exists table + Exists(ExistsTablePlan), +} + +pub struct QueryPlan { + pub df_plan: DataFusionLogicalPlan, + // Contains the TableProviders so we can register the them to ExecutionContext later. + // Use TableProviderAdapter here so we can get the underlying TableRef and also be + // able to cast to Arc + pub tables: Arc, +} + +impl Debug for QueryPlan { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("QueryPlan") + .field("df_plan", &self.df_plan) + .finish() + } +} + +pub struct CreateTablePlan { + /// Engine + pub engine: String, + /// Create table if not exists + pub if_not_exists: bool, + /// Table name + pub table: String, + /// Table schema + pub table_schema: Schema, + /// Table options + pub options: HashMap, +} + +impl Debug for CreateTablePlan { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("CreateTablePlan") + .field("engine", &self.engine) + .field("if_not_exists", &self.if_not_exists) + .field("table", &self.table) + .field("table_schema", &self.table_schema) + .field( + "options", + &self + .options + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect::>(), + ) + .finish() + } +} + +#[derive(Debug)] +pub struct DropTablePlan { + /// Engine + pub engine: String, + /// If exists + pub if_exists: bool, + /// Table name + pub table: String, +} + +/// Insert logical plan +#[derive(Debug)] +pub struct InsertPlan { + /// The table to insert + pub table: TableRef, + /// RowGroup to insert + pub rows: RowGroup, +} + +#[derive(Debug)] +pub struct DescribeTablePlan { + /// The table to describe + pub table: TableRef, +} + +#[derive(Debug)] +pub enum AlterTableOperation { + /// Add a new column, the column id will be ignored. + AddColumn(Vec), + ModifySetting(HashMap), +} + +#[derive(Debug)] +pub struct AlterTablePlan { + /// The table to alter. + pub table: TableRef, + // TODO(yingwen): Maybe use smallvec. + pub operations: AlterTableOperation, +} + +#[derive(Debug)] +pub struct ShowCreatePlan { + /// The table to show. + pub table: TableRef, + /// The type to show + pub obj_type: ShowCreateObject, +} + +#[derive(Debug)] +pub struct ExistsTablePlan { + pub exists: bool, +} diff --git a/sql/src/planner.rs b/sql/src/planner.rs new file mode 100644 index 0000000000..5bc467c5c5 --- /dev/null +++ b/sql/src/planner.rs @@ -0,0 +1,1277 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Planner converts a SQL AST into logical plans + +use std::{ + collections::{BTreeMap, HashMap}, + convert::TryFrom, + mem, + sync::Arc, +}; + +use arrow_deps::datafusion::{error::DataFusionError, sql::planner::SqlToRel}; +use common_types::{ + column_schema::{self, ColumnSchema}, + datum::{Datum, DatumKind}, + request_id::RequestId, + row::{RowGroup, RowGroupBuilder}, + schema::{self, Schema, TSID_COLUMN}, +}; +use log::debug; +use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu}; +use sqlparser::ast::{ + ColumnDef, ColumnOption, Expr, ObjectName, Query, SetExpr, SqlOption, + Statement as SqlStatement, TableConstraint, Value, Values, +}; +use table_engine::table::TableRef; + +use crate::{ + ast::{ + AlterAddColumn, AlterModifySetting, CreateTable, DescribeTable, DropTable, ExistsTable, + ShowCreate, Statement, + }, + container::TableReference, + parser, + plan::{ + AlterTableOperation, AlterTablePlan, CreateTablePlan, DescribeTablePlan, DropTablePlan, + ExistsTablePlan, InsertPlan, Plan, QueryPlan, ShowCreatePlan, + }, + promql::{ColumnNames, Expr as PromExpr}, + provider::{ContextProviderAdapter, MetaProvider}, +}; + +// We do not carry backtrace in sql error because it is mainly used in server +// handler and the error is usually caused by invalid/unsupported sql, which +// should be easy to find out the reason. +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("DataFusion Failed to plan, err:{}", source))] + DataFusionPlan { source: DataFusionError }, + + // Statement is too large and complicate to carry in Error, so we + // only return error here, so the caller should attach sql to its + // error context + #[snafu(display("Unsupported SQL statement"))] + UnsupportedStatement, + + #[snafu(display("Create table name is empty"))] + CreateTableNameEmpty, + + #[snafu(display("Table must contain timestamp constraint"))] + RequireTimestamp, + + #[snafu(display( + "Table must contain only one timestamp key and it's data type must be TIMESTAMP" + ))] + InvalidTimetampKey, + + #[snafu(display("Invalid unsign type: {}.\nBacktrace:\n{}", kind, backtrace))] + InvalidUnsignType { + kind: DatumKind, + backtrace: Backtrace, + }, + + #[snafu(display("Primary key not found, column name:{}", name))] + PrimaryKeyNotFound { name: String }, + + #[snafu(display("Tag column not found, name:{}", name))] + TagColumnNotFound { name: String }, + + #[snafu(display("Timestamp column not found, name:{}", name))] + TimestampColumnNotFound { name: String }, + + #[snafu(display("{} is a reserved column name", name))] + ColumnNameReserved { name: String }, + + #[snafu(display("Invalid create table name, err:{}", source))] + InvalidCreateTableName { source: DataFusionError }, + + #[snafu(display("Failed to build schema, err:{}", source))] + BuildTableSchema { source: common_types::schema::Error }, + + #[snafu(display("Unsupported SQL data type, err:{}", source))] + UnsupportedDataType { source: common_types::datum::Error }, + + #[snafu(display("Invalid column schema, column_name:{}, err:{}", column_name, source))] + InvalidColumnSchema { + column_name: String, + source: column_schema::Error, + }, + + #[snafu(display("Invalid table name, err:{}", source))] + InvalidTableName { source: DataFusionError }, + + #[snafu(display("Table not found, table:{}", name))] + TableNotFound { name: String }, + + #[snafu(display("Column is not null, table:{}, column:{}", table, column))] + InsertMissingColumn { table: String, column: String }, + + #[snafu(display("Column is reserved, table:{}, column:{}", table, column))] + InsertReservedColumn { table: String, column: String }, + + #[snafu(display("Unknown insert column, name:{}", name))] + UnknownInsertColumn { name: String }, + + #[snafu(display("Insert values not enough, len:{}, index:{}", len, index))] + InsertValuesNotEnough { len: usize, index: usize }, + + #[snafu(display("Invalid insert stmt, contains duplicate columns"))] + InsertDuplicateColumns, + + #[snafu(display("Invalid insert stmt, source should be a set"))] + InsertSourceBodyNotSet, + + #[snafu(display("Invalid insert stmt, source expr is not value"))] + InsertExprNotValue, + + #[snafu(display("Insert Failed to convert value, err:{}", source))] + InsertConvertValue { source: common_types::datum::Error }, + + #[snafu(display("Failed to build row, err:{}", source))] + BuildRow { source: common_types::row::Error }, + + #[snafu(display("MetaProvider Failed to find table, err:{}", source))] + MetaProviderFindTable { source: crate::provider::Error }, + + #[snafu(display("Failed to find meta during planning, err:{}", source))] + FindMeta { source: crate::provider::Error }, + + #[snafu(display("Invalid alter table operation, err:{}", source))] + InvalidAlterTableOperation { source: crate::plan::Error }, + + #[snafu(display("Unsupported sql option, value:{}", value))] + UnsupportedOption { value: String }, + + #[snafu(display("Failed to build plan from promql, error:{}", source))] + BuildPromPlanError { source: crate::promql::Error }, +} + +define_result!(Error); + +/// Planner produces logical plans from SQL AST +// TODO(yingwen): Rewrite Planner instead of using datafusion's planner +pub struct Planner<'a, P: MetaProvider> { + provider: &'a P, + request_id: RequestId, + read_parallelism: usize, +} + +impl<'a, P: MetaProvider> Planner<'a, P> { + /// Create a new logical planner + pub fn new(provider: &'a P, request_id: RequestId, read_parallelism: usize) -> Self { + Self { + provider, + request_id, + read_parallelism, + } + } + + /// Create a logical plan from Statement + /// + /// Takes the ownership of statement because some statements like INSERT + /// statements contains lots of data + pub fn statement_to_plan(&self, statement: Statement) -> Result { + let adapter = + ContextProviderAdapter::new(self.provider, self.request_id, self.read_parallelism); + // SqlToRel needs to hold the reference to adapter, thus we can't both holds the + // adapter and the SqlToRel in Planner, which is a self-referential + // case. We wrap a PlannerDelegate to workaround this and avoid the usage of + // pin. + let planner = PlannerDelegate::new(adapter); + + match statement { + Statement::Standard(s) => planner.sql_statement_to_plan(*s), + Statement::Create(s) => planner.create_table_to_plan(s), + Statement::Drop(s) => planner.drop_table_to_plan(s), + Statement::Describe(s) => planner.describe_table_to_plan(s), + Statement::AlterModifySetting(s) => planner.alter_modify_setting_to_plan(s), + Statement::AlterAddColumn(s) => planner.alter_add_column_to_plan(s), + Statement::ShowCreate(s) => planner.show_create_to_plan(s), + Statement::Exists(s) => planner.exists_table_to_plan(s), + } + } + + pub fn promql_expr_to_plan(&self, expr: PromExpr) -> Result<(Plan, Arc)> { + let adapter = + ContextProviderAdapter::new(self.provider, self.request_id, self.read_parallelism); + // SqlToRel needs to hold the reference to adapter, thus we can't both holds the + // adapter and the SqlToRel in Planner, which is a self-referential + // case. We wrap a PlannerDelegate to workaround this and avoid the usage of + // pin. + let planner = PlannerDelegate::new(adapter); + + expr.to_plan(planner.meta_provider, self.read_parallelism) + .context(BuildPromPlanError) + } +} + +/// A planner wraps the datafusion's logical planner, and delegate sql like +/// select/explain to datafusion's planner. +struct PlannerDelegate<'a, P: MetaProvider> { + meta_provider: ContextProviderAdapter<'a, P>, +} + +impl<'a, P: MetaProvider> PlannerDelegate<'a, P> { + fn new(meta_provider: ContextProviderAdapter<'a, P>) -> Self { + Self { meta_provider } + } + + fn sql_statement_to_plan(self, sql_stmt: SqlStatement) -> Result { + match sql_stmt { + // Query statement use datafusion planner + SqlStatement::Explain { .. } | SqlStatement::Query(_) => { + self.sql_statement_to_datafusion_plan(sql_stmt) + } + SqlStatement::Insert { .. } => self.insert_to_plan(sql_stmt), + _ => UnsupportedStatement.fail(), + } + } + + fn sql_statement_to_datafusion_plan(self, sql_stmt: SqlStatement) -> Result { + let df_planner = SqlToRel::new(&self.meta_provider); + + let df_plan = df_planner + .sql_statement_to_plan(&sql_stmt) + .context(DataFusionPlan)?; + + debug!("Sql statement to datafusion plan, df_plan:\n{:#?}", df_plan); + + // Get all tables needed in the plan + let tables = self.meta_provider.try_into_container().context(FindMeta)?; + + Ok(Plan::Query(QueryPlan { + df_plan, + tables: Arc::new(tables), + })) + } + + fn create_table_to_plan(&self, stmt: CreateTable) -> Result { + ensure!(!stmt.name.0.is_empty(), CreateTableNameEmpty); + + debug!("Create table to plan, stmt:{:?}", stmt); + + // TODO(yingwen): Maybe support create table on other schema? + let table_ref = TableReference::try_from(&stmt.name).context(InvalidCreateTableName)?; + + // Now we only takes the table name and ignore the schema and catalog name + let table = table_ref.table().to_string(); + + let mut schema_builder = + schema::Builder::with_capacity(stmt.columns.len()).auto_increment_column_id(true); + let mut name_column_map = BTreeMap::new(); + + // Build all column schemas. + for col in &stmt.columns { + name_column_map.insert(col.name.value.as_str(), parse_column(col)?); + } + + // Tsid column is a reserved column. + ensure!( + !name_column_map.contains_key(TSID_COLUMN), + ColumnNameReserved { + name: TSID_COLUMN.to_string(), + } + ); + + // Find timestamp key and primary key contraint + let mut primary_key_constraint_idx = None; + let mut timestamp_name = None; + for (idx, constraint) in stmt.constraints.iter().enumerate() { + if let TableConstraint::Unique { + columns, + is_primary, + .. + } = constraint + { + if *is_primary { + primary_key_constraint_idx = Some(idx); + } else if parser::is_timestamp_key_constraint(constraint) { + // Only one timestamp key constraint + ensure!(timestamp_name.is_none(), InvalidTimetampKey); + // Only one column in constraint + ensure!(columns.len() == 1, InvalidTimetampKey); + + let name = &columns[0].value; + let timestamp_column = name_column_map + .get(name as &str) + .context(TimestampColumnNotFound { name })?; + // Ensure type is timestamp + ensure!( + timestamp_column.data_type == DatumKind::Timestamp, + InvalidTimetampKey + ); + + timestamp_name = Some(name.clone()); + } + } + } + + // Timestamp column must be provided. + let timestamp_name = timestamp_name.context(RequireTimestamp)?; + + // Build primary key, the builder will check timestamp column is in primary key. + if let Some(idx) = primary_key_constraint_idx { + // If primary key is already provided, use that primary key. + if let TableConstraint::Unique { columns, .. } = &stmt.constraints[idx] { + for col in columns { + let key_column = name_column_map.remove(&*col.value).with_context(|| { + PrimaryKeyNotFound { + name: col.value.clone(), + } + })?; + // The schema builder will checks there is only one timestamp column in primary + // key. + schema_builder = schema_builder + .add_key_column(key_column) + .context(BuildTableSchema)?; + } + } + } else { + // If primary key is not set, Use (timestamp, tsid) as primary key. + let timestamp_column = name_column_map.remove(timestamp_name.as_str()).context( + TimestampColumnNotFound { + name: ×tamp_name, + }, + )?; + let column_schema = + column_schema::Builder::new(TSID_COLUMN.to_string(), DatumKind::UInt64) + .is_nullable(false) + .build() + .context(InvalidColumnSchema { + column_name: TSID_COLUMN, + })?; + schema_builder = schema_builder + .enable_tsid_primary_key(true) + .add_key_column(timestamp_column) + .context(BuildTableSchema)? + .add_key_column(column_schema) + .context(BuildTableSchema)?; + } + + // The key columns have been consumed. + for col in name_column_map.into_values() { + schema_builder = schema_builder + .add_normal_column(col) + .context(BuildTableSchema)?; + } + + let table_schema = schema_builder.build().context(BuildTableSchema)?; + + let options = parse_options(stmt.options)?; + + let plan = CreateTablePlan { + engine: stmt.engine, + if_not_exists: stmt.if_not_exists, + table, + table_schema, + options, + }; + + debug!("Create table to plan, plan:{:?}", plan); + + Ok(Plan::Create(plan)) + } + + fn drop_table_to_plan(&self, stmt: DropTable) -> Result { + let table = if stmt.if_exists { + stmt.name.to_string() + } else { + self.find_table(stmt.name)?.name().to_string() + }; + + Ok(Plan::Drop(DropTablePlan { + engine: stmt.engine, + if_exists: stmt.if_exists, + table, + })) + } + + fn describe_table_to_plan(&self, stmt: DescribeTable) -> Result { + let table = self.find_table(stmt.table_name)?; + + Ok(Plan::Describe(DescribeTablePlan { table })) + } + + // REQUIRE: SqlStatement must be INSERT stmt + fn insert_to_plan(&self, sql_stmt: SqlStatement) -> Result { + match sql_stmt { + SqlStatement::Insert { + table_name, + columns, + source, + .. + } => { + let table = self.find_table(table_name)?; + + let schema = table.schema(); + // Column name and its index in insert stmt: {column name} => index + let column_names_idx: HashMap<_, _> = columns + .iter() + .enumerate() + .map(|(idx, ident)| (&ident.value, idx)) + .collect(); + ensure!( + column_names_idx.len() == columns.len(), + InsertDuplicateColumns + ); + + validate_insert_stmt(table.name(), &schema, &column_names_idx)?; + + // Index in insert values stmt of each column in table schema + let mut column_index_in_insert = Vec::with_capacity(schema.num_columns()); + + // Check all not null columns are provided in stmt, also init + // `column_index_in_insert` + for (idx, column) in schema.columns().iter().enumerate() { + if let Some(tsid_idx) = schema.index_of_tsid() { + if idx == tsid_idx { + // This is a tsid column. + column_index_in_insert.push(InsertMode::Auto); + continue; + } + } + match column_names_idx.get(&column.name) { + Some(idx_in_insert) => { + // This column in schema is also in insert stmt + column_index_in_insert.push(InsertMode::Direct(*idx_in_insert)); + } + None => { + // This column in schema is not in insert stmt + if column.is_nullable { + column_index_in_insert.push(InsertMode::Null); + } else { + // Column is not null and input does not contains that column + return InsertMissingColumn { + table: table.name(), + column: &column.name, + } + .fail(); + } + } + } + } + + let rows = build_row_group(schema, source, column_index_in_insert)?; + + Ok(Plan::Insert(InsertPlan { table, rows })) + } + // We already known this stmt is a INSERT stmt + _ => unreachable!(), + } + } + + fn alter_modify_setting_to_plan(&self, stmt: AlterModifySetting) -> Result { + let table = self.find_table(stmt.table_name)?; + let plan = AlterTablePlan { + table, + operations: AlterTableOperation::ModifySetting(parse_options(stmt.options)?), + }; + Ok(Plan::AlterTable(plan)) + } + + fn alter_add_column_to_plan(&self, stmt: AlterAddColumn) -> Result { + let table = self.find_table(stmt.table_name)?; + let plan = AlterTablePlan { + table, + operations: AlterTableOperation::AddColumn(parse_columns(stmt.columns)?), + }; + Ok(Plan::AlterTable(plan)) + } + + fn exists_table_to_plan(&self, stmt: ExistsTable) -> Result { + let table = self.find_table(stmt.table_name); + match table { + Ok(_) => Ok(Plan::Exists(ExistsTablePlan { exists: true })), + Err(_) => Ok(Plan::Exists(ExistsTablePlan { exists: false })), + } + } + + fn show_create_to_plan(&self, show_create: ShowCreate) -> Result { + let table = self.find_table(show_create.obj_name)?; + let plan = ShowCreatePlan { + table, + obj_type: show_create.obj_type, + }; + Ok(Plan::ShowCreate(plan)) + } + + fn find_table(&self, table_name: ObjectName) -> Result { + let table_ref = TableReference::try_from(&table_name).context(InvalidTableName)?; + + self.meta_provider + .table(table_ref) + .context(MetaProviderFindTable)? + .with_context(|| TableNotFound { + name: table_name.to_string(), + }) + } +} + +#[derive(Debug)] +enum InsertMode { + // Insert the value in expr with given index directly. + Direct(usize), + // No value provided, insert a null. + Null, + // Auto generated column, just temporary fill by default value, the real value will + // be filled by interpreter. + Auto, +} + +/// Build RowGroup +fn build_row_group( + schema: Schema, + source: Box, + column_index_in_insert: Vec, +) -> Result { + // Build row group by schema + match source.body { + SetExpr::Values(Values(values)) => { + let mut row_group_builder = + RowGroupBuilder::with_capacity(schema.clone(), values.len()); + for mut exprs in values { + // Try to build row + let mut row_builder = row_group_builder.row_builder(); + + // For each column in schema, append datum into row builder + for (index_opt, column_schema) in + column_index_in_insert.iter().zip(schema.columns()) + { + match index_opt { + InsertMode::Direct(index) => { + let exprs_len = exprs.len(); + let expr = exprs.get_mut(*index).context(InsertValuesNotEnough { + len: exprs_len, + index: *index, + })?; + + match expr { + Expr::Value(value) => { + let datum = Datum::try_from_sql_value( + &column_schema.data_type, + mem::replace(value, Value::Null), + ) + .context(InsertConvertValue)?; + row_builder = + row_builder.append_datum(datum).context(BuildRow)?; + } + _ => { + InsertExprNotValue.fail()?; + } + } + } + InsertMode::Null => { + // This is a null column + row_builder = + row_builder.append_datum(Datum::Null).context(BuildRow)?; + } + InsertMode::Auto => { + // This is an auto generated column, fill by default value. + let kind = &column_schema.data_type; + row_builder = row_builder + .append_datum(Datum::empty(kind)) + .context(BuildRow)?; + } + } + } + + // Finish this row and append into row group + row_builder.finish().context(BuildRow)?; + } + + // Build the whole row group + Ok(row_group_builder.build()) + } + _ => InsertSourceBodyNotSet.fail(), + } +} + +#[inline] +fn is_tsid_column(name: &str) -> bool { + name == TSID_COLUMN +} + +fn validate_insert_stmt( + table_name: &str, + schema: &Schema, + column_name_idx: &HashMap<&String, usize>, +) -> Result<()> { + for name in column_name_idx.keys() { + if is_tsid_column(name.as_str()) { + return Err(Error::InsertReservedColumn { + table: table_name.to_string(), + column: name.to_string(), + }); + } + schema.column_with_name(name).context(UnknownInsertColumn { + name: name.to_string(), + })?; + } + + Ok(()) +} + +fn parse_options(options: Vec) -> Result> { + let mut parsed_options = HashMap::with_capacity(options.len()); + + for option in options { + let key = option.name.value; + if let Some(value) = parse_for_option(option.value)? { + parsed_options.insert(key, value); + }; + } + + Ok(parsed_options) +} + +/// Parse value for sql option. +pub fn parse_for_option(value: Value) -> Result> { + let value_opt = match value { + Value::Number(n, _long) => Some(n), + Value::SingleQuotedString(v) | Value::DoubleQuotedString(v) => Some(v), + Value::NationalStringLiteral(v) | Value::HexStringLiteral(v) => { + return UnsupportedOption { value: v }.fail(); + } + Value::Boolean(v) => Some(v.to_string()), + Value::Interval { value, .. } => { + return UnsupportedOption { value }.fail(); + } + // Ignore this option if value is null. + Value::Null => None, + }; + + Ok(value_opt) +} + +fn parse_columns(cols: Vec) -> Result> { + let mut parsed_columns = Vec::with_capacity(cols.len()); + + // Build all column schemas. + for col in &cols { + parsed_columns.push(parse_column(col)?); + } + + Ok(parsed_columns) +} + +fn parse_column(col: &ColumnDef) -> Result { + let mut data_type = DatumKind::try_from(&col.data_type).context(UnsupportedDataType)?; + + // Process column options + let mut is_nullable = true; // A column is nullable by default. + let mut is_tag = false; + let mut is_unsign = false; + let mut comment = String::new(); + for option_def in &col.options { + if matches!(option_def.option, ColumnOption::NotNull) { + is_nullable = false; + } else if parser::is_tag_column(&option_def.option) { + is_tag = true; + } else if parser::is_unsign_column(&option_def.option) { + is_unsign = true; + } else if let Some(v) = parser::get_column_comment(&option_def.option) { + comment = v; + } + } + + if is_unsign { + data_type = data_type + .unsign_kind() + .context(InvalidUnsignType { kind: data_type })?; + } + + let builder = column_schema::Builder::new(col.name.value.clone(), data_type) + .is_nullable(is_nullable) + .is_tag(is_tag) + .comment(comment); + + builder.build().context(InvalidColumnSchema { + column_name: &col.name.value, + }) +} + +#[cfg(test)] +mod tests { + use sqlparser::ast::Value; + + use super::*; + use crate::{ + parser::Parser, + planner::{parse_for_option, Planner}, + tests::MockMetaProvider, + }; + + fn quick_test(sql: &str, expected: &str) -> Result<()> { + let mock = MockMetaProvider::default(); + let planner = build_planner(&mock); + let mut statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + let plan = planner.statement_to_plan(statements.remove(0))?; + assert_eq!(format!("{:#?}", plan), expected); + Ok(()) + } + + fn build_planner(provider: &MockMetaProvider) -> Planner { + Planner::new(provider, RequestId::next_id(), 1) + } + + #[test] + pub fn test_parse_for_option() { + let test_string = "aa".to_string(); + // input is_err expected + let test_cases = vec![ + ( + Value::Number("1000".to_string(), false), + false, + Some("1000".to_string()), + ), + ( + Value::SingleQuotedString(test_string.clone()), + false, + Some(test_string.clone()), + ), + ( + Value::DoubleQuotedString(test_string.clone()), + false, + Some(test_string.clone()), + ), + ( + Value::NationalStringLiteral(test_string.clone()), + true, + None, + ), + (Value::HexStringLiteral(test_string.clone()), true, None), + (Value::Boolean(true), false, Some("true".to_string())), + ( + Value::Interval { + value: test_string, + leading_field: None, + leading_precision: None, + last_field: None, + fractional_seconds_precision: None, + }, + true, + None, + ), + (Value::Null, false, None), + ]; + + for (input, is_err, expected) in test_cases { + let ret = parse_for_option(input); + assert_eq!(ret.is_err(), is_err); + if !is_err { + assert_eq!(ret.unwrap(), expected); + } + } + } + + #[test] + fn test_create_statement_to_plan() { + let sql = "CREATE TABLE IF NOT EXISTS t(c1 string tag not null,ts timestamp not null, c3 string, timestamp key(ts),primary key(c1, ts)) \ + ENGINE=Analytic WITH (ttl='70d',update_mode='overwrite',arena_block_size='1KB')"; + quick_test( + sql, + r#"Create( + CreateTablePlan { + engine: "Analytic", + if_not_exists: true, + table: "t", + table_schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "c1", + data_type: String, + is_nullable: false, + is_tag: true, + comment: "", + }, + ColumnSchema { + id: 2, + name: "ts", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "c3", + data_type: String, + is_nullable: true, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + options: { + "arena_block_size": "1KB", + "ttl": "70d", + "update_mode": "overwrite", + }, + }, +)"#, + ) + .unwrap(); + } + + #[test] + fn test_query_statement_to_plan() { + let sql = "select * from test_tablex;"; + assert!(quick_test(sql, "").is_err()); + + let sql = "select * from test_table;"; + quick_test(sql, "Query( + QueryPlan { + df_plan: Projection: #test_table.key1, #test_table.key2, #test_table.field1, #test_table.field2 + TableScan: test_table projection=None, + }, +)").unwrap(); + } + + #[test] + fn test_insert_statement_to_plan() { + let sql = "INSERT INTO test_tablex(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3');"; + assert!(quick_test(sql, "").is_err()); + + let sql = "INSERT INTO test_table(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3');"; + quick_test( + sql, + r#"Insert( + InsertPlan { + table: MemoryTable { + name: "test_table", + id: TableId(100, 0, 100), + schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + }, + rows: RowGroup { + schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + rows: [ + Row { + cols: [ + Varbinary( + b"tagk", + ), + Timestamp( + Timestamp( + 1638428434000, + ), + ), + Double( + 100.0, + ), + String( + StringBytes( + b"hello3", + ), + ), + ], + }, + ], + min_timestamp: Timestamp( + 1638428434000, + ), + max_timestamp: Timestamp( + 1638428434000, + ), + }, + }, +)"#, + ) + .unwrap(); + } + + #[test] + fn test_drop_statement_to_plan() { + let sql = "drop table test_table;"; + quick_test( + sql, + r#"Drop( + DropTablePlan { + engine: "Analytic", + if_exists: false, + table: "test_table", + }, +)"#, + ) + .unwrap(); + + let sql = "drop table test_tablex;"; + assert!(quick_test(sql, "",).is_err()); + + let sql = "drop table if exists test_tablex;"; + quick_test( + sql, + r#"Drop( + DropTablePlan { + engine: "Analytic", + if_exists: true, + table: "test_tablex", + }, +)"#, + ) + .unwrap(); + } + + #[test] + fn test_desc_statement_to_plan() { + let sql = "desc test_tablex;"; + assert!(quick_test(sql, "").is_err()); + + let sql = "desc test_table;"; + quick_test( + sql, + r#"Describe( + DescribeTablePlan { + table: MemoryTable { + name: "test_table", + id: TableId(100, 0, 100), + schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + }, + }, +)"#, + ) + .unwrap(); + } + + #[test] + fn test_alter_column_statement_to_plan() { + let sql = "ALTER TABLE test_tablex ADD column add_col string;"; + assert!(quick_test(sql, "").is_err()); + + let sql = "ALTER TABLE test_table ADD column add_col string;"; + quick_test( + sql, + r#"AlterTable( + AlterTablePlan { + table: MemoryTable { + name: "test_table", + id: TableId(100, 0, 100), + schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + }, + operations: AddColumn( + [ + ColumnSchema { + id: 0, + name: "add_col", + data_type: String, + is_nullable: true, + is_tag: false, + comment: "", + }, + ], + ), + }, +)"#, + ) + .unwrap(); + } + + #[test] + fn test_alter_option_statement_to_plan() { + let sql = "ALTER TABLE test_tablex modify SETTING ttl='9d';"; + assert!(quick_test(sql, "").is_err()); + + let sql = "ALTER TABLE test_table modify SETTING ttl='9d';"; + quick_test( + sql, + r#"AlterTable( + AlterTablePlan { + table: MemoryTable { + name: "test_table", + id: TableId(100, 0, 100), + schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + }, + operations: ModifySetting( + { + "ttl": "9d", + }, + ), + }, +)"#, + ) + .unwrap(); + } + + #[test] + fn test_show_create_statement_to_plan() { + let sql = "show create table test_tablex;"; + assert!(quick_test(sql, "").is_err()); + + let sql = "show create table test_table;"; + quick_test( + sql, + r#"ShowCreate( + ShowCreatePlan { + table: MemoryTable { + name: "test_table", + id: TableId(100, 0, 100), + schema: Schema { + num_key_columns: 2, + timestamp_index: 1, + tsid_index: None, + enable_tsid_primary_key: false, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: false, + is_tag: false, + comment: "", + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: false, + is_tag: false, + comment: "", + }, + ], + }, + version: 1, + }, + }, + obj_type: Table, + }, +)"#, + ) + .unwrap(); + } +} diff --git a/sql/src/promql.rs b/sql/src/promql.rs new file mode 100644 index 0000000000..2113681eea --- /dev/null +++ b/sql/src/promql.rs @@ -0,0 +1,10 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +mod convert; +mod datafusion_util; +mod pushdown; +mod udf; + +pub use convert::{Error, Expr}; +pub use datafusion_util::{ColumnNames, PromAlignNode}; +pub use pushdown::{AlignParameter, Func}; diff --git a/sql/src/promql/convert.rs b/sql/src/promql/convert.rs new file mode 100644 index 0000000000..005f2ebeb1 --- /dev/null +++ b/sql/src/promql/convert.rs @@ -0,0 +1,673 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + convert::{TryFrom, TryInto}, + sync::Arc, +}; + +use arrow_deps::datafusion::{ + error::DataFusionError, + logical_plan::{ + avg, col, combine_filters, count, lit, max, min, plan::Extension, sum, + Expr as DataFusionExpr, LogicalPlan, LogicalPlanBuilder, + }, + sql::planner::ContextProvider, +}; +use ceresdbproto::prometheus::{ + Expr as ExprPb, Filter as FilterPb, FilterType as FilterPbType, Operand as OperandPb, + Selector as PbSelector, SubExpr as PbSubExpr, SubExpr_OperatorType, +}; +use common_types::{ + schema::{Schema, TSID_COLUMN}, + time::{TimeRange, Timestamp}, +}; +use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu}; + +use crate::{ + plan::{Plan, QueryPlan}, + promql::{ + datafusion_util::{default_sort_exprs, timerange_to_expr}, + pushdown::{AlignParameter, Func}, + udf::{create_unique_id, regex_match_expr}, + ColumnNames, PromAlignNode, + }, + provider::{ContextProviderAdapter, MetaProvider}, +}; + +const INIT_LEVEL: usize = 1; +const DEFAULT_LOOKBACK: i64 = 300_000; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Invalid expr, expected: {}, actual:{:?}", expected, actual))] + UnexpectedExpr { expected: String, actual: String }, + + #[snafu(display("Expr pushdown not implemented. expr:{:?}", expr))] + NotImplemented { expr: String }, + + #[snafu(display("MetaProvider {}, err:{}", msg, source))] + MetaProviderError { + msg: String, + source: crate::provider::Error, + }, + + #[snafu(display("Table not found, table:{}", name))] + TableNotFound { name: String }, + + #[snafu(display("Failed to build schema, err:{}", source))] + BuildTableSchema { source: common_types::schema::Error }, + + #[snafu(display("Failed to build plan, source:{}", source,))] + BuildPlanError { source: DataFusionError }, + + #[snafu(display("Invalid expr, msg:{}\nBacktrace:\n{}", msg, backtrace))] + InvalidExpr { msg: String, backtrace: Backtrace }, + + #[snafu(display("Failed to pushdown, source:{}", source))] + PushdownError { + source: crate::promql::pushdown::Error, + }, +} + +define_result!(Error); + +impl From for Error { + fn from(df_err: DataFusionError) -> Self { + Error::BuildPlanError { source: df_err } + } +} + +#[derive(Debug, Clone)] +pub enum Expr { + SimpleExpr(Operand), + RecursiveExpr(SubExpr), +} + +impl TryFrom for Expr { + type Error = Error; + + fn try_from(mut pb_operand: OperandPb) -> Result { + let op = if pb_operand.has_selector() { + let PbSelector { + measurement: table, + start, + end, + align_start, + align_end, + filters, + range, + field, + offset, + step, + .. + } = pb_operand.take_selector(); + let filters = Into::>::into(filters) + .into_iter() + .map(Filter::from) + .collect::>(); + Operand::Selector(Selector { + table, + filters, + field, + query_range: TimeRange::new_unchecked( + Timestamp::new(start), + Timestamp::new(end + 1), + ), /* [start, end] */ + align_range: TimeRange::new_unchecked( + Timestamp::new(align_start), + Timestamp::new(align_end + 1), + ), /* [align_start, align_end] */ + step, + range, + offset, + }) + } else if pb_operand.has_float_val() { + Operand::Float(pb_operand.get_float_val()) + } else if pb_operand.has_string_val() { + Operand::String(pb_operand.take_string_val()) + } else { + return InvalidExpr { + msg: format!("unknown operand:{:?}", pb_operand), + } + .fail(); + }; + + Ok(Expr::SimpleExpr(op)) + } +} + +impl TryFrom for Expr { + type Error = Error; + + fn try_from(mut expr: ExprPb) -> Result { + if expr.has_operand() { + let operand = expr.take_operand(); + return operand.try_into(); + } else if expr.has_sub_expr() { + let sub_expr = expr.take_sub_expr(); + return sub_expr.try_into(); + } + + InvalidExpr { + msg: format!("unknown expr:{:?}", expr), + } + .fail() + } +} + +impl Expr { + pub fn get_selector(&self) -> &Selector { + match self { + Expr::SimpleExpr(se) => match se { + Operand::Selector(sel) => sel, + _ => unreachable!(), + }, + Expr::RecursiveExpr(re) => re.get_selector(), + } + } + + pub fn is_selector(&self) -> bool { + matches!(self, Expr::SimpleExpr(e) if matches!(e, Operand::Selector(_))) + } + + /// For now, only filters and timestamp are pushdown, we translate it + /// into plan like: + /// Aggregate: (when needed) + /// PromAlign: + /// Sort: (tsid, timestamp) asc + /// Project: + /// Filter: + /// TableScan + pub fn to_plan( + self, + meta_provider: ContextProviderAdapter<'_, P>, + read_parallelism: usize, + ) -> Result<(Plan, Arc)> { + let (logic_plan, column_name, _) = + self.build_plan_iter(&meta_provider, INIT_LEVEL, read_parallelism)?; + let tables = Arc::new( + meta_provider + .try_into_container() + .context(MetaProviderError { + msg: "Failed to find meta", + })?, + ); + Ok(( + Plan::Query(QueryPlan { + df_plan: logic_plan, + tables, + }), + column_name, + )) + } + + fn build_plan_iter( + self, + meta_provider: &ContextProviderAdapter<'_, P>, + level: usize, + read_parallelism: usize, + ) -> Result<(LogicalPlan, Arc, String)> { + match self { + Expr::SimpleExpr(simple_expr) => match simple_expr { + Operand::Selector(selector) => { + let (sub_plan, column_name, table_name) = + selector.clone().into_scan_plan(meta_provider)?; + if level == INIT_LEVEL { + // when only selector is pushdown, align is done in Prometheus itself + // since maybe there are subquery inside one query which require complex + // align logic. + return Ok((sub_plan, column_name, table_name)); + } + // insert PromAlignNode into plan with Func::Instant + let Selector { + align_range, + step, + offset, + .. + } = selector; + let align_param = AlignParameter { + align_range, + step: step.into(), + offset: offset.into(), + lookback_delta: DEFAULT_LOOKBACK.into(), + }; + let align_plan = LogicalPlan::Extension(Extension { + node: Arc::new(PromAlignNode { + input: sub_plan, + func: Func::Instant, + table_name: table_name.clone(), + align_param, + column_name: column_name.clone(), + read_parallelism, + }), + }); + Ok((align_plan, column_name, table_name)) + } + Operand::Float(_) | Operand::String(_) => InvalidExpr { + msg: "scalar value not allowed in plan node", + } + .fail(), + }, + // New plan like: + // PromAlign: + // SubPlan + Expr::RecursiveExpr(recursive_expr) => match recursive_expr { + SubExpr::Func(FuncExpr { op, operands }) => { + assert!(!operands.is_empty()); + let func = Func::try_from(op.as_str()).context(PushdownError {})?; + let first_arg = &operands[0]; + if first_arg.is_selector() { + let selector = first_arg.get_selector(); + let (sub_plan, column_name, table_name) = + selector.clone().into_scan_plan(meta_provider)?; + let Selector { + align_range, + step, + range, + offset, + .. + } = selector; + let align_param = AlignParameter { + align_range: *align_range, + step: step.into(), + offset: offset.into(), + lookback_delta: range.into(), + }; + let align_plan = LogicalPlan::Extension(Extension { + node: Arc::new(PromAlignNode { + input: sub_plan, + table_name: table_name.clone(), + func, + align_param, + column_name: column_name.clone(), + read_parallelism, + }), + }); + return Ok((align_plan, column_name, table_name)); + } + InvalidExpr { + msg: "first arg of func must be selector", + } + .fail() + } + + // New plan like: + // Sort: + // Projection + // Aggregate + // SubPlan + SubExpr::Aggr(AggrExpr { + op, + operands, + group_by, + without, + }) => { + assert!(!operands.is_empty()); + let next_level = level + 1; + // aggregators don't have args, only need to deal with sub_node now. + let sub_node = operands.into_iter().next().unwrap(); + let (sub_plan, column_name, table_name) = + sub_node.build_plan_iter(meta_provider, next_level, read_parallelism)?; + // filter out nonexistent tags + let group_by = group_by + .into_iter() + .filter(|by| column_name.tag_keys.contains(by)) + .collect::>(); + let groupby_columns = if without { + column_name + .tag_keys + .iter() + .filter_map(|tag_key| { + if group_by.contains(tag_key) { + None + } else { + Some(tag_key.as_str()) + } + }) + .collect::>() + } else { + group_by.iter().map(|s| (s.as_str())).collect::>() + }; + let aggr_expr = + Self::aggr_op_expr(&op, &column_name.field, column_name.field.clone())?; + let tag_exprs = groupby_columns.iter().map(|v| col(v)).collect::>(); + let udf_args = tag_exprs.clone(); + let mut groupby_expr = vec![col(&column_name.timestamp)]; + groupby_expr.extend(udf_args); + let unique_id_expr = + // TSID is lost after aggregate, but PromAlignNode need a unique id, so + // mock UUID as tsid based on groupby keys + DataFusionExpr::Alias( + Box::new(DataFusionExpr::ScalarUDF { + fun: Arc::new(create_unique_id(tag_exprs.len())), + args: tag_exprs.clone(), + }), + TSID_COLUMN.to_string(), + ); + let mut projection = tag_exprs.clone(); + projection.extend(vec![ + col(&column_name.timestamp), + col(&column_name.field), + unique_id_expr.clone(), + ]); + let sort_exprs = if tag_exprs.is_empty() { + vec![col(&column_name.timestamp).sort(true, true)] + } else { + vec![ + unique_id_expr.sort(true, true), + col(&column_name.timestamp).sort(true, true), + ] + }; + let builder = LogicalPlanBuilder::from(sub_plan); + let plan = builder + .aggregate(groupby_expr, vec![aggr_expr])? + .project(projection)? + .sort(sort_exprs)? + .build()?; + + Ok((plan, column_name, table_name)) + } + SubExpr::Binary(_) => InvalidExpr { + msg: "Binary Expr not supported", + } + .fail(), + }, + } + } + + fn aggr_op_expr(aggr_op: &str, field: &str, alias: String) -> Result { + let expr = match aggr_op { + "sum" => sum(col(field)), + "max" => max(col(field)), + "min" => min(col(field)), + "count" => count(col(field)), + "avg" => avg(col(field)), + _ => { + return InvalidExpr { + msg: format!("aggr {} not supported now", aggr_op), + } + .fail() + } + }; + + Ok(DataFusionExpr::Alias(Box::new(expr), alias)) + } +} + +#[derive(Debug, Clone)] +pub enum Operand { + String(String), + Float(f64), + Selector(Selector), +} + +#[derive(Debug, Clone)] +pub enum SubExpr { + Aggr(AggrExpr), + Func(FuncExpr), + Binary(BinaryExpr), +} + +impl TryFrom for Expr { + type Error = Error; + + fn try_from(mut pb_sub_expr: PbSubExpr) -> Result { + let op_type = pb_sub_expr.get_op_type(); + + let operator = pb_sub_expr.take_operator(); + let operands = pb_sub_expr + .take_operands() + .into_iter() + .map(Expr::try_from) + .collect::>>()?; + let sub_expr = match op_type { + SubExpr_OperatorType::AGGR => SubExpr::Aggr(AggrExpr { + op: operator, + operands, + group_by: pb_sub_expr.take_group().into_vec(), + without: pb_sub_expr.get_without(), + }), + SubExpr_OperatorType::FUNC => SubExpr::Func(FuncExpr { + op: operator, + operands, + }), + SubExpr_OperatorType::BINARY => { + return NotImplemented { + expr: format!("{:?}", pb_sub_expr), + } + .fail() + } + }; + + Ok(Expr::RecursiveExpr(sub_expr)) + } +} + +impl SubExpr { + pub fn get_selector(&self) -> &Selector { + match self { + SubExpr::Aggr(AggrExpr { operands, .. }) => operands[0].get_selector(), + SubExpr::Func(FuncExpr { operands, .. }) => operands[0].get_selector(), + SubExpr::Binary(BinaryExpr { operands, .. }) => operands[0].get_selector(), + } + } + + pub fn is_range_fn(&self) -> bool { + match self { + Self::Func(FuncExpr { operands, .. }) => match &operands[0] { + Expr::SimpleExpr(Operand::Selector(sel)) => sel.range > 0, + _ => false, + }, + _ => false, + } + } +} + +#[derive(Debug, Clone)] +pub struct AggrExpr { + op: String, + operands: Vec, + group_by: Vec, + without: bool, +} + +#[derive(Debug, Clone)] +pub struct FuncExpr { + op: String, + operands: Vec, +} + +#[derive(Debug, Clone)] +pub struct BinaryExpr { + _op: String, + operands: Vec, + _return_bool: bool, +} + +#[derive(Debug, Clone)] +pub enum FilterType { + LiteralOr, + NotLiteralOr, + Regexp, + NotRegexpMatch, +} + +impl From for FilterType { + fn from(pb_type: FilterPbType) -> Self { + match pb_type { + FilterPbType::LITERAL_OR => FilterType::LiteralOr, + FilterPbType::NOT_LITERAL_OR => FilterType::NotLiteralOr, + FilterPbType::REGEXP => FilterType::Regexp, + FilterPbType::NOT_REGEXP_MATCH => FilterType::NotRegexpMatch, + } + } +} + +#[derive(Debug, Clone)] +pub struct FilterOperator { + typ: FilterType, + params: Vec, +} + +#[derive(Debug, Clone)] +pub struct Filter { + tag_key: String, + operators: Vec, +} + +impl From for DataFusionExpr { + fn from(mut f: Filter) -> DataFusionExpr { + let tag_key = col(&f.tag_key); + // TODO(chenxiang): only compare first op now + let mut first_op = f.operators.remove(0); + match first_op.typ { + // regepx filter only have one param + FilterType::Regexp => regex_match_expr(tag_key, first_op.params.remove(0), true), + FilterType::NotRegexpMatch => { + regex_match_expr(tag_key, first_op.params.remove(0), false) + } + FilterType::LiteralOr => tag_key.in_list( + first_op + .params + .iter() + .map(|v| lit(v.as_str())) + .collect::>(), + false, + ), + FilterType::NotLiteralOr => tag_key.in_list( + first_op + .params + .iter() + .map(|v| lit(v.as_str())) + .collect::>(), + true, + ), + } + } +} + +impl From for Filter { + fn from(mut pb_filter: FilterPb) -> Self { + Self { + tag_key: pb_filter.take_tag_key(), + operators: Into::>::into(pb_filter.take_operators()) + .into_iter() + .map(|mut f| FilterOperator { + typ: f.get_filter_type().into(), + params: f.take_params().into(), + }) + .collect::>(), + } + } +} + +#[derive(Debug, Clone)] +pub struct Selector { + // query params + pub query_range: TimeRange, + pub table: String, + pub filters: Vec, + pub field: String, + + // align params + pub align_range: TimeRange, + pub step: i64, + pub range: i64, + pub offset: i64, +} + +impl Selector { + fn into_scan_plan( + self, + meta_provider: &ContextProviderAdapter<'_, P>, + ) -> Result<(LogicalPlan, Arc, String)> { + let Selector { + query_range, + field, + filters, + table, + .. + } = self; + let table_ref = meta_provider + .table(table.as_str().into()) + .context(MetaProviderError { + msg: "failed to find table".to_string(), + })? + .context(TableNotFound { name: &table })?; + + let table_provider = meta_provider + .get_table_provider(table_ref.name().into()) + .context(TableNotFound { name: &table })?; + let schema = Schema::try_from(table_provider.schema()).context(BuildTableSchema)?; + let timestamp_column_name = schema.timestamp_name().to_string(); + let (projection, tag_keys) = Self::build_projection_tag_keys(&schema, &field)?; + let mut filter_exprs = filters + .iter() + .filter_map(|f| { + // drop non_exist filter + if tag_keys.contains(&f.tag_key) { + Some(DataFusionExpr::from(f.clone())) + } else { + None + } + }) + .collect::>(); + filter_exprs.push(timerange_to_expr(query_range, ×tamp_column_name)); + + let builder = LogicalPlanBuilder::scan(table.clone(), table_provider, None)? + .filter(combine_filters(&filter_exprs).expect("at least one filter(timestamp)"))? + .project(projection)? + .sort(default_sort_exprs(×tamp_column_name))?; + let column_name = Arc::new(ColumnNames { + timestamp: timestamp_column_name, + tag_keys, + field, + }); + let scan_plan = builder.build().context(BuildPlanError)?; + Ok((scan_plan, column_name, table)) + } + + fn build_projection_tag_keys( + schema: &Schema, + field: &str, + ) -> Result<(Vec, Vec)> { + if let Some(f) = schema.column_with_name(field) { + ensure!( + f.data_type.is_f64_castable(), + InvalidExpr { + msg: "field type must be f64-compatibile type", + } + ); + } else { + return InvalidExpr { + msg: format!("field:{} not found", field), + } + .fail(); + }; + let mut tag_keys = Vec::new(); + let mut projection = schema + .columns() + .iter() + .filter_map(|column| { + if column.is_tag { + tag_keys.push(column.name.clone()); + Some(col(&column.name)) + } else { + None + } + }) + .collect::>(); + + let timestamp_expr = col(&schema.column(schema.timestamp_index()).name); + let tsid_expr = schema + .tsid_column() + .map(|c| col(&c.name)) + .context(InvalidExpr { + msg: format!("{} not found", TSID_COLUMN), + })?; + let field_expr = col(field); + projection.extend(vec![timestamp_expr, tsid_expr, field_expr]); + + Ok((projection, tag_keys)) + } +} diff --git a/sql/src/promql/datafusion_util.rs b/sql/src/promql/datafusion_util.rs new file mode 100644 index 0000000000..4e5003e963 --- /dev/null +++ b/sql/src/promql/datafusion_util.rs @@ -0,0 +1,105 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{any::Any, fmt, sync::Arc}; + +use arrow_deps::datafusion::logical_plan::{ + col, lit, DFSchemaRef, Expr as DataFusionExpr, Expr, LogicalPlan, UserDefinedLogicalNode, +}; +use common_types::{schema::TSID_COLUMN, time::TimeRange}; + +use crate::promql::pushdown::{AlignParameter, Func}; + +/// ColumnNames represents meaning of columns in one table. +#[derive(Debug)] +pub struct ColumnNames { + pub timestamp: String, + pub tag_keys: Vec, + pub field: String, +} + +/// Translate to `column_name BETWEEN start AND end` expr +pub fn timerange_to_expr(query_range: TimeRange, column_name: &str) -> DataFusionExpr { + DataFusionExpr::Between { + expr: Box::new(col(column_name)), + negated: false, + low: Box::new(lit(query_range.inclusive_start().as_i64())), + high: Box::new(lit(query_range.exclusive_end().as_i64() + 1)), + } +} + +pub fn default_sort_exprs(timestamp_column: &str) -> Vec { + vec![ + col(TSID_COLUMN).sort(true, true), + col(timestamp_column).sort(true, true), + ] +} + +pub struct PromAlignNode { + pub input: LogicalPlan, + pub column_name: Arc, + pub table_name: String, + pub func: Func, + pub align_param: AlignParameter, + pub read_parallelism: usize, +} + +impl fmt::Debug for PromAlignNode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.fmt_for_explain(f) + } +} + +impl UserDefinedLogicalNode for PromAlignNode { + fn as_any(&self) -> &dyn Any { + self + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.input] + } + + fn schema(&self) -> &DFSchemaRef { + self.input.schema() + } + + fn expressions(&self) -> Vec { + let qualified_name = |n| col(&format!("{}.{}", self.table_name, n)); + + let mut exprs = self + .column_name + .tag_keys + .iter() + .map(qualified_name) + .collect::>(); + + exprs.extend(vec![ + qualified_name(&self.column_name.timestamp), + qualified_name(&self.column_name.field), + ]); + + exprs + } + + fn fmt_for_explain(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "PromAlign: align_param={:?}, column_name={:?}, read_parallelism={}", + self.align_param, self.column_name, self.read_parallelism + ) + } + + fn from_template( + &self, + _exprs: &[Expr], + inputs: &[LogicalPlan], + ) -> std::sync::Arc { + Arc::new(PromAlignNode { + input: inputs[0].clone(), + func: self.func, + table_name: self.table_name.clone(), + column_name: self.column_name.clone(), + align_param: self.align_param, + read_parallelism: self.read_parallelism, + }) + } +} diff --git a/sql/src/promql/pushdown.rs b/sql/src/promql/pushdown.rs new file mode 100644 index 0000000000..f9c0a279d9 --- /dev/null +++ b/sql/src/promql/pushdown.rs @@ -0,0 +1,50 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::convert::TryFrom; + +use common_types::time::{TimeRange, Timestamp}; +use snafu::Snafu; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Func {} is not supported yet", func))] + NotSupportedFunc { func: String }, +} + +define_result!(Error); + +#[derive(Debug, Clone, Copy)] +pub enum Func { + Instant, // used to simulate instant query + Rate, + Irate, + Delta, + Idelta, + Increase, +} + +impl TryFrom<&str> for Func { + type Error = Error; + + fn try_from(op: &str) -> Result { + let t = match op { + "rate" => Func::Rate, + "delta" => Func::Delta, + "irate" => Func::Irate, + "idelta" => Func::Idelta, + "increase" => Func::Increase, + func => return NotSupportedFunc { func }.fail(), + }; + + Ok(t) + } +} + +#[derive(Debug, Clone, Copy)] +pub struct AlignParameter { + pub align_range: TimeRange, + pub step: Timestamp, + pub offset: Timestamp, + /// 0 for no look back + pub lookback_delta: Timestamp, +} diff --git a/sql/src/promql/udf.rs b/sql/src/promql/udf.rs new file mode 100644 index 0000000000..8928f6f790 --- /dev/null +++ b/sql/src/promql/udf.rs @@ -0,0 +1,300 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Copy from IOx +// https://github.com/influxdata/influxdb_iox/blob/d0f588d3b800894fe0ebd06b6f9a184ca6a603d7/predicate/src/regex.rs + +use std::sync::Arc; + +use arrow_deps::{ + arrow::{ + array::{ArrayRef, BooleanArray, StringArray, UInt64Array}, + datatypes::DataType, + }, + datafusion::{ + error::{DataFusionError, Result as DataFusionResult}, + logical_plan::{create_udf, Expr}, + physical_plan::{ + functions::{make_scalar_function, Volatility}, + udf::ScalarUDF, + }, + }, +}; +use common_types::hash::hash64; +use common_util::codec::{compact::MemCompactEncoder, Encoder}; + +/// The name of the regex_match UDF given to DataFusion. +pub const REGEX_MATCH_UDF_NAME: &str = "RegexMatch"; +pub const REGEX_NOT_MATCH_UDF_NAME: &str = "RegexNotMatch"; + +/// Given a column containing string values and a single regex pattern, +/// `regex_match_expr` determines which values satisfy the pattern and which do +/// not. +/// +/// If `matches` is true then this expression will filter values that do not +/// satisfy the regex (equivalent to `col ~= /pattern/`). If `matches` is +/// `false` then the expression will filter values that *do* match the regex, +/// which is equivalent to `col !~ /pattern/`. +/// +/// This UDF is designed to support the regex operator that can be pushed down +/// via the InfluxRPC API. +pub fn regex_match_expr(input: Expr, pattern: String, matches: bool) -> Expr { + // N.B., this function does not utilise the Arrow regexp compute kernel because + // in order to act as a filter it needs to return a boolean array of comparison + // results, not an array of strings as the regex compute kernel does. + let func = move |args: &[ArrayRef]| { + assert_eq!(args.len(), 1); // only works over a single column at a time. + + let input_arr = &args[0].as_any().downcast_ref::().unwrap(); + + let pattern = regex::Regex::new(&pattern).map_err(|e| { + DataFusionError::Internal(format!("error compiling regex pattern: {}", e)) + })?; + + let results = input_arr + .iter() + .map(|row| { + // in arrow, any value can be null. + // Here we decide to make our UDF to return null when either base or exponent is + // null. + row.map(|v| pattern.is_match(v) == matches) + }) + .collect::(); + + Ok(Arc::new(results) as ArrayRef) + }; + + // make_scalar_function is a helper to support accepting scalar values as + // well as arrays. + let func = make_scalar_function(func); + + let udf_name = if matches { + REGEX_MATCH_UDF_NAME + } else { + REGEX_NOT_MATCH_UDF_NAME + }; + + let udf = create_udf( + udf_name, + vec![DataType::Utf8], + Arc::new(DataType::Boolean), + Volatility::Stable, + func, + ); + + udf.call(vec![input]) +} + +pub fn create_unique_id(input_len: usize) -> ScalarUDF { + let func = move |args: &[ArrayRef]| { + if args.is_empty() { + let builder = UUIDBuilder::new(); + let tsid: UInt64Array = [Some(builder.finish())].iter().collect(); + return Ok(Arc::new(tsid) as ArrayRef); + } + let array_len = args[0].len(); + let inputs = args + .iter() + .map(|a| { + a.as_any() + .downcast_ref::() + .ok_or_else(|| DataFusionError::Execution("tag column not string".to_string())) + }) + .collect::>>()?; + + let mut builders = Vec::new(); + builders.resize_with(array_len, UUIDBuilder::new); + for array in &inputs { + array + .iter() + .zip(builders.iter_mut()) + .for_each(|(v, builder)| { + builder.write(v); + }); + } + let results: UInt64Array = builders.into_iter().map(|b| Some(b.finish())).collect(); + Ok(Arc::new(results) as ArrayRef) + }; + + create_udf( + "create_unique_id", + vec![DataType::Utf8; input_len], + Arc::new(DataType::UInt64), + Volatility::Stable, + make_scalar_function(func), + ) +} + +struct UUIDBuilder { + encoder: MemCompactEncoder, + buf: Vec, +} + +impl UUIDBuilder { + fn new() -> Self { + Self { + encoder: MemCompactEncoder, + buf: Vec::new(), + } + } + + fn write(&mut self, value: Option<&str>) { + let value = value.unwrap_or(""); + self.encoder + .encode(&mut self.buf, value.as_bytes()) + .unwrap(); // write mem is safe + } + + fn finish(self) -> u64 { + hash64(&self.buf) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow_deps::{ + arrow::{ + array::{StringArray, UInt64Array}, + record_batch::RecordBatch, + util::pretty::pretty_format_batches, + }, + datafusion::{ + datasource::MemTable, + error::DataFusionError, + logical_plan::{col, Expr as DataFusionExpr}, + prelude::ExecutionContext, + }, + }; + use common_types::schema::{ArrowSchema, ArrowSchemaRef, DataType, Field}; + + #[tokio::test] + async fn regex_match_expr() { + let cases = vec![ + ( + ".*", // match everything except NULL values + true, // keep the values matched + vec![ + "+---------------+--------+", + "| words | length |", + "+---------------+--------+", + "| air | 3 |", + "| aphex twin | 10 |", + "| bruce | 5 |", + "| Blood Orange | 12 |", + "| cocteau twins | 13 |", + "+---------------+--------+", + ], + ), + ( + ".*", // match everything except NULL values + false, // filter away all the values matched + vec!["++", "++"], + ), + ( + "", // an empty pattern also matches everything except NULL + true, + vec![ + "+---------------+--------+", + "| words | length |", + "+---------------+--------+", + "| air | 3 |", + "| aphex twin | 10 |", + "| bruce | 5 |", + "| Blood Orange | 12 |", + "| cocteau twins | 13 |", + "+---------------+--------+", + ], + ), + ( + ".+O.*", // match just words containing "O". + true, + vec![ + "+--------------+--------+", + "| words | length |", + "+--------------+--------+", + "| Blood Orange | 12 |", + "+--------------+--------+", + ], + ), + ( + "^(a|b).*", // match everything beginning with "a" or "b" + false, // negate expression and filter away anything that matches + vec![ + "+---------------+--------+", + "| words | length |", + "+---------------+--------+", + "| Blood Orange | 12 |", + "| cocteau twins | 13 |", + "+---------------+--------+", + ], + ), + ]; + + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("words", DataType::Utf8, true), + Field::new("length", DataType::UInt64, false), + ])); + + // define data for table + let words = vec![ + Some("air"), + Some("aphex twin"), + Some("bruce"), + Some("Blood Orange"), + None, + None, + Some("cocteau twins"), + ]; + let rb = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(StringArray::from(words.clone())), + Arc::new( + words + .iter() + .map(|word| word.map(|word| word.len() as u64)) + .collect::(), + ), + ], + ) + .unwrap(); + let rb = vec![vec![rb]]; + for (pattern, matches, expected) in cases.into_iter() { + let regex_expr = super::regex_match_expr(col("words"), pattern.to_string(), matches); + let actual = run_plan(schema.clone(), rb.clone(), regex_expr) + .await + .unwrap(); + + assert_eq!( + expected, actual, + "\n\nEXPECTED:\n{:#?}\nACTUAL:\n{:#?}\n", + expected, actual + ); + } + } + + // Run a plan against the following input table as "t" + async fn run_plan( + schema: ArrowSchemaRef, + rb: Vec>, + op: DataFusionExpr, + ) -> Result, DataFusionError> { + let provider = MemTable::try_new(Arc::clone(&schema), rb).unwrap(); + let mut ctx = ExecutionContext::new(); + ctx.register_table("t", Arc::new(provider)).unwrap(); + + let df = ctx.table("t").unwrap(); + let df = df.filter(op).unwrap(); + + // execute the query + let record_batches = df.collect().await?; + + Ok(pretty_format_batches(&record_batches) + .unwrap() + .to_string() + .split('\n') + .map(|s| s.to_owned()) + .collect()) + } +} diff --git a/sql/src/provider.rs b/sql/src/provider.rs new file mode 100644 index 0000000000..fee689c411 --- /dev/null +++ b/sql/src/provider.rs @@ -0,0 +1,345 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Adapter to providers in datafusion + +use std::{any::Any, cell::RefCell, collections::HashMap, sync::Arc}; + +use arrow_deps::datafusion::{ + catalog::{catalog::CatalogProvider, schema::SchemaProvider}, + datasource::TableProvider, + physical_plan::{udaf::AggregateUDF, udf::ScalarUDF}, + sql::planner::ContextProvider, +}; +use catalog::manager::Manager; +use common_types::request_id::RequestId; +use snafu::{ResultExt, Snafu}; +use table_engine::{provider::TableProviderAdapter, table::TableRef}; +use udf::{registry::FunctionRegistry, scalar::ScalarUdf, udaf::AggregateUdf}; + +use crate::container::{TableContainer, TableReference}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to find catalog, name:{}, err:{}", name, source))] + FindCatalog { + name: String, + source: catalog::manager::Error, + }, + + #[snafu(display("Failed to find schema, name:{}, err:{}", name, source))] + FindSchema { + name: String, + source: catalog::Error, + }, + + #[snafu(display("Failed to find table, name:{}, err:{}", name, source))] + FindTable { + name: String, + source: catalog::schema::Error, + }, + + #[snafu(display("Failed to find udf, err:{}", source))] + FindUdf { source: udf::registry::Error }, +} + +define_result!(Error); + +/// MetaProvider provides meta info needed by Frontend +pub trait MetaProvider { + /// Default catalog name + fn default_catalog_name(&self) -> &str; + + /// Default schema name + fn default_schema_name(&self) -> &str; + + /// Get table meta by table reference + /// + /// Note that this function may block current thread. We can't make this + /// function async as the underlying (aka. datafusion) planner needs a + /// sync provider. + fn table(&self, name: TableReference) -> Result>; + + /// Get udf by name. + fn scalar_udf(&self, name: &str) -> Result>; + + /// Get udaf by name. + fn aggregate_udf(&self, name: &str) -> Result>; +} + +/// We use an adapter instead of using [catalog::Manager] directly, because +/// - MetaProvider provides blocking method, but catalog::Manager may provide +/// async method +/// - Other meta data like default catalog and schema are needed +// TODO(yingwen): Maybe support schema searching instead of using a fixed +// default schema +pub struct CatalogMetaProvider<'a, M> { + pub manager: &'a M, + pub default_catalog: &'a str, + pub default_schema: &'a str, + pub function_registry: &'a (dyn FunctionRegistry + Send + Sync), +} + +impl<'a, M: Manager> MetaProvider for CatalogMetaProvider<'a, M> { + fn default_catalog_name(&self) -> &str { + self.default_catalog + } + + fn default_schema_name(&self) -> &str { + self.default_schema + } + + fn table(&self, name: TableReference) -> Result> { + let resolved = name.resolve(self.default_catalog, self.default_schema); + + let catalog = match self + .manager + .catalog_by_name(resolved.catalog) + .context(FindCatalog { + name: resolved.catalog, + })? { + Some(c) => c, + None => return Ok(None), + }; + + let schema = match catalog + .schema_by_name(resolved.schema) + .context(FindSchema { + name: resolved.schema, + })? { + Some(s) => s, + None => return Ok(None), + }; + + schema.table_by_name(resolved.table).context(FindTable { + name: resolved.table, + }) + } + + fn scalar_udf(&self, name: &str) -> Result> { + self.function_registry.find_udf(name).context(FindUdf) + } + + fn aggregate_udf(&self, name: &str) -> Result> { + self.function_registry.find_udaf(name).context(FindUdf) + } +} + +/// An adapter to ContextProvider, not thread safe +pub struct ContextProviderAdapter<'a, P> { + /// Local cache for TableProvider to avoid create multiple adapter for the + /// same table, also save all the table needed during planning + table_cache: RefCell, + /// Store the first error MetaProvider returns + err: RefCell>, + meta_provider: &'a P, + request_id: RequestId, + /// Read parallelism for each table. + read_parallelism: usize, +} + +impl<'a, P: MetaProvider> ContextProviderAdapter<'a, P> { + /// Create a adapter from meta provider + pub fn new(meta_provider: &'a P, request_id: RequestId, read_parallelism: usize) -> Self { + let default_catalog = meta_provider.default_catalog_name().to_string(); + let default_schema = meta_provider.default_schema_name().to_string(); + + Self { + table_cache: RefCell::new(TableContainer::new(default_catalog, default_schema)), + err: RefCell::new(None), + meta_provider, + request_id, + read_parallelism, + } + } + + /// Consumes the adapter, returning the tables used during planning if no + /// error occurs, otherwise returning the error + pub fn try_into_container(self) -> Result { + if let Some(e) = self.err.into_inner() { + return Err(e); + } + + Ok(self.table_cache.into_inner()) + } + + /// Save error if there is no existing error. + /// + /// The datafusion's ContextProvider can't return error, so here we save the + /// error in the adapter and return None, also let datafusion + /// return a provider not found error and abort the planning + /// procedure. + fn maybe_set_err(&self, err: Error) { + if self.err.borrow().is_none() { + *self.err.borrow_mut() = Some(err); + } + } +} + +impl<'a, P: MetaProvider> MetaProvider for ContextProviderAdapter<'a, P> { + fn default_catalog_name(&self) -> &str { + self.meta_provider.default_catalog_name() + } + + fn default_schema_name(&self) -> &str { + self.meta_provider.default_schema_name() + } + + fn table(&self, name: TableReference) -> Result> { + self.meta_provider.table(name) + } + + fn scalar_udf(&self, name: &str) -> Result> { + self.meta_provider.scalar_udf(name) + } + + fn aggregate_udf(&self, name: &str) -> Result> { + self.meta_provider.aggregate_udf(name) + } +} + +impl<'a, P: MetaProvider> ContextProvider for ContextProviderAdapter<'a, P> { + fn get_table_provider(&self, name: TableReference) -> Option> { + // Find in local cache + if let Some(p) = self.table_cache.borrow().get(name) { + return Some(p); + } + + // Find in meta provider + match self.meta_provider.table(name) { + Ok(Some(table)) => { + let table_adapter = Arc::new(TableProviderAdapter::new( + table, + self.request_id, + self.read_parallelism, + )); + // Put into cache + self.table_cache + .borrow_mut() + .insert(name, table_adapter.clone()); + + Some(table_adapter) + } + Ok(None) => None, + Err(e) => { + self.maybe_set_err(e); + None + } + } + } + + // ScalarUDF is not supported now + fn get_function_meta(&self, name: &str) -> Option> { + // We don't cache udf used by the query because now we will register all udf to + // datafusion's context. + match self.meta_provider.scalar_udf(name) { + Ok(Some(udf)) => Some(udf.to_datafusion_udf()), + Ok(None) => None, + Err(e) => { + self.maybe_set_err(e); + None + } + } + } + + // AggregateUDF is not supported now + fn get_aggregate_meta(&self, name: &str) -> Option> { + match self.meta_provider.aggregate_udf(name) { + Ok(Some(udaf)) => Some(udaf.to_datafusion_udaf()), + Ok(None) => None, + Err(e) => { + self.maybe_set_err(e); + None + } + } + } +} + +struct SchemaProviderAdapter { + catalog: String, + schema: String, + tables: Arc, +} + +impl SchemaProvider for SchemaProviderAdapter { + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + let mut names = Vec::new(); + let _ = self.tables.visit::<_, ()>(|name, table| { + if name.catalog == self.catalog && name.schema == self.schema { + names.push(table.as_table_ref().name().to_string()); + } + Ok(()) + }); + names + } + + fn table(&self, name: &str) -> Option> { + let name_ref = TableReference::Full { + catalog: &self.catalog, + schema: &self.schema, + table: name, + }; + self.tables + .get(name_ref) + .map(|v| v as Arc) + } + + fn table_exist(&self, name: &str) -> bool { + self.table(name).is_some() + } +} + +#[derive(Default)] +pub struct CatalogProviderAdapter { + schemas: HashMap>, +} + +impl CatalogProviderAdapter { + pub fn new_adapters(tables: Arc) -> HashMap { + let mut catalog_adapters = HashMap::with_capacity(tables.num_catalogs()); + let _ = tables.visit::<_, ()>(|name, _| { + // Get or create catalog + let catalog = match catalog_adapters.get_mut(name.catalog) { + Some(v) => v, + None => catalog_adapters + .entry(name.catalog.to_string()) + .or_insert_with(CatalogProviderAdapter::default), + }; + // Get or create schema + if catalog.schemas.get(name.schema).is_none() { + catalog.schemas.insert( + name.schema.to_string(), + Arc::new(SchemaProviderAdapter { + catalog: name.catalog.to_string(), + schema: name.schema.to_string(), + tables: tables.clone(), + }), + ); + } + + Ok(()) + }); + + catalog_adapters + } +} + +impl CatalogProvider for CatalogProviderAdapter { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + self.schemas.keys().cloned().collect() + } + + fn schema(&self, name: &str) -> Option> { + self.schemas + .get(name) + .cloned() + .map(|v| v as Arc) + } +} diff --git a/sql/src/tests.rs b/sql/src/tests.rs new file mode 100644 index 0000000000..bd49bded4b --- /dev/null +++ b/sql/src/tests.rs @@ -0,0 +1,69 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use arrow_deps::datafusion::catalog::TableReference; +use catalog::consts::{DEFAULT_CATALOG, DEFAULT_SCHEMA}; +use common_types::tests::build_schema; +use table_engine::{ + memory::MemoryTable, + table::{Table, TableId, TableRef}, + ANALYTIC_ENGINE_TYPE, +}; +use udf::{scalar::ScalarUdf, udaf::AggregateUdf}; + +use crate::provider::MetaProvider; + +pub struct MockMetaProvider { + tables: Vec>, +} + +impl Default for MockMetaProvider { + fn default() -> Self { + Self { + tables: vec![ + Arc::new(MemoryTable::new( + "test_table".to_string(), + TableId::from(100), + build_schema(), + ANALYTIC_ENGINE_TYPE.to_string(), + )), + Arc::new(MemoryTable::new( + "test_table2".to_string(), + TableId::from(101), + build_schema(), + ANALYTIC_ENGINE_TYPE.to_string(), + )), + ], + } + } +} + +impl MetaProvider for MockMetaProvider { + fn default_catalog_name(&self) -> &str { + DEFAULT_CATALOG + } + + fn default_schema_name(&self) -> &str { + DEFAULT_SCHEMA + } + + fn table(&self, name: TableReference) -> crate::provider::Result> { + let resolved = name.resolve(self.default_catalog_name(), self.default_schema_name()); + for table in &self.tables { + if resolved.table == table.name() { + return Ok(Some(table.clone())); + } + } + + Ok(None) + } + + fn scalar_udf(&self, _name: &str) -> crate::provider::Result> { + todo!() + } + + fn aggregate_udf(&self, _name: &str) -> crate::provider::Result> { + todo!() + } +} diff --git a/src/bin/ceresdb-server.rs b/src/bin/ceresdb-server.rs new file mode 100644 index 0000000000..627e9ab296 --- /dev/null +++ b/src/bin/ceresdb-server.rs @@ -0,0 +1,83 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! The main entry point to start the server + +// TODO(yingwen): ceresdb-server is a legacy name, maybe use a new name + +use std::env; + +use ceresdbx::setup; +use clap::{App, Arg}; +use common_util::{panic, toml}; +use log::info; +use server::config::Config; + +/// The ip address of current node. +const NODE_ADDR: &str = "CSE_CERES_META_NODE_ADDR"; +const META_PEERS: &str = "META_PEERS"; +const CLUSTER_NAME: &str = "CLUSTER_NAME"; +/// Enable communication with meta node. +const ENABLE_META: &str = "ENABLE_META"; + +fn fetch_version() -> String { + let build_version = env!("VERGEN_BUILD_SEMVER"); + let git_branch = env!("VERGEN_GIT_BRANCH"); + let git_commit_id = env!("VERGEN_GIT_SHA_SHORT"); + let build_time = env!("VERGEN_BUILD_TIMESTAMP"); + + format!( + "\nCeresDB Version: {}\nGit branch: {}\nGit commit: {}\nBuild: {}", + build_version, git_branch, git_commit_id, build_time + ) +} + +fn main() { + let version = fetch_version(); + let matches = App::new("CeresDB Server") + .version(version.as_str()) + .arg( + Arg::with_name("config") + .short("c") + .long("config") + .required(false) + .takes_value(true) + .help("Set configuration file, eg: \"/path/server.toml\""), + ) + .get_matches(); + + let mut config = match matches.value_of("config") { + Some(path) => { + let mut toml_buf = String::new(); + toml::parse_toml_from_path(path, &mut toml_buf).expect("Failed to parse config.") + } + None => Config::default(), + }; + + // Combine configs from env. + if let Ok(enable_meta) = env::var(ENABLE_META) { + if let Ok(enable_meta) = enable_meta.parse::() { + config.meta_client.enable_meta = enable_meta; + } + } + if let Ok(node_addr) = env::var(NODE_ADDR) { + config.meta_client.node = node_addr; + } + if let Ok(meta_addr) = env::var(META_PEERS) { + config.meta_client.meta_addr = meta_addr; + } + if let Ok(cluster) = env::var(CLUSTER_NAME) { + config.meta_client.cluster = cluster; + } + + // Setup log. + let _runtime_level = setup::setup_log(&config); + // Setup tracing. + let _writer_guard = setup::setup_tracing(&config); + + panic::set_panic_hook(false); + + // Log version. + info!("version:{}", version); + + setup::run_server(config); +} diff --git a/src/docs/config.toml b/src/docs/config.toml new file mode 100644 index 0000000000..5a2ede377c --- /dev/null +++ b/src/docs/config.toml @@ -0,0 +1,27 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +bind_addr = "0.0.0.0" +http_port = 5000 +grpc_port = 8831 +log_level = "debug" + +[analytic] +data_path = "/tmp/ceresdbx/" + +[analytic.table_opts] +arena_block_size = 128 + +[[meta_client.cluster_view.shards]] +shard_id = 0 +[[meta_client.cluster_view.shards.nodes]] +addr = '127.0.0.1' +port = 38082 + +[[meta_client.cluster_view.shards]] +shard_id = 1 +[[meta_client.cluster_view.shards.nodes]] +addr = '127.0.0.1' +port = 48082 +[[meta_client.cluster_view.shards.nodes]] +addr = '127.0.0.1' +port = 58082 \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000000..22fed20ac2 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,6 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! ceresdbx + +pub mod setup; +mod signal_handler; diff --git a/src/setup.rs b/src/setup.rs new file mode 100644 index 0000000000..6c2d8263d4 --- /dev/null +++ b/src/setup.rs @@ -0,0 +1,127 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Setup server + +use std::sync::Arc; + +use analytic_engine::{self, setup}; +use catalog_impls::{table_based::TableBasedManager, CatalogManagerImpl}; +use common_util::runtime; +use log::info; +use logger::RuntimeLevel; +use query_engine::executor::ExecutorImpl; +use server::{ + config::{Config, RuntimeConfig}, + server::Builder, + table_engine::{MemoryTableEngine, TableEngineProxy}, +}; +use table_engine::engine::EngineRuntimes; +use tracing_util::{ + self, + tracing_appender::{non_blocking::WorkerGuard, rolling::Rotation}, +}; +use udf::registry::FunctionRegistryImpl; + +use crate::signal_handler; + +/// Setup log with given `config`, returns the runtime log level switch. +pub fn setup_log(config: &Config) -> RuntimeLevel { + server::logger::init_log(config).expect("Failed to init log.") +} + +/// Setup tracing with given `config`, returns the writer guard. +pub fn setup_tracing(config: &Config) -> WorkerGuard { + tracing_util::init_tracing_with_file( + &config.tracing_log_name, + &config.tracing_log_dir, + &config.tracing_level, + Rotation::NEVER, + ) +} + +fn build_runtime(name: &str, threads_num: usize) -> runtime::Runtime { + runtime::Builder::default() + .worker_threads(threads_num) + .thread_name(name) + .enable_all() + .build() + .unwrap_or_else(|e| { + //TODO(yingwen) replace panic with fatal + panic!("Failed to create runtime, err:{}", e); + }) +} + +fn build_engine_runtimes(config: &RuntimeConfig) -> EngineRuntimes { + EngineRuntimes { + read_runtime: Arc::new(build_runtime("cse-read", config.read_thread_num)), + write_runtime: Arc::new(build_runtime("cse-write", config.write_thread_num)), + bg_runtime: Arc::new(build_runtime("cse-bg", config.background_thread_num)), + } +} + +/// Run a server, returns when the server is shutdown by user +pub fn run_server(config: Config) { + let runtimes = Arc::new(build_engine_runtimes(&config.runtime)); + let engine_runtimes = runtimes.clone(); + + info!("Server starts up, config:{:#?}", config); + + runtimes.bg_runtime.block_on(async { + // Build all table engine + // Create memory engine + let memory = MemoryTableEngine; + // Create analytic engine + let analytic_config = config.analytic.clone(); + let analytic = setup::open_analytic_table_engine(analytic_config, engine_runtimes) + .await + .unwrap_or_else(|e| { + panic!("Failed to setup analytic engine, err:{}", e); + }); + + // Create table engine proxy + let engine_proxy = Arc::new(TableEngineProxy { + memory, + analytic: analytic.clone(), + }); + + // Create catalog manager, use analytic table as backend + let catalog_manager = CatalogManagerImpl::new( + TableBasedManager::new(&analytic, engine_proxy.clone()) + .await + .unwrap_or_else(|e| { + panic!("Failed to create catalog manager, err:{}", e); + }), + ); + + // Init function registry. + let mut function_registry = FunctionRegistryImpl::new(); + function_registry.load_functions().unwrap_or_else(|e| { + panic!("Failed to create function registry, err:{}", e); + }); + let function_registry = Arc::new(function_registry); + + // Create query executor + let query_executor = ExecutorImpl::new(); + + // Build and start server + let mut server = Builder::new(config) + .runtimes(runtimes.clone()) + .catalog_manager(catalog_manager) + .query_executor(query_executor) + .table_engine(engine_proxy) + .function_registry(function_registry) + .build() + .unwrap_or_else(|e| { + panic!("Failed to create server, err:{}", e); + }); + server.start().await.unwrap_or_else(|e| { + panic!("Failed to start server,, err:{}", e); + }); + + // Wait for signal + signal_handler::wait_for_signal(); + + // Stop server + server.stop(); + }); +} diff --git a/src/signal_handler.rs b/src/signal_handler.rs new file mode 100644 index 0000000000..39ad1733f4 --- /dev/null +++ b/src/signal_handler.rs @@ -0,0 +1,31 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Signal handler +//! +//! Only works on unix like environments + +pub use self::details::wait_for_signal; + +#[cfg(unix)] +mod details { + use log::info; + use signal_hook::{consts::TERM_SIGNALS, iterator::Signals}; + + pub fn wait_for_signal() { + let mut sigs = Signals::new(TERM_SIGNALS).unwrap_or_else(|e| { + // TODO(yingwen): Log here + panic!("Failed to register signal handlers, err:{}", e); + }); + for signal in &mut sigs { + if TERM_SIGNALS.contains(&signal) { + info!("Received signal {}, stopping server...", signal); + break; + } + } + } +} + +#[cfg(not(unix))] +mod details { + pub fn wait_for_signal() {} +} diff --git a/system_catalog/Cargo.toml b/system_catalog/Cargo.toml new file mode 100644 index 0000000000..c6d4ff7b7a --- /dev/null +++ b/system_catalog/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "system_catalog" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +# In alphabetical order +arrow_deps = { path = "../arrow_deps" } +async-trait = "0.1.41" +catalog = { path = "../catalog" } +common_types = { path = "../common_types" } +common_util = { path = "../common_util" } +futures = "0.3" +log = "0.4" +proto = { path = "../proto" } +protobuf = "2.20" +snafu = { version = "0.6.10", features = ["backtraces"] } +table_engine = { path = "../table_engine" } +tokio = { version = "1.0", features = ["sync"] } diff --git a/system_catalog/src/lib.rs b/system_catalog/src/lib.rs new file mode 100644 index 0000000000..a0e1855a70 --- /dev/null +++ b/system_catalog/src/lib.rs @@ -0,0 +1,168 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! System catalog implementations + +use std::{ + collections::HashMap, + fmt::Debug, + pin::Pin, + sync::Arc, + task::{Context, Poll}, +}; + +use async_trait::async_trait; +use common_types::{ + record_batch::RecordBatch, + row::Row, + schema::{RecordSchema, Schema}, + time::Timestamp, +}; +use futures::Stream; +use table_engine::{ + stream, + stream::{PartitionedStreams, RecordBatchStream, SendableRecordBatchStream}, + table::{ + AlterSchemaRequest, FlushRequest, GetRequest, ReadRequest, Table, TableId, TableStats, + WriteRequest, + }, +}; + +pub mod sys_catalog_table; +pub mod tables; + +/// Timestamp of entry +pub const ENTRY_TIMESTAMP: Timestamp = Timestamp::new(0); + +/// The minimal thing that a system table needs to implement +#[async_trait] +pub trait SystemTable: Send + Sync + Debug { + /// System table name + fn name(&self) -> &str; + + /// System table name + fn id(&self) -> TableId; + + /// Produce the schema from this system table + fn schema(&self) -> Schema; + + /// Get the contents of the system table as a single RecordBatch + async fn read( + &self, + request: ReadRequest, + ) -> table_engine::table::Result; +} + +#[derive(Debug)] +pub struct SystemTableAdapter { + inner: Arc, +} + +impl SystemTableAdapter { + pub fn new(inner: impl SystemTable + 'static) -> Self { + Self { + inner: Arc::new(inner), + } + } +} + +#[async_trait] +impl Table for SystemTableAdapter { + fn name(&self) -> &str { + self.inner.name() + } + + fn id(&self) -> TableId { + self.inner.id() + } + + fn schema(&self) -> Schema { + self.inner.schema() + } + + fn options(&self) -> HashMap { + HashMap::new() + } + + fn engine_type(&self) -> &str { + "system" + } + + fn stats(&self) -> TableStats { + TableStats::default() + } + + async fn write(&self, _request: WriteRequest) -> table_engine::table::Result { + Ok(0) + } + + async fn read( + &self, + request: ReadRequest, + ) -> table_engine::table::Result { + self.inner.read(request).await + } + + async fn get(&self, _request: GetRequest) -> table_engine::table::Result> { + Ok(None) + } + + async fn partitioned_read( + &self, + request: ReadRequest, + ) -> table_engine::table::Result { + let read_parallelism = request.opts.read_parallelism; + let stream = self.inner.read(request).await?; + let mut streams = Vec::with_capacity(read_parallelism); + streams.push(stream); + for _ in 0..read_parallelism - 1 { + streams.push(Box::pin(OneRecordBatchStream { + schema: self.schema().clone().to_record_schema(), + record_batch: None, + })); + } + Ok(PartitionedStreams { streams }) + } + + async fn alter_schema( + &self, + _request: AlterSchemaRequest, + ) -> table_engine::table::Result { + Ok(0) + } + + async fn alter_options( + &self, + _options: HashMap, + ) -> table_engine::table::Result { + Ok(0) + } + + async fn flush(&self, _request: FlushRequest) -> table_engine::table::Result<()> { + Ok(()) + } + + async fn compact(&self) -> table_engine::table::Result<()> { + Ok(()) + } +} + +pub struct OneRecordBatchStream { + schema: RecordSchema, + record_batch: Option, +} +impl Stream for OneRecordBatchStream { + type Item = stream::Result; + + fn poll_next(mut self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll> { + if self.record_batch.is_none() { + Poll::Ready(None) + } else { + Poll::Ready(Some(Ok(self.record_batch.take().unwrap()))) + } + } +} +impl RecordBatchStream for OneRecordBatchStream { + fn schema(&self) -> &RecordSchema { + &self.schema + } +} diff --git a/system_catalog/src/sys_catalog_table.rs b/system_catalog/src/sys_catalog_table.rs new file mode 100644 index 0000000000..e1a4a004be --- /dev/null +++ b/system_catalog/src/sys_catalog_table.rs @@ -0,0 +1,1017 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table to store system catalog + +use std::{collections::HashMap, convert::TryFrom, mem}; + +use async_trait::async_trait; +use catalog::consts; +use common_types::{ + bytes::{Bytes, BytesMut, MemBuf, MemBufMut}, + column_schema, + datum::{Datum, DatumKind}, + projected_schema::ProjectedSchema, + record_batch::RecordBatch, + request_id::RequestId, + row::{Row, RowGroup, RowGroupBuilder}, + schema::{self, Schema}, + time::Timestamp, +}; +use common_util::{ + codec::{memcomparable::MemComparable, Encoder}, + define_result, +}; +use futures::TryStreamExt; +use log::{debug, info, warn}; +use proto::sys_catalog::{CatalogEntry, SchemaEntry, TableEntry}; +use protobuf::Message; +use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu}; +use table_engine::{ + self, + engine::{ + CreateTableRequest, DropTableRequest, OpenTableRequest, TableEngine, TableRequestType, + TableState, + }, + predicate::PredicateBuilder, + table::{ + GetRequest, ReadOptions, ReadOrder, ReadRequest, SchemaId, TableId, TableInfo, TableRef, + TableSeq, WriteRequest, + }, +}; +use tokio::sync::Mutex; + +use crate::ENTRY_TIMESTAMP; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to build schema for sys_catalog, err:{}", source))] + BuildSchema { source: common_types::schema::Error }, + + #[snafu(display( + "Failed to get column index for sys_catalog, name:{}.\nBacktrace:\n{}", + name, + backtrace + ))] + GetColumnIndex { name: String, backtrace: Backtrace }, + + #[snafu(display("Failed to build table for sys_catalog, err:{}", source))] + BuildTable { source: table_engine::engine::Error }, + + #[snafu(display("Failed to open table for sys_catalog, err:{}", source))] + OpenTable { source: table_engine::engine::Error }, + + #[snafu(display("Failed to convert into RowGroup, err:{}", source))] + IntoRowGroup { source: common_types::row::Error }, + + #[snafu(display("Failed to persist catalog to table, err:{}", source))] + PersistCatalog { source: table_engine::table::Error }, + + #[snafu(display("Failed to persist schema to table, err:{}", source))] + PersistSchema { source: table_engine::table::Error }, + + #[snafu(display("Failed to persist tables to table, err:{}", source))] + PersistTables { source: table_engine::table::Error }, + + #[snafu(display("Failed to read table, err:{}", source))] + ReadTable { source: table_engine::table::Error }, + + #[snafu(display("Failed to read stream, err:{}", source))] + ReadStream { source: table_engine::stream::Error }, + + #[snafu(display( + "Visitor catalog not found, catalog:{}.\nBacktrace:\n{}", + catalog, + backtrace + ))] + #[snafu(visibility(pub))] + VisitorCatalogNotFound { + catalog: String, + backtrace: Backtrace, + }, + + #[snafu(display( + "Visitor schema not found, catalog:{}, schema:{}.\nBacktrace:\n{}", + catalog, + schema, + backtrace + ))] + #[snafu(visibility(pub))] + VisitorSchemaNotFound { + catalog: String, + schema: String, + backtrace: Backtrace, + }, + + #[snafu(display("Visitor Failed to open table, err:{}", source))] + #[snafu(visibility(pub))] + VisitorOpenTable { source: table_engine::engine::Error }, + + #[snafu(display("Failed to encode entry key header, err:{}", source))] + EncodeKeyHeader { source: common_types::bytes::Error }, + + #[snafu(display("Failed to encode entry body, err:{}", source))] + EncodeKeyBody { + source: common_util::codec::memcomparable::Error, + }, + + #[snafu(display("Failed to encode table key type, err:{}", source))] + EncodeTableKeyType { source: common_types::bytes::Error }, + + #[snafu(display("Failed to read entry key header, err:{}", source))] + ReadKeyHeader { source: common_types::bytes::Error }, + + #[snafu(display("Failed to read table key header, err:{}", source))] + ReadTableKeyHeader { source: common_types::bytes::Error }, + + #[snafu(display( + "Invalid entry key header, value:{}.\nBacktrace:\n{}", + value, + backtrace + ))] + InvalidKeyHeader { value: u8, backtrace: Backtrace }, + + #[snafu(display("Invalid table key type, value:{}.\nBacktrace:\n{}", value, backtrace))] + InvalidTableKeyType { value: u8, backtrace: Backtrace }, + + #[snafu(display( + "Failed to encode protobuf for entry, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + EncodeEntryPb { + source: protobuf::error::ProtobufError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to build row for entry, err:{}", source))] + BuildRow { source: common_types::row::Error }, + + #[snafu(display( + "Failed to decode protobuf for entry, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + DecodeEntryPb { + source: protobuf::error::ProtobufError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to decode table entry, err:{}", source))] + DecodeTableEntry { + source: table_engine::table::TryFromTableEntryError, + }, + + #[snafu(display( + "Failed to decode schema for table alter entry, table:{}, err:{}", + table, + source + ))] + DecodeSchema { + table: String, + source: common_types::schema::Error, + }, + + #[snafu(display("Table key type not found in key.\nBacktrace:\n{}", backtrace))] + EmptyTableKeyType { backtrace: Backtrace }, + + #[snafu(display( + "The row in the sys_catalog_table is invalid, row:{:?}.\nBacktrace:\n{}", + row, + backtrace + ))] + InvalidTableRow { row: Row, backtrace: Backtrace }, + + #[snafu(display( + "The fetched table is mismatched, expect:{}, given:{}.\nBacktrace:\n{}", + expect_table, + given_table, + backtrace + ))] + TableKeyMismatch { + expect_table: String, + given_table: String, + backtrace: Backtrace, + }, + + #[snafu(display("The table is not found, table:{}.\nBacktrace:\n{}", table, backtrace))] + TableNotFound { table: String, backtrace: Backtrace }, + + #[snafu(display("Fail to get the table info, table:{}, err:{}.", table, source))] + GetTableInfo { + table: String, + source: table_engine::table::Error, + }, + + #[snafu(display("Invalid table state transition, table:{}, err:{}.", table, source))] + InvalidTableStateTransition { + table: String, + source: table_engine::engine::Error, + }, + + #[snafu(display("Invalid schema id, id:{}", id))] + InvalidSchemaId { id: u32 }, +} + +define_result!(Error); + +/// Table name of the sys catalog +pub const TABLE_NAME: &str = "sys_catalog"; +/// Schema id of the sys catalog schema (`system/public`). +pub const SCHEMA_ID: SchemaId = SchemaId::from_u16(1); +/// Table sequence of the sys catalog table, always set to 1 +pub const TABLE_SEQ: TableSeq = TableSeq::from_u32(1); +/// Table id of the `sys_catalog` table. +pub const TABLE_ID: TableId = TableId::new(SCHEMA_ID, TABLE_SEQ); +/// Name of key column (field) +pub const KEY_COLUMN_NAME: &str = "key"; +/// Name of timestamp column (field) +pub const TIMESTAMP_COLUMN_NAME: &str = "timestamp"; +/// Name of value column (field) +pub const VALUE_COLUMN_NAME: &str = "value"; +/// Default enable ttl is false +pub const DEFAULT_ENABLE_TTL: &str = "false"; + +// TODO(yingwen): Add a type column once support int8 type and maybe split key +// into multiple columns. +/// SysCatalogTable is a special table to keep tracks of the system infomations +/// +/// Similar to kudu's SysCatalogTable +/// - see +/// - schema: (key, timestamp) -> metadata +/// +/// The timestamp is used to support metadata ttl in the future, now it can set +/// to 0. +#[derive(Debug)] +pub struct SysCatalogTable { + // TODO(yingwen): Table id + /// Underlying Table to actually store data + table: TableRef, + /// Index of the key column + key_column_index: usize, + /// Index of the value column + value_column_index: usize, + /// Protects table create/alter/drop + // TODO(xikai): A better way is to use a specific struct with the lock that takes + // responsibilities to update table. + update_table_lock: Mutex<()>, +} + +impl SysCatalogTable { + /// Create a new [SysCatalogTable] + pub async fn new(table_engine: &T) -> Result { + let table_schema = new_sys_catalog_schema().context(BuildSchema)?; + let key_column_index = table_schema + .index_of(KEY_COLUMN_NAME) + .context(GetColumnIndex { + name: KEY_COLUMN_NAME, + })?; + let value_column_index = + table_schema + .index_of(VALUE_COLUMN_NAME) + .context(GetColumnIndex { + name: VALUE_COLUMN_NAME, + })?; + + let open_request = OpenTableRequest { + catalog_name: consts::SYSTEM_CATALOG.to_string(), + schema_name: consts::SYSTEM_CATALOG_SCHEMA.to_string(), + table_name: TABLE_NAME.to_string(), + engine: table_engine.engine_type().to_string(), + }; + + let table_opt = table_engine + .open_table(open_request) + .await + .context(OpenTable)?; + match table_opt { + Some(table) => { + info!("Sys catalog table open existing table"); + + // The sys_catalog table is already created + return Ok(Self { + table, + key_column_index, + value_column_index, + update_table_lock: Mutex::new(()), + }); + } + None => { + info!("Sys catalog table is not exists, try to create a new table"); + } + } + + let mut options = HashMap::new(); + options.insert( + table_engine::OPTION_KEY_ENABLE_TTL.to_string(), + DEFAULT_ENABLE_TTL.to_string(), + ); + let create_request = CreateTableRequest { + catalog_name: consts::SYSTEM_CATALOG.to_string(), + schema_name: consts::SYSTEM_CATALOG_SCHEMA.to_string(), + table_id: TABLE_ID, + table_name: TABLE_NAME.to_string(), + table_schema, + partition_info: None, + engine: table_engine.engine_type().to_string(), + options, + state: TableState::Stable, + }; + + let table = table_engine + .create_table(create_request) + .await + .context(BuildTable)?; + + Ok(Self { + table, + key_column_index, + value_column_index, + update_table_lock: Mutex::new(()), + }) + } + + /// Returns the table id of the sys catalog table. + #[inline] + pub fn table_id(&self) -> TableId { + TABLE_ID + } + + /// Add and store the catalog info + pub async fn create_catalog(&self, request: CreateCatalogRequest) -> Result<()> { + info!("Add catalog to sys_catalog table, request:{:?}", request); + + let row_group = request.into_row_group(self.table.schema())?; + + let write_req = WriteRequest { row_group }; + self.table.write(write_req).await.context(PersistCatalog)?; + + Ok(()) + } + + /// Add and store the schema info + pub async fn create_schema(&self, request: CreateSchemaRequest) -> Result<()> { + info!("Add schema to sys_catalog table, request:{:?}", request); + + let row_group = request.into_row_group(self.table.schema())?; + + let write_req = WriteRequest { row_group }; + self.table.write(write_req).await.context(PersistSchema)?; + + Ok(()) + } + + /// Create table in the catalog. + pub async fn create_table(&self, table_info: TableInfo) -> Result<()> { + info!( + "Create table to sys_catalog table, table_info:{:?}", + table_info + ); + + let _lock = self.update_table_lock.lock().await; + self.write_table_info(table_info, TableRequestType::Create) + .await?; + + Ok(()) + } + + /// Prepare to drop the table. + pub async fn prepare_drop_table(&self, request: DropTableRequest) -> Result<()> { + info!( + "Prepare to drop table to sys_catalog table, request:{:?}", + request + ); + + let table_key = TableKey { + catalog: &request.catalog_name, + schema: &request.schema_name, + table: &request.table_name, + }; + + // update the dropped flag the lock held. + { + let _lock = self.update_table_lock.lock().await; + if let Some(mut table_info) = self.get_table_info(table_key).await? { + table_info.state.try_transit(TableState::Dropping).context( + InvalidTableStateTransition { + table: &request.table_name, + }, + )?; + + self.write_table_info(table_info, TableRequestType::Drop) + .await?; + } else { + warn!("Prepare to drop a dropped table, request:{:?}", request); + } + } + + Ok(()) + } + + /// Drop the table. + /// + /// Note that [prepare_drop_table] should be called before this method. + pub async fn drop_table(&self, request: DropTableRequest) -> Result<()> { + info!("Drop table to sys_catalog table, request:{:?}", request); + + let table_key = TableKey { + catalog: &request.catalog_name, + schema: &request.schema_name, + table: &request.table_name, + }; + + // update the table state with the lock held. + { + if let Some(mut table_info) = self.get_table_info(table_key).await? { + table_info.state.try_transit(TableState::Dropped).context( + InvalidTableStateTransition { + table: &request.table_name, + }, + )?; + + self.write_table_info(table_info, TableRequestType::Drop) + .await?; + } else { + warn!("Drop a dropped table, request:{:?}", request); + } + } + + Ok(()) + } + + /// Returns the inner table of the sys catalog. + #[inline] + pub fn inner_table(&self) -> TableRef { + self.table.clone() + } + + /// Write the table info to the sys_catalog table without lock. + async fn write_table_info(&self, table_info: TableInfo, typ: TableRequestType) -> Result<()> { + info!( + "Write table info to sys_catalog table, table_info:{:?}", + table_info + ); + + let table_writer = TableWriter { + catalog_table: self.table.clone(), + table_to_write: table_info, + typ, + }; + + table_writer.write().await?; + + Ok(()) + } + + async fn get_table_info<'a>(&'a self, table_key: TableKey<'a>) -> Result> { + let projected_schema = ProjectedSchema::no_projection(self.table.schema()); + let primary_key = TableWriter::build_table_primary_key(table_key.clone())?; + let get_req = GetRequest { + request_id: RequestId::next_id(), + projected_schema, + primary_key, + }; + + match self.table.get(get_req).await.context(GetTableInfo { + table: table_key.table, + })? { + Some(row) => { + let table_info = self.decode_table_info(row)?; + let decoded_table_key = TableKey { + catalog: &table_info.catalog_name, + schema: &table_info.schema_name, + table: &table_info.table_name, + }; + + ensure!( + table_key == decoded_table_key, + TableKeyMismatch { + expect_table: table_key.table, + given_table: decoded_table_key.table, + } + ); + + Ok(Some(table_info)) + } + None => Ok(None), + } + } + + fn decode_table_info(&self, row: Row) -> Result { + ensure!( + row.num_columns() > self.key_column_index, + InvalidTableRow { row } + ); + + ensure!( + row.num_columns() > self.value_column_index, + InvalidTableRow { row } + ); + + // Key and value column is always varbinary. + let key = &row[self.key_column_index] + .as_varbinary() + .with_context(|| InvalidTableRow { row: row.clone() })?; + let value = &row[self.value_column_index] + .as_varbinary() + .with_context(|| InvalidTableRow { row: row.clone() })?; + + match decode_one_request(key, value)? { + DecodedRequest::TableEntry(request) => Ok(request), + _ => InvalidTableRow { row }.fail(), + } + } + + /// Visit all data in the sys catalog table + // TODO(yingwen): Expose read options + pub async fn visit(&self, opts: ReadOptions, visitor: &mut dyn Visitor) -> Result<()> { + let read_request = ReadRequest { + request_id: RequestId::next_id(), + opts, + // The schema of sys catalog table is never changed + projected_schema: ProjectedSchema::no_projection(self.table.schema()), + predicate: PredicateBuilder::default().build(), + order: ReadOrder::None, + }; + let mut batch_stream = self.table.read(read_request).await.context(ReadTable)?; + + info!("batch_stream schema is:{:?}", batch_stream.schema()); + // TODO(yingwen): Check stream schema and table schema? + while let Some(batch) = batch_stream.try_next().await.context(ReadStream)? { + // Visit all requests in the record batch + info!("real batch_stream schema is:{:?}", batch.schema()); + self.visit_record_batch(batch, visitor).await?; + } + + Ok(()) + } + + /// Visit the record batch + async fn visit_record_batch( + &self, + batch: RecordBatch, + visitor: &mut dyn Visitor, + ) -> Result<()> { + let key_column = batch.column(self.key_column_index); + let value_column = batch.column(self.value_column_index); + + info!( + "Sys catalog table visit record batch, column_num:{}, row_num:{}", + batch.num_columns(), + batch.num_rows() + ); + + let num_rows = batch.num_rows(); + for i in 0..num_rows { + // Key and value column is not nullable + let key = key_column.datum(i); + let value = value_column.datum(i); + + debug!( + "Sys catalog table visit row, i:{}, key:{:?}, value:{:?}", + i, key, value + ); + + // Key and value column is always varbinary. + let request = + decode_one_request(key.as_varbinary().unwrap(), value.as_varbinary().unwrap())?; + + Self::call_visitor(request, visitor).await?; + } + + Ok(()) + } + + /// Invoke visitor + async fn call_visitor(request: DecodedRequest, visitor: &mut dyn Visitor) -> Result<()> { + match request { + DecodedRequest::CreateCatalog(req) => visitor.visit_catalog(req), + DecodedRequest::CreateSchema(req) => visitor.visit_schema(req), + DecodedRequest::TableEntry(req) => visitor.visit_tables(req).await, + } + } +} + +/// Visitor for sys catalog requests +// TODO(yingwen): Define an Error for visitor +#[async_trait] +pub trait Visitor { + // TODO(yingwen): Use enum another type if need more operation (delete/update) + fn visit_catalog(&mut self, request: CreateCatalogRequest) -> Result<()>; + + fn visit_schema(&mut self, request: CreateSchemaRequest) -> Result<()>; + + async fn visit_tables(&mut self, table_info: TableInfo) -> Result<()>; +} + +/// Build a new table schema for sys catalog +fn new_sys_catalog_schema() -> schema::Result { + // NOTICE: Both key and value must be non-nullable, the visit function takes + // this assumption + schema::Builder::with_capacity(3) + .auto_increment_column_id(true) + // key + .add_key_column( + column_schema::Builder::new(KEY_COLUMN_NAME.to_string(), DatumKind::Varbinary) + .is_nullable(false) + .is_tag(false) + .build() + .expect("Should succeed to build column schema of catalog"), + )? + // timestamp + .add_key_column( + column_schema::Builder::new(TIMESTAMP_COLUMN_NAME.to_string(), DatumKind::Timestamp) + .is_nullable(false) + .is_tag(false) + .build() + .expect("Should succeed to build column schema of catalog"), + )? + // value + .add_normal_column( + column_schema::Builder::new(VALUE_COLUMN_NAME.to_string(), DatumKind::Varbinary) + .is_nullable(false) + .is_tag(false) + .build() + .expect("Should succeed to build column schema of catalog"), + )? + .build() +} + +/// Request type, used as key header +/// +/// 0 is reserved +#[derive(Debug, Clone, Copy)] +enum KeyType { + CreateCatalog = 1, + CreateSchema = 2, + TableEntry = 3, +} + +impl KeyType { + fn to_u8(self) -> u8 { + self as u8 + } + + fn decode_from_bytes(mut buf: &[u8]) -> Result { + let v = buf.read_u8().context(ReadKeyHeader)?; + + match v { + v if v == Self::CreateCatalog as u8 => Ok(Self::CreateCatalog), + v if v == Self::CreateSchema as u8 => Ok(Self::CreateSchema), + v if v == Self::TableEntry as u8 => Ok(Self::TableEntry), + value => InvalidKeyHeader { value }.fail(), + } + } +} + +/// Catalog entry key +/// +/// Use catalog name as key +struct CatalogKey<'a>(&'a str); + +/// Schema entry key +/// +/// Use (catalog, schema) as key +struct SchemaKey<'a>(&'a str, &'a str); + +// TODO(yingwen): Maybe use same key for create/alter table. +/// Table entry key +/// +/// Use (catalog, schema, table_id) as key +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct TableKey<'a> { + catalog: &'a str, + schema: &'a str, + table: &'a str, +} + +/// Encoder for entry key +struct EntryKeyEncoder; + +impl<'a> Encoder> for EntryKeyEncoder { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &CatalogKey) -> Result<()> { + buf.write_u8(KeyType::CreateCatalog.to_u8()) + .context(EncodeKeyHeader)?; + let encoder = MemComparable; + encoder + .encode(buf, value.0.as_bytes()) + .context(EncodeKeyBody) + } + + fn estimate_encoded_size(&self, value: &CatalogKey) -> usize { + let encoder = MemComparable; + mem::size_of::() + encoder.estimate_encoded_size(value.0.as_bytes()) + } +} + +impl<'a> Encoder> for EntryKeyEncoder { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &SchemaKey) -> Result<()> { + buf.write_u8(KeyType::CreateSchema.to_u8()) + .context(EncodeKeyHeader)?; + let encoder = MemComparable; + encoder + .encode(buf, value.0.as_bytes()) + .context(EncodeKeyBody)?; + encoder + .encode(buf, value.1.as_bytes()) + .context(EncodeKeyBody) + } + + fn estimate_encoded_size(&self, value: &SchemaKey) -> usize { + let encoder = MemComparable; + mem::size_of::() + + encoder.estimate_encoded_size(value.0.as_bytes()) + + encoder.estimate_encoded_size(value.1.as_bytes()) + } +} + +impl<'a> Encoder> for EntryKeyEncoder { + type Error = Error; + + fn encode(&self, buf: &mut B, value: &TableKey) -> Result<()> { + buf.write_u8(KeyType::TableEntry.to_u8()) + .context(EncodeKeyHeader)?; + let encoder = MemComparable; + encoder + .encode(buf, value.catalog.as_bytes()) + .context(EncodeKeyBody)?; + encoder + .encode(buf, value.schema.as_bytes()) + .context(EncodeKeyBody)?; + encoder + .encode(buf, value.table.as_bytes()) + .context(EncodeKeyBody)?; + Ok(()) + } + + fn estimate_encoded_size(&self, value: &TableKey) -> usize { + let encoder = MemComparable; + mem::size_of::() + + encoder.estimate_encoded_size(value.catalog.as_bytes()) + + encoder.estimate_encoded_size(value.schema.as_bytes()) + + encoder.estimate_encoded_size(value.table.as_bytes()) + } +} + +/// Information of the catalog to add +#[derive(Debug)] +pub struct CreateCatalogRequest { + /// Catalog name + pub catalog_name: String, +} + +impl CreateCatalogRequest { + /// Convert into [common_types::row::RowGroup] + fn into_row_group(self, schema: Schema) -> Result { + let key = self.to_key()?; + let value = self.into_value()?; + let mut builder = RowGroupBuilder::new(schema); + builder + .row_builder() + // key + .append_datum(Datum::Varbinary(key)) + .context(BuildRow)? + // timestamp + .append_datum(Datum::Timestamp(ENTRY_TIMESTAMP)) + .context(BuildRow)? + // value + .append_datum(Datum::Varbinary(value)) + .context(BuildRow)? + .finish() + .context(BuildRow)?; + + Ok(builder.build()) + } + + fn to_key(&self) -> Result { + let encoder = EntryKeyEncoder; + let key = CatalogKey(&self.catalog_name); + let mut buf = BytesMut::with_capacity(encoder.estimate_encoded_size(&key)); + encoder.encode(&mut buf, &key)?; + Ok(buf.into()) + } + + fn into_value(self) -> Result { + let entry = self.into_pb(); + + let buf = entry.write_to_bytes().context(EncodeEntryPb)?; + Ok(buf.into()) + } + + fn into_pb(self) -> CatalogEntry { + let mut entry = CatalogEntry::new(); + entry.set_catalog_name(self.catalog_name); + entry.set_created_time(Timestamp::now().as_i64()); + + entry + } +} + +impl From for CreateCatalogRequest { + fn from(entry: CatalogEntry) -> Self { + Self { + catalog_name: entry.catalog_name, + } + } +} + +/// Information of the schema to add. +#[derive(Debug)] +pub struct CreateSchemaRequest { + pub catalog_name: String, + pub schema_name: String, + pub schema_id: SchemaId, +} + +impl CreateSchemaRequest { + /// Convert into [common_types::row::RowGroup] + fn into_row_group(self, schema: Schema) -> Result { + let key = self.to_key()?; + let value = self.into_value()?; + let mut builder = RowGroupBuilder::new(schema); + builder + .row_builder() + // key + .append_datum(Datum::Varbinary(key)) + .context(BuildRow)? + // timestamp + .append_datum(Datum::Timestamp(ENTRY_TIMESTAMP)) + .context(BuildRow)? + // value + .append_datum(Datum::Varbinary(value)) + .context(BuildRow)? + .finish() + .context(BuildRow)?; + + Ok(builder.build()) + } + + fn to_key(&self) -> Result { + let encoder = EntryKeyEncoder; + let key = SchemaKey(&self.catalog_name, &self.schema_name); + let mut buf = BytesMut::with_capacity(encoder.estimate_encoded_size(&key)); + encoder.encode(&mut buf, &key)?; + Ok(buf.into()) + } + + fn into_value(self) -> Result { + let entry = self.into_pb(); + + let buf = entry.write_to_bytes().context(EncodeEntryPb)?; + Ok(buf.into()) + } + + fn into_pb(self) -> SchemaEntry { + let mut entry = SchemaEntry::new(); + entry.set_catalog_name(self.catalog_name); + entry.set_schema_name(self.schema_name); + entry.set_schema_id(self.schema_id.as_u32()); + entry.set_created_time(Timestamp::now().as_i64()); + + entry + } +} + +impl TryFrom for CreateSchemaRequest { + type Error = Error; + + fn try_from(entry: SchemaEntry) -> Result { + let schema_id = SchemaId::new(entry.schema_id).context(InvalidSchemaId { + id: entry.schema_id, + })?; + + Ok(Self { + catalog_name: entry.catalog_name, + schema_name: entry.schema_name, + schema_id, + }) + } +} + +/// Information of the alter operations to the table. +#[derive(Clone, Debug)] +pub struct AlterTableRequest { + pub catalog_name: String, + pub schema_name: String, + pub table_name: String, + /// Schema after alteration. + pub schema: Schema, +} + +/// Writer for writing the table information into the catalog table. +pub struct TableWriter { + catalog_table: TableRef, + table_to_write: TableInfo, + typ: TableRequestType, +} + +impl TableWriter { + async fn write(&self) -> Result<()> { + let row_group = self.convert_table_info_to_row_group()?; + let write_req = WriteRequest { row_group }; + self.catalog_table + .write(write_req) + .await + .context(PersistTables)?; + + Ok(()) + } + + /// Convert the table to write into [common_types::row::RowGroup]. + fn convert_table_info_to_row_group(&self) -> Result { + let mut builder = RowGroupBuilder::new(self.catalog_table.schema()); + let key = Self::build_create_table_key(&self.table_to_write)?; + let value = Self::build_create_table_value(self.table_to_write.clone(), self.typ)?; + + debug!( + "TableWriter build key value, key:{:?}, value:{:?}", + key, value + ); + + Self::build_row(&mut builder, key, value)?; + + Ok(builder.build()) + } + + fn build_row(builder: &mut RowGroupBuilder, key: Bytes, value: Bytes) -> Result<()> { + builder + .row_builder() + // key + .append_datum(Datum::Varbinary(key)) + .context(BuildRow)? + // timestamp + .append_datum(Datum::Timestamp(ENTRY_TIMESTAMP)) + .context(BuildRow)? + // value + .append_datum(Datum::Varbinary(value)) + .context(BuildRow)? + .finish() + .context(BuildRow)?; + Ok(()) + } + + fn build_create_table_key(table_info: &TableInfo) -> Result { + let key = TableKey { + catalog: &table_info.catalog_name, + schema: &table_info.schema_name, + table: &table_info.table_name, + }; + Self::encode_table_key(key) + } + + fn encode_table_key(key: TableKey) -> Result { + let encoder = EntryKeyEncoder; + let mut buf = BytesMut::with_capacity(encoder.estimate_encoded_size(&key)); + encoder.encode(&mut buf, &key)?; + Ok(buf.into()) + } + + fn build_create_table_value(table_info: TableInfo, typ: TableRequestType) -> Result { + let entry = table_info.into_pb(typ); + + let buf = entry.write_to_bytes().context(EncodeEntryPb)?; + Ok(buf.into()) + } + + fn build_table_primary_key(table_key: TableKey) -> Result> { + let encoded_key = Self::encode_table_key(table_key)?; + + Ok(vec![ + Datum::Varbinary(encoded_key), + Datum::Timestamp(ENTRY_TIMESTAMP), + ]) + } +} + +/// Decoded sys catalog request +#[derive(Debug)] +enum DecodedRequest { + CreateCatalog(CreateCatalogRequest), + CreateSchema(CreateSchemaRequest), + TableEntry(TableInfo), +} + +/// Decode request from key/value +fn decode_one_request(key: &[u8], value: &[u8]) -> Result { + let key_type = KeyType::decode_from_bytes(key)?; + let req = match key_type { + KeyType::CreateCatalog => { + let entry = CatalogEntry::parse_from_bytes(value).context(DecodeEntryPb)?; + DecodedRequest::CreateCatalog(CreateCatalogRequest::from(entry)) + } + KeyType::CreateSchema => { + let entry = SchemaEntry::parse_from_bytes(value).context(DecodeEntryPb)?; + DecodedRequest::CreateSchema(CreateSchemaRequest::try_from(entry)?) + } + KeyType::TableEntry => { + let entry = TableEntry::parse_from_bytes(value).context(DecodeEntryPb)?; + let table_info = TableInfo::try_from(entry).context(DecodeTableEntry)?; + DecodedRequest::TableEntry(table_info) + } + }; + + Ok(req) +} diff --git a/system_catalog/src/tables.rs b/system_catalog/src/tables.rs new file mode 100644 index 0000000000..67edfeaa35 --- /dev/null +++ b/system_catalog/src/tables.rs @@ -0,0 +1,179 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +/// implementation of system table: Tables +/// For example `SELECT * FROM system.public.tables` +use std::fmt::{Debug, Formatter}; + +use async_trait::async_trait; +use catalog::{manager::Manager, schema::SchemaRef, CatalogRef}; +use common_types::{ + column_schema, + datum::{Datum, DatumKind}, + record_batch::RecordBatchWithKeyBuilder, + row::Row, + schema, + schema::Schema, +}; +use snafu::ResultExt; +use table_engine::{ + stream::SendableRecordBatchStream, + table::{ReadRequest, SchemaId, TableId, TableRef, TableSeq}, +}; + +use crate::{OneRecordBatchStream, SystemTable, ENTRY_TIMESTAMP}; + +/// Table name of the sys tables +const TABLE_NAME: &str = "tables"; +/// Schema id of the sys catalog schema (`system/public`). +pub const SCHEMA_ID: SchemaId = SchemaId::from_u16(1); +/// Table sequence of the sys tables +pub const TABLE_SEQ: TableSeq = TableSeq::from_u32(2); +/// Table id of the `sys_catalog` table. +pub const TABLE_ID: TableId = TableId::new(SCHEMA_ID, TABLE_SEQ); + +/// Build a new table schema for tables +fn tables_schema() -> Schema { + schema::Builder::with_capacity(6) + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new("timestamp".to_string(), DatumKind::Timestamp) + .is_nullable(false) + .is_tag(false) + .build() + .unwrap(), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("catalog".to_string(), DatumKind::String) + .is_nullable(false) + .is_tag(false) + .build() + .unwrap(), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("schema".to_string(), DatumKind::String) + .is_nullable(false) + .is_tag(false) + .build() + .unwrap(), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("table_name".to_string(), DatumKind::String) + .is_nullable(false) + .is_tag(false) + .build() + .unwrap(), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("table_id".to_string(), DatumKind::UInt64) + .is_nullable(false) + .is_tag(false) + .build() + .unwrap(), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("engine".to_string(), DatumKind::String) + .is_nullable(false) + .is_tag(false) + .build() + .unwrap(), + ) + .unwrap() + .build() + .unwrap() +} + +pub struct Tables { + schema: Schema, + catalog_manager: M, +} + +impl Debug for Tables { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SysTables") + .field("schema", &self.schema) + .finish() + } +} + +impl Tables { + pub fn new(catalog_manager: M) -> Self { + Self { + schema: tables_schema(), + catalog_manager, + } + } + + fn from_table(&self, catalog: CatalogRef, schema: SchemaRef, table: TableRef) -> Row { + let mut datums = Vec::with_capacity(self.schema.num_columns()); + datums.push(Datum::Timestamp(ENTRY_TIMESTAMP)); + datums.push(Datum::from(catalog.name())); + datums.push(Datum::from(schema.name())); + datums.push(Datum::from(table.name())); + datums.push(Datum::from(table.id().as_u64())); + datums.push(Datum::from(table.engine_type())); + Row::from_datums(datums) + } +} + +#[async_trait] +impl SystemTable for Tables { + fn name(&self) -> &str { + TABLE_NAME + } + + fn id(&self) -> TableId { + TABLE_ID + } + + fn schema(&self) -> Schema { + self.schema.clone() + } + + async fn read( + &self, + request: ReadRequest, + ) -> table_engine::table::Result { + let catalogs = self + .catalog_manager + .all_catalogs() + .map_err(|e| Box::new(e) as _) + .context(table_engine::table::Scan { table: self.name() })?; + let mut builder = + RecordBatchWithKeyBuilder::new(self.schema.clone().to_record_schema_with_key()); + + let projector = request + .projected_schema + .try_project_with_key(&self.schema) + .expect("Should succeed to try_project_key of sys_tables"); + for catalog in &catalogs { + for schema in &catalog + .all_schemas() + .map_err(|e| Box::new(e) as _) + .context(table_engine::table::Scan { table: self.name() })? + { + for table in &schema + .all_tables() + .map_err(|e| Box::new(e) as _) + .context(table_engine::table::Scan { table: self.name() })? + { + let row = self.from_table(catalog.clone(), schema.clone(), table.clone()); + let projected_row = projector.project_row(&row, Vec::new()); + builder + .append_row(projected_row) + .map_err(|e| Box::new(e) as _) + .context(table_engine::table::Scan { table: self.name() })?; + } + } + } + let record_batch = builder.build().unwrap().into_record_batch(); + Ok(Box::pin(OneRecordBatchStream { + schema: self.schema.clone().to_record_schema(), + record_batch: Some(record_batch), + })) + } +} diff --git a/table_engine/Cargo.toml b/table_engine/Cargo.toml new file mode 100644 index 0000000000..b617b9f7cc --- /dev/null +++ b/table_engine/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "table_engine" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +# In alphabetical order +arrow_deps = { path = "../arrow_deps" } +async-trait = "0.1.41" +common_types = { path = "../common_types" } +common_util = { path = "../common_util" } +futures = "0.3" +log = "0.4" +proto = { path = "../proto" } +protobuf = "2.20" +serde = "1.0" +serde_derive = "1.0" +smallvec = "1.6" +snafu = { version ="0.6.10", features = ["backtraces"]} +tokio = { version = "1.0", features = ["sync"] } diff --git a/table_engine/src/engine.rs b/table_engine/src/engine.rs new file mode 100644 index 0000000000..b2aaeaaf6c --- /dev/null +++ b/table_engine/src/engine.rs @@ -0,0 +1,261 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table factory trait + +use std::{collections::HashMap, sync::Arc}; + +use async_trait::async_trait; +use common_types::{schema::Schema, time::Timestamp}; +use common_util::runtime::Runtime; +use proto::sys_catalog::{TableEntry, TableState as TableStatePb}; +use snafu::{ensure, Backtrace, Snafu}; + +use crate::{ + partition::PartitionInfo, + table::{TableId, TableInfo, TableRef}, +}; + +#[derive(Debug, Snafu)] +#[snafu(visibility(pub))] +pub enum Error { + #[snafu(display("Invalid table path, path:{}.\nBacktrace:\n{}", path, backtrace))] + InvalidTablePath { path: String, backtrace: Backtrace }, + + #[snafu(display("Table already exists, table:{}.\nBacktrace:\n{}", table, backtrace))] + TableExists { table: String, backtrace: Backtrace }, + + #[snafu(display("Invalid arguments, err:{}", source))] + InvalidArguments { + table: String, + source: Box, + }, + + #[snafu(display("Failed to write meta data, err:{}", source))] + WriteMeta { + source: Box, + }, + + #[snafu(display("Unexpected error, err:{}", source))] + Unexpected { + source: Box, + }, + + #[snafu(display( + "Unknown engine type, type:{}.\nBacktrace:\n{}", + engine_type, + backtrace + ))] + UnknownEngineType { + engine_type: String, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid table state transition, from:{:?}, to:{:?}.\nBacktrace:\n{}", + from, + to, + backtrace + ))] + InvalidTableStateTransition { + from: TableState, + to: TableState, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to close the table engine, err:{}", source))] + Close { + source: Box, + }, +} + +define_result!(Error); + +/// The state of table. +/// +/// Transition rule is defined in the validate function. +#[derive(Clone, Copy, Debug)] +pub enum TableState { + Stable = 0, + Dropping = 1, + Dropped = 2, +} + +impl TableState { + pub fn validate(&self, to: TableState) -> bool { + match self { + TableState::Stable => matches!(to, TableState::Stable | TableState::Dropping), + TableState::Dropping => matches!(to, TableState::Dropped), + TableState::Dropped => false, + } + } + + /// Try to transit from the self state to the `to` state. + /// + /// Returns error if it is a invalid transition. + pub fn try_transit(&mut self, to: TableState) -> Result<()> { + ensure!( + self.validate(to), + InvalidTableStateTransition { from: *self, to } + ); + *self = to; + + Ok(()) + } +} + +impl From for TableStatePb { + fn from(state: TableState) -> TableStatePb { + match state { + TableState::Stable => TableStatePb::STABLE, + TableState::Dropping => TableStatePb::DROPPING, + TableState::Dropped => TableStatePb::DROPPED, + } + } +} + +impl From for TableState { + fn from(state: TableStatePb) -> TableState { + match state { + TableStatePb::STABLE => TableState::Stable, + TableStatePb::DROPPING => TableState::Dropping, + TableStatePb::DROPPED => TableState::Dropped, + } + } +} + +#[derive(Copy, Clone)] +pub enum TableRequestType { + Create, + Drop, +} + +/// Create table request +// TODO(yingwen): Add option for create_if_not_exists? +#[derive(Debug, Clone)] +pub struct CreateTableRequest { + /// Catalog name + pub catalog_name: String, + /// Schema name + pub schema_name: String, + /// Table id + pub table_id: TableId, + // TODO(yingwen): catalog and schema, or add a table path struct? + /// Table name + pub table_name: String, + /// Table schema + pub table_schema: Schema, + /// Partition info if this is a partitioned table + // TODO(yingwen): TableEngine should not have knowledge of partitioning + pub partition_info: Option, + /// Table engine type + pub engine: String, + /// Table options used by each engine + pub options: HashMap, + /// Tells state of the table + pub state: TableState, +} + +impl CreateTableRequest { + // TODO(chunshao.rcs): refactor + pub fn into_pb(self, typ: TableRequestType) -> TableEntry { + let mut table_entry: TableEntry = self.into(); + match typ { + TableRequestType::Create => table_entry.set_created_time(Timestamp::now().as_i64()), + TableRequestType::Drop => table_entry.set_modified_time(Timestamp::now().as_i64()), + } + table_entry + } +} + +impl From for TableEntry { + fn from(req: CreateTableRequest) -> Self { + let mut entry = TableEntry::new(); + entry.set_catalog_name(req.catalog_name); + entry.set_schema_name(req.schema_name); + entry.set_table_id(req.table_id.as_u64()); + entry.set_table_name(req.table_name); + entry.set_engine(req.engine); + entry.set_state(TableStatePb::from(req.state)); + + entry + } +} + +impl From for TableInfo { + fn from(req: CreateTableRequest) -> Self { + Self { + catalog_name: req.catalog_name, + schema_name: req.schema_name, + table_id: req.table_id, + table_name: req.table_name, + engine: req.engine, + state: req.state, + } + } +} + +/// Drop table request +#[derive(Debug, Clone)] +pub struct DropTableRequest { + /// Catalog name + pub catalog_name: String, + /// Schema name + pub schema_name: String, + /// Table name + pub table_name: String, + /// Table engine type + pub engine: String, +} + +#[derive(Debug, Clone)] +pub struct OpenTableRequest { + /// Catalog name + pub catalog_name: String, + /// Schema name + pub schema_name: String, + /// Table name + pub table_name: String, + /// Table engine type + pub engine: String, +} + +impl From for OpenTableRequest { + fn from(table_info: TableInfo) -> Self { + Self { + catalog_name: table_info.catalog_name, + schema_name: table_info.schema_name, + table_name: table_info.table_name, + engine: table_info.engine, + } + } +} + +/// Table engine +// TODO(yingwen): drop table support to release resource owned by the table +#[async_trait] +pub trait TableEngine { + /// Returns the name of engine. + fn engine_type(&self) -> &str; + + /// Close the engine gracefully. + async fn close(&self) -> Result<()>; + + /// Create table + async fn create_table(&self, request: CreateTableRequest) -> Result; + + /// Drop table + async fn drop_table(&self, request: DropTableRequest) -> Result; + + /// Open table, return None if table not exists + async fn open_table(&self, request: OpenTableRequest) -> Result>; +} + +/// A reference counted pointer to table engine +pub type TableEngineRef = Arc; + +#[derive(Clone, Debug)] +pub struct EngineRuntimes { + pub read_runtime: Arc, + pub write_runtime: Arc, + pub bg_runtime: Arc, +} diff --git a/table_engine/src/lib.rs b/table_engine/src/lib.rs new file mode 100644 index 0000000000..ac60c1e8dc --- /dev/null +++ b/table_engine/src/lib.rs @@ -0,0 +1,20 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table engine facade, provides read/write interfaces of table + +#[macro_use] +extern crate common_util; + +pub mod engine; +pub mod memory; +pub mod partition; +pub mod predicate; +pub mod provider; +pub mod stream; +pub mod table; + +/// Enable ttl key +pub const OPTION_KEY_ENABLE_TTL: &str = "enable_ttl"; + +pub const MEMORY_ENGINE_TYPE: &str = "Memory"; +pub const ANALYTIC_ENGINE_TYPE: &str = "Analytic"; diff --git a/table_engine/src/memory.rs b/table_engine/src/memory.rs new file mode 100644 index 0000000000..d26448fddf --- /dev/null +++ b/table_engine/src/memory.rs @@ -0,0 +1,252 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! In-memory table implementations + +use std::{ + collections::HashMap, + fmt, + pin::Pin, + sync::{Arc, RwLock}, + task::{Context, Poll}, +}; + +use async_trait::async_trait; +use common_types::{ + column::{ColumnBlock, ColumnBlockBuilder}, + datum::{Datum, DatumKind}, + record_batch::RecordBatch, + row::{Row, RowGroup}, + schema::{RecordSchema, Schema}, +}; +use futures::stream::Stream; +use snafu::{OptionExt, ResultExt}; + +use crate::{ + stream::{ + self, ErrNoSource, ErrWithSource, PartitionedStreams, RecordBatchStream, + SendableRecordBatchStream, + }, + table::{ + AlterSchemaRequest, FlushRequest, GetRequest, ReadRequest, Result, Table, TableId, + TableStats, UnsupportedMethod, WriteRequest, + }, +}; + +type RowGroupVec = Vec; + +/// In-memory table +/// +/// Mainly for test, DO NOT use it in production. All data inserted are buffered +/// in memory, does not support schema change. +pub struct MemoryTable { + /// Table name + name: String, + /// Table id + id: TableId, + /// Table schema + schema: Schema, + /// Rows + row_groups: Arc>, + /// Engine type + engine_type: String, +} + +impl MemoryTable { + pub fn new(name: String, id: TableId, schema: Schema, engine_type: String) -> Self { + Self { + name, + id, + schema, + row_groups: Arc::new(RwLock::new(Vec::new())), + engine_type, + } + } +} + +impl fmt::Debug for MemoryTable { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("MemoryTable") + .field("name", &self.name) + .field("id", &self.id) + .field("schema", &self.schema) + // row_groups is ignored + .finish() + } +} + +#[async_trait] +impl Table for MemoryTable { + fn name(&self) -> &str { + &self.name + } + + fn id(&self) -> TableId { + self.id + } + + fn options(&self) -> HashMap { + HashMap::new() + } + + fn schema(&self) -> Schema { + self.schema.clone() + } + + fn engine_type(&self) -> &str { + &self.engine_type + } + + fn stats(&self) -> TableStats { + TableStats::default() + } + + async fn write(&self, request: WriteRequest) -> Result { + // TODO(yingwen) Maybe check schema? + let mut row_groups = self.row_groups.write().unwrap(); + let n = request.row_group.num_rows(); + row_groups.push(request.row_group); + + Ok(n) + } + + // batch_size is ignored now + async fn read(&self, request: ReadRequest) -> Result { + let scan = MemoryScan { + schema: request.projected_schema.to_record_schema(), + row_groups: self.row_groups.clone(), + index: 0, + }; + + Ok(Box::pin(scan)) + } + + async fn get(&self, _request: GetRequest) -> Result> { + // Alter schema is not supported now. + UnsupportedMethod { + table: &self.name, + method: "get", + } + .fail() + } + + async fn partitioned_read(&self, request: ReadRequest) -> Result { + let stream = self.read(request).await?; + + Ok(PartitionedStreams::one_stream(stream)) + } + + // TODO: Alter schema is not supported now + async fn alter_schema(&self, _request: AlterSchemaRequest) -> Result { + Ok(1) + } + + // TODO: Alter modify setting is not supported now + async fn alter_options(&self, _options: HashMap) -> Result { + Ok(1) + } + + async fn flush(&self, _request: FlushRequest) -> Result<()> { + // Flush is not supported now. + UnsupportedMethod { + table: self.name(), + method: "flush", + } + .fail() + } + + async fn compact(&self) -> Result<()> { + // Compact is not supported now. + UnsupportedMethod { + table: self.name(), + method: "compact", + } + .fail() + } +} + +#[derive(Debug)] +struct MemoryScan { + // The schema of projected column indexed by ReadRequest::projection + schema: RecordSchema, + row_groups: Arc>, + index: usize, +} + +impl Stream for MemoryScan { + type Item = stream::Result; + + fn poll_next(mut self: Pin<&mut Self>, _ctx: &mut Context<'_>) -> Poll> { + // TODO(yingwen): Batch row groups + let record_batch = { + let row_groups = self.row_groups.read().unwrap(); + if self.index >= row_groups.len() { + return Poll::Ready(None); + } + + let rows = &row_groups[self.index]; + // Because the row group inserted may have different column order, so we cannot + // reuse the projection index, and must find projection index for each row + // group, which is inefficient + row_group_to_record_batch(rows, &self.schema) + }; + + self.index += 1; + Poll::Ready(Some(record_batch)) + } +} + +impl RecordBatchStream for MemoryScan { + fn schema(&self) -> &RecordSchema { + &self.schema + } +} + +// REQUIRE: The schema is the projected schema +fn row_group_to_record_batch( + rows: &RowGroup, + record_schema: &RecordSchema, +) -> stream::Result { + if rows.is_empty() { + return Ok(RecordBatch::new_empty(record_schema.clone())); + } + + let num_cols = record_schema.num_columns(); + let mut column_blocks = Vec::with_capacity(num_cols); + // For each column, create an array for that column + for column in record_schema.columns().iter() { + let rows_schema = rows.schema(); + let col_index = rows_schema + .index_of(&column.name) + .with_context(|| ErrNoSource { + msg: format!( + "Failed to convert RowGroup to RecordBatch, column not found, column:{}", + &column.name + ), + })?; + let cols = rows.iter_column(col_index); + let column_block = build_column_block(&column.data_type, cols)?; + column_blocks.push(column_block); + } + + RecordBatch::new(record_schema.clone(), column_blocks) + .map_err(|e| Box::new(e) as _) + .context(ErrWithSource { + msg: "Failed to create RecordBatch", + }) +} + +fn build_column_block<'a, I: Iterator>( + data_type: &DatumKind, + iter: I, +) -> stream::Result { + let mut builder = ColumnBlockBuilder::with_capacity(data_type, iter.size_hint().0); + for datum in iter { + builder + .append(datum.clone()) + .map_err(|e| Box::new(e) as _) + .context(ErrWithSource { + msg: "Append datum", + })?; + } + Ok(builder.build()) +} diff --git a/table_engine/src/partition/expression.rs b/table_engine/src/partition/expression.rs new file mode 100644 index 0000000000..ae89d3a099 --- /dev/null +++ b/table_engine/src/partition/expression.rs @@ -0,0 +1,71 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Partition expression + +use std::ops::Deref; + +use common_types::datum::Datum; +use common_util::define_result; +use snafu::{Backtrace, OptionExt, Snafu}; + +use crate::partition::PartitionInfo; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("No datums for eval.\nBacktrace:\n{}", backtrace))] + EmptyDatums { backtrace: Backtrace }, +} + +define_result!(Error); + +/// Partition expression +#[derive(Debug)] +pub enum Expression { + ColumnExpr(ColumnExpr), +} + +impl Expression { + pub fn new(partition_info: &PartitionInfo) -> Self { + Self::parse_expr(partition_info.expr.to_string()) + } + + /// Extract column name in expression + pub fn extract_column_name(&self) -> impl Iterator { + match self { + Expression::ColumnExpr(col_expr) => col_expr.extract_column_name(), + } + } + + fn parse_expr(expr_str: String) -> Expression { + Expression::ColumnExpr(ColumnExpr::new(expr_str)) + } + + pub fn eval_uint>(&self, datums: &[T]) -> Result { + match self { + Expression::ColumnExpr(column_expr) => { + column_expr.eval_uint(datums.get(0).context(EmptyDatums)?) + } + } + } +} + +/// Column +#[derive(Debug)] +pub struct ColumnExpr { + column_name: String, +} + +impl ColumnExpr { + fn new(column_name: String) -> Self { + Self { column_name } + } + + fn extract_column_name(&self) -> impl Iterator { + std::iter::once(self.column_name.as_str()) + } + + // TODO: handle error + fn eval_uint(&self, datum: &Datum) -> Result { + Ok(datum.convert_to_uint64()) + } +} diff --git a/table_engine/src/partition/mod.rs b/table_engine/src/partition/mod.rs new file mode 100644 index 0000000000..e419b3ef72 --- /dev/null +++ b/table_engine/src/partition/mod.rs @@ -0,0 +1,27 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Partitioned table supports + +mod expression; +pub mod rule; + +/// Partition type of table +#[derive(Clone, Debug, PartialEq)] +pub enum PartitionType { + None = 0, + Hash = 1, +} + +/// Size type of partition num +pub type PartitionNum = u16; + +/// Info for how to partition table +#[derive(Debug, Clone)] +pub struct PartitionInfo { + /// Partition type + pub partition_type: PartitionType, + /// Partition expression + pub expr: String, + /// Partition num + pub partition_num: PartitionNum, +} diff --git a/table_engine/src/partition/rule.rs b/table_engine/src/partition/rule.rs new file mode 100644 index 0000000000..28b31401c5 --- /dev/null +++ b/table_engine/src/partition/rule.rs @@ -0,0 +1,108 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Partition rules + +use common_types::{datum::Datum, row::Row, schema::Schema}; +use common_util::define_result; +use smallvec::SmallVec; +use snafu::{ensure, Backtrace, ResultExt, Snafu}; + +use crate::partition::{expression::Expression, PartitionInfo, PartitionType}; + +const HASH_COLUMN_NUM: usize = 1; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("No column for hash partitioning.\nBacktrace:\n{}", backtrace))] + NoColumnForHash { backtrace: Backtrace }, + + #[snafu(display("Only support one hash column.\nBacktrace:\n{}", backtrace))] + TooMuchHashColumn { backtrace: Backtrace }, + + #[snafu(display("Failed to eval partition expr, err:{}", source))] + EvalExpr { + source: crate::partition::expression::Error, + }, +} + +define_result!(Error); + +/// Partition rule locate partition by input records +// TODO(yingwen): Recreate partition rule once the schema of the table is changed +#[derive(Debug)] +pub enum PartitionRule { + None, + Hash(HashPartitionRule), +} + +impl PartitionRule { + pub fn new(partition_info: &PartitionInfo, schema: &Schema) -> Result { + match partition_info.partition_type { + PartitionType::None => Ok(PartitionRule::None), + PartitionType::Hash => { + let rule = HashPartitionRule::new(partition_info, schema)?; + Ok(PartitionRule::Hash(rule)) + } + } + } + + /// Return the index of partition + pub fn locate_partition(&self, row: &Row) -> Result { + match self { + // Always return the first partition + PartitionRule::None => Ok(0), + PartitionRule::Hash(rule) => rule.eval_partition_index(row), + } + } +} + +/// Partition rule based on hash +#[derive(Debug)] +pub struct HashPartitionRule { + /// Total number of partitions + partition_num: u16, + /// Expression to evaluate a hash value + expression: Expression, + /// Offsets of columns for evaluate + // TODO(yingwen): The column index may be invalid after schema change (add/del column) + column_index: SmallVec<[usize; HASH_COLUMN_NUM]>, +} + +impl HashPartitionRule { + pub fn new(partition_info: &PartitionInfo, schema: &Schema) -> Result { + let expr = Expression::new(partition_info); + + let col_name_list = expr.extract_column_name(); + let mut column_index = SmallVec::with_capacity(col_name_list.size_hint().0); + for col_name in col_name_list { + for (i, v) in schema.columns().iter().enumerate() { + if col_name == v.name { + column_index.push(i); + break; + } + } + } + + ensure!(!column_index.is_empty(), NoColumnForHash); + ensure!(column_index.len() == 1, TooMuchHashColumn); + + Ok(Self { + partition_num: partition_info.partition_num, + expression: expr, + column_index, + }) + } + + // TODO(yingwen): Also pass schema? + pub fn eval_partition_index(&self, row: &Row) -> Result { + let mut col_vals: SmallVec<[&Datum; HASH_COLUMN_NUM]> = + SmallVec::with_capacity(self.column_index.len()); + for i in &self.column_index { + // TODO(yingwen): Check index? + col_vals.push(&row[*i]); + } + let eval_uint = self.expression.eval_uint(&col_vals).context(EvalExpr)?; + + Ok((eval_uint % self.partition_num as u64) as usize) + } +} diff --git a/table_engine/src/predicate/filter_record_batch.rs b/table_engine/src/predicate/filter_record_batch.rs new file mode 100644 index 0000000000..cafbd960da --- /dev/null +++ b/table_engine/src/predicate/filter_record_batch.rs @@ -0,0 +1,249 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use arrow_deps::datafusion::{ + logical_plan::{Expr, Operator}, + scalar::ScalarValue, +}; +use common_types::{datum::DatumView, record_batch::RecordBatchWithKey}; + +#[derive(Debug)] +struct ColumnFilter { + name: String, + op: Operator, + literal: ScalarValue, +} + +fn evaluate_by_operator(lhs: &T, rhs: &T, op: &Operator) -> Option { + let cmp_res = lhs.partial_cmp(rhs)?; + let v = match op { + Operator::Lt => cmp_res.is_lt(), + Operator::LtEq => cmp_res.is_le(), + Operator::Gt => cmp_res.is_gt(), + Operator::GtEq => cmp_res.is_ge(), + Operator::NotEq => cmp_res.is_ne(), + Operator::Eq => cmp_res.is_eq(), + _ => return None, + }; + Some(v) +} + +fn evaluate_datums_by_operator<'a>( + lhs: &DatumView<'a>, + rhs: &DatumView<'a>, + op: &Operator, +) -> Option { + macro_rules! impl_evaluate { + ($($Kind: ident), *) => { + match (lhs, rhs){ + (DatumView::Null, DatumView::Null) => Some(true), + $((DatumView::$Kind(v1), DatumView::$Kind(v2)) => evaluate_by_operator(v1, v2, op),)* + _ => None, + } + }; + } + + impl_evaluate!( + Timestamp, Double, Float, Varbinary, String, UInt64, UInt32, UInt16, UInt8, Int64, Int32, + Int16, Int8, Boolean + ) +} + +impl ColumnFilter { + fn filter(&self, record_batch: &RecordBatchWithKey, selected_buf: &mut [bool]) -> Option<()> { + let filter_datum_view = DatumView::from_scalar_value(&self.literal)?; + + let column_idx = record_batch.schema_with_key().index_of(&self.name)?; + let column_data = record_batch.column(column_idx); + + assert!(selected_buf.len() >= column_data.num_rows()); + for (i, selected) in selected_buf + .iter_mut() + .enumerate() + .take(column_data.num_rows()) + { + if *selected { + let datum_view = column_data.datum_view(i); + *selected = evaluate_datums_by_operator(&datum_view, &filter_datum_view, &self.op) + .unwrap_or(true); + } + } + + Some(()) + } +} + +/// Filter record batch by applying the `column_filters`. +pub struct RecordBatchFilter { + column_filters: Vec, +} + +impl RecordBatchFilter { + /// Create filter according to the `exprs` whose logical relationship is + /// `AND` between each other. Note that the created filter is not + /// equivalent to the original `exprs` and actually only a subset of the + /// exprs is chosen to create the [`RecordBatchFilter`]. + pub fn new(exprs: &[Expr]) -> Self { + let mut filters = Vec::with_capacity(exprs.len()); + for expr in exprs { + if let Expr::BinaryExpr { left, op, right } = expr { + let (column_name, literal) = match (left.as_ref(), right.as_ref()) { + (Expr::Column(col), Expr::Literal(v)) + | (Expr::Literal(v), Expr::Column(col)) => (col.name.to_string(), v.clone()), + _ => continue, + }; + + if matches!( + op, + Operator::NotEq + | Operator::Eq + | Operator::Gt + | Operator::GtEq + | Operator::Lt + | Operator::LtEq + ) { + filters.push(ColumnFilter { + name: column_name, + op: *op, + literal, + }) + } + } + } + + RecordBatchFilter { + column_filters: filters, + } + } + + /// Filter `record_batch` and save the filtering results into the + /// `selected_rows_buf`. + /// + /// Requires: `selected_rows_buf.len() == record_batch.num_rows()`. + pub fn filter( + &self, + record_batch: &RecordBatchWithKey, + selected_rows_buf: &mut [bool], + ) -> usize { + assert_eq!(record_batch.num_rows(), selected_rows_buf.len()); + + for selected in &mut *selected_rows_buf { + *selected = true; + } + + for column_filter in &self.column_filters { + column_filter.filter(record_batch, selected_rows_buf.as_mut()); + } + + selected_rows_buf + .iter() + .map(|selected| if *selected { 1 } else { 0 }) + .sum() + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.column_filters.is_empty() + } +} + +impl From<&[Expr]> for RecordBatchFilter { + fn from(exprs: &[Expr]) -> Self { + Self::new(exprs) + } +} + +#[cfg(test)] +mod test { + use arrow_deps::datafusion::prelude::Column; + use common_types::{ + row::Row, + tests::{build_record_batch_with_key_by_rows, build_row}, + }; + + use super::*; + + fn build_record_batch(rows: Vec) -> RecordBatchWithKey { + build_record_batch_with_key_by_rows(rows) + } + + fn build_filter_expr(column_name: &str, literal: ScalarValue, op: Operator) -> Expr { + Expr::BinaryExpr { + left: Box::new(Expr::Column(Column::from_name(column_name.to_string()))), + op, + right: Box::new(Expr::Literal(literal)), + } + } + + #[test] + fn test_empty_filter() { + let rows = vec![ + build_row(b"aaaa", 1, 11.0, "AAAA"), + build_row(b"aaaa", 1, 21.0, "BBBB"), + ]; + let batch = build_record_batch(rows); + + let filter = RecordBatchFilter::new(&[]); + let mut selected_rows = vec![false; batch.num_rows()]; + let selected_num = filter.filter(&batch, &mut selected_rows); + + assert_eq!(selected_num, selected_rows.len()); + assert!(selected_rows.iter().all(|v| *v)); + } + + #[test] + fn test_all_filter() { + let rows = vec![ + build_row(b"aaaa", 1, 11.0, "AAAA"), + build_row(b"aaaa", 1, 21.0, "BBBB"), + build_row(b"aaaa", 2, 21.0, "CCCC"), + build_row(b"bbbb", 2, 31.0, "DDDD"), + build_row(b"bbbb", 2, 31.0, "DDDD"), + ]; + let batch = build_record_batch(rows); + + let expr = build_filter_expr("key2", ScalarValue::Int64(Some(2)), Operator::LtEq); + let filter = RecordBatchFilter::new(&[expr]); + let mut selected_rows = vec![false; batch.num_rows()]; + let selected_num = filter.filter(&batch, &mut selected_rows); + + assert_eq!(selected_num, selected_rows.len()); + assert!(selected_rows.iter().all(|v| *v)); + } + + #[test] + fn test_partial_filter() { + let rows = vec![ + build_row(b"aaaa", 1, 11.0, "AAAA"), + build_row(b"aaaa", 1, 21.0, "BBBB"), + build_row(b"aaaa", 2, 21.0, "CCCC"), + build_row(b"bbbb", 2, 31.0, "DDDD"), + build_row(b"bbbb", 2, 31.0, "DDDD"), + ]; + let batch = build_record_batch(rows); + + let expr1 = build_filter_expr("key2", ScalarValue::Int64(Some(2)), Operator::LtEq); + let expr2 = build_filter_expr( + "key1", + ScalarValue::Binary(Some(b"aabb".to_vec())), + Operator::GtEq, + ); + let filter = RecordBatchFilter::new(&[expr1, expr2]); + let mut selected_rows = vec![false; batch.num_rows()]; + let selected_num = filter.filter(&batch, &mut selected_rows); + let expect_selected_rows = vec![false, false, false, true, true]; + + assert_eq!(selected_num, 2); + assert_eq!(selected_rows, expect_selected_rows); + } + + #[test] + fn test_filter_empty_batch() { + let batch = build_record_batch(vec![]); + let expr1 = build_filter_expr("key2", ScalarValue::Int64(Some(2)), Operator::LtEq); + let filter = RecordBatchFilter::new(&[expr1]); + let mut selected_rows = vec![false; batch.num_rows()]; + filter.filter(&batch, &mut selected_rows); + + assert!(selected_rows.is_empty()); + } +} diff --git a/table_engine/src/predicate/mod.rs b/table_engine/src/predicate/mod.rs new file mode 100644 index 0000000000..2758dac513 --- /dev/null +++ b/table_engine/src/predicate/mod.rs @@ -0,0 +1,540 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Predict for query table. +//! Reference to: https://github.com/influxdata/influxdb_iox/blob/29b10413051f8c4a2193e8633aa133e45b0e505a/query/src/predicate.rs + +use std::{collections::HashSet, convert::TryInto, sync::Arc}; + +use arrow_deps::{ + arrow::{ + array::ArrayRef, + datatypes::{Schema as ArrowSchema, SchemaRef}, + }, + datafusion::{ + logical_plan::{Column, Expr, Operator}, + optimizer::utils as datafusion_util, + parquet::file::metadata::RowGroupMetaData, + physical_optimizer::pruning::{PruningPredicate, PruningStatistics}, + scalar::ScalarValue, + }, + parquet::file::statistics::Statistics as ParquetStatistics, +}; +use common_types::{ + schema::Schema, + time::{TimeRange, Timestamp}, +}; +use log::{debug, error}; +use snafu::{ResultExt, Snafu}; + +pub mod filter_record_batch; + +#[derive(Debug, Snafu)] +#[snafu(visibility = "pub")] +pub enum Error { + #[snafu(display("Failed ot do pruning, err:{}", source))] + Prune { + source: arrow_deps::datafusion::error::DataFusionError, + }, +} + +define_result!(Error); + +/// port from datafusion. +/// Extract the min/max statistics from a `ParquetStatistics` object +macro_rules! get_statistic { + ($column_statistics:expr, $func:ident, $bytes_func:ident) => {{ + if !$column_statistics.has_min_max_set() { + return None; + } + match $column_statistics { + ParquetStatistics::Boolean(s) => Some(ScalarValue::Boolean(Some(*s.$func()))), + ParquetStatistics::Int32(s) => Some(ScalarValue::Int32(Some(*s.$func()))), + ParquetStatistics::Int64(s) => Some(ScalarValue::Int64(Some(*s.$func()))), + // 96 bit ints not supported + ParquetStatistics::Int96(_) => None, + ParquetStatistics::Float(s) => Some(ScalarValue::Float32(Some(*s.$func()))), + ParquetStatistics::Double(s) => Some(ScalarValue::Float64(Some(*s.$func()))), + ParquetStatistics::ByteArray(s) => { + let s = std::str::from_utf8(s.$bytes_func()) + .map(|s| s.to_string()) + .ok(); + Some(ScalarValue::Utf8(s)) + } + // type not supported yet + ParquetStatistics::FixedLenByteArray(_) => None, + } + }}; +} + +/// port from datafusion. +// Extract the min or max value calling `func` or `bytes_func` on the +// ParquetStatistics as appropriate +macro_rules! get_min_max_values { + ($self:expr, $column:expr, $func:ident, $bytes_func:ident) => {{ + let (column_index, field) = + if let Some((v, f)) = $self.parquet_schema.column_with_name(&$column.name) { + (v, f) + } else { + // Named column was not present + return None; + }; + + let data_type = field.data_type(); + let null_scalar: ScalarValue = if let Ok(v) = data_type.try_into() { + v + } else { + // DataFusion doesn't have support for ScalarValues of the column type + return None; + }; + + let scalar_values: Vec = $self + .row_group_metadata + .iter() + .flat_map(|meta| meta.column(column_index).statistics()) + .map(|stats| get_statistic!(stats, $func, $bytes_func)) + .map(|maybe_scalar| { + // column either did't have statistics at all or didn't have min/max values + maybe_scalar.unwrap_or_else(|| null_scalar.clone()) + }) + .collect(); + + // ignore errors converting to arrays (e.g. different types) + ScalarValue::iter_to_array(scalar_values).ok() + }}; +} + +/// Wraps parquet statistics in a way +/// that implements [`PruningStatistics`] +struct RowGroupPruningStatistics<'a> { + row_group_metadata: &'a [RowGroupMetaData], + parquet_schema: &'a ArrowSchema, +} + +impl<'a> PruningStatistics for RowGroupPruningStatistics<'a> { + fn min_values(&self, column: &Column) -> Option { + get_min_max_values!(self, column, min, min_bytes) + } + + fn max_values(&self, column: &Column) -> Option { + get_min_max_values!(self, column, max, max_bytes) + } + + fn num_containers(&self) -> usize { + self.row_group_metadata.len() + } +} + +fn build_row_group_predicate( + predicate_builder: &PruningPredicate, + row_group_metadata: &[RowGroupMetaData], +) -> Result> { + let parquet_schema = predicate_builder.schema().as_ref(); + + let pruning_stats = RowGroupPruningStatistics { + row_group_metadata, + parquet_schema, + }; + + predicate_builder + .prune(&pruning_stats) + .map_err(|e| { + error!("Error evaluating row group predicate values {}", e); + e + }) + .context(Prune) +} + +/// Predicate helps determine whether specific row group should be read. +#[derive(Debug, Clone)] +pub struct Predicate { + /// Predicates in the query for filter out the columns that meet all the + /// exprs. + pub exprs: Vec, + /// The time range involved by the query. + pub time_range: TimeRange, +} + +pub type PredicateRef = Arc; + +impl Predicate { + pub fn empty() -> Self { + Self::new(TimeRange::min_to_max()) + } + + pub fn new(time_range: TimeRange) -> Self { + Self { + exprs: Vec::new(), + time_range, + } + } + + /// Determine whether a row group should be read according to the meta data + /// in the `row_groups`. + /// + /// The boolean value in the returned vector denotes the corresponding row + /// group in the `row_groups` whether should be read. + pub fn filter_row_groups(&self, schema: &Schema, row_groups: &[RowGroupMetaData]) -> Vec { + let mut results = vec![true; row_groups.len()]; + let arrow_schema: SchemaRef = schema.clone().into_arrow_schema_ref(); + for expr in &self.exprs { + match PruningPredicate::try_new(expr, arrow_schema.clone()) { + Ok(pruning_predicate) => { + debug!("pruning_predicate is:{:?}", pruning_predicate); + + if let Ok(values) = build_row_group_predicate(&pruning_predicate, row_groups) { + for (curr_val, result_val) in values.into_iter().zip(results.iter_mut()) { + *result_val = curr_val && *result_val + } + }; + // if fail to build, just ignore this filter so that all the + // row groups should be read for this + // filter. + } + Err(e) => { + // for any error just ignore it and that is to say, for this filter all the row + // groups should be read. + error!("fail to build pruning predicate, err:{}", e); + } + } + } + + results + } +} + +/// Builder for [Predicate] +#[derive(Debug, Clone, Default)] +#[must_use] +pub struct PredicateBuilder { + time_range: Option, + exprs: Vec, +} + +impl PredicateBuilder { + /// Adds the expressions from `filter_exprs` that can be pushed down to + /// query engine. + pub fn add_pushdown_exprs(mut self, filter_exprs: &[Expr]) -> Self { + // For each expression of the filter_exprs, recursively split it if it is is an + // AND conjunction. For example, expression (x AND y) is split into [x, + // y]. + let mut split_exprs = vec![]; + for filter_expr in filter_exprs { + Self::split_and_expr(filter_expr, &mut split_exprs) + } + + // Only keep single_column and primitive binary expressions + let pushdown_exprs: Vec<_> = split_exprs + .into_iter() + .filter(Self::is_able_to_pushdown) + .collect(); + + self.exprs = pushdown_exprs; + + self + } + + /// Extract the time range from the `filter_exprs` and set it as + /// `TimeRange::zero_to_max()` if no timestamp predicate is found. + pub fn set_time_range(mut self, schema: &Schema, filter_exprs: &[Expr]) -> Self { + let time_range_extractor = TimeRangeExtractor { + timestamp_column_name: schema.timestamp_name(), + filters: filter_exprs, + }; + + let time_range = time_range_extractor.extract(); + debug!( + "finish extract time range from the filters, time_range:{:?}, filters:{:?}", + time_range, filter_exprs + ); + + self.time_range = Some(time_range); + + self + } + + pub fn build(self) -> PredicateRef { + Arc::new(Predicate { + exprs: self.exprs, + time_range: self.time_range.unwrap_or_else(TimeRange::min_to_max), + }) + } + + /// Determine whether the `expr` can be pushed down. + /// Returns false if any error occurs. + fn is_able_to_pushdown(expr: &Expr) -> bool { + let mut columns = HashSet::new(); + if let Err(e) = datafusion_util::expr_to_columns(expr, &mut columns) { + error!( + "Failed to extract columns from the expr, ignore this expr:{:?}, err:{}", + expr, e + ); + return false; + } + + columns.len() == 1 && Self::is_primitive_binary_expr(expr) + } + + /// Recursively split all "AND" expressions into smaller one + /// Example: "A AND B AND C" => [A, B, C] + fn split_and_expr(expr: &Expr, predicates: &mut Vec) { + match expr { + Expr::BinaryExpr { + right, + op: Operator::And, + left, + } => { + Self::split_and_expr(left, predicates); + Self::split_and_expr(right, predicates); + } + other => predicates.push(other.clone()), + } + } + + /// Return true if the given expression is in a primitive binary in the + /// form: `column op constant` and op must be a comparison one. + fn is_primitive_binary_expr(expr: &Expr) -> bool { + match expr { + Expr::BinaryExpr { left, op, right } => { + matches!( + (&**left, &**right), + (Expr::Column(_), Expr::Literal(_)) | (Expr::Literal(_), Expr::Column(_)) + ) && matches!( + op, + Operator::Eq + | Operator::NotEq + | Operator::Lt + | Operator::LtEq + | Operator::Gt + | Operator::GtEq + ) + } + _ => false, + } + } +} + +struct TimeRangeExtractor<'a> { + timestamp_column_name: &'a str, + filters: &'a [Expr], +} + +impl<'a> TimeRangeExtractor<'a> { + /// Do extraction from the `self.filters` for TimeRange. + /// + /// Returns `TimeRange::zero_to_max()` if no timestamp predicate is found. + fn extract(&self) -> TimeRange { + let mut time_range = TimeRange::min_to_max(); + for expr in self.filters { + let sub_time_range = self.extract_time_range_from_expr(expr); + let new_time_range = Self::and_time_ranges(&time_range, &sub_time_range); + + debug!( + "do and logic for time range, left:{:?}, right:{:?}, output:{:?}, expr:{:?}", + time_range, sub_time_range, new_time_range, expr + ); + time_range = new_time_range + } + + time_range + } + + /// Extract timestamp from the literal scalar expression. + fn timestamp_from_scalar_expr(expr: &Expr) -> Option { + if let Expr::Literal(ScalarValue::TimestampMillisecond(v, _)) = expr { + return v.map(Timestamp::new); + } + + None + } + + /// Compute the intersection of the two time ranges. + fn and_time_ranges(left: &TimeRange, right: &TimeRange) -> TimeRange { + let start = left.inclusive_start().max(right.inclusive_start()); + let end = left.exclusive_end().min(right.exclusive_end()); + TimeRange::new(start, end).unwrap_or_else(TimeRange::empty) + } + + /// Compute the union of the two time ranges and the union is defined as the + /// [min(left.start(), right.start()), max(left.end(), right.end())). + fn or_time_ranges(left: &TimeRange, right: &TimeRange) -> TimeRange { + let start = left.inclusive_start().min(right.inclusive_start()); + let end = left.exclusive_end().max(right.exclusive_end()); + TimeRange::new_unchecked(start, end) + } + + /// Extract the timestamp from the column expression and its corresponding + /// literal expression. Returns `None` if the expression pair is not + /// involved with timestamp column. No assumption on the order of the + /// `left` and `right`. + fn timestamp_from_column_and_value_expr(&self, left: &Expr, right: &Expr) -> Option { + let (column, val) = match (left, right) { + (Expr::Column(column), Expr::Literal(_)) => (column, right), + (Expr::Literal(_), Expr::Column(column)) => (column, left), + _ => return None, + }; + + if column.name == self.timestamp_column_name { + Self::timestamp_from_scalar_expr(val) + } else { + None + } + } + + /// Extract time range from the binary expression. + fn extract_time_range_from_binary_expr( + &self, + left: &Expr, + right: &Expr, + op: &Operator, + ) -> TimeRange { + match op { + Operator::And => { + let time_range_left = self.extract_time_range_from_expr(left); + let time_range_right = self.extract_time_range_from_expr(right); + Self::and_time_ranges(&time_range_left, &time_range_right) + } + Operator::Or => { + let time_range_left = self.extract_time_range_from_expr(left); + let time_range_right = self.extract_time_range_from_expr(right); + Self::or_time_ranges(&time_range_left, &time_range_right) + } + Operator::Eq => self + .timestamp_from_column_and_value_expr(left, right) + .map(TimeRange::from_timestamp) + .unwrap_or_else(TimeRange::min_to_max), + Operator::NotEq => TimeRange::min_to_max(), + Operator::Lt => self + .timestamp_from_column_and_value_expr(left, right) + .map(|right_t| TimeRange::new_unchecked(Timestamp::MIN, right_t)) + .unwrap_or_else(TimeRange::min_to_max), + Operator::LtEq => self + .timestamp_from_column_and_value_expr(left, right) + .map(|right_t| { + let right_t = right_t.checked_add_i64(1).unwrap_or(right_t); + TimeRange::new_unchecked(Timestamp::MIN, right_t) + }) + .unwrap_or_else(TimeRange::min_to_max), + Operator::Gt => self + .timestamp_from_column_and_value_expr(left, right) + .map(|left_t| { + let left_t = left_t.checked_add_i64(1).unwrap_or(left_t); + TimeRange::new_unchecked(left_t, Timestamp::MAX) + }) + .unwrap_or_else(TimeRange::min_to_max), + Operator::GtEq => self + .timestamp_from_column_and_value_expr(left, right) + .map(|left_t| TimeRange::new_unchecked(left_t, Timestamp::MAX)) + .unwrap_or_else(TimeRange::min_to_max), + Operator::Plus + | Operator::Minus + | Operator::Multiply + | Operator::Divide + | Operator::Modulo + | Operator::Like + | Operator::NotLike + | Operator::IsDistinctFrom + | Operator::IsNotDistinctFrom + | Operator::RegexMatch + | Operator::RegexNotMatch + | Operator::RegexIMatch + | Operator::RegexNotIMatch => TimeRange::min_to_max(), + } + } + + /// Extract time range from the between expression. + fn time_range_from_between_expr(low: &Expr, high: &Expr, negated: bool) -> TimeRange { + if negated { + return TimeRange::min_to_max(); + } + + let low_t = Self::timestamp_from_scalar_expr(low).unwrap_or(Timestamp::MIN); + // the two operands are inclusive in the `between` expression. + let high_t = { + let t = Self::timestamp_from_scalar_expr(high).unwrap_or(Timestamp::MAX); + t.checked_add_i64(1).unwrap_or(Timestamp::MAX) + }; + TimeRange::new(low_t, high_t).unwrap_or_else(TimeRange::empty) + } + + /// Extract time range from the list expressions. + fn time_range_from_list_expr(list: &[Expr], negated: bool) -> TimeRange { + if negated { + return TimeRange::min_to_max(); + } + + if list.is_empty() { + return TimeRange::empty(); + } + + let (mut inclusive_start, mut inclusive_end) = (Timestamp::MAX, Timestamp::MIN); + for expr in list { + match Self::timestamp_from_scalar_expr(expr) { + Some(t) => { + inclusive_start = inclusive_start.min(t); + inclusive_end = inclusive_end.max(t); + } + None => return TimeRange::min_to_max(), + } + } + + TimeRange::new(inclusive_start, inclusive_end).unwrap_or_else(TimeRange::empty) + } + + /// Extract the time range recursively from the `expr`. + /// + /// Now the strategy is conservative: for the sub-expr which we are not sure + /// how to handle it, returns `TimeRange::zero_to_max()`. + fn extract_time_range_from_expr(&self, expr: &Expr) -> TimeRange { + match expr { + Expr::BinaryExpr { left, op, right } => { + self.extract_time_range_from_binary_expr(left, right, op) + } + Expr::Between { + expr, + negated, + low, + high, + } => { + if let Expr::Column(column) = expr.as_ref() { + if column.name == self.timestamp_column_name { + return Self::time_range_from_between_expr(&*low, &*high, *negated); + } + } + + TimeRange::min_to_max() + } + Expr::InList { + expr, + list, + negated, + } => { + if let Expr::Column(column) = expr.as_ref() { + if column.name == self.timestamp_column_name { + return Self::time_range_from_list_expr(list, *negated); + } + } + + TimeRange::min_to_max() + } + Expr::Not(_) + | Expr::Alias(_, _) + | Expr::ScalarVariable(_) + | Expr::Column(_) + | Expr::Literal(_) + | Expr::IsNotNull(_) + | Expr::IsNull(_) + | Expr::Negative(_) + | Expr::Case { .. } + | Expr::Cast { .. } + | Expr::TryCast { .. } + | Expr::Sort { .. } + | Expr::ScalarFunction { .. } + | Expr::ScalarUDF { .. } + | Expr::AggregateFunction { .. } + | Expr::WindowFunction { .. } + | Expr::AggregateUDF { .. } + | Expr::Wildcard { .. } + | Expr::GetIndexedField { .. } => TimeRange::min_to_max(), + } + } +} diff --git a/table_engine/src/provider.rs b/table_engine/src/provider.rs new file mode 100644 index 0000000000..92e2ed57e0 --- /dev/null +++ b/table_engine/src/provider.rs @@ -0,0 +1,275 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Datafusion `TableProvider` adapter + +use std::{any::Any, fmt, sync::Arc}; + +use arrow_deps::{ + arrow::datatypes::SchemaRef, + datafusion::{ + datasource::datasource::{TableProvider, TableProviderFilterPushDown}, + error::{DataFusionError, Result}, + execution::runtime_env::RuntimeEnv, + logical_plan::Expr, + physical_plan::{ + DisplayFormatType, ExecutionPlan, Partitioning, + SendableRecordBatchStream as DfSendableRecordBatchStream, Statistics, + }, + }, +}; +use async_trait::async_trait; +use common_types::{projected_schema::ProjectedSchema, request_id::RequestId, schema::Schema}; +use log::debug; +use tokio::sync::Mutex; + +use crate::{ + predicate::{PredicateBuilder, PredicateRef}, + stream::{SendableRecordBatchStream, ToDfStream}, + table::{self, ReadOptions, ReadOrder, ReadRequest, TableRef}, +}; + +/// An adapter to [TableProvider] with schema snapshot. +/// +/// This adapter holds a schema snapshot of the table and always returns that +/// schema to caller. +#[derive(Debug)] +pub struct TableProviderAdapter { + table: TableRef, + /// The schema of the table when this adapter is created, used as schema + /// snapshot for read to avoid the reader sees different schema during + /// query + read_schema: Schema, + request_id: RequestId, + read_parallelism: usize, +} + +impl TableProviderAdapter { + pub fn new(table: TableRef, request_id: RequestId, read_parallelism: usize) -> Self { + // Take a snapshot of the schema + let read_schema = table.schema(); + + Self { + table, + read_schema, + request_id, + read_parallelism, + } + } + + pub fn as_table_ref(&self) -> &TableRef { + &self.table + } + + pub fn scan_table( + &self, + projection: &Option>, + filters: &[Expr], + limit: Option, + read_order: ReadOrder, + ) -> Result> { + debug!( + "scan table, table:{}, request_id:{}, projection:{:?}, filters:{:?}, limit:{:?}, read_order:{:?}", + self.table.name(), + self.request_id, + projection, + filters, + limit, + read_order, + ); + + // Forbid the parallel reading if the data order is required. + let read_parallelism = if read_order.is_in_order() { + 1 + } else { + self.read_parallelism + }; + + let predicate = self.predicate_from_filters(filters); + Ok(Arc::new(ScanTable { + projected_schema: ProjectedSchema::new(self.read_schema.clone(), projection.clone()) + .map_err(|e| { + DataFusionError::Internal(format!( + "Invalid projection, plan:{:?}, projection:{:?}, err:{:?}", + self, projection, e + )) + })?, + table: self.table.clone(), + request_id: self.request_id, + read_order, + read_parallelism, + predicate, + stream_state: Mutex::new(ScanStreamState::default()), + })) + } + + fn predicate_from_filters(&self, filters: &[Expr]) -> PredicateRef { + PredicateBuilder::default() + .add_pushdown_exprs(filters) + .set_time_range(&self.read_schema, filters) + .build() + } +} + +#[async_trait] +impl TableProvider for TableProviderAdapter { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + // We use the `read_schema` as the schema of this `TableProvider` + self.read_schema.clone().into_arrow_schema_ref() + } + + async fn scan( + &self, + projection: &Option>, + filters: &[Expr], + limit: Option, + ) -> Result> { + self.scan_table(projection, filters, limit, ReadOrder::None) + } + + fn supports_filter_pushdown(&self, _filter: &Expr) -> Result { + Ok(TableProviderFilterPushDown::Inexact) + } +} + +#[derive(Default)] +struct ScanStreamState { + inited: bool, + err: Option, + streams: Vec>, +} + +impl ScanStreamState { + fn take_stream(&mut self, index: usize) -> Result { + if let Some(e) = &self.err { + return Err(DataFusionError::Execution(format!( + "Failed to read table, partition:{}, err:{}", + index, e + ))); + } + + // TODO(yingwen): Return an empty stream if index is out of bound. + self.streams[index].take().ok_or_else(|| { + DataFusionError::Execution(format!( + "Read partition multiple times is not supported, partition:{}", + index + )) + }) + } +} + +/// Physical plan of scanning table. +struct ScanTable { + projected_schema: ProjectedSchema, + table: TableRef, + request_id: RequestId, + read_order: ReadOrder, + read_parallelism: usize, + predicate: PredicateRef, + + stream_state: Mutex, +} + +impl ScanTable { + async fn maybe_init_stream(&self, runtime: Arc) -> Result<()> { + let mut stream_state = self.stream_state.lock().await; + if stream_state.inited { + return Ok(()); + } + + let req = ReadRequest { + request_id: self.request_id, + opts: ReadOptions { + batch_size: runtime.batch_size(), + read_parallelism: self.read_parallelism, + }, + projected_schema: self.projected_schema.clone(), + predicate: self.predicate.clone(), + order: self.read_order, + }; + + let read_res = self.table.partitioned_read(req).await; + match read_res { + Ok(partitioned_streams) => { + assert_eq!(self.read_parallelism, partitioned_streams.streams.len()); + stream_state.streams = partitioned_streams.streams.into_iter().map(Some).collect(); + } + Err(e) => { + stream_state.err = Some(e); + } + } + stream_state.inited = true; + + Ok(()) + } +} + +#[async_trait] +impl ExecutionPlan for ScanTable { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.projected_schema.to_projected_arrow_schema() + } + + fn output_partitioning(&self) -> Partitioning { + Partitioning::RoundRobinBatch(self.read_parallelism) + } + + fn children(&self) -> Vec> { + // this is a leaf node and has no children + vec![] + } + + fn with_new_children(&self, _: Vec>) -> Result> { + Err(DataFusionError::Internal(format!( + "Children cannot be replaced in {:?}", + self + ))) + } + + async fn execute( + &self, + partition: usize, + runtime: Arc, + ) -> Result { + self.maybe_init_stream(runtime).await?; + + let mut stream_state = self.stream_state.lock().await; + let stream = stream_state.take_stream(partition)?; + + Ok(Box::pin(ToDfStream(stream))) + } + + fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "ScanTable: table={}, parallelism={}, order={:?}, ", + self.table.name(), + self.read_parallelism, + self.read_order, + ) + } + + fn statistics(&self) -> Statistics { + // TODO(yingwen): Implement this + Statistics::default() + } +} + +impl fmt::Debug for ScanTable { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ScanTable") + .field("projected_schema", &self.projected_schema) + .field("table", &self.table.name()) + .field("read_order", &self.read_order) + .field("read_parallelism", &self.read_parallelism) + .field("predicate", &self.predicate) + .finish() + } +} diff --git a/table_engine/src/stream.rs b/table_engine/src/stream.rs new file mode 100644 index 0000000000..fc8245d07c --- /dev/null +++ b/table_engine/src/stream.rs @@ -0,0 +1,128 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table record stream + +use std::{ + convert::TryFrom, + pin::Pin, + task::{Context, Poll}, +}; + +use arrow_deps::{ + arrow::{ + datatypes::SchemaRef, + error::{ArrowError, Result as ArrowResult}, + record_batch::RecordBatch as ArrowRecordBatch, + }, + datafusion::physical_plan::{ + RecordBatchStream as DfRecordBatchStream, + SendableRecordBatchStream as DfSendableRecordBatchStream, + }, +}; +use common_types::{record_batch::RecordBatch, schema::RecordSchema}; +use common_util::define_result; +use futures::stream::Stream; +use snafu::{Backtrace, ResultExt, Snafu}; + +// TODO(yingwen): Classify the error. +#[derive(Debug, Snafu)] +#[snafu(visibility(pub))] +pub enum Error { + #[snafu(display("Stream error, msg:{}, err:{}", msg, source))] + ErrWithSource { + msg: String, + source: Box, + }, + + #[snafu(display("Stream error, msg:{}.\nBacktrace:\n{}", msg, backtrace))] + ErrNoSource { msg: String, backtrace: Backtrace }, +} + +define_result!(Error); + +pub trait RecordBatchStream: Stream> { + fn schema(&self) -> &RecordSchema; +} + +pub type SendableRecordBatchStream = Pin>; + +/// Record batch streams divided by time range. +pub struct PartitionedStreams { + pub streams: Vec, +} + +impl PartitionedStreams { + pub fn one_stream(stream: SendableRecordBatchStream) -> Self { + Self { + streams: vec![stream], + } + } +} + +pub struct ToDfStream(pub SendableRecordBatchStream); + +impl Stream for ToDfStream { + type Item = ArrowResult; + + fn poll_next(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll> { + match self.0.as_mut().poll_next(ctx) { + Poll::Ready(Some(Ok(record_batch))) => { + Poll::Ready(Some(Ok(record_batch.into_arrow_record_batch()))) + } + Poll::Ready(Some(Err(e))) => { + Poll::Ready(Some(Err(ArrowError::ExternalError(Box::new(e))))) + } + Poll::Ready(None) => Poll::Ready(None), + Poll::Pending => Poll::Pending, + } + } +} + +impl DfRecordBatchStream for ToDfStream { + fn schema(&self) -> SchemaRef { + self.0.schema().to_arrow_schema_ref() + } +} + +pub struct FromDfStream { + schema: RecordSchema, + df_stream: DfSendableRecordBatchStream, +} + +impl FromDfStream { + pub fn new(df_stream: DfSendableRecordBatchStream) -> Result { + let df_schema = df_stream.schema(); + let schema = RecordSchema::try_from(df_schema) + .map_err(|e| Box::new(e) as _) + .context(ErrWithSource { + msg: "Convert record schema", + })?; + + Ok(Self { schema, df_stream }) + } +} + +impl Stream for FromDfStream { + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll> { + match self.df_stream.as_mut().poll_next(ctx) { + Poll::Ready(Some(record_batch_res)) => Poll::Ready(Some( + record_batch_res + .map_err(|e| Box::new(e) as _) + .and_then(|batch| RecordBatch::try_from(batch).map_err(|e| Box::new(e) as _)) + .context(ErrWithSource { + msg: "Convert from arrow record batch", + }), + )), + Poll::Ready(None) => Poll::Ready(None), + Poll::Pending => Poll::Pending, + } + } +} + +impl RecordBatchStream for FromDfStream { + fn schema(&self) -> &RecordSchema { + &self.schema + } +} diff --git a/table_engine/src/table.rs b/table_engine/src/table.rs new file mode 100644 index 0000000000..b361756e8d --- /dev/null +++ b/table_engine/src/table.rs @@ -0,0 +1,608 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table abstraction + +use std::{ + collections::HashMap, + convert::TryFrom, + fmt, + sync::{ + atomic::{AtomicU32, AtomicU64, Ordering}, + Arc, + }, +}; + +use async_trait::async_trait; +use common_types::{ + column_schema::ColumnSchema, + datum::Datum, + projected_schema::ProjectedSchema, + request_id::RequestId, + row::{Row, RowGroup}, + schema::{RecordSchemaWithKey, Schema, Version}, + time::Timestamp, +}; +use proto::sys_catalog::{TableEntry, TableState as TableStatePb}; +use serde_derive::Deserialize; +use snafu::{Backtrace, Snafu}; + +use crate::{ + engine::{TableRequestType, TableState}, + predicate::PredicateRef, + stream::{PartitionedStreams, SendableRecordBatchStream}, +}; + +/// Contains common error variant, implementation specific error should +/// be cast into Box +#[derive(Debug, Snafu)] +#[snafu(visibility = "pub")] +pub enum Error { + #[snafu(display( + "Unsupported table method, table:{}, method:{}.\nBacktrace:\n{}", + table, + method, + backtrace + ))] + UnsupportedMethod { + table: String, + method: String, + backtrace: Backtrace, + }, + + #[snafu(display( + "Get Invalid primary key, expected schema:{:?}, given_primary_keys:{:?}.\nBacktrace:\n{}", + schema, + primary_key_columns, + backtrace + ))] + GetInvalidPrimaryKey { + schema: RecordSchemaWithKey, + primary_key_columns: Vec, + backtrace: Backtrace, + }, + + #[snafu(display( + "Get null primary key, expected schema:{:?}, given_primary_keys:{:?}.\nBacktrace:\n{}", + schema, + primary_key_columns, + backtrace + ))] + GetNullPrimaryKey { + schema: RecordSchemaWithKey, + primary_key_columns: Vec, + backtrace: Backtrace, + }, + + #[snafu(display("Unexpected error, err:{}", source))] + Unexpected { + source: Box, + }, + + #[snafu(display("Invalid arguments, err:{}", source))] + InvalidArguments { + table: String, + source: Box, + }, + + #[snafu(display("Failed to write table, table:{}, err:{}", table, source))] + Write { + table: String, + source: Box, + }, + + #[snafu(display("Failed to scan table, table:{}, err:{}", table, source))] + Scan { + table: String, + source: Box, + }, + + #[snafu(display("Failed to get table, table:{}, err:{}", table, source))] + Get { + table: String, + source: Box, + }, + + #[snafu(display("Failed to alter schema, table:{}, err:{}", table, source))] + AlterSchema { + table: String, + source: Box, + }, + + #[snafu(display("Failed to alter options, table:{}, err:{}", table, source))] + AlterOptions { + table: String, + source: Box, + }, + + #[snafu(display("Failed to flush table, table:{}, err:{}", table, source))] + Flush { + table: String, + source: Box, + }, + + #[snafu(display("Failed to compact table, table:{}, err:{}", table, source))] + Compact { + table: String, + source: Box, + }, +} + +define_result!(Error); + +/// Default partition num to scan in parallelism. +pub const DEFAULT_READ_PARALLELISM: usize = 8; + +/// Schema id (24 bits) +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct SchemaId(u32); + +impl SchemaId { + /// Bits of schema id. + const BITS: u32 = 24; + /// 24 bits mask (0xffffff) + const MASK: u32 = (1 << Self::BITS) - 1; + /// Max schema id. + pub const MAX: SchemaId = SchemaId(Self::MASK); + /// Min schema id. + pub const MIN: SchemaId = SchemaId(0); + + /// Create a new schema id from u32, return None if `id` is invalid. + pub fn new(id: u32) -> Option { + // Only need to check max as min is 0. + if id <= SchemaId::MAX.0 { + Some(Self(id)) + } else { + None + } + } + + // It is safe to convert u16 into schema id. + pub const fn from_u16(id: u16) -> Self { + Self(id as u32) + } + + /// Convert the schema id into u32. + #[inline] + pub fn as_u32(&self) -> u32 { + self.0 + } +} + +impl PartialEq for SchemaId { + fn eq(&self, other: &u32) -> bool { + self.0 == *other + } +} + +impl From for SchemaId { + fn from(id: u16) -> SchemaId { + SchemaId::from_u16(id) + } +} + +/// Sequence of a table under a schema (40 bits). +#[derive(Debug, Clone, Copy)] +pub struct TableSeq(u64); + +impl TableSeq { + /// Bits of schema id. + const BITS: u64 = 40; + /// 40 bits mask (0xffffffffff). + const MASK: u64 = (1 << Self::BITS) - 1; + /// Max sequence of table in a schema. + pub const MAX: TableSeq = TableSeq(Self::MASK); + /// Min sequence of table in a schema. + pub const MIN: TableSeq = TableSeq(0); + + /// Create a new table sequence from u64, return None if `seq` is invalid. + pub const fn new(seq: u64) -> Option { + // Only need to check max as min is 0. + if seq <= TableSeq::MAX.0 { + Some(Self(seq)) + } else { + None + } + } + + // It is safe to convert u32 into table seq. + pub const fn from_u32(id: u32) -> Self { + Self(id as u64) + } + + /// Convert the table sequence into u64. + #[inline] + pub fn as_u64(&self) -> u64 { + self.0 + } +} + +impl From for TableSeq { + fn from(id: u32) -> TableSeq { + TableSeq::from_u32(id) + } +} + +/// Table Id (64 bits) +/// +/// Table id is constructed via schema id (24 bits) and a table sequence (40 +/// bits). +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Deserialize)] +pub struct TableId(u64); + +impl TableId { + /// Min table id. + pub const MIN: TableId = TableId(0); + + /// Create a new table id from `schema_id` and `table_seq`. + pub const fn new(schema_id: SchemaId, table_seq: TableSeq) -> Self { + let schema_id_data = schema_id.0 as u64; + let schema_id_part = schema_id_data << TableSeq::BITS; + let table_id_data = schema_id_part | table_seq.0; + + Self(table_id_data) + } + + /// Get the schema id part of the table id. + #[inline] + pub fn schema_id(&self) -> SchemaId { + let schema_id_part = self.0 >> TableSeq::BITS; + + SchemaId(schema_id_part as u32) + } + + /// Get the sequence part of the table id. + #[inline] + pub fn table_seq(&self) -> TableSeq { + let seq_part = self.0 & TableSeq::MASK; + + TableSeq(seq_part) + } + + /// Convert table id into u64. + #[inline] + pub fn as_u64(&self) -> u64 { + self.0 + } +} + +impl From for TableId { + fn from(id: u64) -> TableId { + TableId(id) + } +} + +impl fmt::Debug for TableId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "TableId({}, {}, {})", + self.0, + self.schema_id().as_u32(), + self.table_seq().as_u64() + ) + } +} + +impl fmt::Display for TableId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } +} + +// TODO(yingwen): Support DELETE/UPDATE... , a mutation type is needed. +#[derive(Debug)] +pub struct WriteRequest { + /// rows to write + pub row_group: RowGroup, +} + +#[derive(Debug)] +pub struct ReadOptions { + pub batch_size: usize, + /// Suggested read parallelism, the actual returned stream should equal to + /// `read_parallelism`. + pub read_parallelism: usize, +} + +impl Default for ReadOptions { + fn default() -> Self { + Self { + batch_size: 10000, + read_parallelism: DEFAULT_READ_PARALLELISM, + } + } +} + +#[derive(Debug)] +pub struct GetRequest { + /// Query request id. + pub request_id: RequestId, + /// The schema and projection for get, the output data should match this + /// schema. + pub projected_schema: ProjectedSchema, + /// The primary key of the row to get. + pub primary_key: Vec, +} + +#[derive(Copy, Clone, Debug)] +pub enum ReadOrder { + /// No order requirements from the read request. + None, + Asc, + Desc, +} + +impl ReadOrder { + pub fn from_is_asc(is_asc: Option) -> Self { + match is_asc { + Some(true) => ReadOrder::Asc, + Some(false) => ReadOrder::Desc, + None => ReadOrder::None, + } + } + + #[inline] + pub fn is_out_of_order(&self) -> bool { + matches!(self, ReadOrder::None) + } + + #[inline] + pub fn is_in_order(&self) -> bool { + !self.is_out_of_order() + } + + #[inline] + pub fn is_in_desc_order(&self) -> bool { + matches!(self, ReadOrder::Desc) + } +} + +#[derive(Debug)] +pub struct ReadRequest { + /// Read request id. + pub request_id: RequestId, + /// Read options. + pub opts: ReadOptions, + /// The schema and projection for read, the output data should match this + /// schema. + pub projected_schema: ProjectedSchema, + /// Predicate of the query. + pub predicate: PredicateRef, + /// Read the rows in reverse order. + pub order: ReadOrder, +} + +#[derive(Debug)] +pub struct AlterSchemaRequest { + /// The new schema. + pub schema: Schema, + /// Previous schema version before alteration. + pub pre_schema_version: Version, +} + +#[derive(Debug)] +pub struct FlushRequest { + /// Trigger a compaction after flush, default is true. + pub compact_after_flush: bool, + /// Whether to wait flush task finishes, default is true. + pub sync: bool, +} + +impl Default for FlushRequest { + fn default() -> Self { + Self { + compact_after_flush: true, + sync: true, + } + } +} + +/// Table abstraction +/// +/// We do not let Table trait extends datafusion's TableProvider, since +/// that will tie out abstraction with datafusion. However, we still use +/// datafusion's RecordBatchStream trait. +#[async_trait] +pub trait Table: std::fmt::Debug { + /// Returns table name. + fn name(&self) -> &str; + + /// Returns the id of this table. + fn id(&self) -> TableId; + + /// Schema of this table. + fn schema(&self) -> Schema; + + /// Options of this table. + fn options(&self) -> HashMap; + + /// Engine type of this table. + fn engine_type(&self) -> &str; + + /// Get table's statistics. + fn stats(&self) -> TableStats; + + /// Write to table. + async fn write(&self, request: WriteRequest) -> Result; + + /// Read from table. + async fn read(&self, request: ReadRequest) -> Result; + + /// Get the specific row according to the primary key. + /// TODO(xikai): object-safety is not ensured by now if the default + /// implementation is provided. Actually it is better to use the read + /// method to implement the get method. + async fn get(&self, request: GetRequest) -> Result>; + + /// Read multiple partition of the table in parallel. + async fn partitioned_read(&self, request: ReadRequest) -> Result; + + /// Alter table schema to the schema specific in [AlterSchemaRequest] if + /// the `pre_schema_version` is equal to current schema version. + /// + /// Returns the affected rows (always 1). + async fn alter_schema(&self, request: AlterSchemaRequest) -> Result; + + /// Alter table options. + /// + /// Returns the affected rows (always 1). + async fn alter_options(&self, options: HashMap) -> Result; + + /// Flush this table. + async fn flush(&self, request: FlushRequest) -> Result<()>; + + /// Compact this table and wait until compaction completes. + async fn compact(&self) -> Result<()>; +} + +/// Basic statistics of table. +#[derive(Debug, Clone, Copy, Default)] +pub struct TableStats { + /// Total write request + pub num_write: u64, + /// Total read request + pub num_read: u64, + /// Total flush request + pub num_flush: u64, +} + +/// A reference-counted pointer to Table +pub type TableRef = Arc; + +/// Helper to generate a schema id. +pub struct SchemaIdGenerator { + last_schema_id: AtomicU32, +} + +impl SchemaIdGenerator { + pub fn last_schema_id_u32(&self) -> u32 { + self.last_schema_id.load(Ordering::Relaxed) + } + + pub fn set_last_schema_id(&self, last_schema_id: SchemaId) { + self.last_schema_id + .store(last_schema_id.as_u32(), Ordering::Relaxed); + } + + pub fn alloc_schema_id(&self) -> Option { + let last = self.last_schema_id.fetch_add(1, Ordering::Relaxed); + + SchemaId::new(last + 1) + } +} + +impl Default for SchemaIdGenerator { + fn default() -> Self { + Self { + last_schema_id: AtomicU32::new(SchemaId::MIN.as_u32()), + } + } +} + +/// Helper to generate a table sequence. +pub struct TableSeqGenerator { + last_table_seq: AtomicU64, +} + +impl TableSeqGenerator { + pub fn last_table_seq_u64(&self) -> u64 { + self.last_table_seq.load(Ordering::Relaxed) + } + + pub fn set_last_table_seq(&self, last_table_seq: TableSeq) { + self.last_table_seq + .store(last_table_seq.as_u64(), Ordering::Relaxed); + } + + pub fn alloc_table_seq(&self) -> Option { + let last = self.last_table_seq.fetch_add(1, Ordering::Relaxed); + + TableSeq::new(last + 1) + } +} + +impl Default for TableSeqGenerator { + fn default() -> Self { + Self { + last_table_seq: AtomicU64::new(TableSeq::MIN.as_u64()), + } + } +} + +/// Create table request in catalog +#[derive(Debug, Clone)] +pub struct TableInfo { + /// Catalog name + pub catalog_name: String, + /// Schema name + pub schema_name: String, + /// Table id + pub table_id: TableId, + /// Table name + pub table_name: String, + /// Table engine type + pub engine: String, + /// Tells state of the table + pub state: TableState, +} + +#[derive(Debug, Snafu)] +pub struct TryFromTableEntryError(common_types::schema::Error); + +impl TryFrom for TableInfo { + type Error = TryFromTableEntryError; + + fn try_from(entry: TableEntry) -> std::result::Result { + Ok(Self { + catalog_name: entry.catalog_name, + schema_name: entry.schema_name, + table_id: entry.table_id.into(), + table_name: entry.table_name, + engine: entry.engine, + state: TableState::from(entry.state), + }) + } +} + +impl From for TableEntry { + fn from(table_info: TableInfo) -> Self { + let mut entry = TableEntry::new(); + entry.set_catalog_name(table_info.catalog_name); + entry.set_schema_name(table_info.schema_name); + entry.set_table_id(table_info.table_id.as_u64()); + entry.set_table_name(table_info.table_name); + entry.set_engine(table_info.engine); + entry.set_state(TableStatePb::from(table_info.state)); + + entry + } +} + +impl TableInfo { + // TODO(chunshao.rcs): refactor + pub fn into_pb(self, typ: TableRequestType) -> TableEntry { + let mut table_entry: TableEntry = self.into(); + match typ { + TableRequestType::Create => table_entry.set_created_time(Timestamp::now().as_i64()), + TableRequestType::Drop => table_entry.set_modified_time(Timestamp::now().as_i64()), + } + table_entry + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_schema_id() { + assert_eq!(0, SchemaId::MIN.as_u32()); + assert_eq!(0xffffff, SchemaId::MAX.as_u32()); + } + + #[test] + fn test_table_seq() { + assert_eq!(0, TableSeq::MIN.as_u64()); + assert_eq!(0xffffffffff, TableSeq::MAX.as_u64()); + } +} diff --git a/udf/Cargo.toml b/udf/Cargo.toml new file mode 100644 index 0000000000..a4895e787d --- /dev/null +++ b/udf/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "udf" +version = "0.1.0" +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +arrow_deps = { path = "../arrow_deps" } +base64 = "0.13" +chrono = "0.4" +common_types = { path = "../common_types" } +common_util = { path = "../common_util" } +hyperloglog = { path = "../components/rust-hyperloglog" } +smallvec = "1.6" +snafu = { version ="0.6.10", features = ["backtraces"]} diff --git a/udf/src/aggregate.rs b/udf/src/aggregate.rs new file mode 100644 index 0000000000..45fa24b73b --- /dev/null +++ b/udf/src/aggregate.rs @@ -0,0 +1,164 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Aggregate functions. + +use std::{fmt, ops::Deref}; + +use arrow_deps::{ + arrow::array::ArrayRef as DfArrayRef, + datafusion::{ + error::{DataFusionError, Result as DfResult}, + physical_plan::Accumulator as DfAccumulator, + scalar::ScalarValue as DfScalarValue, + }, +}; +use common_util::define_result; +use snafu::Snafu; + +use crate::functions::{ScalarValue, ScalarValueRef}; + +#[derive(Debug, Snafu)] +#[snafu(visibility(pub(crate)))] +pub enum Error { + #[snafu(display("Failed to get state, err:{}", source))] + GetState { + source: Box, + }, + + #[snafu(display("Failed to merge state, err:{}", source))] + MergeState { + source: Box, + }, +} + +define_result!(Error); + +pub struct State(Vec); + +impl State { + fn into_df_scalar_values(self) -> Vec { + self.0 + } +} + +impl From for State { + fn from(value: ScalarValue) -> Self { + Self(vec![value.into_df_scalar_value()]) + } +} + +pub struct Input<'a>(&'a [DfScalarValue]); + +impl<'a> Input<'a> { + pub fn iter(&self) -> impl Iterator { + self.0.iter().map(ScalarValueRef::from) + } + + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + pub fn value(&self, index: usize) -> ScalarValueRef { + ScalarValueRef::from(&self.0[index]) + } +} + +pub struct StateRef<'a>(Input<'a>); + +impl<'a> Deref for StateRef<'a> { + type Target = Input<'a>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +/// An accumulator represents a stateful object that lives throughout the +/// evaluation of multiple rows and generically accumulates values. +/// +/// An accumulator knows how to: +/// * update its state from inputs via `update` +/// * convert its internal state to a vector of scalar values +/// * update its state from multiple accumulators' states via `merge` +/// * compute the final value from its internal state via `evaluate` +pub trait Accumulator: Send + Sync + fmt::Debug { + /// Returns the state of the accumulator at the end of the accumulation. + // in the case of an average on which we track `sum` and `n`, this function + // should return a vector of two values, sum and n. + fn state(&self) -> Result; + + /// updates the accumulator's state from a vector of scalars. + fn update(&mut self, values: Input) -> Result<()>; + + /// updates the accumulator's state from a vector of scalars. + fn merge(&mut self, states: StateRef) -> Result<()>; + + /// returns its value based on its current state. + fn evaluate(&self) -> Result; +} + +#[derive(Debug)] +pub struct ToDfAccumulator { + accumulator: T, +} + +impl ToDfAccumulator { + pub fn new(accumulator: T) -> Self { + Self { accumulator } + } +} + +impl DfAccumulator for ToDfAccumulator { + fn state(&self) -> DfResult> { + let state = self.accumulator.state().map_err(|e| { + DataFusionError::Execution(format!("Accumulator failed to get state, err:{}", e)) + })?; + Ok(state.into_df_scalar_values()) + } + + fn update_batch(&mut self, values: &[DfArrayRef]) -> DfResult<()> { + if values.is_empty() { + return Ok(()); + }; + (0..values[0].len()).try_for_each(|index| { + let v = values + .iter() + .map(|array| DfScalarValue::try_from_array(array, index)) + .collect::>>()?; + let input = Input(&v); + + self.accumulator.update(input).map_err(|e| { + DataFusionError::Execution(format!("Accumulator failed to update, err:{}", e)) + }) + }) + } + + fn merge_batch(&mut self, states: &[DfArrayRef]) -> DfResult<()> { + if states.is_empty() { + return Ok(()); + }; + (0..states[0].len()).try_for_each(|index| { + let v = states + .iter() + .map(|array| DfScalarValue::try_from_array(array, index)) + .collect::>>()?; + let state_ref = StateRef(Input(&v)); + + self.accumulator.merge(state_ref).map_err(|e| { + DataFusionError::Execution(format!("Accumulator failed to merge, err:{}", e)) + }) + }) + } + + fn evaluate(&self) -> DfResult { + let value = self.accumulator.evaluate().map_err(|e| { + DataFusionError::Execution(format!("Accumulator failed to evaluate, err:{}", e)) + })?; + + Ok(value.into_df_scalar_value()) + } +} diff --git a/udf/src/functions.rs b/udf/src/functions.rs new file mode 100644 index 0000000000..6fcd2df4be --- /dev/null +++ b/udf/src/functions.rs @@ -0,0 +1,326 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Functions. + +use std::{ + hash::{Hash, Hasher}, + sync::Arc, +}; + +use arrow_deps::{ + arrow::datatypes::DataType, + datafusion::{ + error::DataFusionError, + physical_plan::{ + aggregates::{AccumulatorFunctionImplementation, StateTypeFunction}, + functions::{ + ReturnTypeFunction, ScalarFunctionImplementation, Signature as DfSignature, + TypeSignature as DfTypeSignature, Volatility, + }, + ColumnarValue as DfColumnarValue, + }, + scalar::ScalarValue as DfScalarValue, + }, +}; +use common_types::{column::ColumnBlock, datum::DatumKind}; +use common_util::define_result; +use smallvec::SmallVec; +use snafu::{ResultExt, Snafu}; + +use crate::aggregate::{Accumulator, ToDfAccumulator}; + +// Most functions have no more than 5 args. +const FUNC_ARG_NUM: usize = 5; + +#[derive(Debug, Snafu)] +#[snafu(visibility(pub(crate)))] +pub enum Error { + #[snafu(display("Failed to convert array to ColumnarValue, err:{}", source))] + InvalidArray { source: common_types::column::Error }, + + #[snafu(display("Invalid function arguments, err:{}", source))] + InvalidArguments { + source: Box, + }, + + #[snafu(display("Failed to execute function, err:{}", source))] + CallFunction { + source: Box, + }, +} + +define_result!(Error); + +/// A dynamically typed, nullable single value. +// TODO(yingwen): Can we use Datum? +#[derive(Debug)] +pub struct ScalarValue(DfScalarValue); + +impl ScalarValue { + pub(crate) fn into_df_scalar_value(self) -> DfScalarValue { + self.0 + } + + fn from_df_scalar_value(df_scalar: &DfScalarValue) -> Self { + Self(df_scalar.clone()) + } + + pub fn as_str(&self) -> Option<&str> { + match &self.0 { + DfScalarValue::Utf8(value_opt) => value_opt.as_ref().map(|v| v.as_str()), + _ => None, + } + } +} + +impl From for ScalarValue { + fn from(value: String) -> Self { + Self(DfScalarValue::Utf8(Some(value))) + } +} + +impl From for ScalarValue { + fn from(value: u64) -> Self { + Self(value.into()) + } +} + +pub struct ScalarValueRef<'a>(&'a DfScalarValue); + +impl<'a> ScalarValueRef<'a> { + pub fn as_str(&self) -> Option<&str> { + match self.0 { + DfScalarValue::Utf8(value_opt) | DfScalarValue::LargeUtf8(value_opt) => { + value_opt.as_ref().map(|v| v.as_str()) + } + _ => None, + } + } +} + +impl<'a> From<&'a DfScalarValue> for ScalarValueRef<'a> { + fn from(value: &DfScalarValue) -> ScalarValueRef { + ScalarValueRef(value) + } +} + +impl<'a> Hash for ScalarValueRef<'a> { + fn hash(&self, state: &mut H) { + self.0.hash(state) + } +} + +/// Represent a value of function result. +#[derive(Debug)] +pub enum ColumnarValue { + /// Array of values. + Array(ColumnBlock), + /// A single value. + Scalar(ScalarValue), +} + +impl ColumnarValue { + fn into_df_columnar_value(self) -> DfColumnarValue { + match self { + ColumnarValue::Array(v) => DfColumnarValue::Array(v.to_arrow_array_ref()), + ColumnarValue::Scalar(v) => DfColumnarValue::Scalar(v.into_df_scalar_value()), + } + } + + fn try_from_df_columnar_value(df_value: &DfColumnarValue) -> Result { + let columnar_value = match df_value { + DfColumnarValue::Array(array) => { + let column_block = + ColumnBlock::try_cast_arrow_array_ref(array).context(InvalidArray)?; + ColumnarValue::Array(column_block) + } + DfColumnarValue::Scalar(v) => { + ColumnarValue::Scalar(ScalarValue::from_df_scalar_value(v)) + } + }; + + Ok(columnar_value) + } +} + +/// A function's TypeSignature. +#[derive(Debug)] +pub enum TypeSignature { + /// exact number of arguments of an exact type + Exact(Vec), + /// fixed number of arguments of an arbitrary but equal type out of a list + /// of valid types + // A function of one argument of double is `Uniform(1, vec![DatumKind::Double])` + // A function of one argument of double or uint64 is `Uniform(1, vec![DatumKind::Double, + // DatumKind::UInt64])` + Uniform(usize, Vec), + /// One of a list of signatures + OneOf(Vec), +} + +impl TypeSignature { + pub(crate) fn to_datafusion_signature(&self) -> DfSignature { + DfSignature::new(self.to_datafusion_type_signature(), Volatility::Immutable) + } + + fn to_datafusion_type_signature(&self) -> DfTypeSignature { + match self { + TypeSignature::Exact(kinds) => { + let data_types = kinds.iter().map(|v| DataType::from(*v)).collect(); + DfTypeSignature::Exact(data_types) + } + TypeSignature::Uniform(num, kinds) => { + let data_types = kinds.iter().map(|v| DataType::from(*v)).collect(); + DfTypeSignature::Uniform(*num, data_types) + } + TypeSignature::OneOf(sigs) => { + let df_sigs = sigs + .iter() + .map(|v| v.to_datafusion_type_signature()) + .collect(); + DfTypeSignature::OneOf(df_sigs) + } + } + } +} + +/// A scalar function's return type. +#[derive(Debug)] +pub struct ReturnType { + kind: DatumKind, +} + +impl ReturnType { + pub(crate) fn to_datafusion_return_type(&self) -> ReturnTypeFunction { + let data_type = Arc::new(DataType::from(self.kind)); + Arc::new(move |_| Ok(data_type.clone())) + } +} + +pub struct ScalarFunction { + signature: TypeSignature, + return_type: ReturnType, + df_scalar_fn: ScalarFunctionImplementation, +} + +impl ScalarFunction { + pub fn make_by_fn(signature: TypeSignature, return_type: DatumKind, func: F) -> Self + where + F: Fn(&[ColumnarValue]) -> Result + Send + Sync + 'static, + { + let return_type = ReturnType { kind: return_type }; + + // Adapter to map func to Fn(&[DfColumnarValue]) -> Result + let df_adapter = move |df_args: &[DfColumnarValue]| { + // Convert df_args from DfColumnarValue to ColumnarValue. + let mut values: SmallVec<[ColumnarValue; FUNC_ARG_NUM]> = + SmallVec::with_capacity(df_args.len()); + for df_arg in df_args { + let value = ColumnarValue::try_from_df_columnar_value(df_arg).map_err(|e| { + DataFusionError::Internal(format!( + "Failed to convert datafusion columnar value, err:{}", + e + )) + })?; + values.push(value); + } + + // Execute our function. + let result_value = func(&values).map_err(|e| { + DataFusionError::Execution(format!("Failed to execute function, err:{}", e)) + })?; + + // Convert the result value to DfColumnarValue. + Ok(result_value.into_df_columnar_value()) + }; + + let df_scalar_fn = Arc::new(df_adapter); + + Self { + signature, + return_type, + df_scalar_fn, + } + } + + #[inline] + pub fn signature(&self) -> &TypeSignature { + &self.signature + } + + #[inline] + pub fn return_type(&self) -> &ReturnType { + &self.return_type + } + + #[inline] + pub(crate) fn to_datafusion_function(&self) -> ScalarFunctionImplementation { + self.df_scalar_fn.clone() + } +} + +pub struct AggregateFunction { + type_signature: TypeSignature, + return_type: ReturnType, + df_accumulator: AccumulatorFunctionImplementation, + state_type: Vec, +} + +impl AggregateFunction { + pub fn make_by_fn( + type_signature: TypeSignature, + return_type: DatumKind, + state_type: Vec, + accumulator_fn: F, + ) -> Self + where + F: Fn() -> Result + Send + Sync + 'static, + A: Accumulator + 'static, + { + // Create accumulator. + let df_adapter = move || { + let accumulator = accumulator_fn().map_err(|e| { + DataFusionError::Execution(format!("Failed to create accumulator, err:{}", e)) + })?; + let accumulator = Box::new(ToDfAccumulator::new(accumulator)); + + Ok(accumulator as _) + }; + let df_accumulator = Arc::new(df_adapter); + + // Create return type. + let return_type = ReturnType { kind: return_type }; + + Self { + type_signature, + return_type, + df_accumulator, + state_type, + } + } + + #[inline] + pub fn signature(&self) -> &TypeSignature { + &self.type_signature + } + + #[inline] + pub fn return_type(&self) -> &ReturnType { + &self.return_type + } + + #[inline] + pub(crate) fn to_datafusion_accumulator(&self) -> AccumulatorFunctionImplementation { + self.df_accumulator.clone() + } + + pub(crate) fn to_datafusion_state_type(&self) -> StateTypeFunction { + let data_types = Arc::new( + self.state_type + .iter() + .map(|kind| DataType::from(*kind)) + .collect::>(), + ); + Arc::new(move |_| Ok(data_types.clone())) + } +} diff --git a/udf/src/lib.rs b/udf/src/lib.rs new file mode 100644 index 0000000000..36d5f32fdf --- /dev/null +++ b/udf/src/lib.rs @@ -0,0 +1,10 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! UDF support. + +pub mod aggregate; +pub mod functions; +pub mod registry; +pub mod scalar; +pub mod udaf; +pub mod udfs; diff --git a/udf/src/registry.rs b/udf/src/registry.rs new file mode 100644 index 0000000000..34e0af7051 --- /dev/null +++ b/udf/src/registry.rs @@ -0,0 +1,92 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Function registry. + +use std::{collections::HashMap, sync::Arc}; + +use common_util::define_result; +use snafu::{ensure, Backtrace, Snafu}; + +use crate::{scalar::ScalarUdf, udaf::AggregateUdf, udfs}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Udf already exists, name:{}.\nBacktrace:\n{}", name, backtrace))] + UdfExists { name: String, backtrace: Backtrace }, +} + +define_result!(Error); + +/// A registry knows how to build logical expressions out of user-defined +/// function' names +pub trait FunctionRegistry { + fn register_udf(&mut self, udf: ScalarUdf) -> Result<()>; + + fn register_udaf(&mut self, udaf: AggregateUdf) -> Result<()>; + + fn find_udf(&self, name: &str) -> Result>; + + fn find_udaf(&self, name: &str) -> Result>; + + fn list_udfs(&self) -> Result>; +} + +/// Default function registry. +#[derive(Debug, Default)] +pub struct FunctionRegistryImpl { + scalar_functions: HashMap, + aggregate_functions: HashMap, +} + +impl FunctionRegistryImpl { + pub fn new() -> Self { + Self::default() + } + + /// Load all provided udfs. + pub fn load_functions(&mut self) -> Result<()> { + udfs::register_all_udfs(self) + } +} + +impl FunctionRegistry for FunctionRegistryImpl { + fn register_udf(&mut self, udf: ScalarUdf) -> Result<()> { + ensure!( + !self.scalar_functions.contains_key(udf.name()), + UdfExists { name: udf.name() } + ); + + self.scalar_functions.insert(udf.name().to_string(), udf); + + Ok(()) + } + + fn register_udaf(&mut self, udaf: AggregateUdf) -> Result<()> { + ensure!( + !self.aggregate_functions.contains_key(udaf.name()), + UdfExists { name: udaf.name() } + ); + + self.aggregate_functions + .insert(udaf.name().to_string(), udaf); + + Ok(()) + } + + fn find_udf(&self, name: &str) -> Result> { + let udf = self.scalar_functions.get(name).cloned(); + Ok(udf) + } + + fn find_udaf(&self, name: &str) -> Result> { + let udaf = self.aggregate_functions.get(name).cloned(); + Ok(udaf) + } + + fn list_udfs(&self) -> Result> { + let udfs = self.scalar_functions.values().cloned().collect(); + Ok(udfs) + } +} + +pub type FunctionRegistryRef = Arc; diff --git a/udf/src/scalar.rs b/udf/src/scalar.rs new file mode 100644 index 0000000000..2ce056c3f3 --- /dev/null +++ b/udf/src/scalar.rs @@ -0,0 +1,39 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Scalar udfs. + +use std::sync::Arc; + +use arrow_deps::datafusion::physical_plan::udf::ScalarUDF; + +use crate::functions::ScalarFunction; + +/// Logical representation of a UDF. +#[derive(Debug, Clone)] +pub struct ScalarUdf { + /// DataFusion UDF. + df_udf: Arc, +} + +impl ScalarUdf { + pub fn create(name: &str, func: ScalarFunction) -> Self { + let signature = func.signature().to_datafusion_signature(); + let return_type = func.return_type().to_datafusion_return_type(); + let scalar_fn = func.to_datafusion_function(); + + let df_udf = Arc::new(ScalarUDF::new(name, &signature, &return_type, &scalar_fn)); + + Self { df_udf } + } + + #[inline] + pub fn name(&self) -> &str { + &self.df_udf.name + } + + /// Convert into datafusion's udf + #[inline] + pub fn to_datafusion_udf(&self) -> Arc { + self.df_udf.clone() + } +} diff --git a/udf/src/udaf.rs b/udf/src/udaf.rs new file mode 100644 index 0000000000..06f8983460 --- /dev/null +++ b/udf/src/udaf.rs @@ -0,0 +1,45 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! UDAF support. + +use std::sync::Arc; + +use arrow_deps::datafusion::physical_plan::udaf::AggregateUDF; + +use crate::functions::AggregateFunction; + +/// Logical representation of a UDAF. +#[derive(Debug, Clone)] +pub struct AggregateUdf { + /// DataFusion UDAF. + df_udaf: Arc, +} + +impl AggregateUdf { + pub fn create(name: &str, func: AggregateFunction) -> Self { + let signature = func.signature().to_datafusion_signature(); + let return_type = func.return_type().to_datafusion_return_type(); + let accumulator = func.to_datafusion_accumulator(); + let state_type = func.to_datafusion_state_type(); + + let df_udaf = Arc::new(AggregateUDF::new( + name, + &signature, + &return_type, + &accumulator, + &state_type, + )); + + Self { df_udaf } + } + + #[inline] + pub fn name(&self) -> &str { + &self.df_udaf.name + } + + #[inline] + pub fn to_datafusion_udaf(&self) -> Arc { + self.df_udaf.clone() + } +} diff --git a/udf/src/udfs/mod.rs b/udf/src/udfs/mod.rs new file mode 100644 index 0000000000..5d64edf237 --- /dev/null +++ b/udf/src/udfs/mod.rs @@ -0,0 +1,16 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! UDFs + +use crate::registry::{FunctionRegistry, Result}; + +mod thetasketch_distinct; +mod time_bucket; + +pub fn register_all_udfs(registry: &mut dyn FunctionRegistry) -> Result<()> { + // Register all udfs + time_bucket::register_to_registry(registry)?; + thetasketch_distinct::register_to_registry(registry)?; + + Ok(()) +} diff --git a/udf/src/udfs/thetasketch_distinct.rs b/udf/src/udfs/thetasketch_distinct.rs new file mode 100644 index 0000000000..90ef3aefa5 --- /dev/null +++ b/udf/src/udfs/thetasketch_distinct.rs @@ -0,0 +1,166 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! thetasketch_distinct() udaf. + +use std::{fmt, mem}; + +use common_types::datum::DatumKind; +use common_util::define_result; +use hyperloglog::HyperLogLog; +use snafu::{ensure, OptionExt, ResultExt, Snafu}; + +use crate::{ + aggregate::{self, Accumulator, GetState, Input, MergeState, State, StateRef}, + functions::{AggregateFunction, ScalarValue, TypeSignature}, + registry::{self, FunctionRegistry}, + udaf::AggregateUdf, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Invalid argument number."))] + InvalidArgNum, + + #[snafu(display("Invalid state len."))] + InvalidStateLen, + + #[snafu(display("Invalid state, state is not string."))] + StateNotString, + + #[snafu(display("Failed to decode base64 of hll, err:{}.", source))] + DecodeBase64 { source: base64::DecodeError }, + + #[snafu(display("Invalid state, failed to decode hll, err:{}.", source))] + DecodeHll { source: hyperloglog::Error }, +} + +define_result!(Error); + +const HLL_ERROR_RATE: f64 = 0.01; +// Hll seeds: +const HLL_KEY0: u64 = 0; +const HLL_KEY1: u64 = 0; + +pub fn register_to_registry(registry: &mut dyn FunctionRegistry) -> registry::Result<()> { + registry.register_udaf(new_udaf()) +} + +fn new_udaf() -> AggregateUdf { + let aggregate_function = new_function(); + + AggregateUdf::create("thetasketch_distinct", aggregate_function) +} + +pub(crate) fn new_function() -> AggregateFunction { + // Aways use the same hasher with same keys. + let hll = HyperLogLog::new_with_keys(HLL_ERROR_RATE, HLL_KEY0, HLL_KEY1); + + let accumulator_fn = move || { + let distinct = HllDistinct { + hll: HyperLogLog::new_from_template(&hll), + }; + + Ok(distinct) + }; + + let type_signature = make_type_signature(); + let state_type = make_state_type(); + + AggregateFunction::make_by_fn( + type_signature, + DatumKind::UInt64, + state_type, + accumulator_fn, + ) +} + +fn make_type_signature() -> TypeSignature { + TypeSignature::Uniform( + 1, + vec![ + DatumKind::Timestamp, + DatumKind::Double, + DatumKind::Varbinary, + DatumKind::String, + DatumKind::UInt64, + ], + ) +} + +fn make_state_type() -> Vec { + vec![DatumKind::String] +} + +/// Distinct counter based on HyperLogLog. +/// +/// The HyperLogLogs must be initialized with same hash seeds (new from same +/// template). +struct HllDistinct { + hll: HyperLogLog, +} + +// TODO(yingwen): Avoid base64 encode/decode if datafusion supports converting +// binary datatype to scalarvalue. +impl HllDistinct { + fn merge_impl(&mut self, states: StateRef) -> Result<()> { + // The states are serialize from hll. + ensure!(states.len() == 1, InvalidStateLen); + let value_ref = states.value(0); + let hll_string = value_ref.as_str().context(StateNotString)?; + let hll_bytes = base64::decode(hll_string).context(DecodeBase64)?; + let mut buf = &hll_bytes[..]; + // Try to deserialize the hll. + let hll = HyperLogLog::read_from_buf(&mut buf).context(DecodeHll)?; + + // Merge the hll, note that the two hlls must created or serialized from the + // same template hll. + self.hll.merge(&hll); + + Ok(()) + } +} + +impl fmt::Debug for HllDistinct { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("HllDistinct") + .field("len", &self.hll.len()) + .finish() + } +} + +impl Accumulator for HllDistinct { + fn state(&self) -> aggregate::Result { + // Serialize `self.hll` to bytes. + let mut buf = Vec::with_capacity(mem::size_of::()); + self.hll + .write_to_buf(&mut buf) + .map_err(|e| Box::new(e) as _) + .context(GetState)?; + // HACK: DataFusion does not support creating a scalar from binary, so we need + // to use base64 to convert a binary into string. + let hll_string = base64::encode(buf); + + Ok(State::from(ScalarValue::from(hll_string))) + } + + fn update(&mut self, values: Input) -> aggregate::Result<()> { + for value_ref in values.iter() { + // Insert value into hll. + self.hll.insert(&value_ref); + } + + Ok(()) + } + + fn merge(&mut self, states: StateRef) -> aggregate::Result<()> { + self.merge_impl(states) + .map_err(|e| Box::new(e) as _) + .context(MergeState) + } + + fn evaluate(&self) -> aggregate::Result { + let count = self.hll.len() as u64; + + Ok(ScalarValue::from(count)) + } +} diff --git a/udf/src/udfs/time_bucket.rs b/udf/src/udfs/time_bucket.rs new file mode 100644 index 0000000000..40e428ec5a --- /dev/null +++ b/udf/src/udfs/time_bucket.rs @@ -0,0 +1,324 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! time_bucket UDF. + +use std::time::Duration; + +use chrono::{Datelike, FixedOffset, TimeZone}; +use common_types::{ + column::{ColumnBlock, ColumnBlockBuilder, TimestampColumn}, + datum::{Datum, DatumKind}, + time::Timestamp, +}; +use common_util::define_result; +use snafu::{ensure, OptionExt, ResultExt, Snafu}; + +use crate::{ + functions::{CallFunction, ColumnarValue, InvalidArguments, ScalarFunction, TypeSignature}, + registry::{self, FunctionRegistry}, + scalar::ScalarUdf, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Invalid period, period:{}", period))] + InvalidPeriod { period: String }, + + #[snafu(display("Invalid period number, period:{}, err:{}", period, source))] + InvalidPeriodNumber { + period: String, + source: std::num::ParseIntError, + }, + + #[snafu(display("Invalid argument number."))] + InvalidArgNum, + + #[snafu(display("Invalid arguments, require timestamp column."))] + NotTimestampColumn, + + #[snafu(display("Invalid arguments, require period."))] + NotPeriod, + + #[snafu(display("Period of week only support P1W."))] + UnsupportedWeek, + + #[snafu(display("Period of month only support P1M."))] + UnsupportedMonth, + + #[snafu(display("Period of year only support P1Y."))] + UnsupportedYear, + + #[snafu(display( + "Failed to truncate timestamp, timestamp:{}, period:{:?}", + timestamp, + period + ))] + TruncateTimestamp { timestamp: i64, period: Period }, + + #[snafu(display("Failed to build result column, err:{}", source))] + BuildColumn { source: common_types::column::Error }, +} + +define_result!(Error); + +/// Default timezone: +08:00 +const DEFAULT_TIMEZONE_OFFSET_SECS: i32 = 8 * 3600; + +pub fn register_to_registry(registry: &mut dyn FunctionRegistry) -> registry::Result<()> { + registry.register_udf(new_udf()) +} + +fn new_udf() -> ScalarUdf { + // args: + // - timestamp column. + // - period. + // - input timestamp format in PARTITION BY (unsed now). + // - input timezone (ignored now). + // - timestamp output format (ignored now). + let func = |args: &[ColumnarValue]| { + let bucket = TimeBucket::parse_args(args) + .map_err(|e| Box::new(e) as _) + .context(InvalidArguments)?; + + let result_column = bucket + .call() + .map_err(|e| Box::new(e) as _) + .context(CallFunction)?; + + Ok(ColumnarValue::Array(result_column)) + }; + + let signature = make_signature(); + let scalar_function = ScalarFunction::make_by_fn(signature, DatumKind::Timestamp, func); + + ScalarUdf::create("time_bucket", scalar_function) +} + +fn make_signature() -> TypeSignature { + let sigs = vec![ + TypeSignature::Exact(vec![DatumKind::Timestamp, DatumKind::String]), + TypeSignature::Exact(vec![ + DatumKind::Timestamp, + DatumKind::String, + DatumKind::String, + ]), + TypeSignature::Exact(vec![ + DatumKind::Timestamp, + DatumKind::String, + DatumKind::String, + DatumKind::String, + ]), + TypeSignature::Exact(vec![ + DatumKind::Timestamp, + DatumKind::String, + DatumKind::String, + DatumKind::String, + DatumKind::String, + ]), + ]; + TypeSignature::OneOf(sigs) +} + +struct TimeBucket<'a> { + column: &'a TimestampColumn, + period: Period, +} + +impl<'a> TimeBucket<'a> { + fn parse_args(args: &[ColumnarValue]) -> Result { + ensure!(args.len() >= 2, InvalidArgNum); + + let column = match &args[0] { + ColumnarValue::Array(block) => block.as_timestamp().context(NotTimestampColumn)?, + _ => return NotTimestampColumn.fail(), + }; + let period = match &args[1] { + ColumnarValue::Scalar(value) => { + let period_str = value.as_str().context(NotPeriod)?; + Period::parse(period_str)? + } + _ => return NotPeriod.fail(), + }; + + Ok(TimeBucket { column, period }) + } + + fn call(&self) -> Result { + let mut out_column_builder = + ColumnBlockBuilder::with_capacity(&DatumKind::Timestamp, self.column.num_rows()); + for ts_opt in self.column.iter() { + match ts_opt { + Some(ts) => { + let truncated = self.period.truncate(ts).context(TruncateTimestamp { + timestamp: ts, + period: self.period, + })?; + out_column_builder + .append(Datum::Timestamp(truncated)) + .context(BuildColumn)?; + } + None => { + out_column_builder + .append(Datum::Null) + .context(BuildColumn)?; + } + } + } + Ok(out_column_builder.build()) + } +} + +/// A time bucket period. +/// +/// e.g. +/// - PT1S +/// - PT1M +/// - PT1H +/// - P1D +/// - P1W +/// - P1M +/// - P1Y +#[derive(Debug, Clone, Copy)] +pub enum Period { + Second(u16), + Minute(u16), + Hour(u16), + Day(u16), + Week, + Month, + Year, +} + +impl Period { + fn parse(period: &str) -> Result { + ensure!(period.len() >= 3, InvalidPeriod { period }); + let is_pt = if period.starts_with("PT") { + true + } else if period.starts_with('P') { + false + } else { + return InvalidPeriod { period }.fail(); + }; + + let back = period.chars().last().context(InvalidPeriod { period })?; + let parsed = if is_pt { + let number = &period[2..period.len() - 1]; + let number = number + .parse::() + .context(InvalidPeriodNumber { period })?; + match back { + 'S' => Period::Second(number), + 'M' => Period::Minute(number), + 'H' => Period::Hour(number), + _ => return InvalidPeriod { period }.fail(), + } + } else { + let number = &period[1..period.len() - 1]; + let number = number + .parse::() + .context(InvalidPeriodNumber { period })?; + match back { + 'D' => Period::Day(number), + 'W' => { + ensure!(number == 1, UnsupportedWeek); + Period::Week + } + 'M' => { + ensure!(number == 1, UnsupportedMonth); + Period::Month + } + 'Y' => { + ensure!(number == 1, UnsupportedYear); + Period::Year + } + _ => return InvalidPeriod { period }.fail(), + } + }; + + Ok(parsed) + } + + fn truncate(&self, ts: Timestamp) -> Option { + const MINUTE_SECONDS: u64 = 60; + const HOUR_SECONDS: u64 = 60 * MINUTE_SECONDS; + + let truncated_ts = match self { + Period::Second(period) => { + let duration = Duration::from_secs(u64::from(*period)); + ts.truncate_by(duration) + } + Period::Minute(period) => { + let duration = Duration::from_secs(u64::from(*period) * MINUTE_SECONDS); + ts.truncate_by(duration) + } + Period::Hour(period) => { + let duration = Duration::from_secs(u64::from(*period) * HOUR_SECONDS); + ts.truncate_by(duration) + } + Period::Day(period) => Self::truncate_day(ts, *period)?, + Period::Week => Self::truncate_week(ts), + Period::Month => Self::truncate_month(ts), + Period::Year => Self::truncate_year(ts), + }; + + Some(truncated_ts) + } + + fn truncate_day(ts: Timestamp, period: u16) -> Option { + let offset = FixedOffset::east(DEFAULT_TIMEZONE_OFFSET_SECS); + // Convert to local time. + let datetime = offset.timestamp_millis(ts.as_i64()); + + // Truncate day + let day = datetime.day(); + let day = day - (day % u32::from(period)); + let truncated_datetime = offset + .ymd(datetime.year(), datetime.month(), day) + .and_hms(0, 0, 0); + let truncated_ts = truncated_datetime.timestamp_millis(); + + Some(Timestamp::new(truncated_ts)) + } + + fn truncate_week(ts: Timestamp) -> Timestamp { + let offset = FixedOffset::east(DEFAULT_TIMEZONE_OFFSET_SECS); + // Convert to local time. + let datetime = offset.timestamp_millis(ts.as_i64()); + + // Truncate week. + let week_offset = datetime.weekday().num_days_from_monday(); + let week_millis = 7 * 24 * 3600 * 1000; + let ts_offset = week_offset * week_millis; + // TODO(yingwen): Impl sub/divide for Timestamp + let week_millis = i64::from(week_millis); + let truncated_ts = (ts.as_i64() - i64::from(ts_offset)) / week_millis * week_millis; + + Timestamp::new(truncated_ts) + } + + fn truncate_month(ts: Timestamp) -> Timestamp { + let offset = FixedOffset::east(DEFAULT_TIMEZONE_OFFSET_SECS); + // Convert to local time. + let datetime = offset.timestamp_millis(ts.as_i64()); + + // Truncate month + let truncated_datetime = offset + .ymd(datetime.year(), datetime.month(), 1) + .and_hms(0, 0, 0); + let truncated_ts = truncated_datetime.timestamp_millis(); + + Timestamp::new(truncated_ts) + } + + fn truncate_year(ts: Timestamp) -> Timestamp { + let offset = FixedOffset::east(DEFAULT_TIMEZONE_OFFSET_SECS); + // Convert to local time. + let datetime = offset.timestamp_millis(ts.as_i64()); + + // Truncate year + let truncated_datetime = offset.ymd(datetime.year(), 1, 1).and_hms(0, 0, 0); + let truncated_ts = truncated_datetime.timestamp_millis(); + + Timestamp::new(truncated_ts) + } +} diff --git a/wal/Cargo.toml b/wal/Cargo.toml new file mode 100644 index 0000000000..574cffa9e2 --- /dev/null +++ b/wal/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "wal" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +async-trait = "0.1.41" +common_util = {path = "../common_util"} +common_types = {path = "../common_types"} +log = "0.4" +snafu = { version ="0.6.10", features = ["backtraces"] } +tokio = { version = "1.0", features = ["sync"] } + +[dev-dependencies] +tempfile = "3.1.0" +futures = { version = "0.3", features = ["async-await"] } + +[dependencies.rocksdb] +git = "https://github.com/tikv/rust-rocksdb.git" +branch = "tikv-5.2" +features = ["portable"] diff --git a/wal/src/lib.rs b/wal/src/lib.rs new file mode 100644 index 0000000000..440edb2d1e --- /dev/null +++ b/wal/src/lib.rs @@ -0,0 +1,10 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Write Ahead Log + +pub mod log_batch; +pub mod manager; +pub mod rocks_impl; + +#[cfg(test)] +mod tests; diff --git a/wal/src/log_batch.rs b/wal/src/log_batch.rs new file mode 100644 index 0000000000..7e08c6c10d --- /dev/null +++ b/wal/src/log_batch.rs @@ -0,0 +1,89 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Log entries definition. + +use std::fmt::Debug; + +use common_types::{ + bytes::{MemBuf, MemBufMut}, + SequenceNumber, +}; + +use crate::manager::RegionId; + +pub trait Payload: Send + Sync + Debug { + type Error: std::error::Error + Send + Sync + 'static; + /// Compute size of the encoded payload. + fn encode_size(&self) -> usize; + /// Append the encoded payload to the `buf`. + fn encode_to(&self, buf: &mut B) -> Result<(), Self::Error>; +} + +#[derive(Debug)] +pub struct LogEntry

{ + pub sequence: SequenceNumber, + pub payload: P, +} + +/// An entry to be written into the Wal. +/// +/// Generally, the `payload` is a lazily encoder whose constraint is +/// `PayloadEncoder`. `region_id` is a logically region and set it as 0 if +/// unnecessary. +#[derive(Debug)] +pub struct LogWriteEntry

{ + pub payload: P, +} + +/// A batch of `LogWriteEntry`s. +#[derive(Debug)] +pub struct LogWriteBatch

{ + pub(crate) region_id: RegionId, + pub(crate) entries: Vec>, +} + +impl LogWriteBatch

{ + pub fn new(region_id: RegionId) -> Self { + Self::with_capacity(region_id, 0) + } + + pub fn with_capacity(region_id: RegionId, cap: usize) -> Self { + Self { + region_id, + entries: Vec::with_capacity(cap), + } + } + + #[inline] + pub fn push(&mut self, entry: LogWriteEntry

) { + self.entries.push(entry) + } + + #[inline] + pub fn len(&self) -> usize { + self.entries.len() + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.entries.is_empty() + } + + #[inline] + pub fn clear(&mut self) { + self.entries.clear() + } +} + +impl Default for LogWriteBatch

{ + fn default() -> Self { + Self::new(0) + } +} + +pub trait PayloadDecoder: Send + Sync { + type Error: std::error::Error + Send + Sync + 'static; + type Target: Send + Sync; + /// Decode `Target` from the `bytes`. + fn decode(&self, buf: &mut B) -> Result; +} diff --git a/wal/src/manager.rs b/wal/src/manager.rs new file mode 100644 index 0000000000..4ea8fe97ab --- /dev/null +++ b/wal/src/manager.rs @@ -0,0 +1,237 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! WalManager abstraction + +use std::{fmt, time::Duration}; + +use async_trait::async_trait; +pub use common_types::SequenceNumber; + +use crate::log_batch::{LogEntry, LogWriteBatch, Payload, PayloadDecoder}; + +pub mod error { + use common_util::define_result; + use snafu::{Backtrace, Snafu}; + + use crate::manager::RegionId; + + // Now most error from manage implementation don't have backtrace, so we add + // backtrace here. + #[derive(Debug, Snafu)] + #[snafu(visibility(pub))] + pub enum Error { + #[snafu(display( + "Failed to open wal, path:{}, err:{}.\nBacktrace:\n{}", + wal_path, + source, + backtrace + ))] + Open { + wal_path: String, + source: Box, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to initialize wal, err:{}.\nBacktrace:\n{}", source, backtrace))] + Initialization { + source: Box, + backtrace: Backtrace, + }, + + #[snafu(display( + "Region is not found, region_id:{}.\nBacktrace:\n{}", + region_id, + backtrace + ))] + RegionNotFound { + region_id: RegionId, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to write log entries, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + Write { + source: Box, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to read log entries, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + Read { + source: Box, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to delete log entries, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + Delete { + source: Box, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to encode, err:{}.\nBacktrace:\n{}", source, backtrace))] + Encoding { + source: Box, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to decode, err:{}.\nBacktrace:\n{}", source, backtrace))] + Decoding { + source: Box, + backtrace: Backtrace, + }, + } + + define_result!(Error); +} + +use common_types::{MAX_SEQUENCE_NUMBER, MIN_SEQUENCE_NUMBER}; +pub use error::*; + +pub type RegionId = u64; +pub const MAX_REGION_ID: RegionId = u64::MAX; + +#[derive(Debug, Clone)] +pub struct WriteContext { + /// Timeout to write wal and it only takes effect when writing to a Wal on a + /// remote machine (writing to the local disk does not have timeout). + pub timeout: Duration, +} + +impl Default for WriteContext { + fn default() -> Self { + Self { + timeout: Duration::from_secs(1), + } + } +} + +/// Write abstraction for log entries in Wal. +#[async_trait] +pub trait LogWriter { + /// Write a batch of log entries to log. + /// + /// Returns the max sequence number for the batch of log entries. + async fn write( + &self, + ctx: &WriteContext, + batch: &LogWriteBatch

, + ) -> Result; +} + +#[derive(Debug, Clone)] +pub struct ReadContext { + /// Timeout to read log entries and it only takes effect when reading from a + /// Wal on a remote machine (reading from the local disk does not have + /// timeout). + pub timeout: Duration, +} + +impl Default for ReadContext { + fn default() -> Self { + Self { + timeout: Duration::from_secs(5), + } + } +} + +#[derive(Debug, Clone, Copy)] +pub enum ReadBoundary { + Max, + Min, + Included(SequenceNumber), + Excluded(SequenceNumber), +} + +impl ReadBoundary { + /// Convert the boundary to start sequence number. + /// + /// Returns `None` if the boundary is `Excluded(MAX_SEQUENCE_NUM)` + pub fn as_start_sequence_number(&self) -> Option { + match *self { + ReadBoundary::Max => Some(MAX_SEQUENCE_NUMBER), + ReadBoundary::Min => Some(MIN_SEQUENCE_NUMBER), + ReadBoundary::Included(n) => Some(n), + ReadBoundary::Excluded(n) => { + if n == MAX_SEQUENCE_NUMBER { + None + } else { + Some(n + 1) + } + } + } + } + + /// Convert the boundary to start sequence number. + /// + /// Returns `None` if the boundary is `Excluded(MIN_SEQUENCE_NUM)` + pub fn as_end_sequence_number(&self) -> Option { + match *self { + ReadBoundary::Max => Some(MAX_SEQUENCE_NUMBER), + ReadBoundary::Min => Some(MIN_SEQUENCE_NUMBER), + ReadBoundary::Included(n) => Some(n), + ReadBoundary::Excluded(n) => { + if n == MIN_SEQUENCE_NUMBER { + None + } else { + Some(n - 1) + } + } + } + } +} + +#[derive(Debug, Clone)] +pub struct ReadRequest { + /// Region id of the wal to read + pub region_id: RegionId, + // TODO(yingwen): Or just rename to ReadBound? + /// Start bound + pub start: ReadBoundary, + /// End bound + pub end: ReadBoundary, +} + +/// Iterator abstraction for log entry. +pub trait LogIterator { + fn next_log_entry( + &mut self, + decoder: &D, + ) -> Result>>; +} + +/// Read abstraction for log entries in the Wal. +pub trait LogReader { + /// Iterator over log entries. + type Iterator: LogIterator + Send; + /// Provide iterator on necessary entries according to `ReadRequest`. + fn read(&self, ctx: &ReadContext, req: &ReadRequest) -> Result; +} + +// TODO(xikai): define Error as associate type. +/// Management of multi-region Wals. +/// +/// Every region has its own increasing (and maybe hallow) sequence number +/// space. +#[async_trait] +pub trait WalManager: LogWriter + LogReader + fmt::Debug { + /// Get current sequence number. + fn sequence_num(&self, region_id: RegionId) -> Result; + + /// Mark the entries whose sequence number is in [0, `sequence_number`] to + /// be deleted in the future. + async fn mark_delete_entries_up_to( + &self, + region_id: RegionId, + sequence_num: SequenceNumber, + ) -> Result<()>; +} diff --git a/wal/src/rocks_impl/encoding.rs b/wal/src/rocks_impl/encoding.rs new file mode 100644 index 0000000000..727b5715f2 --- /dev/null +++ b/wal/src/rocks_impl/encoding.rs @@ -0,0 +1,533 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Encoding for Wal logs + +use common_types::{ + bytes::{self, BytesMut, MemBuf, MemBufMut}, + SequenceNumber, +}; +use common_util::{ + codec::{Decoder, Encoder}, + define_result, +}; +use snafu::{ensure, Backtrace, ResultExt, Snafu}; + +use crate::{ + log_batch::{Payload, PayloadDecoder}, + manager::{self, RegionId}, +}; + +const LOG_KEY_ENCODING_V0: u8 = 0; +const NEWEST_LOG_KEY_ENCODING_VERSION: u8 = LOG_KEY_ENCODING_V0; + +const LOG_VALUE_ENCODING_V0: u8 = 0; +const NEWEST_LOG_VALUE_ENCODING_VERSION: u8 = LOG_VALUE_ENCODING_V0; + +const META_KEY_ENCODING_V0: u8 = 0; +const NEWEST_META_KEY_ENCODING_VERSION: u8 = META_KEY_ENCODING_V0; + +const META_VALUE_ENCODING_V0: u8 = 0; +const NEWEST_META_VALUE_ENCODING_VERSION: u8 = META_VALUE_ENCODING_V0; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to encode log key, err:{}", source))] + EncodeLogKey { + source: bytes::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to encode log value header, err:{}", source))] + EncodeLogValueHeader { source: bytes::Error }, + + #[snafu(display("Failed to encode log value payload, err:{}", source))] + EncodeLogValuePayload { + source: Box, + }, + + #[snafu(display("Failed to decode log key, err:{}", source))] + DecodeLogKey { source: bytes::Error }, + + #[snafu(display("Failed to decode log value header, err:{}", source))] + DecodeLogValueHeader { source: bytes::Error }, + + #[snafu(display("Failed to decode log value payload, err:{}", source))] + DecodeLogValuePayload { + source: Box, + }, + + #[snafu(display("Failed to encode meta key, err:{}", source))] + EncodeMetaKey { + source: bytes::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to encode meta value, err:{}", source))] + EncodeMetaValue { source: bytes::Error }, + + #[snafu(display("Failed to decode meta key, err:{}", source))] + DecodeMetaKey { source: bytes::Error }, + + #[snafu(display("Failed to decode meta value, err:{}", source))] + DecodeMetaValue { source: bytes::Error }, + + #[snafu(display( + "Found invalid meta key type, expect:{:?}, given:{}.\nBacktrace:\n{}", + expect, + given, + backtrace + ))] + InvalidMetaKeyType { + expect: MetaKeyType, + given: u8, + backtrace: Backtrace, + }, + + #[snafu(display( + "Found invalid namespace, expect:{:?}, given:{}.\nBacktrace:\n{}", + expect, + given, + backtrace + ))] + InvalidNamespace { + expect: Namespace, + given: u8, + backtrace: Backtrace, + }, + + #[snafu(display( + "Found invalid version, expect:{}, given:{}.\nBacktrace:\n{}", + expect, + given, + backtrace + ))] + InvalidVersion { + expect: u8, + given: u8, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +#[derive(Debug, Copy, Clone)] +pub enum Namespace { + Meta = 0, + Log = 1, +} + +#[derive(Debug, Clone)] +pub struct LogEncoding { + key_enc: LogKeyEncoder, + value_enc: LogValueEncoder, + // value decoder is created dynamically from the version, + value_enc_version: u8, +} + +impl LogEncoding { + pub fn newest() -> Self { + Self { + key_enc: LogKeyEncoder { + version: NEWEST_LOG_KEY_ENCODING_VERSION, + namespace: Namespace::Log, + }, + value_enc: LogValueEncoder { + version: NEWEST_LOG_VALUE_ENCODING_VERSION, + }, + value_enc_version: NEWEST_LOG_VALUE_ENCODING_VERSION, + } + } + + // Encode [LogKey] into `buf` and caller should knows that the keys are ordered + // by ([RegionId], [SequenceNum]) so the caller can use this method to + // generate min/max key in specific scope(global or in some region). + pub fn encode_key(&self, buf: &mut BytesMut, log_key: &LogKey) -> manager::Result<()> { + buf.clear(); + buf.reserve(self.key_enc.estimate_encoded_size(log_key)); + self.key_enc + .encode(buf, log_key) + .map_err(|e| Box::new(e) as _) + .context(manager::Encoding)?; + + Ok(()) + } + + pub fn encode_value(&self, buf: &mut BytesMut, payload: &impl Payload) -> manager::Result<()> { + buf.clear(); + buf.reserve(self.value_enc.estimate_encoded_size(payload)); + self.value_enc + .encode(buf, payload) + .map_err(|e| Box::new(e) as _) + .context(manager::Encoding) + } + + pub fn is_log_key(&self, mut buf: &[u8]) -> manager::Result { + self.key_enc + .is_valid(&mut buf) + .map_err(|e| Box::new(e) as _) + .context(manager::Decoding) + } + + pub fn decode_key(&self, mut buf: &[u8]) -> manager::Result { + self.key_enc + .decode(&mut buf) + .map_err(|e| Box::new(e) as _) + .context(manager::Decoding) + } + + pub fn decode_value( + &self, + mut buf: &[u8], + decoder: &D, + ) -> manager::Result { + let value_dec = LogValueDecoder { + version: self.value_enc_version, + payload_dec: decoder, + }; + + value_dec + .decode(&mut buf) + .map_err(|e| Box::new(e) as _) + .context(manager::Decoding) + } +} + +pub type LogKey = (RegionId, SequenceNumber); + +#[derive(Debug, Clone)] +struct LogKeyEncoder { + version: u8, + namespace: Namespace, +} + +impl LogKeyEncoder { + /// Determine whether the raw bytes is a log key. + pub fn is_valid(&self, buf: &mut B) -> Result { + let namespace = buf.read_u8().context(DecodeLogKey)?; + Ok(namespace == self.namespace as u8) + } +} + +impl Encoder for LogKeyEncoder { + type Error = Error; + + /// Key format: + /// + /// ```text + /// +---------------+----------------+-------------------+--------------------+ + /// | namespace(u8) | region_id(u64) | sequence_num(u64) | version header(u8) | + /// +---------------+----------------+-------------------+--------------------+ + /// ``` + /// + /// More information can be extended after the incremented `version header`. + fn encode(&self, buf: &mut B, log_key: &LogKey) -> Result<()> { + buf.write_u8(self.namespace as u8).context(EncodeLogKey)?; + buf.write_u64(log_key.0).context(EncodeLogKey)?; + buf.write_u64(log_key.1).context(EncodeLogKey)?; + buf.write_u8(self.version).context(EncodeLogKey)?; + + Ok(()) + } + + fn estimate_encoded_size(&self, _log_key: &LogKey) -> usize { + // Refer to key format. + 1 + 8 + 8 + 1 + } +} + +impl Decoder for LogKeyEncoder { + type Error = Error; + + fn decode(&self, buf: &mut B) -> Result { + // check namespace + let namespace = buf.read_u8().context(DecodeLogKey)?; + ensure!( + namespace == self.namespace as u8, + InvalidNamespace { + expect: self.namespace, + given: namespace + } + ); + + let log_key = ( + buf.read_u64().context(DecodeLogKey)?, + buf.read_u64().context(DecodeLogKey)?, + ); + + // check version + let version = buf.read_u8().context(DecodeLogKey)?; + ensure!( + version == self.version, + InvalidVersion { + expect: self.version, + given: version + } + ); + + Ok(log_key) + } +} + +#[derive(Debug, Clone)] +struct LogValueEncoder { + version: u8, +} + +impl Encoder for LogValueEncoder { + type Error = Error; + + /// Value format: + /// +--------------------+---------+ + /// | version_header(u8) | payload | + /// +--------------------+---------+ + fn encode(&self, buf: &mut B, payload: &T) -> Result<()> { + buf.write_u8(self.version).context(EncodeLogValueHeader)?; + + payload + .encode_to(buf) + .map_err(|e| Box::new(e) as _) + .context(EncodeLogValuePayload) + } + + fn estimate_encoded_size(&self, payload: &T) -> usize { + // Refer to value format. + 1 + payload.encode_size() + } +} + +struct LogValueDecoder<'a, D: PayloadDecoder> { + version: u8, + payload_dec: &'a D, +} + +impl<'a, D: PayloadDecoder> Decoder for LogValueDecoder<'a, D> { + type Error = Error; + + fn decode(&self, buf: &mut B) -> Result { + let version = buf.read_u8().context(DecodeLogValueHeader)?; + ensure!( + version == self.version, + InvalidVersion { + expect: self.version, + given: version + } + ); + + self.payload_dec + .decode(buf) + .map_err(|e| Box::new(e) as _) + .context(DecodeLogValuePayload) + } +} + +#[derive(Clone, Copy, Debug)] +pub enum MetaKeyType { + MaxSeq = 0, +} + +#[derive(Clone, Debug)] +pub struct MetaKeyEncoder { + version: u8, + key_type: MetaKeyType, + namespace: Namespace, +} + +#[derive(Clone, Debug)] +pub struct MetaKey { + pub region_id: RegionId, +} + +impl MetaKeyEncoder { + /// Determine whether the raw bytes is a valid meta key. + pub fn is_valid(&self, buf: &mut B) -> Result { + let namespace = buf.read_u8().context(DecodeMetaKey)?; + let key_type = buf.read_u8().context(DecodeMetaKey)?; + Ok(namespace == self.namespace as u8 && key_type == self.key_type as u8) + } +} + +impl Encoder for MetaKeyEncoder { + type Error = Error; + + /// Key format: + /// + /// ```text + /// +---------------+--------------+----------------+--------------------+ + /// | namespace(u8) | key_type(u8) | region_id(u64) | version header(u8) | + /// +---------------+--------------+----------------+--------------------+ + /// ``` + /// + /// More information can be extended after the incremented `version header`. + fn encode(&self, buf: &mut B, meta_key: &MetaKey) -> Result<()> { + buf.write_u8(self.namespace as u8).context(EncodeMetaKey)?; + buf.write_u8(self.key_type as u8).context(EncodeMetaKey)?; + buf.write_u64(meta_key.region_id).context(EncodeMetaKey)?; + buf.write_u8(self.version).context(EncodeMetaKey)?; + + Ok(()) + } + + fn estimate_encoded_size(&self, _log_key: &MetaKey) -> usize { + // Refer to key format. + 1 + 1 + 8 + 1 + } +} + +impl Decoder for MetaKeyEncoder { + type Error = Error; + + fn decode(&self, buf: &mut B) -> Result { + // check namespace + let namespace = buf.read_u8().context(DecodeMetaKey)?; + ensure!( + namespace == self.namespace as u8, + InvalidNamespace { + expect: self.namespace, + given: namespace + } + ); + + let key_type = buf.read_u8().context(DecodeMetaKey)?; + ensure!( + key_type == self.key_type as u8, + InvalidMetaKeyType { + expect: self.key_type, + given: key_type, + } + ); + + let region_id = buf.read_u64().context(DecodeMetaKey)?; + + // check version + let version = buf.read_u8().context(DecodeMetaKey)?; + ensure!( + version == self.version, + InvalidVersion { + expect: self.version, + given: version + } + ); + + Ok(MetaKey { region_id }) + } +} + +#[derive(Clone, Debug)] +pub struct MaxSeqMetaValue { + pub max_seq: SequenceNumber, +} + +#[derive(Clone, Debug)] +pub struct MaxSeqMetaValueEncoder { + version: u8, +} + +impl Encoder for MaxSeqMetaValueEncoder { + type Error = Error; + + /// Value format: + /// + /// ```text + /// +--------------------+--------------+ + /// | version header(u8) | max_seq(u64) | + /// +--------------------+--------------+ + /// ``` + /// + /// More information can be extended after the incremented `version header`. + fn encode(&self, buf: &mut B, meta_value: &MaxSeqMetaValue) -> Result<()> { + buf.write_u8(self.version).context(EncodeMetaValue)?; + buf.write_u64(meta_value.max_seq).context(EncodeMetaValue)?; + + Ok(()) + } + + fn estimate_encoded_size(&self, _meta_value: &MaxSeqMetaValue) -> usize { + // Refer to value format. + 1 + 8 + } +} + +impl Decoder for MaxSeqMetaValueEncoder { + type Error = Error; + + fn decode(&self, buf: &mut B) -> Result { + // check version + let version = buf.read_u8().context(DecodeMetaValue)?; + ensure!( + version == self.version, + InvalidVersion { + expect: self.version, + given: version + } + ); + + let max_seq = buf.read_u64().context(DecodeMetaValue)?; + Ok(MaxSeqMetaValue { max_seq }) + } +} + +#[derive(Clone, Debug)] +pub struct MaxSeqMetaEncoding { + key_enc: MetaKeyEncoder, + value_enc: MaxSeqMetaValueEncoder, +} + +impl MaxSeqMetaEncoding { + pub fn newest() -> Self { + Self { + key_enc: MetaKeyEncoder { + version: NEWEST_META_KEY_ENCODING_VERSION, + key_type: MetaKeyType::MaxSeq, + namespace: Namespace::Meta, + }, + value_enc: MaxSeqMetaValueEncoder { + version: NEWEST_META_VALUE_ENCODING_VERSION, + }, + } + } + + pub fn is_max_seq_meta_key(&self, mut buf: &[u8]) -> manager::Result { + self.key_enc + .is_valid(&mut buf) + .map_err(|e| Box::new(e) as _) + .context(manager::Decoding) + } + + pub fn encode_key(&self, buf: &mut BytesMut, meta_key: &MetaKey) -> manager::Result<()> { + buf.clear(); + buf.reserve(self.key_enc.estimate_encoded_size(meta_key)); + self.key_enc + .encode(buf, meta_key) + .map_err(|e| Box::new(e) as _) + .context(manager::Encoding)?; + + Ok(()) + } + + pub fn encode_value( + &self, + buf: &mut BytesMut, + meta_value: &MaxSeqMetaValue, + ) -> manager::Result<()> { + buf.clear(); + buf.reserve(self.value_enc.estimate_encoded_size(meta_value)); + self.value_enc + .encode(buf, meta_value) + .map_err(|e| Box::new(e) as _) + .context(manager::Encoding) + } + + pub fn decode_key(&self, mut buf: &[u8]) -> manager::Result { + self.key_enc + .decode(&mut buf) + .map_err(|e| Box::new(e) as _) + .context(manager::Decoding) + } + + pub fn decode_value(&self, mut buf: &[u8]) -> manager::Result { + self.value_enc + .decode(&mut buf) + .map_err(|e| Box::new(e) as _) + .context(manager::Decoding) + } +} diff --git a/wal/src/rocks_impl/manager.rs b/wal/src/rocks_impl/manager.rs new file mode 100644 index 0000000000..bdf71eba0e --- /dev/null +++ b/wal/src/rocks_impl/manager.rs @@ -0,0 +1,621 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! WalManager implementation based on RocksDB + +use std::{ + collections::HashMap, + fmt, + fmt::Formatter, + path::PathBuf, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, RwLock, + }, +}; + +use async_trait::async_trait; +use common_types::{bytes::BytesMut, SequenceNumber, MAX_SEQUENCE_NUMBER, MIN_SEQUENCE_NUMBER}; +use common_util::runtime::Runtime; +use log::{debug, info, warn}; +use rocksdb::{DBIterator, DBOptions, ReadOptions, SeekKey, Writable, WriteBatch, DB}; +use snafu::ResultExt; +use tokio::sync::Mutex; + +use crate::{ + log_batch::{LogEntry, LogWriteBatch, Payload, PayloadDecoder}, + manager::{ + error::*, LogIterator, LogReader, LogWriter, ReadContext, ReadRequest, RegionId, + WalManager, WriteContext, MAX_REGION_ID, + }, + rocks_impl::encoding::{LogEncoding, LogKey, MaxSeqMetaEncoding, MaxSeqMetaValue, MetaKey}, +}; + +/// Region in the Wal. +struct Region { + /// id of the Region + id: RegionId, + /// RocksDB instance + db: Arc, + /// `next_sequence_num` is ensured to be positive + next_sequence_num: AtomicU64, + /// Encoding for log entries + log_encoding: LogEncoding, + /// Encoding for meta data of max sequence + max_seq_meta_encoding: MaxSeqMetaEncoding, + /// Runtime for write requests + runtime: Arc, + /// Ensure the delete procedure to be sequential + delete_lock: Mutex<()>, +} + +impl Region { + /// Allocate a continuous range of [SequenceNumber] and returns + /// the start [SequenceNumber] of the range [start, start+`number`). + #[inline] + fn alloc_sequence_num(&self, number: u64) -> SequenceNumber { + self.next_sequence_num.fetch_add(number, Ordering::Relaxed) + } + + #[inline] + /// Generate [LogKey] from `region_id` and `sequence_num` + fn log_key(&self, sequence_num: SequenceNumber) -> LogKey { + (self.id, sequence_num) + } + + /// Returns the current sequence number which must be positive. + fn sequence_num(&self) -> Result { + let next_seq_num = self.next_sequence_num.load(Ordering::Relaxed); + debug_assert!(next_seq_num > 0); + + Ok(next_seq_num - 1) + } + + /// Delete entries in the range `[0, sequence_num]`. + /// + /// The delete procedure is ensured to be sequential. + async fn delete_entries_up_to(&self, mut sequence_num: SequenceNumber) -> Result<()> { + debug!( + "Wal Region delete entries begin deleting, sequence_num:{:?}", + sequence_num + ); + + let _delete_guard = self.delete_lock.lock().await; + let max_seq = self.sequence_num()?; + if sequence_num > max_seq { + warn!( + "Try to delete entries up to sequence number({}) greater than current max sequence \ + number({})", + sequence_num, + max_seq + ); + sequence_num = max_seq; + } + + let wb = { + let wb = WriteBatch::default(); + + // Delete the range [0, sequence_num] + let start_log_key = (self.id, 0); + let end_log_key = if sequence_num < MAX_SEQUENCE_NUMBER { + (self.id, sequence_num + 1) + } else { + // Region id is unlikely to overflow. + (self.id + 1, 0) + }; + let (mut start_key_buf, mut end_key_buf) = (BytesMut::new(), BytesMut::new()); + self.log_encoding + .encode_key(&mut start_key_buf, &start_log_key)?; + self.log_encoding + .encode_key(&mut end_key_buf, &end_log_key)?; + wb.delete_range(&start_key_buf, &end_key_buf) + .map_err(|e| e.into()) + .context(Delete)?; + + // Update the max sequence number. + let meta_key = MetaKey { region_id: self.id }; + let meta_value = MaxSeqMetaValue { max_seq }; + let (mut meta_key_buf, mut meta_value_buf) = (BytesMut::new(), BytesMut::new()); + self.max_seq_meta_encoding + .encode_key(&mut meta_key_buf, &meta_key)?; + self.max_seq_meta_encoding + .encode_value(&mut meta_value_buf, &meta_value)?; + wb.put(&meta_key_buf, &meta_value_buf) + .map_err(|e| e.into()) + .context(Delete)?; + + wb + }; + + let db = self.db.clone(); + self.runtime + .spawn_blocking(move || db.write(&wb).map_err(|e| e.into()).context(Delete)) + .await + .map_err(|e| Box::new(e) as _) + .context(Delete)? + } + + fn read(&self, ctx: &ReadContext, req: &ReadRequest) -> Result { + debug!("Wal region begin reading, ctx:{:?}, req:{:?}", ctx, req); + + let read_opts = ReadOptions::default(); + let iter = DBIterator::new(self.db.clone(), read_opts); + + let start_sequence = if let Some(n) = req.start.as_start_sequence_number() { + n + } else { + return Ok(RocksLogIterator::new_empty(self.log_encoding.clone(), iter)); + }; + + let end_sequence = if let Some(n) = req.end.as_end_sequence_number() { + n + } else { + return Ok(RocksLogIterator::new_empty(self.log_encoding.clone(), iter)); + }; + + let (min_log_key, max_log_key) = (self.log_key(start_sequence), self.log_key(end_sequence)); + + let log_iter = + RocksLogIterator::with_data(self.log_encoding.clone(), iter, min_log_key, max_log_key); + Ok(log_iter) + } + + async fn write(&self, ctx: &WriteContext, batch: &LogWriteBatch

) -> Result { + debug!( + "Wal region begin writing, ctx:{:?}, log_entries_num:{}", + ctx, + batch.entries.len() + ); + + let entries_num = batch.len() as u64; + let (wb, max_sequence_num) = { + let wb = WriteBatch::default(); + let mut next_sequence_num = self.alloc_sequence_num(entries_num); + let mut key_buf = BytesMut::new(); + let mut value_buf = BytesMut::new(); + + for entry in &batch.entries { + self.log_encoding + .encode_key(&mut key_buf, &(batch.region_id, next_sequence_num))?; + self.log_encoding + .encode_value(&mut value_buf, &entry.payload)?; + wb.put(&key_buf, &value_buf) + .map_err(|e| e.into()) + .context(Write)?; + + next_sequence_num += 1; + } + + (wb, next_sequence_num - 1) + }; + + let db = self.db.clone(); + self.runtime + .spawn_blocking(move || { + db.write(&wb) + .map(|_| max_sequence_num) + .map_err(|e| e.into()) + .context(Write) + }) + .await + .map_err(|e| Box::new(e) as _) + .context(Write)? + } +} + +/// [WalManager] implementation based on RocksDB. +/// A [RocksImpl] consists of multiple [Region]s and any read/write/delete +/// request is delegated to specific [Region]. +pub struct RocksImpl { + /// Wal data path + wal_path: String, + /// RocksDB instance + db: Arc, + /// Runtime for read/write log entries + runtime: Arc, + /// Encoding for log entry + log_encoding: LogEncoding, + /// Encoding for meta data of max sequence + max_seq_meta_encoding: MaxSeqMetaEncoding, + /// Regions + regions: RwLock>>, +} + +impl Drop for RocksImpl { + fn drop(&mut self) { + // Clear all regions. + { + let mut regions = self.regions.write().unwrap(); + regions.clear(); + } + + info!("RocksImpl dropped, wal_path:{}", self.wal_path); + } +} + +impl RocksImpl { + fn build_regions(&self) -> Result<()> { + let region_seqs = self.find_region_seqs_from_db()?; + + info!( + "RocksImpl build regions, wal_path:{}, region_seqs:{:?}", + self.wal_path, region_seqs + ); + + let mut regions = self.regions.write().unwrap(); + for (region_id, sequence_number) in region_seqs { + let region = Region { + id: region_id, + db: self.db.clone(), + next_sequence_num: AtomicU64::new(sequence_number + 1), + log_encoding: self.log_encoding.clone(), + max_seq_meta_encoding: self.max_seq_meta_encoding.clone(), + runtime: self.runtime.clone(), + delete_lock: Mutex::new(()), + }; + + regions.insert(region_id, Arc::new(region)); + } + + Ok(()) + } + + fn find_region_seqs_from_region_data( + &self, + region_max_seqs: &mut HashMap, + ) -> Result<()> { + let mut current_region_id = MAX_REGION_ID; + let mut end_boundary_key_buf = BytesMut::new(); + loop { + let log_key = (current_region_id, MAX_SEQUENCE_NUMBER); + self.log_encoding + .encode_key(&mut end_boundary_key_buf, &log_key)?; + let mut iter = self.db.iter(); + let seek_key = SeekKey::Key(&end_boundary_key_buf); + + let found = iter + .seek_for_prev(seek_key) + .map_err(|e| e.into()) + .context(Initialization)?; + + if !found { + debug!("RocksImpl find region pairs stop scanning, because of no entries to scan"); + break; + } + + if !self.log_encoding.is_log_key(iter.key())? { + debug!("RocksImpl find region pairs stop scanning, because log keys are exhausted"); + break; + } + + let log_key = self.log_encoding.decode_key(iter.key())?; + region_max_seqs.insert(log_key.0, log_key.1); + + if log_key.0 == 0 { + debug!("RocksImpl find region pairs stop scanning, because region 0 is reached"); + break; + } + current_region_id = log_key.0 - 1; + } + + Ok(()) + } + + fn find_region_seqs_from_region_meta( + &self, + region_max_seqs: &mut HashMap, + ) -> Result<()> { + let meta_key = MetaKey { region_id: 0 }; + let mut start_boundary_key_buf = BytesMut::new(); + self.max_seq_meta_encoding + .encode_key(&mut start_boundary_key_buf, &meta_key)?; + let mut iter = self.db.iter(); + let seek_key = SeekKey::Key(&start_boundary_key_buf); + iter.seek(seek_key) + .map_err(|e| e.into()) + .context(Initialization)?; + + loop { + if !iter.valid().map_err(|e| e.into()).context(Initialization)? { + debug!("RocksImpl exhausts the iterator for meta information"); + break; + } + if !self.max_seq_meta_encoding.is_max_seq_meta_key(iter.key())? { + debug!("RocksImpl exhausts max sequence meta key"); + break; + } + + let meta_key = self.max_seq_meta_encoding.decode_key(iter.key())?; + let meta_value = self.max_seq_meta_encoding.decode_value(iter.value())?; + region_max_seqs + .entry(meta_key.region_id) + .and_modify(|v| { + *v = meta_value.max_seq.max(*v); + }) + .or_insert(meta_value.max_seq); + + iter.next().map_err(|e| e.into()).context(Initialization)?; + } + + Ok(()) + } + + /// Collect all the regions with its max sequence number from the db. + /// + /// Returns the mapping: region_id -> max_sequence_number + fn find_region_seqs_from_db(&self) -> Result> { + // build the mapping: region_id -> max_sequence_number + let mut region_max_seqs = HashMap::new(); + + // scan the region information from the data part. + self.find_region_seqs_from_region_data(&mut region_max_seqs)?; + + // scan the region information from the meta part. + self.find_region_seqs_from_region_meta(&mut region_max_seqs)?; + + Ok(region_max_seqs) + } + + /// Get the region and create it if not found. + fn get_or_create_region(&self, region_id: RegionId) -> Arc { + { + let regions = self.regions.read().unwrap(); + if let Some(region) = regions.get(®ion_id) { + return region.clone(); + } + } + + let mut regions = self.regions.write().unwrap(); + if let Some(region) = regions.get(®ion_id) { + return region.clone(); + } + + info!( + "RocksImpl create new region, wal_path:{}, region_id:{}", + self.wal_path, region_id + ); + + // create a new region + let region = Arc::new(Region { + id: region_id, + db: self.db.clone(), + // ensure `next_sequence_number` to start from 1 (larger than MIN_SEQUENCE_NUMBER) + next_sequence_num: AtomicU64::new(MIN_SEQUENCE_NUMBER + 1), + log_encoding: self.log_encoding.clone(), + max_seq_meta_encoding: self.max_seq_meta_encoding.clone(), + runtime: self.runtime.clone(), + delete_lock: Mutex::new(()), + }); + + regions.insert(region_id, region.clone()); + region + } + + /// Get the region + fn region(&self, region_id: RegionId) -> Option> { + let regions = self.regions.read().unwrap(); + regions.get(®ion_id).cloned() + } +} + +/// Builder for `RocksImpl`. +pub struct Builder { + wal_path: String, + rocksdb_config: DBOptions, + runtime: Arc, +} + +impl Builder { + pub fn with_default_rocksdb_config( + wal_path: impl Into, + runtime: Arc, + ) -> Self { + let mut rocksdb_config = DBOptions::default(); + // TODO(yingwen): Move to another function? + rocksdb_config.create_if_missing(true); + Self::new(wal_path, runtime, rocksdb_config) + } + + pub fn new( + wal_path: impl Into, + runtime: Arc, + rocksdb_config: DBOptions, + ) -> Self { + let wal_path: PathBuf = wal_path.into(); + Self { + wal_path: wal_path.to_str().unwrap().to_owned(), + rocksdb_config, + runtime, + } + } + + pub fn build(self) -> Result { + let db = DB::open(self.rocksdb_config, &self.wal_path) + .map_err(|e| e.into()) + .context(Open { + wal_path: self.wal_path.clone(), + })?; + let rocks_impl = RocksImpl { + wal_path: self.wal_path, + db: Arc::new(db), + runtime: self.runtime, + log_encoding: LogEncoding::newest(), + max_seq_meta_encoding: MaxSeqMetaEncoding::newest(), + regions: RwLock::new(HashMap::new()), + }; + rocks_impl.build_regions()?; + + Ok(rocks_impl) + } +} + +/// Iterator over log entries based on RocksDB iterator. +pub struct RocksLogIterator { + log_encoding: LogEncoding, + /// denotes no more data to iterate and it is set to true when: + /// - initialized as no data iterator, or + /// - iterate to the end. + no_more_data: bool, + min_log_key: LogKey, + max_log_key: LogKey, + /// denote whether `iter` is seeked + seeked: bool, + /// RocksDB iterator + iter: DBIterator>, +} + +impl RocksLogIterator { + /// Create iterator maybe containing data. + fn with_data( + log_encoding: LogEncoding, + iter: DBIterator>, + min_log_key: LogKey, + max_log_key: LogKey, + ) -> Self { + Self { + log_encoding, + no_more_data: false, + min_log_key, + max_log_key, + seeked: false, + iter, + } + } + + /// Create empty iterator. + fn new_empty(log_encoding: LogEncoding, iter: DBIterator>) -> Self { + Self { + log_encoding, + no_more_data: true, + min_log_key: (0, 0), + max_log_key: (0, 0), + seeked: false, + iter, + } + } + + /// it's a valid log key if it is in the range `[self.min_log_key, + /// self.max_log_key]`. + fn is_valid_log_key(&self, curr_log_key: &LogKey) -> bool { + curr_log_key <= &self.max_log_key && curr_log_key >= &self.min_log_key + } + + /// End is reached iteration if `curr_log_key` is greater than + /// `max_log_key`. + fn is_end_reached(&self, curr_log_key: &LogKey) -> bool { + curr_log_key >= &self.max_log_key + } + + /// let `iter` seek to `min_log_key` + /// no guarantee on that `self.iter` is valid + fn seek(&mut self) -> Result<()> { + self.seeked = true; + + let mut seek_key_buf = BytesMut::new(); + self.log_encoding + .encode_key(&mut seek_key_buf, &self.min_log_key)?; + let seek_key = SeekKey::Key(&seek_key_buf); + self.iter + .seek(seek_key) + .map_err(|e| e.into()) + .context(Read)?; + + Ok(()) + } +} + +impl LogIterator for RocksLogIterator { + fn next_log_entry( + &mut self, + decoder: &D, + ) -> Result>> { + if self.no_more_data { + return Ok(None); + } + + if !self.seeked { + self.seek()?; + + let valid = self.iter.valid().map_err(|e| e.into()).context(Read)?; + if !valid { + self.no_more_data = true; + return Ok(None); + } + } else { + let found = self.iter.next().map_err(|e| e.into()).context(Read)?; + if !found { + self.no_more_data = true; + return Ok(None); + } + } + + let curr_log_key = self.log_encoding.decode_key(self.iter.key())?; + self.no_more_data = self.is_end_reached(&curr_log_key); + + if self.is_valid_log_key(&curr_log_key) { + let payload = self.log_encoding.decode_value(self.iter.value(), decoder)?; + let log_entry = LogEntry { + sequence: curr_log_key.1, + payload, + }; + Ok(Some(log_entry)) + } else { + Ok(None) + } + } +} + +impl LogReader for RocksImpl { + type Iterator = RocksLogIterator; + + fn read(&self, ctx: &ReadContext, req: &ReadRequest) -> Result { + if let Some(region) = self.region(req.region_id) { + region.read(ctx, req) + } else { + let iter = DBIterator::new(self.db.clone(), ReadOptions::default()); + Ok(RocksLogIterator::new_empty(self.log_encoding.clone(), iter)) + } + } +} + +#[async_trait] +impl LogWriter for RocksImpl { + async fn write( + &self, + ctx: &WriteContext, + batch: &LogWriteBatch

, + ) -> Result { + let region = self.get_or_create_region(batch.region_id); + region.write(ctx, batch).await + } +} + +#[async_trait] +impl WalManager for RocksImpl { + fn sequence_num(&self, region_id: RegionId) -> Result { + if let Some(region) = self.region(region_id) { + return region.sequence_num(); + } + + Ok(MIN_SEQUENCE_NUMBER) + } + + async fn mark_delete_entries_up_to( + &self, + region_id: RegionId, + sequence_num: SequenceNumber, + ) -> Result<()> { + if let Some(region) = self.region(region_id) { + return region.delete_entries_up_to(sequence_num).await; + } + + Ok(()) + } +} + +impl fmt::Debug for RocksImpl { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("RocksImpl") + .field("wal_path", &self.wal_path) + .finish() + } +} diff --git a/wal/src/rocks_impl/mod.rs b/wal/src/rocks_impl/mod.rs new file mode 100644 index 0000000000..e25bca788a --- /dev/null +++ b/wal/src/rocks_impl/mod.rs @@ -0,0 +1,6 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! WalManager implementation based on RocksDB + +pub mod encoding; +pub mod manager; diff --git a/wal/src/tests/mod.rs b/wal/src/tests/mod.rs new file mode 100644 index 0000000000..c52a689521 --- /dev/null +++ b/wal/src/tests/mod.rs @@ -0,0 +1,6 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! integration tests for wal + +mod read_write; +pub mod util; diff --git a/wal/src/tests/read_write.rs b/wal/src/tests/read_write.rs new file mode 100644 index 0000000000..a38bb1282c --- /dev/null +++ b/wal/src/tests/read_write.rs @@ -0,0 +1,449 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ops::Deref, sync::Arc}; + +use common_types::SequenceNumber; + +use crate::{ + log_batch::LogWriteBatch, + manager::{LogReader, LogWriter, ReadBoundary, ReadRequest, RegionId, WalManager}, + tests::util::{RocksTestEnv, TestEnv, TestPayload, WalBuilder}, +}; + +fn check_write_batch_with_read_request( + env: &TestEnv, + wal: Arc, + read_req: ReadRequest, + max_seq: SequenceNumber, + write_batch: &LogWriteBatch, +) { + let iter = wal + .read(&env.read_ctx, &read_req) + .expect("should succeed to read"); + env.check_log_entries(max_seq, write_batch, iter); +} + +fn check_write_batch( + env: &TestEnv, + wal: Arc, + region_id: RegionId, + max_seq: SequenceNumber, + write_batch: &LogWriteBatch, +) { + let read_req = ReadRequest { + region_id, + start: ReadBoundary::Included(max_seq + 1 - write_batch.entries.len() as u64), + end: ReadBoundary::Included(max_seq), + }; + check_write_batch_with_read_request(env, wal, read_req, max_seq, write_batch) +} + +async fn simple_read_write_with_wal( + env: impl Deref>, + wal: Arc, + region_id: RegionId, +) { + let write_batch = env.build_log_batch(region_id, 0, 10); + let seq = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + check_write_batch(&env, wal, region_id, seq, &write_batch) +} + +async fn simple_read_write(env: &TestEnv, region_id: RegionId) { + let wal = env.build_wal(); + simple_read_write_with_wal(env, wal.clone(), region_id).await; +} + +/// Test the read with different kinds of boundaries. +async fn read_with_boundary(env: &TestEnv) { + let wal = env.build_wal(); + let region_id = 0; + let write_batch = env.build_log_batch(region_id, 0, 10); + let end_seq = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + let start_seq = end_seq + 1 - write_batch.entries.len() as u64; + + // [min, max] + { + let read_req = ReadRequest { + region_id, + start: ReadBoundary::Min, + end: ReadBoundary::Max, + }; + check_write_batch_with_read_request(env, wal.clone(), read_req, end_seq, &write_batch); + } + + // [0, 10] + { + let read_req = ReadRequest { + region_id, + start: ReadBoundary::Included(start_seq), + end: ReadBoundary::Included(end_seq), + }; + check_write_batch_with_read_request(env, wal.clone(), read_req, end_seq, &write_batch); + } + + // (0, 10] + { + let read_req = ReadRequest { + region_id, + start: ReadBoundary::Excluded(start_seq), + end: ReadBoundary::Included(end_seq), + }; + let write_batch = env.build_log_batch(region_id, 1, 10); + check_write_batch_with_read_request(env, wal.clone(), read_req, end_seq, &write_batch); + } + + // [0, 10) + { + let read_req = ReadRequest { + region_id, + start: ReadBoundary::Included(start_seq), + end: ReadBoundary::Excluded(end_seq), + }; + let write_batch = env.build_log_batch(region_id, 0, 9); + check_write_batch_with_read_request(env, wal.clone(), read_req, end_seq - 1, &write_batch); + } + + // (0, 10) + { + let read_req = ReadRequest { + region_id, + start: ReadBoundary::Excluded(start_seq), + end: ReadBoundary::Excluded(end_seq), + }; + let write_batch = env.build_log_batch(region_id, 1, 9); + check_write_batch_with_read_request(env, wal.clone(), read_req, end_seq - 1, &write_batch); + } +} + +/// Test read and write across multiple regions parallely. +async fn write_multiple_regions_parallelly(env: Arc>) { + let wal = env.build_wal(); + let mut handles = Vec::with_capacity(10); + for i in 0..5 { + let read_write_0 = + env.runtime + .spawn(simple_read_write_with_wal(env.clone(), wal.clone(), i)); + let read_write_1 = + env.runtime + .spawn(simple_read_write_with_wal(env.clone(), wal.clone(), i)); + handles.push(read_write_0); + handles.push(read_write_1); + } + futures::future::join_all(handles) + .await + .into_iter() + .for_each(|res| { + res.expect("should succeed to join the write"); + }); +} + +/// Test whether the written logs can be read after reopen. +async fn reopen(env: &TestEnv) { + let region_id = 0; + let (write_batch, seq) = { + let wal = env.build_wal(); + let write_batch = env.build_log_batch(region_id, 0, 10); + let seq = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + (write_batch, seq) + }; + + // reopen the wal + let wal = env.build_wal(); + let read_req = ReadRequest { + region_id, + start: ReadBoundary::Included(seq + 1 - write_batch.entries.len() as u64), + end: ReadBoundary::Included(seq), + }; + let iter = wal + .read(&env.read_ctx, &read_req) + .expect("should succeed to read"); + env.check_log_entries(seq, &write_batch, iter); +} + +/// A complex test case for read and write: +/// - Write two log batch +/// - Read the first batch and then read the second batch. +/// - Read the whole batch. +/// - Read the part of first batch and second batch. +async fn complex_read_write(env: &TestEnv) { + let wal = env.build_wal(); + let region_id = 0; + + // write two batches + let (start_val, mid_val, end_val) = (0, 10, 50); + let write_batch_1 = env.build_log_batch(region_id, start_val, mid_val); + let seq_1 = wal + .write(&env.write_ctx, &write_batch_1) + .await + .expect("should succeed to write"); + let write_batch_2 = env.build_log_batch(region_id, mid_val, end_val); + let seq_2 = wal + .write(&env.write_ctx, &write_batch_2) + .await + .expect("should succeed to write"); + + // read the first batch + check_write_batch(env, wal.clone(), region_id, seq_1, &write_batch_1); + // read the second batch + check_write_batch(env, wal.clone(), region_id, seq_2, &write_batch_2); + + // read the whole batch + let (seq_3, write_batch_3) = (seq_2, env.build_log_batch(region_id, start_val, end_val)); + check_write_batch(env, wal.clone(), region_id, seq_3, &write_batch_3); + + // read the part of batch1 and batch2 + let (seq_4, write_batch_4) = { + let new_start = (start_val + mid_val) / 2; + let new_end = (mid_val + end_val) / 2; + let seq = seq_2 - (end_val - new_end) as u64; + (seq, env.build_log_batch(region_id, new_start, new_end)) + }; + check_write_batch(env, wal.clone(), region_id, seq_4, &write_batch_4); +} + +/// Test whether data can be deleted. +async fn simple_write_delete(env: &TestEnv) { + let region_id = 0; + let wal = env.build_wal(); + let mut write_batch = env.build_log_batch(region_id, 0, 10); + let seq = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + check_write_batch(env, wal.clone(), region_id, seq, &write_batch); + + // delete all logs + wal.mark_delete_entries_up_to(region_id, seq) + .await + .expect("should succeed to delete"); + let read_req = ReadRequest { + region_id, + start: ReadBoundary::Min, + end: ReadBoundary::Max, + }; + let iter = wal + .read(&env.read_ctx, &read_req) + .expect("should succeed to read"); + write_batch.entries.clear(); + env.check_log_entries(seq, &write_batch, iter); +} + +/// Delete half of the written data and check the remaining half can be read. +async fn write_delete_half(env: &TestEnv) { + let region_id = 0; + let wal = env.build_wal(); + let mut write_batch = env.build_log_batch(region_id, 0, 10); + let seq = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + check_write_batch(env, wal.clone(), region_id, seq, &write_batch); + + // delete all logs + wal.mark_delete_entries_up_to(region_id, seq / 2) + .await + .expect("should succeed to delete"); + let read_req = ReadRequest { + region_id, + start: ReadBoundary::Min, + end: ReadBoundary::Max, + }; + let iter = wal + .read(&env.read_ctx, &read_req) + .expect("should succeed to read"); + write_batch.entries.drain(..write_batch.entries.len() / 2); + env.check_log_entries(seq, &write_batch, iter); +} + +/// Test delete across multiple regions. +async fn write_delete_multiple_regions(env: &TestEnv) { + let (region_id_1, region_id_2) = (1, 2); + let wal = env.build_wal(); + let mut write_batch_1 = env.build_log_batch(region_id_1, 0, 10); + let seq_1 = wal + .write(&env.write_ctx, &write_batch_1) + .await + .expect("should succeed to write"); + + let write_batch_2 = env.build_log_batch(region_id_2, 10, 20); + let seq_2 = wal + .write(&env.write_ctx, &write_batch_2) + .await + .expect("should succeed to write"); + + // delete all logs of region 1. + wal.mark_delete_entries_up_to(region_id_1, seq_1) + .await + .expect("should succeed to delete"); + let read_req = ReadRequest { + region_id: region_id_1, + start: ReadBoundary::Min, + end: ReadBoundary::Max, + }; + let iter = wal + .read(&env.read_ctx, &read_req) + .expect("should succeed to read"); + write_batch_1.entries.clear(); + env.check_log_entries(seq_1, &write_batch_1, iter); + + check_write_batch(env, wal.clone(), region_id_2, seq_2, &write_batch_2); +} + +/// The sequence number should increase monotonically after multiple writes. +async fn sequence_increase_monotonically_multiple_writes(env: &TestEnv) { + let region_id = 0; + let wal = env.build_wal(); + let write_batch = env.build_log_batch(region_id, 0, 10); + let seq_1 = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + let seq_2 = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + let seq_3 = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + + assert!(seq_2 > seq_1); + assert!(seq_3 > seq_2); +} + +/// The sequence number should increase monotonically after write, delete and +/// one more write. +async fn sequence_increase_monotonically_delete_write(env: &TestEnv) { + let region_id = 0; + let wal = env.build_wal(); + let write_batch = env.build_log_batch(region_id, 0, 10); + // write + let seq_1 = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + // delete + wal.mark_delete_entries_up_to(region_id, seq_1) + .await + .expect("should succeed to delete"); + // write again + let seq_2 = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + + assert!(seq_2 > seq_1); +} + +/// The sequence number should increase monotonically after write, delete, +/// reopen and write. +async fn sequence_increase_monotonically_delete_reopen_write(env: &TestEnv) { + let region_id = 0; + let wal = env.build_wal(); + let write_batch = env.build_log_batch(region_id, 0, 10); + // write + let seq_1 = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + // delete + wal.mark_delete_entries_up_to(region_id, seq_1) + .await + .expect("should succeed to delete"); + // restart + drop(wal); + let wal = env.build_wal(); + // write again + let seq_2 = wal + .write(&env.write_ctx, &write_batch) + .await + .expect("should succeed to write"); + + assert!(seq_2 > seq_1); +} + +#[test] +fn test_simple_read_write() { + let rocks_env = RocksTestEnv::new(2); + rocks_env.runtime.block_on(simple_read_write(&rocks_env, 0)); +} + +#[test] +fn test_read_with_boundary() { + let rocks_env = RocksTestEnv::new(2); + rocks_env.runtime.block_on(read_with_boundary(&rocks_env)); +} + +#[test] +fn test_write_multiple_regions() { + let rocks_env = Arc::new(RocksTestEnv::new(4)); + rocks_env + .runtime + .block_on(write_multiple_regions_parallelly(rocks_env.clone())); +} + +#[test] +fn test_reopen() { + let rocks_env = RocksTestEnv::new(2); + rocks_env.runtime.block_on(reopen(&rocks_env)); +} + +#[test] +fn test_complex_read_write() { + let rocks_env = RocksTestEnv::new(2); + rocks_env.runtime.block_on(complex_read_write(&rocks_env)); +} + +#[test] +fn test_simple_write_delete() { + let rocks_env = RocksTestEnv::new(2); + rocks_env.runtime.block_on(simple_write_delete(&rocks_env)); +} + +#[test] +fn test_write_delete_half() { + let rocks_env = RocksTestEnv::new(2); + rocks_env.runtime.block_on(write_delete_half(&rocks_env)); +} +#[test] +fn test_write_delete_multiple_regions() { + let rocks_env = RocksTestEnv::new(2); + rocks_env + .runtime + .block_on(write_delete_multiple_regions(&rocks_env)); +} + +#[test] +fn test_sequence_increase_monotonically_multiple_writes() { + let rocks_env = RocksTestEnv::new(2); + rocks_env + .runtime + .block_on(sequence_increase_monotonically_multiple_writes(&rocks_env)); +} + +#[test] +fn test_sequence_increase_monotonically_delete_write() { + let rocks_env = RocksTestEnv::new(2); + rocks_env + .runtime + .block_on(sequence_increase_monotonically_delete_write(&rocks_env)); +} + +#[test] +fn test_sequence_increase_monotonically_delete_reopen_write() { + let rocks_env = RocksTestEnv::new(2); + rocks_env + .runtime + .block_on(sequence_increase_monotonically_delete_reopen_write( + &rocks_env, + )); +} diff --git a/wal/src/tests/util.rs b/wal/src/tests/util.rs new file mode 100644 index 0000000000..cd631363f6 --- /dev/null +++ b/wal/src/tests/util.rs @@ -0,0 +1,158 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! utilities for testing wal module. + +use std::{path::Path, sync::Arc}; + +use common_types::bytes::{MemBuf, MemBufMut}; +use common_util::runtime::{self, Runtime}; +use tempfile::TempDir; + +use crate::{ + log_batch::{LogWriteBatch, LogWriteEntry, Payload, PayloadDecoder}, + manager::{LogIterator, LogReader, ReadContext, RegionId, WalManager, WriteContext}, + rocks_impl::{self, manager::RocksImpl}, +}; + +pub trait WalBuilder: Default + Send + Sync { + type Wal: WalManager + Send + Sync; + fn build(&self, data_path: &Path, runtime: Arc) -> Arc; +} +use common_types::SequenceNumber; +use snafu::Snafu; + +#[derive(Debug, Snafu)] +pub enum Error {} + +#[derive(Default)] +pub struct RocksWalBuilder; + +impl WalBuilder for RocksWalBuilder { + type Wal = RocksImpl; + + fn build(&self, data_path: &Path, runtime: Arc) -> Arc { + let wal_builder = + rocks_impl::manager::Builder::with_default_rocksdb_config(data_path, runtime); + + Arc::new( + wal_builder + .build() + .expect("should succeed to build rocksimpl wal"), + ) + } +} + +pub type RocksTestEnv = TestEnv; + +/// The environment for testing wal. +pub struct TestEnv { + pub dir: TempDir, + pub runtime: Arc, + pub write_ctx: WriteContext, + pub read_ctx: ReadContext, + /// Builder for a specific wal. + builder: B, +} + +impl TestEnv { + pub fn new(num_workers: usize) -> Self { + let runtime = runtime::Builder::default() + .worker_threads(num_workers) + .enable_all() + .build() + .unwrap(); + + Self { + dir: tempfile::tempdir().unwrap(), + runtime: Arc::new(runtime), + write_ctx: WriteContext::default(), + read_ctx: ReadContext::default(), + builder: B::default(), + } + } + + pub fn build_wal(&self) -> Arc { + self.builder.build(self.dir.path(), self.runtime.clone()) + } + + /// Build the log batch with [TestPayload].val range [start, end). + pub fn build_log_batch( + &self, + region_id: RegionId, + start: u32, + end: u32, + ) -> LogWriteBatch { + let mut write_batch = LogWriteBatch::new(region_id); + for val in start..end { + let payload = TestPayload { val }; + write_batch.entries.push(LogWriteEntry { payload }); + } + + write_batch + } + + /// Check whether the log entries from the iterator equals the + /// `write_batch`. + pub fn check_log_entries( + &self, + max_seq: SequenceNumber, + write_batch: &LogWriteBatch, + mut iter: ::Iterator, + ) { + let dec = TestPayloadDecoder; + let mut log_entries = Vec::with_capacity(write_batch.entries.len()); + loop { + let log_entry = iter + .next_log_entry(&dec) + .expect("should succeed to fetch next log entry"); + if log_entry.is_none() { + break; + } + + log_entries.push(log_entry.unwrap()); + } + + assert_eq!(write_batch.entries.len(), log_entries.len()); + for (idx, (expect_log_write_entry, log_entry)) in write_batch + .entries + .iter() + .zip(log_entries.iter()) + .rev() + .enumerate() + { + assert_eq!(max_seq - idx as u64, log_entry.sequence); + assert_eq!(expect_log_write_entry.payload, log_entry.payload); + } + } +} + +/// The payload for Wal log entry for testing. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct TestPayload { + val: u32, +} + +impl Payload for TestPayload { + type Error = Error; + + fn encode_size(&self) -> usize { + 4 + } + + fn encode_to(&self, buf: &mut B) -> Result<(), Self::Error> { + buf.write_u32(self.val).expect("must write"); + Ok(()) + } +} + +pub struct TestPayloadDecoder; + +impl PayloadDecoder for TestPayloadDecoder { + type Error = Error; + type Target = TestPayload; + + fn decode(&self, buf: &mut B) -> Result { + let val = buf.read_u32().expect("should succeed to read u32"); + Ok(TestPayload { val }) + } +}

, + /// Object metadata for the listing + pub objects: Vec>, +} + +/// The metadata that describes an object. +#[derive(Debug)] +pub struct ObjectMeta { + /// The full path to the object + pub location: P, + /// The last modified time + pub last_modified: SystemTime, + /// The size in bytes of the object + pub size: usize, +} + +#[cfg(test)] +mod tests { + use std::io::Read; + + use bytes::Bytes; + use futures::{stream, StreamExt, TryStreamExt}; + + use super::*; + use crate::path::{file::FilePath, parsed::DirsAndFileName}; + + type Error = Box; + type Result = std::result::Result; + + async fn flatten_list_stream< + P: path::ObjectStorePath, + E: std::error::Error + Send + Sync + 'static, + R: AsyncRead + Unpin, + >( + storage: &impl ObjectStore, + prefix: Option<&P>, + ) -> Result> { + storage + .list(prefix) + .await? + .map_ok(|v| stream::iter(v).map(Ok)) + .try_flatten() + .try_collect() + .await + } + + pub(crate) async fn put_get_delete_list< + P: path::ObjectStorePath, + E: std::error::Error + Send + Sync + 'static, + R: AsyncRead + Unpin, + >( + storage: &impl ObjectStore, + ) -> Result<()> { + delete_fixtures(storage).await; + + let content_list = flatten_list_stream(storage, None).await?; + assert!( + content_list.is_empty(), + "Expected list to be empty; found: {:?}", + content_list + ); + + let data = Bytes::from("arbitrary data"); + let mut location = storage.new_path(); + location.push_dir("test_dir"); + location.set_file_name("test_file.json"); + + storage + .put(&location, data.as_ref(), Some(data.len())) + .await?; + + // List everything + let content_list = flatten_list_stream(storage, None).await?; + assert_eq!(content_list, &[location.clone()]); + + // List everything starting with a prefix that should return results + let mut prefix = storage.new_path(); + prefix.push_dir("test_dir"); + let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + assert_eq!(content_list, &[location.clone()]); + + // List everything starting with a prefix that shouldn't return results + let mut prefix = storage.new_path(); + prefix.push_dir("something"); + let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + assert!(content_list.is_empty()); + + let mut read_data = Vec::with_capacity(data.len()); + + storage.get(&location).await?.read_to_end(&mut read_data)?; + assert_eq!(&*read_data, data); + + storage.delete(&location).await?; + + let content_list = flatten_list_stream(storage, None).await?; + assert!(content_list.is_empty()); + + Ok(()) + } + + pub(crate) async fn list_with_delimiter< + P: path::ObjectStorePath, + E: std::error::Error + Send + Sync + 'static, + R: AsyncRead + Unpin, + >( + storage: &impl ObjectStore, + ) -> Result<()> { + delete_fixtures(storage).await; + + // ==================== check: store is empty ==================== + let content_list = flatten_list_stream(storage, None).await?; + assert!(content_list.is_empty()); + + // ==================== do: create files ==================== + let data = Bytes::from("arbitrary data"); + + let files: Vec<_> = [ + "test_file", + "mydb/wb/000/000/000.segment", + "mydb/wb/000/000/001.segment", + "mydb/wb/000/000/002.segment", + "mydb/wb/001/001/000.segment", + "mydb/wb/foo.json", + "mydb/data/whatevs", + ] + .iter() + .map(|&s| str_to_path(storage, s)) + .collect(); + + for f in &files { + storage + .put(f, data.as_ref(), Some(data.len())) + .await + .unwrap(); + } + + // ==================== check: prefix-list `mydb/wb` (directory) + // ==================== + let mut prefix = storage.new_path(); + prefix.push_all_dirs(&["mydb", "wb"]); + + let mut expected_000 = prefix.clone(); + expected_000.push_dir("000"); + let mut expected_001 = prefix.clone(); + expected_001.push_dir("001"); + let mut expected_location = prefix.clone(); + expected_location.set_file_name("foo.json"); + + let result = storage.list_with_delimiter(&prefix).await.unwrap(); + + assert_eq!(result.common_prefixes, vec![expected_000, expected_001]); + assert_eq!(result.objects.len(), 1); + + let object = &result.objects[0]; + + assert_eq!(object.location, expected_location); + assert_eq!(object.size, data.len()); + + // ==================== check: prefix-list `mydb/wb/000/000/001` (partial + // filename) ==================== + let mut prefix = storage.new_path(); + prefix.push_all_dirs(&["mydb", "wb", "000", "000"]); + prefix.set_file_name("001"); + + let mut expected_location = storage.new_path(); + expected_location.push_all_dirs(&["mydb", "wb", "000", "000"]); + expected_location.set_file_name("001.segment"); + + let result = storage.list_with_delimiter(&prefix).await.unwrap(); + assert!(result.common_prefixes.is_empty()); + assert_eq!(result.objects.len(), 1); + + let object = &result.objects[0]; + + assert_eq!(object.location, expected_location); + + // ==================== check: prefix-list `not_there` (non-existing prefix) + // ==================== + let mut prefix = storage.new_path(); + prefix.push_all_dirs(&["not_there"]); + + let result = storage.list_with_delimiter(&prefix).await.unwrap(); + assert!(result.common_prefixes.is_empty()); + assert!(result.objects.is_empty()); + + // ==================== do: remove all files ==================== + for f in &files { + storage.delete(f).await.unwrap(); + } + + // ==================== check: store is empty ==================== + let content_list = flatten_list_stream(storage, None).await?; + assert!(content_list.is_empty()); + + Ok(()) + } + + /// Parse a str as a `CloudPath` into a `DirAndFileName`, even though the + /// associated storage might not be cloud storage, to reuse the cloud + /// path parsing logic. Then convert into the correct type of path for + /// the given storage. + fn str_to_path< + P: path::ObjectStorePath, + E: std::error::Error + Send + Sync, + R: AsyncRead + Unpin, + >( + storage: &impl ObjectStore, + val: &str, + ) -> P { + let cloud_path = FilePath::raw(val, false); + let parsed: DirsAndFileName = cloud_path.into(); + + let mut new_path = storage.new_path(); + for part in parsed.directories { + new_path.push_dir(part.to_string()); + } + + if let Some(file_name) = parsed.file_name { + new_path.set_file_name(file_name.to_string()); + } + new_path + } + + async fn delete_fixtures< + P: path::ObjectStorePath, + E: std::error::Error + Send + Sync, + R: AsyncRead + Unpin, + >( + storage: &impl ObjectStore, + ) { + let files: Vec<_> = [ + "test_file", + "mydb/wb/000/000/000.segment", + "mydb/wb/000/000/001.segment", + "mydb/wb/000/000/002.segment", + "mydb/wb/001/001/000.segment", + "mydb/wb/foo.json", + "mydb/data/whatevs", + ] + .iter() + .map(|&s| str_to_path(storage, s)) + .collect(); + + for f in &files { + // don't care if it errors, should fail elsewhere + let _ = storage.delete(f).await; + } + } + + // Tests TODO: + // GET nonexisting location (in_memory/file) + // DELETE nonexisting location + // PUT overwriting +} diff --git a/components/object_store/src/path/file.rs b/components/object_store/src/path/file.rs new file mode 100644 index 0000000000..acdae35f69 --- /dev/null +++ b/components/object_store/src/path/file.rs @@ -0,0 +1,518 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + mem, + path::{is_separator, PathBuf}, +}; + +use crate::path::{parsed::DirsAndFileName, parts::PathPart, ObjectStorePath}; + +/// An object storage location suitable for passing to disk based object +/// storage. +#[derive(Debug, Clone, Default, PartialEq, Eq, Ord, PartialOrd)] +pub struct FilePath { + inner: FilePathRepresentation, +} + +impl ObjectStorePath for FilePath { + fn set_file_name(&mut self, part: impl Into) { + self.inner = mem::take(&mut self.inner).set_file_name(part); + } + + fn push_dir(&mut self, part: impl Into) { + self.inner = mem::take(&mut self.inner).push_dir(part); + } + + fn push_all_dirs<'a>(&mut self, parts: impl AsRef<[&'a str]>) { + self.inner = mem::take(&mut self.inner).push_all_dirs(parts); + } + + fn display(&self) -> String { + self.to_raw().display().to_string() + } +} + +impl FilePath { + /// Creates a file storage location from a `PathBuf` without parsing or + /// allocating unless other methods are called on this instance that + /// need it. + /// + /// The "nature" of path (i.e. if it is a directory or file) will be + /// guessed. So paths ending with a separator (e.g. `/foo/bar/` on + /// Linux) are treated as a directory. However for all other paths (like + /// `/foo/bar` on Linux) it is not clear if a directory or file is meant + /// w/o inspecting the underlying store. To workaround that there is the + /// `assume_directory` flag which will treat ambiguous paths as directories. + /// If set to `false`, these cases will be treated as files. + pub fn raw(path: impl Into, assume_directory: bool) -> Self { + let path = path.into(); + Self { + inner: FilePathRepresentation::Raw(path, assume_directory), + } + } + + /// Creates a filesystem `PathBuf` location by using the standard library's + /// `PathBuf` building implementation appropriate for the current + /// platform. + pub fn to_raw(&self) -> PathBuf { + use FilePathRepresentation::*; + + match &self.inner { + Raw(path, _) => path.to_owned(), + Parsed(dirs_and_file_name) => { + let mut path: PathBuf = dirs_and_file_name + .directories + .iter() + .map(PathPart::encoded) + .collect(); + if let Some(file_name) = &dirs_and_file_name.file_name { + path.push(file_name.encoded()); + } + path + } + } + } + + /// Add the parts of `path` to the end of this path. Notably does + /// *not* behave as `PathBuf::push` does: there is no way to replace the + /// root. If `self` has a file name, that will be removed, then the + /// directories of `path` will be appended, then any file name of `path` + /// will be assigned to `self`. + pub fn push_path(&mut self, path: &Self) { + self.inner = mem::take(&mut self.inner).push_path(path) + } + + /// Add a `PathPart` to the end of the path's directories. + pub fn push_part_as_dir(&mut self, part: &PathPart) { + self.inner = mem::take(&mut self.inner).push_part_as_dir(part); + } + + /// Whether the prefix is the start of this path or not. + pub fn prefix_matches(&self, prefix: &Self) -> bool { + self.inner.prefix_matches(&prefix.inner) + } + + /// Returns all directory and file name `PathParts` in `self` after the + /// specified `prefix`. Ignores any `file_name` part of `prefix`. + /// Returns `None` if `self` dosen't start with `prefix`. + pub fn parts_after_prefix(&self, prefix: &Self) -> Option> { + self.inner.parts_after_prefix(&prefix.inner) + } + + /// Remove this path's file name, if there is one. + pub fn unset_file_name(&mut self) { + self.inner = mem::take(&mut self.inner).unset_file_name(); + } +} + +impl From for DirsAndFileName { + fn from(file_path: FilePath) -> Self { + file_path.inner.into() + } +} + +impl From for FilePath { + fn from(dirs_and_file_name: DirsAndFileName) -> Self { + Self { + inner: FilePathRepresentation::Parsed(dirs_and_file_name), + } + } +} + +#[derive(Debug, Clone, Eq)] +enum FilePathRepresentation { + // raw: native path representation and also remember if we always assume it is a directory + // assume_directory: bool + Raw(PathBuf, bool), + Parsed(DirsAndFileName), +} + +impl Default for FilePathRepresentation { + fn default() -> Self { + Self::Parsed(DirsAndFileName::default()) + } +} + +impl PartialEq for FilePathRepresentation { + fn eq(&self, other: &Self) -> bool { + matches!(self.cmp(other), std::cmp::Ordering::Equal) + } +} +impl PartialOrd for FilePathRepresentation { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for FilePathRepresentation { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + use FilePathRepresentation::*; + match (self, other) { + (Parsed(self_parts), Parsed(other_parts)) => self_parts.cmp(other_parts), + (Parsed(self_parts), _) => { + let other_parts: DirsAndFileName = other.to_owned().into(); + self_parts.cmp(&other_parts) + } + (_, Parsed(other_parts)) => { + let self_parts: DirsAndFileName = self.to_owned().into(); + self_parts.cmp(other_parts) + } + _ => { + let self_parts: DirsAndFileName = self.to_owned().into(); + let other_parts: DirsAndFileName = other.to_owned().into(); + self_parts.cmp(&other_parts) + } + } + } +} + +impl FilePathRepresentation { + fn push_dir(self, part: impl Into) -> Self { + let mut dirs_and_file_name: DirsAndFileName = self.into(); + + dirs_and_file_name.push_dir(part); + Self::Parsed(dirs_and_file_name) + } + + fn push_all_dirs<'a>(self, parts: impl AsRef<[&'a str]>) -> Self { + let mut dirs_and_file_name: DirsAndFileName = self.into(); + + dirs_and_file_name.push_all_dirs(parts); + Self::Parsed(dirs_and_file_name) + } + + fn set_file_name(self, part: impl Into) -> Self { + let mut dirs_and_file_name: DirsAndFileName = self.into(); + + dirs_and_file_name.set_file_name(part); + Self::Parsed(dirs_and_file_name) + } + + fn unset_file_name(self) -> Self { + let mut dirs_and_file_name: DirsAndFileName = self.into(); + + dirs_and_file_name.unset_file_name(); + Self::Parsed(dirs_and_file_name) + } + + /// Add the parts of `path` to the end of this path. Notably does + /// *not* behave as `PathBuf::push` does: there is no way to replace the + /// root. If `self` has a file name, that will be removed, then the + /// directories of `path` will be appended, then any file name of `path` + /// will be assigned to `self`. + fn push_path(self, path: &FilePath) -> Self { + let DirsAndFileName { + directories: path_dirs, + file_name: path_file_name, + } = path.inner.to_owned().into(); + let mut dirs_and_file_name: DirsAndFileName = self.into(); + + dirs_and_file_name.directories.extend(path_dirs); + dirs_and_file_name.file_name = path_file_name; + + Self::Parsed(dirs_and_file_name) + } + + /// Add a `PathPart` to the end of the path's directories. + fn push_part_as_dir(self, part: &PathPart) -> Self { + let mut dirs_and_file_name: DirsAndFileName = self.into(); + + dirs_and_file_name.push_part_as_dir(part); + + Self::Parsed(dirs_and_file_name) + } + + fn prefix_matches(&self, prefix: &Self) -> bool { + use FilePathRepresentation::*; + match (self, prefix) { + (Parsed(self_parts), Parsed(prefix_parts)) => self_parts.prefix_matches(prefix_parts), + (Parsed(self_parts), _) => { + let prefix_parts: DirsAndFileName = prefix.to_owned().into(); + self_parts.prefix_matches(&prefix_parts) + } + (_, Parsed(prefix_parts)) => { + let self_parts: DirsAndFileName = self.to_owned().into(); + self_parts.prefix_matches(prefix_parts) + } + _ => { + let self_parts: DirsAndFileName = self.to_owned().into(); + let prefix_parts: DirsAndFileName = prefix.to_owned().into(); + self_parts.prefix_matches(&prefix_parts) + } + } + } + + /// Returns all directory and file name `PathParts` in `self` after the + /// specified `prefix`. Ignores any `file_name` part of `prefix`. + /// Returns `None` if `self` dosen't start with `prefix`. + fn parts_after_prefix(&self, prefix: &Self) -> Option> { + use FilePathRepresentation::*; + match (self, prefix) { + (Parsed(self_parts), Parsed(prefix_parts)) => { + self_parts.parts_after_prefix(prefix_parts) + } + (Parsed(self_parts), _) => { + let prefix_parts: DirsAndFileName = prefix.to_owned().into(); + self_parts.parts_after_prefix(&prefix_parts) + } + (_, Parsed(prefix_parts)) => { + let self_parts: DirsAndFileName = self.to_owned().into(); + self_parts.parts_after_prefix(prefix_parts) + } + _ => { + let self_parts: DirsAndFileName = self.to_owned().into(); + let prefix_parts: DirsAndFileName = prefix.to_owned().into(); + self_parts.parts_after_prefix(&prefix_parts) + } + } + } +} + +impl From for DirsAndFileName { + fn from(file_path_rep: FilePathRepresentation) -> Self { + use FilePathRepresentation::*; + + match file_path_rep { + Raw(path, assume_directory) => { + let mut parts: Vec = path + .iter() + .flat_map(|s| s.to_os_string().into_string().map(PathPart)) + .collect(); + + if !assume_directory && !parts.is_empty() && !is_directory(&path) { + let file_name = Some(parts.pop().expect("cannot be empty")); + Self { + directories: parts, + file_name, + } + } else { + Self { + directories: parts, + file_name: None, + } + } + } + Parsed(dirs_and_file_name) => dirs_and_file_name, + } + } +} + +/// Checks if the path is for sure a directory (i.e. ends with a separator). +fn is_directory(path: &std::path::Path) -> bool { + if let Some(s) = path.to_str() { + if let Some(c) = s.chars().last() { + return is_separator(c); + } + } + false +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::parsed_path; + + #[test] + fn path_buf_to_dirs_and_file_name_conversion() { + // Last section ending in `.json` is a file name + let path_buf: PathBuf = "/one/two/blah.json".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + let mut expected_parts = parsed_path!(["/", "one", "two"], "blah.json"); + expected_parts.directories[0] = PathPart("/".to_string()); // not escaped + assert_eq!(parts, expected_parts); + + // Last section ending in `.segment` is a file name + let path_buf: PathBuf = "/one/two/blah.segment".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + let mut expected_parts = parsed_path!(["/", "one", "two"], "blah.segment"); + expected_parts.directories[0] = PathPart("/".to_string()); // not escaped + assert_eq!(parts, expected_parts); + + // Last section ending in `.parquet` is a file name + let path_buf: PathBuf = "/one/two/blah.parquet".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + let mut expected_parts = parsed_path!(["/", "one", "two"], "blah.parquet"); + expected_parts.directories[0] = PathPart("/".to_string()); // not escaped + assert_eq!(parts, expected_parts); + + // Last section ending in `.txt` is NOT a file name; we don't recognize that + // extension + let path_buf: PathBuf = "/one/two/blah.txt".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + let mut expected_parts = parsed_path!(["/", "one", "two"], "blah.txt"); + expected_parts.directories[0] = PathPart("/".to_string()); // not escaped + assert_eq!(parts, expected_parts); + + // Last section containing a `.` isn't a file name + let path_buf: PathBuf = "/one/two/blah.blah".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + let mut expected_parts = parsed_path!(["/", "one", "two"], "blah.blah"); + expected_parts.directories[0] = PathPart("/".to_string()); // not escaped + assert_eq!(parts, expected_parts); + + // Last section starting with a `.` isn't a file name (macos temp dirs do this) + let path_buf: PathBuf = "/one/two/.blah".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + let mut expected_parts = parsed_path!(["/", "one", "two"], ".blah"); + expected_parts.directories[0] = PathPart("/".to_string()); // not escaped + assert_eq!(parts, expected_parts); + + let path_buf: PathBuf = "/a/b/d".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + let mut expected_parts = parsed_path!(["/", "a", "b"], "d"); + expected_parts.directories[0] = PathPart("/".to_string()); // not escaped + assert_eq!(parts, expected_parts); + + let path_buf: PathBuf = "/a/b/c".into(); + let file_path = FilePath::raw(path_buf, true); + let parts: DirsAndFileName = file_path.into(); + let mut expected_parts = parsed_path!(["/", "a", "b", "c"]); + expected_parts.directories[0] = PathPart("/".to_string()); // not escaped + assert_eq!(parts, expected_parts); + } + + #[test] + fn conversions() { + // dir and file name + let path_buf: PathBuf = "foo/bar/blah.json".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + + let expected_parts = parsed_path!(["foo", "bar"], "blah.json"); + assert_eq!(parts, expected_parts); + + // dir, no file name + let path_buf: PathBuf = "foo/bar/".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + + let expected_parts = parsed_path!(["foo", "bar"]); + assert_eq!(parts, expected_parts); + + // same but w/o the final marker + let path_buf: PathBuf = "foo/bar".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + + let expected_parts = parsed_path!(["foo"], "bar"); + assert_eq!(parts, expected_parts); + + // same but w/o the final marker, but forced to be a directory + let path_buf: PathBuf = "foo/bar".into(); + let file_path = FilePath::raw(path_buf, true); + let parts: DirsAndFileName = file_path.into(); + + let expected_parts = parsed_path!(["foo", "bar"]); + assert_eq!(parts, expected_parts); + + // no dir, file name + let path_buf: PathBuf = "blah.json".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + + let expected_parts = parsed_path!([], "blah.json"); + assert_eq!(parts, expected_parts); + + // empty + let path_buf: PathBuf = "".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + + let expected_parts = parsed_path!(); + assert_eq!(parts, expected_parts); + + // weird file name + let path_buf: PathBuf = "blah.x".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.into(); + + let expected_parts = parsed_path!("blah.x"); + assert_eq!(parts, expected_parts); + } + + #[test] + fn equality() { + let path_buf: PathBuf = "foo/bar/blah.json".into(); + let file_path = FilePath::raw(path_buf, false); + let parts: DirsAndFileName = file_path.clone().into(); + let parsed: FilePath = parts.into(); + + assert_eq!(file_path, parsed); + } + + #[test] + fn ordering() { + let a_path_buf: PathBuf = "foo/bar/a.json".into(); + let a_file_path = FilePath::raw(&a_path_buf, false); + let a_parts: DirsAndFileName = a_file_path.into(); + let a_parsed: FilePath = a_parts.into(); + + let b_path_buf: PathBuf = "foo/bar/b.json".into(); + let b_file_path = FilePath::raw(&b_path_buf, false); + + assert!(a_path_buf < b_path_buf); + assert!( + a_parsed < b_file_path, + "a was not less than b: a = {:#?}\nb = {:#?}", + a_parsed, + b_file_path + ); + } + + #[test] + fn path_display() { + let a_path_buf: PathBuf = "foo/bar/a.json".into(); + let expected_display = a_path_buf.display().to_string(); + let a_file_path = FilePath::raw(&a_path_buf, false); + + assert_eq!(a_file_path.display(), expected_display); + + let a_parts: DirsAndFileName = a_file_path.into(); + let a_parsed: FilePath = a_parts.into(); + + assert_eq!(a_parsed.display(), expected_display); + } + + #[test] + fn test_file_path_represent_ord() { + let file1 = FilePathRepresentation::Raw(PathBuf::from("/aa/bb"), false); + let file1_bak = FilePathRepresentation::Raw(PathBuf::from("/aa/bb"), false); + let file2 = FilePathRepresentation::Raw(PathBuf::from("/zz/aa/bb"), false); + + assert!(file1 == file1_bak); + assert!(file1 < file2) + } + + #[test] + fn test_file_path_parts_after_prefix() { + let file = FilePath::raw("/a/b/c", false); + let file2 = FilePath::raw("/a/b", true); + let ret = file.parts_after_prefix(&file2); + assert_eq!(ret, Some(vec![PathPart("c".to_string())])); + + let file = FilePath::raw("/a/b/c", false); + let file2 = FilePath::raw("/a/b", false); + let ret = file.parts_after_prefix(&file2); + assert_eq!( + ret, + Some(vec![PathPart("b".to_string()), PathPart("c".to_string())]) + ); + + let file = FilePath::raw("/a/b/d", false); + let file2 = FilePath::raw("/a/b/c/dd", true); + let ret = file.parts_after_prefix(&file2); + assert_eq!(ret, None); + + let file = FilePath::raw("/a/b/d", true); + let file2 = FilePath::raw("/a/b/c", true); + let ret = file.parts_after_prefix(&file2); + assert_eq!(ret, None); + } +} diff --git a/components/object_store/src/path/mod.rs b/components/object_store/src/path/mod.rs new file mode 100644 index 0000000000..e5922d6df8 --- /dev/null +++ b/components/object_store/src/path/mod.rs @@ -0,0 +1,35 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! This module contains code for abstracting object locations that work +//! across different backing implementations and platforms. + +pub mod file; +pub mod parsed; +pub mod parts; + +/// The delimiter to separate object namespaces, creating a directory structure. +pub const DELIMITER: &str = "/"; + +/// Universal interface for handling paths and locations for objects and +/// directories in the object store. +/// +/// +/// Deliberately does not implement `Display` or `ToString`! +pub trait ObjectStorePath: + std::fmt::Debug + Clone + PartialEq + Eq + Send + Sync + 'static +{ + /// Set the file name of this path + fn set_file_name(&mut self, part: impl Into); + + /// Add a part to the end of the path's directories, encoding any restricted + /// characters. + fn push_dir(&mut self, part: impl Into); + + /// Push a bunch of parts as directories in one go. + fn push_all_dirs<'a>(&mut self, parts: impl AsRef<[&'a str]>); + + /// Like `std::path::Path::display, converts an `ObjectStorePath` to a + /// `String` suitable for printing; not suitable for sending to + /// APIs. + fn display(&self) -> String; +} diff --git a/components/object_store/src/path/parsed.rs b/components/object_store/src/path/parsed.rs new file mode 100644 index 0000000000..0c9781a9b6 --- /dev/null +++ b/components/object_store/src/path/parsed.rs @@ -0,0 +1,389 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use itertools::Itertools; + +use crate::path::{parts::PathPart, ObjectStorePath, DELIMITER}; + +/// A path stored as a collection of 0 or more directories and 0 or 1 file name +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)] +pub struct DirsAndFileName { + /// Directory hierarchy. + pub directories: Vec, + + /// Filename, if set. + pub file_name: Option, +} + +impl ObjectStorePath for DirsAndFileName { + fn set_file_name(&mut self, part: impl Into) { + let part = part.into(); + self.file_name = Some((&*part).into()); + } + + fn push_dir(&mut self, part: impl Into) { + let part = part.into(); + self.directories.push((&*part).into()); + } + + fn push_all_dirs<'a>(&mut self, parts: impl AsRef<[&'a str]>) { + self.directories + .extend(parts.as_ref().iter().map(|&v| v.into())); + } + + fn display(&self) -> String { + let mut s = self + .directories + .iter() + .map(PathPart::encoded) + .join(DELIMITER); + + if !s.is_empty() { + s.push_str(DELIMITER); + } + if let Some(file_name) = &self.file_name { + s.push_str(file_name.encoded()); + } + s + } +} + +impl DirsAndFileName { + pub(crate) fn prefix_matches(&self, prefix: &Self) -> bool { + let diff = itertools::diff_with( + self.directories.iter(), + prefix.directories.iter(), + |a, b| a == b, + ); + + use itertools::Diff; + match diff { + None => match (self.file_name.as_ref(), prefix.file_name.as_ref()) { + (Some(self_file), Some(prefix_file)) => { + self_file.encoded().starts_with(prefix_file.encoded()) + } + (Some(_self_file), None) => true, + (None, Some(_prefix_file)) => false, + (None, None) => true, + }, + Some(Diff::Shorter(_, mut remaining_self)) => { + let next_dir = remaining_self + .next() + .expect("must have at least one mismatch to be in this case"); + match prefix.file_name.as_ref() { + Some(prefix_file) => next_dir.encoded().starts_with(prefix_file.encoded()), + None => true, + } + } + Some(Diff::FirstMismatch(_, mut remaining_self, mut remaining_prefix)) => { + let first_prefix = remaining_prefix + .next() + .expect("must have at least one mismatch to be in this case"); + + // There must not be any other remaining parts in the prefix + remaining_prefix.next().is_none() + // and the next item in self must start with the last item in the prefix + && remaining_self + .next() + .expect("must be at least one value") + .encoded() + .starts_with(first_prefix.encoded()) + } + _ => false, + } + } + + /// Returns all directory and file name `PathParts` in `self` after the + /// specified `prefix`. Ignores any `file_name` part of `prefix`. + /// Returns `None` if `self` dosen't start with `prefix`. + pub(crate) fn parts_after_prefix(&self, prefix: &Self) -> Option> { + if self.directories.len() < prefix.directories.len() { + return None; + } + + let mut dirs_iter = self.directories.iter(); + let mut prefix_dirs_iter = prefix.directories.iter(); + + let mut parts = vec![]; + + for dir in &mut dirs_iter { + let pre = prefix_dirs_iter.next(); + + match pre { + None => { + parts.push(dir.to_owned()); + break; + } + Some(p) if p == dir => continue, + Some(_) => return None, + } + } + + parts.extend(dirs_iter.cloned()); + + if let Some(file_name) = &self.file_name { + parts.push(file_name.to_owned()); + } + + Some(parts) + } + + /// Add a `PathPart` to the end of the path's directories. + pub(crate) fn push_part_as_dir(&mut self, part: &PathPart) { + self.directories.push(part.to_owned()); + } + + /// Remove the file name, if any. + pub(crate) fn unset_file_name(&mut self) { + self.file_name = None; + } +} + +/// Short-cut macro to create [`DirsAndFileName`] instances. +/// +/// # Example +/// ``` +/// use object_store::parsed_path; +/// +/// // empty path +/// parsed_path!(); +/// +/// // filename only +/// parsed_path!("test.txt"); +/// +/// // directories only +/// parsed_path!(["path", "to"]); +/// +/// // filename + directories +/// parsed_path!(["path", "to"], "test.txt"); +/// ``` +#[macro_export] +macro_rules! parsed_path { + ([$($dir:expr),*], $file:expr) => { + $crate::path::parsed::DirsAndFileName { + directories: vec![$($crate::path::parts::PathPart::from($dir)),*], + file_name: Some($crate::path::parts::PathPart::from($file)), + } + }; + ([$($dir:expr),*]) => { + $crate::path::parsed::DirsAndFileName { + directories: vec![$($crate::path::parts::PathPart::from($dir)),*], + file_name: None, + } + }; + ($file:expr) => { + parsed_path!([], $file) + }; + () => { + parsed_path!([]) + }; +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parts_after_prefix_behavior() { + let mut existing_path = DirsAndFileName::default(); + existing_path.push_all_dirs(&["apple", "bear", "cow", "dog"]); + existing_path.file_name = Some("egg.json".into()); + + // Prefix with one directory + let mut prefix = DirsAndFileName::default(); + prefix.push_dir("apple"); + let expected_parts: Vec = vec!["bear", "cow", "dog", "egg.json"] + .into_iter() + .map(Into::into) + .collect(); + let parts = existing_path.parts_after_prefix(&prefix).unwrap(); + assert_eq!(parts, expected_parts); + + // Prefix with two directories + let mut prefix = DirsAndFileName::default(); + prefix.push_all_dirs(&["apple", "bear"]); + let expected_parts: Vec = vec!["cow", "dog", "egg.json"] + .into_iter() + .map(Into::into) + .collect(); + let parts = existing_path.parts_after_prefix(&prefix).unwrap(); + assert_eq!(parts, expected_parts); + + // Not a prefix + let mut prefix = DirsAndFileName::default(); + prefix.push_dir("cow"); + assert!(existing_path.parts_after_prefix(&prefix).is_none()); + + // Prefix with a partial directory + let mut prefix = DirsAndFileName::default(); + prefix.push_dir("ap"); + assert!(existing_path.parts_after_prefix(&prefix).is_none()); + + // Prefix matches but there aren't any parts after it + let mut existing_path = DirsAndFileName::default(); + existing_path.push_all_dirs(&["apple", "bear", "cow", "dog"]); + let prefix = existing_path.clone(); + let parts = existing_path.parts_after_prefix(&prefix).unwrap(); + assert!(parts.is_empty()); + } + + #[test] + fn prefix_matches() { + let mut haystack = DirsAndFileName::default(); + haystack.push_all_dirs(&["foo/bar", "baz%2Ftest", "something"]); + + // self starts with self + assert!( + haystack.prefix_matches(&haystack), + "{:?} should have started with {:?}", + haystack, + haystack + ); + + // a longer prefix doesn't match + let mut needle = haystack.clone(); + needle.push_dir("longer now"); + assert!( + !haystack.prefix_matches(&needle), + "{:?} shouldn't have started with {:?}", + haystack, + needle + ); + + // one dir prefix matches + let mut needle = DirsAndFileName::default(); + needle.push_dir("foo/bar"); + assert!( + haystack.prefix_matches(&needle), + "{:?} should have started with {:?}", + haystack, + needle + ); + + // two dir prefix matches + needle.push_dir("baz%2Ftest"); + assert!( + haystack.prefix_matches(&needle), + "{:?} should have started with {:?}", + haystack, + needle + ); + + // partial dir prefix matches + let mut needle = DirsAndFileName::default(); + needle.push_dir("f"); + assert!( + haystack.prefix_matches(&needle), + "{:?} should have started with {:?}", + haystack, + needle + ); + + // one dir and one partial dir matches + let mut needle = DirsAndFileName::default(); + needle.push_all_dirs(&["foo/bar", "baz"]); + assert!( + haystack.prefix_matches(&needle), + "{:?} should have started with {:?}", + haystack, + needle + ); + } + + #[test] + fn prefix_matches_with_file_name() { + let mut haystack = DirsAndFileName::default(); + haystack.push_all_dirs(&["foo/bar", "baz%2Ftest", "something"]); + + let mut needle = haystack.clone(); + + // All directories match and file name is a prefix + haystack.set_file_name("foo.segment"); + needle.set_file_name("foo"); + + assert!( + haystack.prefix_matches(&needle), + "{:?} should have started with {:?}", + haystack, + needle + ); + + // All directories match but file name is not a prefix + needle.set_file_name("e"); + + assert!( + !haystack.prefix_matches(&needle), + "{:?} should not have started with {:?}", + haystack, + needle + ); + + // Not all directories match; file name is a prefix of the next directory; this + // matches + let mut needle = DirsAndFileName::default(); + needle.push_all_dirs(&["foo/bar", "baz%2Ftest"]); + needle.set_file_name("s"); + + assert!( + haystack.prefix_matches(&needle), + "{:?} should have started with {:?}", + haystack, + needle + ); + + // Not all directories match; file name is NOT a prefix of the next directory; + // no match + needle.set_file_name("p"); + + assert!( + !haystack.prefix_matches(&needle), + "{:?} should not have started with {:?}", + haystack, + needle + ); + } + + #[test] + fn test_macro() { + let actual = parsed_path!(["foo", "bar"], "baz"); + let expected = DirsAndFileName { + directories: vec![PathPart::from("foo"), PathPart::from("bar")], + file_name: Some(PathPart::from("baz")), + }; + assert_eq!(actual, expected); + + let actual = parsed_path!([], "foo"); + let expected = DirsAndFileName { + directories: vec![], + file_name: Some(PathPart::from("foo")), + }; + assert_eq!(actual, expected); + + let actual = parsed_path!("foo"); + let expected = DirsAndFileName { + directories: vec![], + file_name: Some(PathPart::from("foo")), + }; + assert_eq!(actual, expected); + + let actual = parsed_path!(["foo", "bar"]); + let expected = DirsAndFileName { + directories: vec![PathPart::from("foo"), PathPart::from("bar")], + file_name: None, + }; + assert_eq!(actual, expected); + + let actual = parsed_path!([]); + let expected = DirsAndFileName { + directories: vec![], + file_name: None, + }; + assert_eq!(actual, expected); + + let actual = parsed_path!(); + let expected = DirsAndFileName { + directories: vec![], + file_name: None, + }; + assert_eq!(actual, expected); + } +} diff --git a/components/object_store/src/path/parts.rs b/components/object_store/src/path/parts.rs new file mode 100644 index 0000000000..b9e69becfb --- /dev/null +++ b/components/object_store/src/path/parts.rs @@ -0,0 +1,142 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use percent_encoding::{percent_decode_str, percent_encode, AsciiSet, CONTROLS}; + +use super::DELIMITER; + +// percent_encode's API needs this as a byte +const DELIMITER_BYTE: u8 = DELIMITER.as_bytes()[0]; + +// special encoding of the empty string part. +// Using '%' is the safest character since it will always be used in the +// output of percent_encode no matter how we evolve the INVALID AsciiSet over +// time. +const EMPTY: &str = "%"; + +/// The PathPart type exists to validate the directory/file names that form part +/// of a path. +/// +/// A PathPart instance is guaranteed to be non-empty and to contain no `/` +/// characters as it can only be constructed by going through the `from` impl. +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)] +pub struct PathPart(pub(super) String); + +/// Characters we want to encode. +const INVALID: &AsciiSet = &CONTROLS + // The delimiter we are reserving for internal hierarchy + .add(DELIMITER_BYTE) + // Characters AWS recommends avoiding for object keys + // https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingMetadata.html + .add(b'\\') + .add(b'{') + // TODO: Non-printable ASCII characters (128–255 decimal characters) + .add(b'^') + .add(b'}') + .add(b'%') + .add(b'`') + .add(b']') + .add(b'"') // " <-- my editor is confused about double quotes within single quotes + .add(b'>') + .add(b'[') + .add(b'~') + .add(b'<') + .add(b'#') + .add(b'|') + // Characters Google Cloud Storage recommends avoiding for object names + // https://cloud.google.com/storage/docs/naming-objects + .add(b'\r') + .add(b'\n') + .add(b'*') + .add(b'?'); + +impl From<&str> for PathPart { + fn from(v: &str) -> Self { + match v { + // We don't want to encode `.` generally, but we do want to disallow parts of paths + // to be equal to `.` or `..` to prevent file system traversal shenanigans. + "." => Self(String::from("%2E")), + ".." => Self(String::from("%2E%2E")), + + // Every string except the empty string will be percent encoded. + // The empty string will be transformed into a sentinel value EMPTY + // which can safely be a prefix of an encoded value since it will be + // fully matched at decode time (see impl Display for PathPart). + "" => Self(String::from(EMPTY)), + other => Self(percent_encode(other.as_bytes(), INVALID).to_string()), + } + } +} + +impl std::fmt::Display for PathPart { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match &self.0[..] { + EMPTY => "".fmt(f), + _ => percent_decode_str(&self.0) + .decode_utf8() + .expect("Valid UTF-8 that came from String") + .fmt(f), + } + } +} + +impl PathPart { + /// Encode as string. + pub fn encoded(&self) -> &str { + &self.0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn path_part_delimiter_gets_encoded() { + let part: PathPart = "foo/bar".into(); + assert_eq!(part, PathPart(String::from("foo%2Fbar"))); + } + + #[test] + fn path_part_gets_decoded_for_display() { + let part: PathPart = "foo/bar".into(); + assert_eq!(part.to_string(), "foo/bar"); + } + + #[test] + fn path_part_given_already_encoded_string() { + let part: PathPart = "foo%2Fbar".into(); + assert_eq!(part, PathPart(String::from("foo%252Fbar"))); + assert_eq!(part.to_string(), "foo%2Fbar"); + } + + #[test] + fn path_part_cant_be_one_dot() { + let part: PathPart = ".".into(); + assert_eq!(part, PathPart(String::from("%2E"))); + assert_eq!(part.to_string(), "."); + } + + #[test] + fn path_part_cant_be_two_dots() { + let part: PathPart = "..".into(); + assert_eq!(part, PathPart(String::from("%2E%2E"))); + assert_eq!(part.to_string(), ".."); + } + + #[test] + fn path_part_cant_be_empty() { + let part: PathPart = "".into(); + assert_eq!(part, PathPart(String::from(EMPTY))); + assert_eq!(part.to_string(), ""); + } + + #[test] + fn empty_is_safely_encoded() { + let part: PathPart = EMPTY.into(); + assert_eq!( + part, + PathPart(percent_encode(EMPTY.as_bytes(), INVALID).to_string()) + ); + assert_eq!(part.to_string(), EMPTY); + } +} diff --git a/components/parquet/Cargo.toml b/components/parquet/Cargo.toml new file mode 100644 index 0000000000..c33523280e --- /dev/null +++ b/components/parquet/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "parquet" +version = "0.1.0" +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +arrow_deps = { path = "../../arrow_deps" } +lru = "0.7.0" +parquet-format = "4.0.0" +thrift = "0.13" \ No newline at end of file diff --git a/components/parquet/src/cache.rs b/components/parquet/src/cache.rs new file mode 100644 index 0000000000..393d49b63e --- /dev/null +++ b/components/parquet/src/cache.rs @@ -0,0 +1,67 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt::Debug, + sync::{Arc, RwLock}, +}; + +use arrow_deps::parquet::file::metadata::ParquetMetaData; +use lru::LruCache; + +pub trait MetaCache: Debug { + fn get(&self, key: &str) -> Option>; + + fn put(&self, key: String, value: Arc); +} + +pub trait DataCache: Debug { + fn get(&self, key: &str) -> Option>>; + + fn put(&self, key: String, value: Arc>); +} + +#[derive(Debug)] +pub struct LruMetaCache { + cache: RwLock>>, +} + +impl LruMetaCache { + pub fn new(cap: usize) -> Self { + Self { + cache: RwLock::new(LruCache::new(cap)), + } + } +} + +impl MetaCache for LruMetaCache { + fn get(&self, key: &str) -> Option> { + self.cache.write().unwrap().get(key).cloned() + } + + fn put(&self, key: String, value: Arc) { + self.cache.write().unwrap().put(key, value); + } +} + +#[derive(Debug)] +pub struct LruDataCache { + cache: RwLock>>>, +} + +impl LruDataCache { + pub fn new(cap: usize) -> Self { + Self { + cache: RwLock::new(LruCache::new(cap)), + } + } +} + +impl DataCache for LruDataCache { + fn get(&self, key: &str) -> Option>> { + self.cache.write().unwrap().get(key).cloned() + } + + fn put(&self, key: String, value: Arc>) { + self.cache.write().unwrap().put(key, value); + } +} diff --git a/components/parquet/src/lib.rs b/components/parquet/src/lib.rs new file mode 100644 index 0000000000..b2b8d28c46 --- /dev/null +++ b/components/parquet/src/lib.rs @@ -0,0 +1,17 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +pub mod cache; +pub mod reverse_reader; +mod serialized_reader; +#[cfg(test)] +pub mod tests; + +// use cache::Cache; +use std::sync::Arc; + +pub use serialized_reader::CachableSerializedFileReader; + +use crate::cache::{DataCache, MetaCache}; + +pub type MetaCacheRef = Arc; +pub type DataCacheRef = Arc; diff --git a/components/parquet/src/reverse_reader.rs b/components/parquet/src/reverse_reader.rs new file mode 100644 index 0000000000..ca201c3bea --- /dev/null +++ b/components/parquet/src/reverse_reader.rs @@ -0,0 +1,231 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{collections::VecDeque, sync::Arc}; + +use arrow_deps::{ + arrow::{ + datatypes::SchemaRef, + error::Result as ArrowResult, + record_batch::{RecordBatch, RecordBatchReader}, + }, + parquet::{ + arrow::{ + self, arrow_reader::ParquetRecordBatchReader, ArrowReader, ParquetFileArrowReader, + }, + errors::Result, + file::{ + metadata::{FileMetaData, ParquetMetaData}, + reader::{FileReader, RowGroupReader}, + }, + record::reader::RowIter, + schema::types::Type as SchemaType, + }, +}; + +/// The reverse reader for [FileReader]. +/// +/// The details of implementation is: +/// - Split the original [FileReader] into [RowGroup]s. +/// - Reverse all the [RowGroup]s into `reversed_readers` so the order of +/// [RowGroup] is already reversed. +/// - Reverse all the [RecordBatch]es of the [RowGroup] into the +/// `current_reversed_batches`. +/// - Pop one [RecordBatch] from the `current_reversed_batches`and reverse its +/// data and send it to caller. +pub struct ReversedFileReader { + schema: SchemaRef, + /// The readers are arranged in reversed order and built from the + /// [RowGroup]. + reversed_readers: Vec, + /// Buffer all the record batches of one reader and every record batch is + /// reversed. + current_reversed_batches: VecDeque>, + next_reader_idx: usize, +} + +impl ReversedFileReader { + fn fetch_next_batches_if_necessary(&mut self) { + if !self.current_reversed_batches.is_empty() { + // current reader is not exhausted and no need to fetch data. + return; + } + + if self.next_reader_idx >= self.reversed_readers.len() { + // all the readers have been exhausted. + return; + } + + let reader = &mut self.reversed_readers[self.next_reader_idx]; + for batch in reader { + // reverse the order of the data of every record batch. + let reversed_batch = match batch { + Ok(v) => arrow_deps::util::reverse_record_batch(&v), + Err(e) => Err(e), + }; + // reverse the order of the record batches. + self.current_reversed_batches.push_front(reversed_batch); + } + + self.next_reader_idx += 1; + } +} + +impl Iterator for ReversedFileReader { + type Item = ArrowResult; + + fn next(&mut self) -> Option { + self.fetch_next_batches_if_necessary(); + self.current_reversed_batches.pop_front() + } +} + +impl RecordBatchReader for ReversedFileReader { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} + +/// Reader for one [RowGroup] of the [FileReader]. +struct SingleRowGroupFileReader { + file_reader: Arc, + /// The index of row group in `file_reader` to read. + row_group_idx: usize, + /// The meta data for the reader of the one row group. + meta_data: ParquetMetaData, +} + +impl SingleRowGroupFileReader { + fn new(file_reader: Arc, row_group_idx: usize) -> Self { + let meta_data = { + let orig_meta_data = file_reader.metadata(); + let orig_file_meta_data = orig_meta_data.file_metadata(); + let row_group_meta_data = orig_meta_data.row_group(row_group_idx); + let file_meta_data = FileMetaData::new( + orig_file_meta_data.version(), + // provide the row group's row number because of the reader only contains one row + // group. + row_group_meta_data.num_rows(), + orig_file_meta_data.created_by().clone(), + orig_file_meta_data.key_value_metadata().clone(), + orig_file_meta_data.schema_descr_ptr(), + orig_file_meta_data.column_orders().cloned(), + ); + ParquetMetaData::new(file_meta_data, vec![row_group_meta_data.clone()]) + }; + + Self { + file_reader, + row_group_idx, + meta_data, + } + } +} + +impl FileReader for SingleRowGroupFileReader { + fn metadata(&self) -> &ParquetMetaData { + &self.meta_data + } + + fn num_row_groups(&self) -> usize { + 1 + } + + fn get_row_group(&self, i: usize) -> Result> { + self.file_reader.get_row_group(self.row_group_idx + i) + } + + fn get_row_iter(&self, projection: Option) -> Result { + RowIter::from_file(projection, self) + } +} + +/// Builder for [ReverseRecordBatchReader] from the `file_reader`. +#[must_use] +pub struct Builder { + file_reader: Arc, + batch_size: usize, + projection: Option>, +} + +impl Builder { + pub fn new(file_reader: Arc, batch_size: usize) -> Self { + Self { + file_reader, + batch_size, + projection: None, + } + } + + pub fn projection(mut self, projection: Option>) -> Self { + self.projection = projection; + + self + } + + pub fn build(self) -> Result { + let mut reversed_readers = Vec::with_capacity(self.file_reader.num_row_groups()); + for row_group_idx in (0..self.file_reader.num_row_groups()).rev() { + let row_group_file_reader = + SingleRowGroupFileReader::new(self.file_reader.clone(), row_group_idx); + let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(row_group_file_reader)); + let batch_reader = if let Some(proj) = &self.projection { + arrow_reader.get_record_reader_by_columns(proj.iter().cloned(), self.batch_size)? + } else { + arrow_reader.get_record_reader(self.batch_size)? + }; + reversed_readers.push(batch_reader); + } + + let schema = { + let file_metadata = self.file_reader.metadata().file_metadata(); + Arc::new(arrow::parquet_to_arrow_schema( + file_metadata.schema_descr(), + file_metadata.key_value_metadata(), + )?) + }; + + Ok(ReversedFileReader { + schema, + reversed_readers, + current_reversed_batches: VecDeque::new(), + next_reader_idx: 0, + }) + } +} + +#[cfg(test)] +mod tests { + use arrow_deps::parquet::file::reader::SerializedFileReader; + + use super::*; + + const TEST_FILE: &str = "binary.parquet"; + const TEST_BATCH_SIZE: usize = 1000; + + fn check_reversed_row_iter(original: RowIter, reversed: ReversedFileReader) { + let mut original_reversed_rows: Vec<_> = original.into_iter().collect(); + original_reversed_rows.reverse(); + + let reversed_record_batches: Vec<_> = reversed + .into_iter() + .map(|v| v.expect("Fail to fetch record batch")) + .collect(); + + crate::tests::check_rows_and_record_batches( + &original_reversed_rows, + &reversed_record_batches, + ); + } + + #[test] + fn test_reverse_file_reader() { + let test_file = crate::tests::get_test_file(TEST_FILE); + let file_reader: Arc = Arc::new( + SerializedFileReader::new(test_file).expect("Should succeed to init file reader"), + ); + let reversed_reader = Builder::new(file_reader.clone(), TEST_BATCH_SIZE) + .build() + .expect("Should succeed to build reversed file reader"); + check_reversed_row_iter(file_reader.get_row_iter(None).unwrap(), reversed_reader); + } +} diff --git a/components/parquet/src/serialized_reader.rs b/components/parquet/src/serialized_reader.rs new file mode 100644 index 0000000000..a79c13ed07 --- /dev/null +++ b/components/parquet/src/serialized_reader.rs @@ -0,0 +1,738 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! fork from https://github.com/apache/arrow-rs/blob/5.2.0/parquet/src/file/serialized_reader.rs + +//! Contains implementations of the reader traits FileReader, RowGroupReader and +//! PageReader Also contains implementations of the ChunkReader for files (with +//! buffering) and byte arrays (RAM) + +use std::{fs::File, io::Read, option::Option::Some, sync::Arc}; + +use arrow_deps::parquet::{ + basic::{Compression, Encoding, Type}, + column::page::{Page, PageReader}, + compression::{create_codec, Codec}, + errors::{ParquetError, Result}, + file::{footer, metadata::*, reader::*, statistics}, + record::{reader::RowIter, Row}, + schema::types::Type as SchemaType, + util::{cursor::SliceableCursor, memory::ByteBufferPtr}, +}; +use parquet_format::{PageHeader, PageType}; +use thrift::protocol::TCompactInputProtocol; + +use crate::{DataCacheRef, MetaCacheRef}; + +fn format_page_data_key(name: &str, col_start: u64, col_length: u64) -> String { + format!("{}_{}_{}", name, col_start, col_length) +} + +/// Conversion into a [`RowIter`](crate::record::reader::RowIter) +/// using the full file schema over all row groups. +impl IntoIterator for CachableSerializedFileReader { + type IntoIter = RowIter<'static>; + type Item = Row; + + fn into_iter(self) -> Self::IntoIter { + RowIter::from_file_into(Box::new(self)) + } +} + +// ---------------------------------------------------------------------- +// Implementations of file & row group readers + +/// A serialized with cache implementation for Parquet [`FileReader`]. +/// Two kinds of items are cacheable: +/// - [`ParquetMetaData`]: only used for creating the reader. +/// - Column chunk bytes: used for reading data by +/// [`SerializedRowGroupReader`]. +/// +/// Note: the implementation is based on the https://github.com/apache/arrow-rs/blob/5.2.0/parquet/src/file/serialized_reader.rs. +pub struct CachableSerializedFileReader { + name: String, + chunk_reader: Arc, + metadata: Arc, + data_cache: Option, +} + +impl CachableSerializedFileReader { + /// Creates file reader from a Parquet file. + /// Returns error if Parquet file does not exist or is corrupt. + pub fn new( + name: String, + chunk_reader: R, + meta_cache: Option, + data_cache: Option, + ) -> Result { + // MODIFICATION START: consider cache for meta data. + let metadata = if let Some(meta_cache) = meta_cache { + if let Some(v) = meta_cache.get(&name) { + v + } else { + let meta_data = Arc::new(footer::parse_metadata(&chunk_reader)?); + meta_cache.put(name.clone(), meta_data.clone()); + meta_data + } + } else { + Arc::new(footer::parse_metadata(&chunk_reader)?) + }; + // MODIFICATION END. + + Ok(Self { + name, + chunk_reader: Arc::new(chunk_reader), + metadata, + data_cache, + }) + } + + /// Filters row group metadata to only those row groups, + /// for which the predicate function returns true + pub fn filter_row_groups(&mut self, predicate: &dyn Fn(&RowGroupMetaData, usize) -> bool) { + let mut filtered_row_groups = Vec::::new(); + for (i, row_group_metadata) in self.metadata.row_groups().iter().enumerate() { + if predicate(row_group_metadata, i) { + filtered_row_groups.push(row_group_metadata.clone()); + } + } + self.metadata = Arc::new(ParquetMetaData::new( + self.metadata.file_metadata().clone(), + filtered_row_groups, + )); + } +} + +impl FileReader for CachableSerializedFileReader { + fn metadata(&self) -> &ParquetMetaData { + &self.metadata + } + + fn num_row_groups(&self) -> usize { + self.metadata.num_row_groups() + } + + fn get_row_group(&self, i: usize) -> Result> { + let row_group_metadata = self.metadata.row_group(i); + // Row groups should be processed sequentially. + let f = Arc::clone(&self.chunk_reader); + Ok(Box::new(SerializedRowGroupReader::new( + f, + row_group_metadata, + self.name.clone(), + self.data_cache.clone(), + ))) + } + + fn get_row_iter(&self, projection: Option) -> Result { + RowIter::from_file(projection, self) + } +} + +/// A serialized with cache implementation for Parquet [`RowGroupReader`]. +/// +/// The cache is used for column data chunk when building [`PageReader`]. +/// +/// NOTE: the implementation is based on the https://github.com/apache/arrow-rs/blob/5.2.0/parquet/src/file/serialized_reader.rs +pub struct SerializedRowGroupReader<'a, R: ChunkReader> { + chunk_reader: Arc, + metadata: &'a RowGroupMetaData, + name: String, + data_cache: Option, +} + +impl<'a, R: ChunkReader> SerializedRowGroupReader<'a, R> { + /// Creates new row group reader from a file and row group metadata. + fn new( + chunk_reader: Arc, + metadata: &'a RowGroupMetaData, + name: String, + data_cache: Option, + ) -> Self { + Self { + chunk_reader, + metadata, + name, + data_cache, + } + } + + fn get_data(&self, col_start: u64, col_length: u64) -> Result> { + let mut file_chunk = self.chunk_reader.get_read(col_start, col_length as usize)?; + let mut buf = Vec::with_capacity(col_length as usize); + file_chunk.read_to_end(&mut buf).unwrap(); + Ok(buf) + } + + fn get_file_chunk(&self, col_start: u64, col_length: u64) -> Result { + if let Some(data_cache) = &self.data_cache { + let key = format_page_data_key(&self.name, col_start, col_length); + if let Some(v) = data_cache.get(&key) { + Ok(SliceableCursor::new(v)) + } else { + let buf_arc = Arc::new(self.get_data(col_start, col_length)?); + data_cache.put(key, buf_arc.clone()); + let slice = SliceableCursor::new(buf_arc); + Ok(slice) + } + } else { + let buf_arc = Arc::new(self.get_data(col_start, col_length)?); + let slice = SliceableCursor::new(buf_arc); + Ok(slice) + } + } +} + +impl<'a, R: 'static + ChunkReader> RowGroupReader for SerializedRowGroupReader<'a, R> { + fn metadata(&self) -> &RowGroupMetaData { + self.metadata + } + + fn num_columns(&self) -> usize { + self.metadata.num_columns() + } + + // TODO: fix PARQUET-816 + fn get_column_page_reader(&self, i: usize) -> Result> { + let col = self.metadata.column(i); + let (col_start, col_length) = col.byte_range(); + + // MODIFICATION START: consider the cache for the data chunk: [col_start, + // col_start+col_length). + let file_chunk = self.get_file_chunk(col_start, col_length)?; + // MODIFICATION END. + + let page_reader = SerializedPageReader::new( + file_chunk, + col.num_values(), + col.compression(), + col.column_descr().physical_type(), + )?; + Ok(Box::new(page_reader)) + } + + fn get_row_iter(&self, projection: Option) -> Result { + RowIter::from_row_group(projection, self) + } +} + +/// A serialized implementation for Parquet [`PageReader`]. +pub struct SerializedPageReader { + // The file source buffer which references exactly the bytes for the column trunk + // to be read by this page reader. + buf: T, + + // The compression codec for this column chunk. Only set for non-PLAIN codec. + decompressor: Option>, + + // The number of values we have seen so far. + seen_num_values: i64, + + // The number of total values in this column chunk. + total_num_values: i64, + + // Column chunk type. + physical_type: Type, +} + +impl SerializedPageReader { + /// Creates a new serialized page reader from file source. + pub fn new( + buf: T, + total_num_values: i64, + compression: Compression, + physical_type: Type, + ) -> Result { + let decompressor = create_codec(compression)?; + let result = Self { + buf, + total_num_values, + seen_num_values: 0, + decompressor, + physical_type, + }; + Ok(result) + } + + /// Reads Page header from Thrift. + fn read_page_header(&mut self) -> Result { + let mut prot = TCompactInputProtocol::new(&mut self.buf); + let page_header = PageHeader::read_from_in_protocol(&mut prot)?; + Ok(page_header) + } +} + +impl Iterator for SerializedPageReader { + type Item = Result; + + fn next(&mut self) -> Option { + self.get_next_page().transpose() + } +} + +impl PageReader for SerializedPageReader { + fn get_next_page(&mut self) -> Result> { + while self.seen_num_values < self.total_num_values { + let page_header = self.read_page_header()?; + + // When processing data page v2, depending on enabled compression for the + // page, we should account for uncompressed data ('offset') of + // repetition and definition levels. + // + // We always use 0 offset for other pages other than v2, `true` flag means + // that compression will be applied if decompressor is defined + let mut offset: usize = 0; + let mut can_decompress = true; + + if let Some(ref header_v2) = page_header.data_page_header_v2 { + offset = (header_v2.definition_levels_byte_length + + header_v2.repetition_levels_byte_length) as usize; + // When is_compressed flag is missing the page is considered compressed + can_decompress = header_v2.is_compressed.unwrap_or(true); + } + + let compressed_len = page_header.compressed_page_size as usize - offset; + let uncompressed_len = page_header.uncompressed_page_size as usize - offset; + // We still need to read all bytes from buffered stream + let mut buffer = vec![0; offset + compressed_len]; + self.buf.read_exact(&mut buffer)?; + + // TODO: page header could be huge because of statistics. We should set a + // maximum page header size and abort if that is exceeded. + if let Some(decompressor) = self.decompressor.as_mut() { + if can_decompress { + let mut decompressed_buffer = Vec::with_capacity(uncompressed_len); + let decompressed_size = + decompressor.decompress(&buffer[offset..], &mut decompressed_buffer)?; + if decompressed_size != uncompressed_len { + return Err(ParquetError::General(format!( + "Actual decompressed size doesn't match the expected one ({} vs {})", + decompressed_size, uncompressed_len, + ))); + } + if offset == 0 { + buffer = decompressed_buffer; + } else { + // Prepend saved offsets to the buffer + buffer.truncate(offset); + buffer.append(&mut decompressed_buffer); + } + } + } + + let result = match page_header.type_ { + PageType::DictionaryPage => { + assert!(page_header.dictionary_page_header.is_some()); + let dict_header = page_header.dictionary_page_header.as_ref().unwrap(); + let is_sorted = dict_header.is_sorted.unwrap_or(false); + Page::DictionaryPage { + buf: ByteBufferPtr::new(buffer), + num_values: dict_header.num_values as u32, + encoding: Encoding::from(dict_header.encoding), + is_sorted, + } + } + PageType::DataPage => { + assert!(page_header.data_page_header.is_some()); + let header = page_header.data_page_header.unwrap(); + self.seen_num_values += header.num_values as i64; + Page::DataPage { + buf: ByteBufferPtr::new(buffer), + num_values: header.num_values as u32, + encoding: Encoding::from(header.encoding), + def_level_encoding: Encoding::from(header.definition_level_encoding), + rep_level_encoding: Encoding::from(header.repetition_level_encoding), + statistics: statistics::from_thrift(self.physical_type, header.statistics), + } + } + PageType::DataPageV2 => { + assert!(page_header.data_page_header_v2.is_some()); + let header = page_header.data_page_header_v2.unwrap(); + let is_compressed = header.is_compressed.unwrap_or(true); + self.seen_num_values += header.num_values as i64; + Page::DataPageV2 { + buf: ByteBufferPtr::new(buffer), + num_values: header.num_values as u32, + encoding: Encoding::from(header.encoding), + num_nulls: header.num_nulls as u32, + num_rows: header.num_rows as u32, + def_levels_byte_len: header.definition_levels_byte_length as u32, + rep_levels_byte_len: header.repetition_levels_byte_length as u32, + is_compressed, + statistics: statistics::from_thrift(self.physical_type, header.statistics), + } + } + _ => { + // For unknown page type (e.g., INDEX_PAGE), skip and read next. + continue; + } + }; + return Ok(Some(result)); + } + + // We are at the end of this column chunk and no more page left. Return None. + Ok(None) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow_deps::parquet::basic::ColumnOrder; + + use super::*; + use crate::cache::{LruDataCache, LruMetaCache}; + + #[test] + fn test_cursor_and_file_has_the_same_behaviour() { + let mut buf: Vec = Vec::new(); + crate::tests::get_test_file("alltypes_plain.parquet") + .read_to_end(&mut buf) + .unwrap(); + let cursor = SliceableCursor::new(buf); + let read_from_cursor = + CachableSerializedFileReader::new("read_from_cursor".to_string(), cursor, None, None) + .unwrap(); + + let test_file = crate::tests::get_test_file("alltypes_plain.parquet"); + let read_from_file = + CachableSerializedFileReader::new("read_from_file".to_string(), test_file, None, None) + .unwrap(); + + let file_iter = read_from_file.get_row_iter(None).unwrap(); + let cursor_iter = read_from_cursor.get_row_iter(None).unwrap(); + + assert!(file_iter.eq(cursor_iter)); + } + + #[test] + fn test_reuse_file_chunk() { + // This test covers the case of maintaining the correct start position in a file + // stream for each column reader after initializing and moving to the next one + // (without necessarily reading the entire column). + let test_file = crate::tests::get_test_file("alltypes_plain.parquet"); + let reader = + CachableSerializedFileReader::new("test".to_string(), test_file, None, None).unwrap(); + let row_group = reader.get_row_group(0).unwrap(); + + let mut page_readers = Vec::new(); + for i in 0..row_group.num_columns() { + page_readers.push(row_group.get_column_page_reader(i).unwrap()); + } + + // Now buffer each col reader, we do not expect any failures like: + // General("underlying Thrift error: end of file") + for mut page_reader in page_readers { + assert!(page_reader.get_next_page().is_ok()); + } + } + + fn new_filer_reader_with_cache() -> CachableSerializedFileReader { + let data_cache: Option = Some(Arc::new(LruDataCache::new(1000))); + let meta_cache: Option = Some(Arc::new(LruMetaCache::new(1000))); + let test_file = crate::tests::get_test_file("alltypes_plain.parquet"); + let reader_result = CachableSerializedFileReader::new( + "test".to_string(), + test_file, + meta_cache.clone(), + data_cache.clone(), + ); + assert!(reader_result.is_ok()); + reader_result.unwrap() + } + + fn test_with_file_reader(reader: &CachableSerializedFileReader) { + // Test contents in Parquet metadata + let metadata = reader.metadata(); + assert_eq!(metadata.num_row_groups(), 1); + + // Test contents in file metadata + let file_metadata = metadata.file_metadata(); + assert!(file_metadata.created_by().is_some()); + assert_eq!( + file_metadata.created_by().as_ref().unwrap(), + "impala version 1.3.0-INTERNAL (build 8a48ddb1eff84592b3fc06bc6f51ec120e1fffc9)" + ); + assert!(file_metadata.key_value_metadata().is_none()); + assert_eq!(file_metadata.num_rows(), 8); + assert_eq!(file_metadata.version(), 1); + assert_eq!(file_metadata.column_orders(), None); + + // Test contents in row group metadata + let row_group_metadata = metadata.row_group(0); + assert_eq!(row_group_metadata.num_columns(), 11); + assert_eq!(row_group_metadata.num_rows(), 8); + assert_eq!(row_group_metadata.total_byte_size(), 671); + // Check each column order + for i in 0..row_group_metadata.num_columns() { + assert_eq!(file_metadata.column_order(i), ColumnOrder::UNDEFINED); + } + + // Test row group reader + let row_group_reader_result = reader.get_row_group(0); + assert!(row_group_reader_result.is_ok()); + let row_group_reader: Box = row_group_reader_result.unwrap(); + assert_eq!( + row_group_reader.num_columns(), + row_group_metadata.num_columns() + ); + assert_eq!( + row_group_reader.metadata().total_byte_size(), + row_group_metadata.total_byte_size() + ); + + // Test page readers + // TODO: test for every column + let page_reader_0_result = row_group_reader.get_column_page_reader(0); + assert!(page_reader_0_result.is_ok()); + let mut page_reader_0: Box = page_reader_0_result.unwrap(); + let mut page_count = 0; + while let Ok(Some(page)) = page_reader_0.get_next_page() { + let is_expected_page = match page { + Page::DictionaryPage { + buf, + num_values, + encoding, + is_sorted, + } => { + assert_eq!(buf.len(), 32); + assert_eq!(num_values, 8); + assert_eq!(encoding, Encoding::PLAIN_DICTIONARY); + assert!(!is_sorted); + true + } + Page::DataPage { + buf, + num_values, + encoding, + def_level_encoding, + rep_level_encoding, + statistics, + } => { + assert_eq!(buf.len(), 11); + assert_eq!(num_values, 8); + assert_eq!(encoding, Encoding::PLAIN_DICTIONARY); + assert_eq!(def_level_encoding, Encoding::RLE); + assert_eq!(rep_level_encoding, Encoding::BIT_PACKED); + assert!(statistics.is_none()); + true + } + _ => false, + }; + assert!(is_expected_page); + page_count += 1; + } + assert_eq!(page_count, 2); + } + + #[test] + fn test_file_reader() { + let test_file = crate::tests::get_test_file("alltypes_plain.parquet"); + let reader = CachableSerializedFileReader::new("test".to_string(), test_file, None, None) + .expect("Should succeed to build test reader"); + test_with_file_reader(&reader); + } + + #[test] + fn test_file_reader_with_cache() { + let reader = new_filer_reader_with_cache(); + let test_num = 10usize; + for _ in 0..test_num { + test_with_file_reader(&reader); + } + } + + #[test] + fn test_file_reader_datapage_v2() { + let test_file = crate::tests::get_test_file("datapage_v2.snappy.parquet"); + let reader_result = + CachableSerializedFileReader::new("test".to_string(), test_file, None, None); + assert!(reader_result.is_ok()); + let reader = reader_result.unwrap(); + + // Test contents in Parquet metadata + let metadata = reader.metadata(); + assert_eq!(metadata.num_row_groups(), 1); + + // Test contents in file metadata + let file_metadata = metadata.file_metadata(); + assert!(file_metadata.created_by().is_some()); + assert_eq!( + file_metadata.created_by().as_ref().unwrap(), + "parquet-mr version 1.8.1 (build 4aba4dae7bb0d4edbcf7923ae1339f28fd3f7fcf)" + ); + assert!(file_metadata.key_value_metadata().is_some()); + assert_eq!( + file_metadata.key_value_metadata().to_owned().unwrap().len(), + 1 + ); + + assert_eq!(file_metadata.num_rows(), 5); + assert_eq!(file_metadata.version(), 1); + assert_eq!(file_metadata.column_orders(), None); + + let row_group_metadata = metadata.row_group(0); + + // Check each column order + for i in 0..row_group_metadata.num_columns() { + assert_eq!(file_metadata.column_order(i), ColumnOrder::UNDEFINED); + } + + // Test row group reader + let row_group_reader_result = reader.get_row_group(0); + assert!(row_group_reader_result.is_ok()); + let row_group_reader: Box = row_group_reader_result.unwrap(); + assert_eq!( + row_group_reader.num_columns(), + row_group_metadata.num_columns() + ); + assert_eq!( + row_group_reader.metadata().total_byte_size(), + row_group_metadata.total_byte_size() + ); + + // Test page readers + // TODO: test for every column + let page_reader_0_result = row_group_reader.get_column_page_reader(0); + assert!(page_reader_0_result.is_ok()); + let mut page_reader_0: Box = page_reader_0_result.unwrap(); + let mut page_count = 0; + while let Ok(Some(page)) = page_reader_0.get_next_page() { + let is_expected_page = match page { + Page::DictionaryPage { + buf, + num_values, + encoding, + is_sorted, + } => { + assert_eq!(buf.len(), 7); + assert_eq!(num_values, 1); + assert_eq!(encoding, Encoding::PLAIN); + assert!(!is_sorted); + true + } + Page::DataPageV2 { + buf, + num_values, + encoding, + num_nulls, + num_rows, + def_levels_byte_len, + rep_levels_byte_len, + is_compressed, + statistics, + } => { + assert_eq!(buf.len(), 4); + assert_eq!(num_values, 5); + assert_eq!(encoding, Encoding::RLE_DICTIONARY); + assert_eq!(num_nulls, 1); + assert_eq!(num_rows, 5); + assert_eq!(def_levels_byte_len, 2); + assert_eq!(rep_levels_byte_len, 0); + assert!(is_compressed); + assert!(statistics.is_some()); + true + } + _ => false, + }; + assert!(is_expected_page); + page_count += 1; + } + assert_eq!(page_count, 2); + } + + #[test] + fn test_page_iterator() { + let file = crate::tests::get_test_file("alltypes_plain.parquet"); + let file_reader = Arc::new( + CachableSerializedFileReader::new("test".to_string(), file, None, None).unwrap(), + ); + + let mut page_iterator = FilePageIterator::new(0, file_reader.clone()).unwrap(); + + // read first page + let page = page_iterator.next(); + assert!(page.is_some()); + assert!(page.unwrap().is_ok()); + + // reach end of file + let page = page_iterator.next(); + assert!(page.is_none()); + + let row_group_indices = Box::new(0..1); + let mut page_iterator = + FilePageIterator::with_row_groups(0, row_group_indices, file_reader).unwrap(); + + // read first page + let page = page_iterator.next(); + assert!(page.is_some()); + assert!(page.unwrap().is_ok()); + + // reach end of file + let page = page_iterator.next(); + assert!(page.is_none()); + } + + #[test] + fn test_file_reader_key_value_metadata() { + let file = crate::tests::get_test_file("binary.parquet"); + let file_reader = Arc::new( + CachableSerializedFileReader::new("test".to_string(), file, None, None).unwrap(), + ); + + let metadata = file_reader + .metadata + .file_metadata() + .key_value_metadata() + .as_ref() + .unwrap(); + + assert_eq!(metadata.len(), 3); + + assert_eq!(metadata.get(0).unwrap().key, "parquet.proto.descriptor"); + + assert_eq!(metadata.get(1).unwrap().key, "writer.model.name"); + assert_eq!(metadata.get(1).unwrap().value, Some("protobuf".to_owned())); + + assert_eq!(metadata.get(2).unwrap().key, "parquet.proto.class"); + assert_eq!( + metadata.get(2).unwrap().value, + Some("foo.baz.Foobaz$Event".to_owned()) + ); + } + + #[test] + fn test_file_reader_filter_row_groups() -> Result<()> { + let test_file = crate::tests::get_test_file("alltypes_plain.parquet"); + let mut reader = + CachableSerializedFileReader::new("test".to_string(), test_file, None, None)?; + + // test initial number of row groups + let metadata = reader.metadata(); + assert_eq!(metadata.num_row_groups(), 1); + + // test filtering out all row groups + reader.filter_row_groups(&|_, _| false); + let metadata = reader.metadata(); + assert_eq!(metadata.num_row_groups(), 0); + + Ok(()) + } +} diff --git a/components/parquet/src/tests.rs b/components/parquet/src/tests.rs new file mode 100644 index 0000000000..69d6904e8f --- /dev/null +++ b/components/parquet/src/tests.rs @@ -0,0 +1,118 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{env, error::Error, fs, path::PathBuf, str::FromStr}; + +use arrow_deps::{ + arrow::{array::*, datatypes::DataType, record_batch::RecordBatch}, + parquet::record::{Field, Row}, +}; + +fn get_data_dir( + udf_env: &str, + submodule_data: &str, +) -> std::result::Result> { + // Try user defined env. + if let Ok(dir) = env::var(udf_env) { + let trimmed = dir.trim().to_string(); + if !trimmed.is_empty() { + let pb = PathBuf::from(trimmed); + if pb.is_dir() { + return Ok(pb); + } else { + return Err(format!( + "the data dir `{}` defined by env {} not found", + pb.display(), + udf_env + ) + .into()); + } + } + } + + // The env is undefined or its value is trimmed to empty, let's try default dir. + + // env "CARGO_MANIFEST_DIR" is "the directory containing the manifest of your + // package", set by `cargo run` or `cargo test`, see: + // https://doc.rust-lang.org/cargo/reference/environment-variables.html + let dir = env!("CARGO_MANIFEST_DIR"); + + let pb = PathBuf::from(dir).join(submodule_data); + if pb.is_dir() { + Ok(pb) + } else { + Err(format!( + "env `{}` is undefined or has empty value, and the pre-defined data dir `{}` not found\n\ + HINT: try running `git submodule update --init`", + udf_env, + pb.display(), + ).into()) + } +} + +fn parquet_test_data() -> String { + match get_data_dir("PARQUET_TEST_DATA", "../parquet-testing/data") { + Ok(pb) => pb.display().to_string(), + Err(err) => panic!("failed to get parquet data dir: {}", err), + } +} + +/// Returns path to the test parquet file in 'data' directory +fn get_test_path(file_name: &str) -> PathBuf { + let mut pathbuf = PathBuf::from_str(&parquet_test_data()).unwrap(); + pathbuf.push(file_name); + pathbuf +} + +/// Returns file handle for a test parquet file from 'data' directory +pub fn get_test_file(file_name: &str) -> fs::File { + let path = get_test_path(file_name); + fs::File::open(path.as_path()).unwrap_or_else(|err| { + panic!( + "Test file {} could not be opened, did you do `git submodule update`?: {}", + path.display(), + err + ) + }) +} + +struct RowViewOfRecordBatch<'a> { + record_batch: &'a RecordBatch, + row_idx: usize, +} + +impl<'a> RowViewOfRecordBatch<'a> { + fn check_row(&self, expect_row: &Row) { + for (col_idx, (_, field)) in expect_row.get_column_iter().enumerate() { + let array_ref = self.record_batch.column(col_idx); + + match array_ref.data_type() { + DataType::Binary => { + let array = array_ref.as_any().downcast_ref::().unwrap(); + let v = array.value(self.row_idx); + + if let Field::Bytes(field_value) = field { + assert_eq!(v, field_value.data()); + } else { + panic!("different value type"); + } + } + _ => unimplemented!("not support {:?}", array_ref.data_type()), + } + } + } +} + +pub fn check_rows_and_record_batches(rows: &[Row], record_batches: &[RecordBatch]) { + let mut row_idx = 0; + for record_batch in record_batches { + for row_idx_in_batch in 0..record_batch.num_rows() { + let expect_row = &rows[row_idx]; + let row_view = RowViewOfRecordBatch { + record_batch, + row_idx: row_idx_in_batch, + }; + row_view.check_row(expect_row); + row_idx += 1; + } + } +} diff --git a/components/profile/Cargo.toml b/components/profile/Cargo.toml new file mode 100644 index 0000000000..044fb5685a --- /dev/null +++ b/components/profile/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "profile" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[dependencies] +log = "0.4" +tempfile = "3.0" +jemallocator = "0.3.2" +jemalloc-ctl = "0.3.2" + +[dependencies.jemalloc-sys] +version = "0.3.2" +features = ["stats", "profiling", "unprefixed_malloc_on_supported_platforms"] diff --git a/components/profile/src/lib.rs b/components/profile/src/lib.rs new file mode 100644 index 0000000000..2f63f8c536 --- /dev/null +++ b/components/profile/src/lib.rs @@ -0,0 +1,142 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Memory profiler for running application based on jemalloc features. + +use std::{ + fmt::Formatter, + fs::File, + io, + io::Read, + sync::{Mutex, MutexGuard}, + thread, time, +}; + +use jemalloc_ctl::{Access, AsName}; +use jemallocator; +use log::{error, info}; + +#[derive(Debug)] +pub enum Error { + Internal { msg: String }, + IO(io::Error), + Jemalloc(jemalloc_ctl::Error), +} + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "Profile Error: {:?}", self) + } +} + +impl std::error::Error for Error {} + +pub type Result = std::result::Result; + +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + +const PROF_ACTIVE: &'static [u8] = b"prof.active\0"; +const PROF_DUMP: &'static [u8] = b"prof.dump\0"; +const PROFILE_OUTPUT: &'static [u8] = b"profile.out\0"; +const PROFILE_OUTPUT_FILE_PATH: &str = "/tmp/profile.out"; + +fn set_prof_active(active: bool) -> Result<()> { + let name = PROF_ACTIVE.name(); + name.write(active).map_err(|e| Error::Jemalloc(e)) +} + +fn dump_profile() -> Result<()> { + let name = PROF_DUMP.name(); + name.write(PROFILE_OUTPUT).map_err(|e| Error::Jemalloc(e)) +} + +struct ProfLockGuard<'a>(MutexGuard<'a, ()>); + +/// ProfLockGuard hold the profile lock and take responsibilities for +/// (de)activating mem profiling. NOTE: Keeping mem profiling on may cause some +/// extra runtime cost so we choose to activating it dynamically. +impl<'a> ProfLockGuard<'a> { + pub fn new(guard: MutexGuard<'a, ()>) -> Result { + set_prof_active(true)?; + Ok(Self(guard)) + } +} + +impl<'a> Drop for ProfLockGuard<'a> { + fn drop(&mut self) { + if let Err(e) = set_prof_active(false) { + error!("Fail to deactivate profiling, err:{}", e); + } + } +} + +pub struct Profiler { + mem_prof_lock: Mutex<()>, +} + +impl Default for Profiler { + fn default() -> Self { + Self::new() + } +} + +impl Profiler { + pub fn new() -> Self { + Self { + mem_prof_lock: Mutex::new(()), + } + } + + // dump_mem_prof collects mem profiling data in `seconds`. + // TODO(xikai): limit the profiling duration + pub fn dump_mem_prof(&self, seconds: u64) -> Result> { + // concurrent profiling is disabled. + let lock_guard = self.mem_prof_lock.try_lock().map_err(|e| Error::Internal { + msg: format!("failed to acquire mem_prof_lock, err:{}", e), + })?; + + let _guard = ProfLockGuard::new(lock_guard)?; + + info!( + "Profiler::dump_mem_prof start memory profiling {} seconds", + seconds + ); + // wait for seconds for collect the profiling data + thread::sleep(time::Duration::from_secs(seconds)); + + // clearing the profile output file before dumping profile results. + { + let f = File::open(PROFILE_OUTPUT_FILE_PATH).map_err(|e| { + error!("Failed to open prof data file, err:{}", e); + Error::IO(e) + })?; + f.set_len(0).map_err(|e| { + error!("Failed to truncate profile output file, err:{}", e); + Error::IO(e) + })?; + } + + // dump the profile results to profile output file. + dump_profile().map_err(|e| { + error!( + "Failed to dump prof to {}, err:{}", + PROFILE_OUTPUT_FILE_PATH, e + ); + e + })?; + + // read the profile results into buffer + let mut f = File::open(PROFILE_OUTPUT_FILE_PATH).map_err(|e| { + error!("Failed to open prof data file, err:{}", e); + Error::IO(e) + })?; + + let mut buffer = Vec::new(); + f.read_to_end(&mut buffer).map_err(|e| { + error!("Failed to read prof data file, err:{}", e); + Error::IO(e) + })?; + + Ok(buffer) + } +} diff --git a/components/rust-hyperloglog/.github/dependabot.yml b/components/rust-hyperloglog/.github/dependabot.yml new file mode 100644 index 0000000000..66cef947a2 --- /dev/null +++ b/components/rust-hyperloglog/.github/dependabot.yml @@ -0,0 +1,10 @@ +# // Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +version: 2 +updates: +- package-ecosystem: cargo + directory: "/" + schedule: + interval: daily + time: "04:00" + open-pull-requests-limit: 10 diff --git a/components/rust-hyperloglog/.gitignore b/components/rust-hyperloglog/.gitignore new file mode 100644 index 0000000000..4468cbfb1c --- /dev/null +++ b/components/rust-hyperloglog/.gitignore @@ -0,0 +1,7 @@ +*.dSYM +*~ +.rust +build +Cargo.lock +src/hyperloglog/hyperloglog +target diff --git a/components/rust-hyperloglog/.travis.yml b/components/rust-hyperloglog/.travis.yml new file mode 100644 index 0000000000..52635e58a8 --- /dev/null +++ b/components/rust-hyperloglog/.travis.yml @@ -0,0 +1,6 @@ +# // Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +language: rust +rust: + - nightly + - stable diff --git a/components/rust-hyperloglog/Cargo.toml b/components/rust-hyperloglog/Cargo.toml new file mode 100644 index 0000000000..40c7cb83f1 --- /dev/null +++ b/components/rust-hyperloglog/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "hyperloglog" +version = "1.0.0" +authors = ["Frank Denis "] +description = "Hyperloglog implementation in Rust" +license = "ISC" +homepage = "https://github.com/jedisct1/rust-hyperloglog" +repository = "https://github.com/jedisct1/rust-hyperloglog" +edition = "2018" + +[lib] +name = "hyperloglog" +path = "src/hyperloglog/lib.rs" + +[dependencies] +bytecount = "0.6" +bytes = { path = "../bytes" } +rand = "0.8.0" +siphasher = "0.3" +snafu = { version ="0.6.10", features = ["backtraces"]} diff --git a/components/rust-hyperloglog/LICENSE b/components/rust-hyperloglog/LICENSE new file mode 100644 index 0000000000..ab647ead82 --- /dev/null +++ b/components/rust-hyperloglog/LICENSE @@ -0,0 +1,23 @@ +Copyright (c) 2013-2016, Frank Denis +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/components/rust-hyperloglog/README.md b/components/rust-hyperloglog/README.md new file mode 100644 index 0000000000..f104f9d59a --- /dev/null +++ b/components/rust-hyperloglog/README.md @@ -0,0 +1,27 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +hyperloglog +=========== + +A [HyperLogLog](https://static.googleusercontent.com/media/research.google.com/en/us/pubs/archive/40671.pdf) implementation in Rust, with bias correction. + +Installation: use [Cargo](http://crates.io): + +```toml +[dependencies] +hyperloglog = "0" +``` + +Usage: + +```rust +let mut hll = HyperLogLog::new(error_rate); +hll.insert(&"test1"); +hll.insert(&"test2"); +let card_estimation = hll.len(); + +let mut hll2 = HyperLogLog::new_from_template(&hll); +hll2.insert(&"test3"); + +hll.merge(&hll2); +``` diff --git a/components/rust-hyperloglog/THANKS b/components/rust-hyperloglog/THANKS new file mode 100644 index 0000000000..091c37fc33 --- /dev/null +++ b/components/rust-hyperloglog/THANKS @@ -0,0 +1,3 @@ +Nelson Gonçalves (@goncalvesnelson) +Vasily Evseenko (@svpcom) +for Python's hyperloglog implementation this code is based on. diff --git a/components/rust-hyperloglog/src/hyperloglog/lib.rs b/components/rust-hyperloglog/src/hyperloglog/lib.rs new file mode 100644 index 0000000000..242ae9980e --- /dev/null +++ b/components/rust-hyperloglog/src/hyperloglog/lib.rs @@ -0,0 +1,4264 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// (C)opyleft 2013-2019 Frank Denis + +//! HyperLogLog implementation for Rust +//! +//! Forked from +#![crate_name = "hyperloglog"] +#![warn(non_camel_case_types, non_upper_case_globals, unused_qualifications)] +#![allow(non_snake_case)] +#![allow(clippy::unreadable_literal)] + +use std::{ + cmp::Ordering::{Equal, Greater, Less}, + hash::{Hash, Hasher}, + iter::repeat, +}; + +use bytes::{MemBuf, MemBufMut}; +use siphasher::sip::SipHasher13; +use snafu::{ResultExt, Snafu}; + +static TRESHOLD_DATA: [f64; 15] = [ + 10.0, 20.0, 40.0, 80.0, 220.0, 400.0, 900.0, 1800.0, 3100.0, 6500.0, 11500.0, 20000.0, 50000.0, + 120000.0, 350000.0, +]; + +static RAW_ESTIMATE_DATA: &[&[f64]] = &[ + &[ + 11.0, 11.717, 12.207, 12.7896, 13.2882, 13.8204, 14.3772, 14.9342, 15.5202, 16.161, + 16.7722, 17.4636, 18.0396, 18.6766, 19.3566, 20.0454, 20.7936, 21.4856, 22.2666, 22.9946, + 23.766, 24.4692, 25.3638, 26.0764, 26.7864, 27.7602, 28.4814, 29.433, 30.2926, 31.0664, + 31.9996, 32.7956, 33.5366, 34.5894, 35.5738, 36.2698, 37.3682, 38.0544, 39.2342, 40.0108, + 40.7966, 41.9298, 42.8704, 43.6358, 44.5194, 45.773, 46.6772, 47.6174, 48.4888, 49.3304, + 50.2506, 51.4996, 52.3824, 53.3078, 54.3984, 55.5838, 56.6618, 57.2174, 58.3514, 59.0802, + 60.1482, 61.0376, 62.3598, 62.8078, 63.9744, 64.914, 65.781, 67.1806, 68.0594, 68.8446, + 69.7928, 70.8248, 71.8324, 72.8598, 73.6246, 74.7014, 75.393, 76.6708, 77.2394, + ], + &[ + 23.0, 23.1194, 23.8208, 24.2318, 24.77, 25.2436, 25.7774, 26.2848, 26.8224, 27.3742, + 27.9336, 28.503, 29.0494, 29.6292, 30.2124, 30.798, 31.367, 31.9728, 32.5944, 33.217, + 33.8438, 34.3696, 35.0956, 35.7044, 36.324, 37.0668, 37.6698, 38.3644, 39.049, 39.6918, + 40.4146, 41.082, 41.687, 42.5398, 43.2462, 43.857, 44.6606, 45.4168, 46.1248, 46.9222, + 47.6804, 48.447, 49.3454, 49.9594, 50.7636, 51.5776, 52.331, 53.19, 53.9676, 54.7564, + 55.5314, 56.4442, 57.3708, 57.9774, 58.9624, 59.8796, 60.755, 61.472, 62.2076, 63.1024, + 63.8908, 64.7338, 65.7728, 66.629, 67.413, 68.3266, 69.1524, 70.2642, 71.1806, 72.0566, + 72.9192, 73.7598, 74.3516, 75.5802, 76.4386, 77.4916, 78.1524, 79.1892, 79.8414, 80.8798, + 81.8376, 82.4698, 83.7656, 84.331, 85.5914, 86.6012, 87.7016, 88.5582, 89.3394, 90.3544, + 91.4912, 92.308, 93.3552, 93.9746, 95.2052, 95.727, 97.1322, 98.3944, 98.7588, 100.242, + 101.1914, 102.2538, 102.8776, 103.6292, 105.1932, 105.9152, 107.0868, 107.6728, 108.7144, + 110.3114, 110.8716, 111.245, 112.7908, 113.7064, 114.636, 115.7464, 116.1788, 117.7464, + 118.4896, 119.6166, 120.5082, 121.7798, 122.9028, 123.4426, 124.8854, 125.705, 126.4652, + 128.3464, 128.3462, 130.0398, 131.0342, 131.0042, 132.4766, 133.511, 134.7252, 135.425, + 136.5172, 138.0572, 138.6694, 139.3712, 140.8598, 141.4594, 142.554, 143.4006, 144.7374, + 146.1634, 146.8994, 147.605, 147.9304, 149.1636, 150.2468, 151.5876, 152.2096, 153.7032, + 154.7146, 155.807, 156.9228, 157.0372, 158.5852, + ], + &[ + 46.0, 46.1902, 47.271, 47.8358, 48.8142, 49.2854, 50.317, 51.354, 51.8924, 52.9436, + 53.4596, 54.5262, 55.6248, 56.1574, 57.2822, 57.837, 58.9636, 60.074, 60.7042, 61.7976, + 62.4772, 63.6564, 64.7942, 65.5004, 66.686, 67.291, 68.5672, 69.8556, 70.4982, 71.8204, + 72.4252, 73.7744, 75.0786, 75.8344, 77.0294, 77.8098, 79.0794, 80.5732, 81.1878, 82.5648, + 83.2902, 84.6784, 85.3352, 86.8946, 88.3712, 89.0852, 90.499, 91.2686, 92.6844, 94.2234, + 94.9732, 96.3356, 97.2286, 98.7262, 100.3284, 101.1048, 102.5962, 103.3562, 105.1272, + 106.4184, 107.4974, 109.0822, 109.856, 111.48, 113.2834, 114.0208, 115.637, 116.5174, + 118.0576, 119.7476, 120.427, 122.1326, 123.2372, 125.2788, 126.6776, 127.7926, 129.1952, + 129.9564, 131.6454, 133.87, 134.5428, 136.2, 137.0294, 138.6278, 139.6782, 141.792, + 143.3516, 144.2832, 146.0394, 147.0748, 148.4912, 150.849, 151.696, 153.5404, 154.073, + 156.3714, 157.7216, 158.7328, 160.4208, 161.4184, 163.9424, 165.2772, 166.411, 168.1308, + 168.769, 170.9258, 172.6828, 173.7502, 175.706, 176.3886, 179.0186, 180.4518, 181.927, + 183.4172, 184.4114, 186.033, 188.5124, 189.5564, 191.6008, 192.4172, 193.8044, 194.997, + 197.4548, 198.8948, 200.2346, 202.3086, 203.1548, 204.8842, 206.6508, 206.6772, 209.7254, + 210.4752, 212.7228, 214.6614, 215.1676, 217.793, 218.0006, 219.9052, 221.66, 223.5588, + 225.1636, 225.6882, 227.7126, 229.4502, 231.1978, 232.9756, 233.1654, 236.727, 238.1974, + 237.7474, 241.1346, 242.3048, 244.1948, 245.3134, 246.879, 249.1204, 249.853, 252.6792, + 253.857, 254.4486, 257.2362, 257.9534, 260.0286, 260.5632, 262.663, 264.723, 265.7566, + 267.2566, 267.1624, 270.62, 272.8216, 273.2166, 275.2056, 276.2202, 278.3726, 280.3344, + 281.9284, 283.9728, 284.1924, 286.4872, 287.587, 289.807, 291.1206, 292.769, 294.8708, + 296.665, 297.1182, 299.4012, 300.6352, 302.1354, 304.1756, 306.1606, 307.3462, 308.5214, + 309.4134, 310.8352, 313.9684, 315.837, 316.7796, 318.9858, + ], + &[ + 92.0, 93.4934, 94.9758, 96.4574, 97.9718, 99.4954, 101.5302, 103.0756, 104.6374, 106.1782, + 107.7888, 109.9522, 111.592, 113.2532, 114.9086, 116.5938, 118.9474, 120.6796, 122.4394, + 124.2176, 125.9768, 128.4214, 130.2528, 132.0102, 133.8658, 135.7278, 138.3044, 140.1316, + 142.093, 144.0032, 145.9092, 148.6306, 150.5294, 152.5756, 154.6508, 156.662, 159.552, + 161.3724, 163.617, 165.5754, 167.7872, 169.8444, 172.7988, 174.8606, 177.2118, 179.3566, + 181.4476, 184.5882, 186.6816, 189.0824, 191.0258, 193.6048, 196.4436, 198.7274, 200.957, + 203.147, 205.4364, 208.7592, 211.3386, 213.781, 215.8028, 218.656, 221.6544, 223.996, + 226.4718, 229.1544, 231.6098, 234.5956, 237.0616, 239.5758, 242.4878, 244.5244, 248.2146, + 250.724, 252.8722, 255.5198, 258.0414, 261.941, 264.9048, 266.87, 269.4304, 272.028, + 274.4708, 278.37, 281.0624, 283.4668, 286.5532, 289.4352, 293.2564, 295.2744, 298.2118, + 300.7472, 304.1456, 307.2928, 309.7504, 312.5528, 315.979, 318.2102, 322.1834, 324.3494, + 327.325, 330.6614, 332.903, 337.2544, 339.9042, 343.215, 345.2864, 348.0814, 352.6764, + 355.301, 357.139, 360.658, 363.1732, 366.5902, 369.9538, 373.0828, 375.922, 378.9902, + 382.7328, 386.4538, 388.1136, 391.2234, 394.0878, 396.708, 401.1556, 404.1852, 406.6372, + 409.6822, 412.7796, 416.6078, 418.4916, 422.131, 424.5376, 428.1988, 432.211, 434.4502, + 438.5282, 440.912, 444.0448, 447.7432, 450.8524, 453.7988, 456.7858, 458.8868, 463.9886, + 466.5064, 468.9124, 472.6616, 475.4682, 478.582, 481.304, 485.2738, 488.6894, 490.329, + 496.106, 497.6908, 501.1374, 504.5322, 506.8848, 510.3324, 513.4512, 516.179, 520.4412, + 522.6066, 526.167, 528.7794, 533.379, 536.067, 538.46, 542.9116, 545.692, 547.9546, + 552.493, 555.2722, 557.335, 562.449, 564.2014, 569.0738, 571.0974, 574.8564, 578.2996, + 581.409, 583.9704, 585.8098, 589.6528, 594.5998, 595.958, 600.068, 603.3278, 608.2016, + 609.9632, 612.864, 615.43, 620.7794, 621.272, 625.8644, 629.206, 633.219, 634.5154, + 638.6102, + ], + &[ + 184.2152, 187.2454, 190.2096, 193.6652, 196.6312, 199.6822, 203.249, 206.3296, 210.0038, + 213.2074, 216.4612, 220.27, 223.5178, 227.4412, 230.8032, 234.1634, 238.1688, 241.6074, + 245.6946, 249.2664, 252.8228, 257.0432, 260.6824, 264.9464, 268.6268, 272.2626, 276.8376, + 280.4034, 284.8956, 288.8522, 292.7638, 297.3552, 301.3556, 305.7526, 309.9292, 313.8954, + 318.8198, 322.7668, 327.298, 331.6688, 335.9466, 340.9746, 345.1672, 349.3474, 354.3028, + 358.8912, 364.114, 368.4646, 372.9744, 378.4092, 382.6022, 387.843, 392.5684, 397.1652, + 402.5426, 407.4152, 412.5388, 417.3592, 422.1366, 427.486, 432.3918, 437.5076, 442.509, + 447.3834, 453.3498, 458.0668, 463.7346, 469.1228, 473.4528, 479.7, 484.644, 491.0518, + 495.5774, 500.9068, 506.432, 512.1666, 517.434, 522.6644, 527.4894, 533.6312, 538.3804, + 544.292, 550.5496, 556.0234, 562.8206, 566.6146, 572.4188, 579.117, 583.6762, 590.6576, + 595.7864, 601.509, 607.5334, 612.9204, 619.772, 624.2924, 630.8654, 636.1836, 642.745, + 649.1316, 655.0386, 660.0136, 666.6342, 671.6196, 678.1866, 684.4282, 689.3324, 695.4794, + 702.5038, 708.129, 713.528, 720.3204, 726.463, 732.7928, 739.123, 744.7418, 751.2192, + 756.5102, 762.6066, 769.0184, 775.2224, 781.4014, 787.7618, 794.1436, 798.6506, 805.6378, + 811.766, 819.7514, 824.5776, 828.7322, 837.8048, 843.6302, 849.9336, 854.4798, 861.3388, + 867.9894, 873.8196, 880.3136, 886.2308, 892.4588, 899.0816, 905.4076, 912.0064, 917.3878, + 923.619, 929.998, 937.3482, 943.9506, 947.991, 955.1144, 962.203, 968.8222, 975.7324, + 981.7826, 988.7666, 994.2648, 1000.3128, 1007.4082, 1013.7536, 1020.3376, 1026.7156, + 1031.7478, 1037.4292, 1045.393, 1051.2278, 1058.3434, 1062.8726, 1071.884, 1076.806, + 1082.9176, 1089.1678, 1095.5032, 1102.525, 1107.2264, 1115.315, 1120.93, 1127.252, + 1134.1496, 1139.0408, 1147.5448, 1153.3296, 1158.1974, 1166.5262, 1174.3328, 1175.657, + 1184.4222, 1190.9172, 1197.1292, 1204.4606, 1210.4578, 1218.8728, 1225.3336, 1226.6592, + 1236.5768, 1241.363, 1249.4074, 1254.6566, 1260.8014, 1266.5454, 1274.5192, + ], + &[ + 369.0, 374.8294, 381.2452, 387.6698, 394.1464, 400.2024, 406.8782, 413.6598, 420.462, + 427.2826, 433.7102, 440.7416, 447.9366, 455.1046, 462.285, 469.0668, 476.306, 483.8448, + 491.301, 498.9886, 506.2422, 513.8138, 521.7074, 529.7428, 537.8402, 545.1664, 553.3534, + 561.594, 569.6886, 577.7876, 585.65, 594.228, 602.8036, 611.1666, 620.0818, 628.0824, + 637.2574, 646.302, 655.1644, 664.0056, 672.3802, 681.7192, 690.5234, 700.2084, 708.831, + 718.485, 728.1112, 737.4764, 746.76, 756.3368, 766.5538, 775.5058, 785.2646, 795.5902, + 804.3818, 814.8998, 824.9532, 835.2062, 845.2798, 854.4728, 864.9582, 875.3292, 886.171, + 896.781, 906.5716, 916.7048, 927.5322, 937.875, 949.3972, 958.3464, 969.7274, 980.2834, + 992.1444, 1003.4264, 1013.0166, 1024.018, 1035.0438, 1046.34, 1057.6856, 1068.9836, + 1079.0312, 1091.677, 1102.3188, 1113.4846, 1124.4424, 1135.739, 1147.1488, 1158.9202, + 1169.406, 1181.5342, 1193.2834, 1203.8954, 1216.3286, 1226.2146, 1239.6684, 1251.9946, + 1262.123, 1275.4338, 1285.7378, 1296.076, 1308.9692, 1320.4964, 1333.0998, 1343.9864, + 1357.7754, 1368.3208, 1380.4838, 1392.7388, 1406.0758, 1416.9098, 1428.9728, 1440.9228, + 1453.9292, 1462.617, 1476.05, 1490.2996, 1500.6128, 1513.7392, 1524.5174, 1536.6322, + 1548.2584, 1562.3766, 1572.423, 1587.1232, 1596.5164, 1610.5938, 1622.5972, 1633.1222, + 1647.7674, 1658.5044, 1671.57, 1683.7044, 1695.4142, 1708.7102, 1720.6094, 1732.6522, + 1747.841, 1756.4072, 1769.9786, 1782.3276, 1797.5216, 1808.3186, 1819.0694, 1834.354, + 1844.575, 1856.2808, 1871.1288, 1880.7852, 1893.9622, 1906.3418, 1920.6548, 1932.9302, + 1945.8584, 1955.473, 1968.8248, 1980.6446, 1995.9598, 2008.349, 2019.8556, 2033.0334, + 2044.0206, 2059.3956, 2069.9174, 2082.6084, 2093.7036, 2106.6108, 2118.9124, 2132.301, + 2144.7628, 2159.8422, 2171.0212, 2183.101, 2193.5112, 2208.052, 2221.3194, 2233.3282, + 2247.295, 2257.7222, 2273.342, 2286.5638, 2299.6786, 2310.8114, 2322.3312, 2335.516, + 2349.874, 2363.5968, 2373.865, 2387.1918, 2401.8328, 2414.8496, 2424.544, 2436.7592, + 2447.1682, 2464.1958, 2474.3438, 2489.0006, 2497.4526, 2513.6586, 2527.19, 2540.7028, + 2553.768, + ], + &[ + 738.1256, 750.4234, 763.1064, 775.4732, 788.4636, 801.0644, 814.488, 827.9654, 841.0832, + 854.7864, 868.1992, 882.2176, 896.5228, 910.1716, 924.7752, 938.899, 953.6126, 968.6492, + 982.9474, 998.5214, 1013.1064, 1028.6364, 1044.2468, 1059.4588, 1075.3832, 1091.0584, + 1106.8606, 1123.3868, 1139.5062, 1156.1862, 1172.463, 1189.339, 1206.1936, 1223.1292, + 1240.1854, 1257.2908, 1275.3324, 1292.8518, 1310.5204, 1328.4854, 1345.9318, 1364.552, + 1381.4658, 1400.4256, 1419.849, 1438.152, 1456.8956, 1474.8792, 1494.118, 1513.62, + 1532.5132, 1551.9322, 1570.7726, 1590.6086, 1610.5332, 1630.5918, 1650.4294, 1669.7662, + 1690.4106, 1710.7338, 1730.9012, 1750.4486, 1770.1556, 1791.6338, 1812.7312, 1833.6264, + 1853.9526, 1874.8742, 1896.8326, 1918.1966, 1939.5594, 1961.07, 1983.037, 2003.1804, + 2026.071, 2047.4884, 2070.0848, 2091.2944, 2114.333, 2135.9626, 2158.2902, 2181.0814, + 2202.0334, 2224.4832, 2246.39, 2269.7202, 2292.1714, 2314.2358, 2338.9346, 2360.891, + 2384.0264, 2408.3834, 2430.1544, 2454.8684, 2476.9896, 2501.4368, 2522.8702, 2548.0408, + 2570.6738, 2593.5208, 2617.0158, 2640.2302, 2664.0962, 2687.4986, 2714.2588, 2735.3914, + 2759.6244, 2781.8378, 2808.0072, 2830.6516, 2856.2454, 2877.2136, 2903.4546, 2926.785, + 2951.2294, 2976.468, 3000.867, 3023.6508, 3049.91, 3073.5984, 3098.162, 3121.5564, + 3146.2328, 3170.9484, 3195.5902, 3221.3346, 3242.7032, 3271.6112, 3296.5546, 3317.7376, + 3345.072, 3369.9518, 3394.326, 3418.1818, 3444.6926, 3469.086, 3494.2754, 3517.8698, + 3544.248, 3565.3768, 3588.7234, 3616.979, 3643.7504, 3668.6812, 3695.72, 3719.7392, + 3742.6224, 3770.4456, 3795.6602, 3819.9058, 3844.002, 3869.517, 3895.6824, 3920.8622, + 3947.1364, 3973.985, 3995.4772, 4021.62, 4046.628, 4074.65, 4096.2256, 4121.831, 4146.6406, + 4173.276, 4195.0744, 4223.9696, 4251.3708, 4272.9966, 4300.8046, 4326.302, 4353.1248, + 4374.312, 4403.0322, 4426.819, 4450.0598, 4478.5206, 4504.8116, 4528.8928, 4553.9584, + 4578.8712, 4603.8384, 4632.3872, 4655.5128, 4675.821, 4704.6222, 4731.9862, 4755.4174, + 4781.2628, 4804.332, 4832.3048, 4862.8752, 4883.4148, 4906.9544, 4935.3516, 4954.3532, + 4984.0248, 5011.217, 5035.3258, 5057.3672, 5084.1828, + ], + &[ + 1477.0, 1501.6014, 1526.5802, 1551.7942, 1577.3042, 1603.2062, 1629.8402, 1656.2292, + 1682.9462, 1709.9926, 1737.3026, 1765.4252, 1793.0578, 1821.6092, 1849.626, 1878.5568, + 1908.527, 1937.5154, 1967.1874, 1997.3878, 2027.37, 2058.1972, 2089.5728, 2120.1012, + 2151.9668, 2183.292, 2216.0772, 2247.8578, 2280.6562, 2313.041, 2345.714, 2380.3112, + 2414.1806, 2447.9854, 2481.656, 2516.346, 2551.5154, 2586.8378, 2621.7448, 2656.6722, + 2693.5722, 2729.1462, 2765.4124, 2802.8728, 2838.898, 2876.408, 2913.4926, 2951.4938, + 2989.6776, 3026.282, 3065.7704, 3104.1012, 3143.7388, 3181.6876, 3221.1872, 3261.5048, + 3300.0214, 3339.806, 3381.409, 3421.4144, 3461.4294, 3502.2286, 3544.651, 3586.6156, + 3627.337, 3670.083, 3711.1538, 3753.5094, 3797.01, 3838.6686, 3882.1678, 3922.8116, + 3967.9978, 4009.9204, 4054.3286, 4097.5706, 4140.6014, 4185.544, 4229.5976, 4274.583, + 4316.9438, 4361.672, 4406.2786, 4451.8628, 4496.1834, 4543.505, 4589.1816, 4632.5188, + 4678.2294, 4724.8908, 4769.0194, 4817.052, 4861.4588, 4910.1596, 4956.4344, 5002.5238, + 5048.13, 5093.6374, 5142.8162, 5187.7894, 5237.3984, 5285.6078, 5331.0858, 5379.1036, + 5428.6258, 5474.6018, 5522.7618, 5571.5822, 5618.59, 5667.9992, 5714.88, 5763.454, + 5808.6982, 5860.3644, 5910.2914, 5953.571, 6005.9232, 6055.1914, 6104.5882, 6154.5702, + 6199.7036, 6251.1764, 6298.7596, 6350.0302, 6398.061, 6448.4694, 6495.933, 6548.0474, + 6597.7166, 6646.9416, 6695.9208, 6742.6328, 6793.5276, 6842.1934, 6894.2372, 6945.3864, + 6996.9228, 7044.2372, 7094.1374, 7142.2272, 7192.2942, 7238.8338, 7288.9006, 7344.0908, + 7394.8544, 7443.5176, 7490.4148, 7542.9314, 7595.6738, 7641.9878, 7694.3688, 7743.0448, + 7797.522, 7845.53, 7899.594, 7950.3132, 7996.455, 8050.9442, 8092.9114, 8153.1374, + 8197.4472, 8252.8278, 8301.8728, 8348.6776, 8401.4698, 8453.551, 8504.6598, 8553.8944, + 8604.1276, 8657.6514, 8710.3062, 8758.908, 8807.8706, 8862.1702, 8910.4668, 8960.77, + 9007.2766, 9063.164, 9121.0534, 9164.1354, 9218.1594, 9267.767, 9319.0594, 9372.155, + 9419.7126, 9474.3722, 9520.1338, 9572.368, 9622.7702, 9675.8448, 9726.5396, 9778.7378, + 9827.6554, 9878.1922, 9928.7782, 9978.3984, 10026.578, 10076.5626, 10137.1618, 10177.5244, + 10229.9176, + ], + &[ + 2954.0, 3003.4782, 3053.3568, 3104.3666, 3155.324, 3206.9598, 3259.648, 3312.539, + 3366.1474, 3420.2576, 3474.8376, 3530.6076, 3586.451, 3643.38, 3700.4104, 3757.5638, + 3815.9676, 3875.193, 3934.838, 3994.8548, 4055.018, 4117.1742, 4178.4482, 4241.1294, + 4304.4776, 4367.4044, 4431.8724, 4496.3732, 4561.4304, 4627.5326, 4693.949, 4761.5532, + 4828.7256, 4897.6182, 4965.5186, 5034.4528, 5104.865, 5174.7164, 5244.6828, 5316.6708, + 5387.8312, 5459.9036, 5532.476, 5604.8652, 5679.6718, 5753.757, 5830.2072, 5905.2828, + 5980.0434, 6056.6264, 6134.3192, 6211.5746, 6290.0816, 6367.1176, 6447.9796, 6526.5576, + 6606.1858, 6686.9144, 6766.1142, 6847.0818, 6927.9664, 7010.9096, 7091.0816, 7175.3962, + 7260.3454, 7344.018, 7426.4214, 7511.3106, 7596.0686, 7679.8094, 7765.818, 7852.4248, + 7936.834, 8022.363, 8109.5066, 8200.4554, 8288.5832, 8373.366, 8463.4808, 8549.7682, + 8642.0522, 8728.3288, 8820.9528, 8907.727, 9001.0794, 9091.2522, 9179.988, 9269.852, + 9362.6394, 9453.642, 9546.9024, 9640.6616, 9732.6622, 9824.3254, 9917.7484, 10007.9392, + 10106.7508, 10196.2152, 10289.8114, 10383.5494, 10482.3064, 10576.8734, 10668.7872, + 10764.7156, 10862.0196, 10952.793, 11049.9748, 11146.0702, 11241.4492, 11339.2772, + 11434.2336, 11530.741, 11627.6136, 11726.311, 11821.5964, 11918.837, 12015.3724, + 12113.0162, 12213.0424, 12306.9804, 12408.4518, 12504.8968, 12604.586, 12700.9332, + 12798.705, 12898.5142, 12997.0488, 13094.788, 13198.475, 13292.7764, 13392.9698, + 13486.8574, 13590.1616, 13686.5838, 13783.6264, 13887.2638, 13992.0978, 14081.0844, + 14189.9956, 14280.0912, 14382.4956, 14486.4384, 14588.1082, 14686.2392, 14782.276, + 14888.0284, 14985.1864, 15088.8596, 15187.0998, 15285.027, 15383.6694, 15495.8266, + 15591.3736, 15694.2008, 15790.3246, 15898.4116, 15997.4522, 16095.5014, 16198.8514, + 16291.7492, 16402.6424, 16499.1266, 16606.2436, 16697.7186, 16796.3946, 16902.3376, + 17005.7672, 17100.814, 17206.8282, 17305.8262, 17416.0744, 17508.4092, 17617.0178, + 17715.4554, 17816.758, 17920.1748, 18012.9236, 18119.7984, 18223.2248, 18324.2482, + 18426.6276, 18525.0932, 18629.8976, 18733.2588, 18831.0466, 18940.1366, 19032.2696, + 19131.729, 19243.4864, 19349.6932, 19442.866, 19547.9448, 19653.2798, 19754.4034, + 19854.0692, 19965.1224, 20065.1774, 20158.2212, 20253.353, 20366.3264, 20463.22, + ], + &[ + 5908.5052, 6007.2672, 6107.347, 6208.5794, 6311.2622, 6414.5514, 6519.3376, 6625.6952, + 6732.5988, 6841.3552, 6950.5972, 7061.3082, 7173.5646, 7287.109, 7401.8216, 7516.4344, + 7633.3802, 7751.2962, 7870.3784, 7990.292, 8110.79, 8233.4574, 8356.6036, 8482.2712, + 8607.7708, 8735.099, 8863.1858, 8993.4746, 9123.8496, 9255.6794, 9388.5448, 9522.7516, + 9657.3106, 9792.6094, 9930.5642, 10068.794, 10206.7256, 10347.81, 10490.3196, 10632.0778, + 10775.9916, 10920.4662, 11066.124, 11213.073, 11358.0362, 11508.1006, 11659.1716, + 11808.7514, 11959.4884, 12112.1314, 12265.037, 12420.3756, 12578.933, 12734.311, + 12890.0006, 13047.2144, 13207.3096, 13368.5144, 13528.024, 13689.847, 13852.7528, + 14018.3168, 14180.5372, 14346.9668, 14513.5074, 14677.867, 14846.2186, 15017.4186, + 15184.9716, 15356.339, 15529.2972, 15697.3578, 15871.8686, 16042.187, 16216.4094, + 16389.4188, 16565.9126, 16742.3272, 16919.0042, 17094.7592, 17273.965, 17451.8342, + 17634.4254, 17810.5984, 17988.9242, 18171.051, 18354.7938, 18539.466, 18721.0408, + 18904.9972, 19081.867, 19271.9118, 19451.8694, 19637.9816, 19821.2922, 20013.1292, + 20199.3858, 20387.8726, 20572.9514, 20770.7764, 20955.1714, 21144.751, 21329.9952, + 21520.709, 21712.7016, 21906.3868, 22096.2626, 22286.0524, 22475.051, 22665.5098, + 22862.8492, 23055.5294, 23249.6138, 23437.848, 23636.273, 23826.093, 24020.3296, + 24213.3896, 24411.7392, 24602.9614, 24805.7952, 24998.1552, 25193.9588, 25389.0166, + 25585.8392, 25780.6976, 25981.2728, 26175.977, 26376.5252, 26570.1964, 26773.387, + 26962.9812, 27163.0586, 27368.164, 27565.0534, 27758.7428, 27961.1276, 28163.2324, + 28362.3816, 28565.7668, 28758.644, 28956.9768, 29163.4722, 29354.7026, 29561.1186, + 29767.9948, 29959.9986, 30164.0492, 30366.9818, 30562.5338, 30762.9928, 30976.1592, + 31166.274, 31376.722, 31570.3734, 31770.809, 31974.8934, 32179.5286, 32387.5442, + 32582.3504, 32794.076, 32989.9528, 33191.842, 33392.4684, 33595.659, 33801.8672, + 34000.3414, 34200.0922, 34402.6792, 34610.0638, 34804.0084, 35011.13, 35218.669, + 35418.6634, 35619.0792, 35830.6534, 36028.4966, 36229.7902, 36438.6422, 36630.7764, + 36833.3102, 37048.6728, 37247.3916, 37453.5904, 37669.3614, 37854.5526, 38059.305, + 38268.0936, 38470.2516, 38674.7064, 38876.167, 39068.3794, 39281.9144, 39492.8566, + 39684.8628, 39898.4108, 40093.1836, 40297.6858, 40489.7086, 40717.2424, + ], + &[ + 11817.475, 12015.0046, 12215.3792, 12417.7504, 12623.1814, 12830.0086, 13040.0072, + 13252.503, 13466.178, 13683.2738, 13902.0344, 14123.9798, 14347.394, 14573.7784, + 14802.6894, 15033.6824, 15266.9134, 15502.8624, 15741.4944, 15980.7956, 16223.8916, + 16468.6316, 16715.733, 16965.5726, 17217.204, 17470.666, 17727.8516, 17986.7886, + 18247.6902, 18510.9632, 18775.304, 19044.7486, 19314.4408, 19587.202, 19862.2576, + 20135.924, 20417.0324, 20697.9788, 20979.6112, 21265.0274, 21550.723, 21841.6906, + 22132.162, 22428.1406, 22722.127, 23020.5606, 23319.7394, 23620.4014, 23925.2728, + 24226.9224, 24535.581, 24845.505, 25155.9618, 25470.3828, 25785.9702, 26103.7764, + 26420.4132, 26742.0186, 27062.8852, 27388.415, 27714.6024, 28042.296, 28365.4494, + 28701.1526, 29031.8008, 29364.2156, 29704.497, 30037.1458, 30380.111, 30723.8168, + 31059.5114, 31404.9498, 31751.6752, 32095.2686, 32444.7792, 32794.767, 33145.204, + 33498.4226, 33847.6502, 34209.006, 34560.849, 34919.4838, 35274.9778, 35635.1322, + 35996.3266, 36359.1394, 36722.8266, 37082.8516, 37447.7354, 37815.9606, 38191.0692, + 38559.4106, 38924.8112, 39294.6726, 39663.973, 40042.261, 40416.2036, 40779.2036, + 41161.6436, 41540.9014, 41921.1998, 42294.7698, 42678.5264, 43061.3464, 43432.375, + 43818.432, 44198.6598, 44583.0138, 44970.4794, 45353.924, 45729.858, 46118.2224, + 46511.5724, 46900.7386, 47280.6964, 47668.1472, 48055.6796, 48446.9436, 48838.7146, + 49217.7296, 49613.7796, 50010.7508, 50410.0208, 50793.7886, 51190.2456, 51583.1882, + 51971.0796, 52376.5338, 52763.319, 53165.5534, 53556.5594, 53948.2702, 54346.352, + 54748.7914, 55138.577, 55543.4824, 55941.1748, 56333.7746, 56745.1552, 57142.7944, + 57545.2236, 57935.9956, 58348.5268, 58737.5474, 59158.5962, 59542.6896, 59958.8004, + 60349.3788, 60755.0212, 61147.6144, 61548.194, 61946.0696, 62348.6042, 62763.603, + 63162.781, 63560.635, 63974.3482, 64366.4908, 64771.5876, 65176.7346, 65597.3916, + 65995.915, 66394.0384, 66822.9396, 67203.6336, 67612.2032, 68019.0078, 68420.0388, + 68821.22, 69235.8388, 69640.0724, 70055.155, 70466.357, 70863.4266, 71276.2482, 71677.0306, + 72080.2006, 72493.0214, 72893.5952, 73314.5856, 73714.9852, 74125.3022, 74521.2122, + 74933.6814, 75341.5904, 75743.0244, 76166.0278, 76572.1322, 76973.1028, 77381.6284, + 77800.6092, 78189.328, 78607.0962, 79012.2508, 79407.8358, 79825.725, 80238.701, 80646.891, + 81035.6436, 81460.0448, 81876.3884, + ], + &[ + 23635.0036, + 24030.8034, + 24431.4744, + 24837.1524, + 25246.7928, + 25661.326, + 26081.3532, + 26505.2806, + 26933.9892, + 27367.7098, + 27805.318, + 28248.799, + 28696.4382, + 29148.8244, + 29605.5138, + 30066.8668, + 30534.2344, + 31006.32, + 31480.778, + 31962.2418, + 32447.3324, + 32938.0232, + 33432.731, + 33930.728, + 34433.9896, + 34944.1402, + 35457.5588, + 35974.5958, + 36497.3296, + 37021.9096, + 37554.326, + 38088.0826, + 38628.8816, + 39171.3192, + 39723.2326, + 40274.5554, + 40832.3142, + 41390.613, + 41959.5908, + 42532.5466, + 43102.0344, + 43683.5072, + 44266.694, + 44851.2822, + 45440.7862, + 46038.0586, + 46640.3164, + 47241.064, + 47846.155, + 48454.7396, + 49076.9168, + 49692.542, + 50317.4778, + 50939.65, + 51572.5596, + 52210.2906, + 52843.7396, + 53481.3996, + 54127.236, + 54770.406, + 55422.6598, + 56078.7958, + 56736.7174, + 57397.6784, + 58064.5784, + 58730.308, + 59404.9784, + 60077.0864, + 60751.9158, + 61444.1386, + 62115.817, + 62808.7742, + 63501.4774, + 64187.5454, + 64883.6622, + 65582.7468, + 66274.5318, + 66976.9276, + 67688.7764, + 68402.138, + 69109.6274, + 69822.9706, + 70543.6108, + 71265.5202, + 71983.3848, + 72708.4656, + 73433.384, + 74158.4664, + 74896.4868, + 75620.9564, + 76362.1434, + 77098.3204, + 77835.7662, + 78582.6114, + 79323.9902, + 80067.8658, + 80814.9246, + 81567.0136, + 82310.8536, + 83061.9952, + 83821.4096, + 84580.8608, + 85335.547, + 86092.5802, + 86851.6506, + 87612.311, + 88381.2016, + 89146.3296, + 89907.8974, + 90676.846, + 91451.4152, + 92224.5518, + 92995.8686, + 93763.5066, + 94551.2796, + 95315.1944, + 96096.1806, + 96881.0918, + 97665.679, + 98442.68, + 99229.3002, + 100011.0994, + 100790.6386, + 101580.1564, + 102377.7484, + 103152.1392, + 103944.2712, + 104730.216, + 105528.6336, + 106324.9398, + 107117.6706, + 107890.3988, + 108695.2266, + 109485.238, + 110294.7876, + 111075.0958, + 111878.0496, + 112695.2864, + 113464.5486, + 114270.0474, + 115068.608, + 115884.3626, + 116673.2588, + 117483.3716, + 118275.097, + 119085.4092, + 119879.2808, + 120687.5868, + 121499.9944, + 122284.916, + 123095.9254, + 123912.5038, + 124709.0454, + 125503.7182, + 126323.259, + 127138.9412, + 127943.8294, + 128755.646, + 129556.5354, + 130375.3298, + 131161.4734, + 131971.1962, + 132787.5458, + 133588.1056, + 134431.351, + 135220.2906, + 136023.398, + 136846.6558, + 137667.0004, + 138463.663, + 139283.7154, + 140074.6146, + 140901.3072, + 141721.8548, + 142543.2322, + 143356.1096, + 144173.7412, + 144973.0948, + 145794.3162, + 146609.5714, + 147420.003, + 148237.9784, + 149050.5696, + 149854.761, + 150663.1966, + 151494.0754, + 152313.1416, + 153112.6902, + 153935.7206, + 154746.9262, + 155559.547, + 156401.9746, + 157228.7036, + 158008.7254, + 158820.75, + 159646.9184, + 160470.4458, + 161279.5348, + 162093.3114, + 162918.542, + 163729.2842, + ], + &[ + 47271.0, + 48062.3584, + 48862.7074, + 49673.152, + 50492.8416, + 51322.9514, + 52161.03, + 53009.407, + 53867.6348, + 54734.206, + 55610.5144, + 56496.2096, + 57390.795, + 58297.268, + 59210.6448, + 60134.665, + 61068.0248, + 62010.4472, + 62962.5204, + 63923.5742, + 64895.0194, + 65876.4182, + 66862.6136, + 67862.6968, + 68868.8908, + 69882.8544, + 70911.271, + 71944.0924, + 72990.0326, + 74040.692, + 75100.6336, + 76174.7826, + 77252.5998, + 78340.2974, + 79438.2572, + 80545.4976, + 81657.2796, + 82784.6336, + 83915.515, + 85059.7362, + 86205.9368, + 87364.4424, + 88530.3358, + 89707.3744, + 90885.9638, + 92080.197, + 93275.5738, + 94479.391, + 95695.918, + 96919.2236, + 98148.4602, + 99382.3474, + 100625.6974, + 101878.0284, + 103141.6278, + 104409.4588, + 105686.2882, + 106967.5402, + 108261.6032, + 109548.1578, + 110852.0728, + 112162.231, + 113479.0072, + 114806.2626, + 116137.9072, + 117469.5048, + 118813.5186, + 120165.4876, + 121516.2556, + 122875.766, + 124250.5444, + 125621.2222, + 127003.2352, + 128387.848, + 129775.2644, + 131181.7776, + 132577.3086, + 133979.9458, + 135394.1132, + 136800.9078, + 138233.217, + 139668.5308, + 141085.212, + 142535.2122, + 143969.0684, + 145420.2872, + 146878.1542, + 148332.7572, + 149800.3202, + 151269.66, + 152743.6104, + 154213.0948, + 155690.288, + 157169.4246, + 158672.1756, + 160160.059, + 161650.6854, + 163145.7772, + 164645.6726, + 166159.1952, + 167682.1578, + 169177.3328, + 170700.0118, + 172228.8964, + 173732.6664, + 175265.5556, + 176787.799, + 178317.111, + 179856.6914, + 181400.865, + 182943.4612, + 184486.742, + 186033.4698, + 187583.7886, + 189148.1868, + 190688.4526, + 192250.1926, + 193810.9042, + 195354.2972, + 196938.7682, + 198493.5898, + 200079.2824, + 201618.912, + 203205.5492, + 204765.5798, + 206356.1124, + 207929.3064, + 209498.7196, + 211086.229, + 212675.1324, + 214256.7892, + 215826.2392, + 217412.8474, + 218995.6724, + 220618.6038, + 222207.1166, + 223781.0364, + 225387.4332, + 227005.7928, + 228590.4336, + 230217.8738, + 231805.1054, + 233408.9, + 234995.3432, + 236601.4956, + 238190.7904, + 239817.2548, + 241411.2832, + 243002.4066, + 244640.1884, + 246255.3128, + 247849.3508, + 249479.9734, + 251106.8822, + 252705.027, + 254332.9242, + 255935.129, + 257526.9014, + 259154.772, + 260777.625, + 262390.253, + 264004.4906, + 265643.59, + 267255.4076, + 268873.426, + 270470.7252, + 272106.4804, + 273722.4456, + 275337.794, + 276945.7038, + 278592.9154, + 280204.3726, + 281841.1606, + 283489.171, + 285130.1716, + 286735.3362, + 288364.7164, + 289961.1814, + 291595.5524, + 293285.683, + 294899.6668, + 296499.3434, + 298128.0462, + 299761.8946, + 301394.2424, + 302997.6748, + 304615.1478, + 306269.7724, + 307886.114, + 309543.1028, + 311153.2862, + 312782.8546, + 314421.2008, + 316033.2438, + 317692.9636, + 319305.2648, + 320948.7406, + 322566.3364, + 324228.4224, + 325847.1542, + ], + &[ + 94542.0, + 96125.811, + 97728.019, + 99348.558, + 100987.9705, + 102646.7565, + 104324.5125, + 106021.7435, + 107736.7865, + 109469.272, + 111223.9465, + 112995.219, + 114787.432, + 116593.152, + 118422.71, + 120267.2345, + 122134.6765, + 124020.937, + 125927.2705, + 127851.255, + 129788.9485, + 131751.016, + 133726.8225, + 135722.592, + 137736.789, + 139770.568, + 141821.518, + 143891.343, + 145982.1415, + 148095.387, + 150207.526, + 152355.649, + 154515.6415, + 156696.05, + 158887.7575, + 161098.159, + 163329.852, + 165569.053, + 167837.4005, + 170121.6165, + 172420.4595, + 174732.6265, + 177062.77, + 179412.502, + 181774.035, + 184151.939, + 186551.6895, + 188965.691, + 191402.8095, + 193857.949, + 196305.0775, + 198774.6715, + 201271.2585, + 203764.78, + 206299.3695, + 208818.1365, + 211373.115, + 213946.7465, + 216532.076, + 219105.541, + 221714.5375, + 224337.5135, + 226977.5125, + 229613.0655, + 232270.2685, + 234952.2065, + 237645.3555, + 240331.1925, + 243034.517, + 245756.0725, + 248517.6865, + 251232.737, + 254011.3955, + 256785.995, + 259556.44, + 262368.335, + 265156.911, + 267965.266, + 270785.583, + 273616.0495, + 276487.4835, + 279346.639, + 282202.509, + 285074.3885, + 287942.2855, + 290856.018, + 293774.0345, + 296678.5145, + 299603.6355, + 302552.6575, + 305492.9785, + 308466.8605, + 311392.581, + 314347.538, + 317319.4295, + 320285.9785, + 323301.7325, + 326298.3235, + 329301.3105, + 332301.987, + 335309.791, + 338370.762, + 341382.923, + 344431.1265, + 347464.1545, + 350507.28, + 353619.2345, + 356631.2005, + 359685.203, + 362776.7845, + 365886.488, + 368958.2255, + 372060.6825, + 375165.4335, + 378237.935, + 381328.311, + 384430.5225, + 387576.425, + 390683.242, + 393839.648, + 396977.8425, + 400101.9805, + 403271.296, + 406409.8425, + 409529.5485, + 412678.7, + 415847.423, + 419020.8035, + 422157.081, + 425337.749, + 428479.6165, + 431700.902, + 434893.1915, + 438049.582, + 441210.5415, + 444379.2545, + 447577.356, + 450741.931, + 453959.548, + 457137.0935, + 460329.846, + 463537.4815, + 466732.3345, + 469960.5615, + 473164.681, + 476347.6345, + 479496.173, + 482813.1645, + 486025.6995, + 489249.4885, + 492460.1945, + 495675.8805, + 498908.0075, + 502131.802, + 505374.3855, + 508550.9915, + 511806.7305, + 515026.776, + 518217.0005, + 521523.9855, + 524705.9855, + 527950.997, + 531210.0265, + 534472.497, + 537750.7315, + 540926.922, + 544207.094, + 547429.4345, + 550666.3745, + 553975.3475, + 557150.7185, + 560399.6165, + 563662.697, + 566916.7395, + 570146.1215, + 573447.425, + 576689.6245, + 579874.5745, + 583202.337, + 586503.0255, + 589715.635, + 592910.161, + 596214.3885, + 599488.035, + 602740.92, + 605983.0685, + 609248.67, + 612491.3605, + 615787.912, + 619107.5245, + 622307.9555, + 625577.333, + 628840.4385, + 632085.2155, + 635317.6135, + 638691.7195, + 641887.467, + 645139.9405, + 648441.546, + 651666.252, + 654941.845, + ], + &[ + 189084.0, + 192250.913, + 195456.774, + 198696.946, + 201977.762, + 205294.444, + 208651.754, + 212042.099, + 215472.269, + 218941.91, + 222443.912, + 225996.845, + 229568.199, + 233193.568, + 236844.457, + 240543.233, + 244279.475, + 248044.27, + 251854.588, + 255693.2, + 259583.619, + 263494.621, + 267445.385, + 271454.061, + 275468.769, + 279549.456, + 283646.446, + 287788.198, + 291966.099, + 296181.164, + 300431.469, + 304718.618, + 309024.004, + 313393.508, + 317760.803, + 322209.731, + 326675.061, + 331160.627, + 335654.47, + 340241.442, + 344841.833, + 349467.132, + 354130.629, + 358819.432, + 363574.626, + 368296.587, + 373118.482, + 377914.93, + 382782.301, + 387680.669, + 392601.981, + 397544.323, + 402529.115, + 407546.018, + 412593.658, + 417638.657, + 422762.865, + 427886.169, + 433017.167, + 438213.273, + 443441.254, + 448692.421, + 453937.533, + 459239.049, + 464529.569, + 469910.083, + 475274.03, + 480684.473, + 486070.26, + 491515.237, + 496995.651, + 502476.617, + 507973.609, + 513497.19, + 519083.233, + 524726.509, + 530305.505, + 535945.728, + 541584.404, + 547274.055, + 552967.236, + 558667.862, + 564360.216, + 570128.148, + 575965.08, + 581701.952, + 587532.523, + 593361.144, + 599246.128, + 605033.418, + 610958.779, + 616837.117, + 622772.818, + 628672.04, + 634675.369, + 640574.831, + 646585.739, + 652574.547, + 658611.217, + 664642.684, + 670713.914, + 676737.681, + 682797.313, + 688837.897, + 694917.874, + 701009.882, + 707173.648, + 713257.254, + 719415.392, + 725636.761, + 731710.697, + 737906.209, + 744103.074, + 750313.39, + 756504.185, + 762712.579, + 768876.985, + 775167.859, + 781359.0, + 787615.959, + 793863.597, + 800245.477, + 806464.582, + 812785.294, + 819005.925, + 825403.057, + 831676.197, + 837936.284, + 844266.968, + 850642.711, + 856959.756, + 863322.774, + 869699.931, + 876102.478, + 882355.787, + 888694.463, + 895159.952, + 901536.143, + 907872.631, + 914293.672, + 920615.14, + 927130.974, + 933409.404, + 939922.178, + 946331.47, + 952745.93, + 959209.264, + 965590.224, + 972077.284, + 978501.961, + 984953.19, + 991413.271, + 997817.479, + 1004222.658, + 1010725.676, + 1017177.138, + 1023612.529, + 1030098.236, + 1036493.719, + 1043112.207, + 1049537.036, + 1056008.096, + 1062476.184, + 1068942.337, + 1075524.95, + 1081932.864, + 1088426.025, + 1094776.005, + 1101327.448, + 1107901.673, + 1114423.639, + 1120884.602, + 1127324.923, + 1133794.24, + 1140328.886, + 1146849.376, + 1153346.682, + 1159836.502, + 1166478.703, + 1172953.304, + 1179391.502, + 1185950.982, + 1192544.052, + 1198913.41, + 1205430.994, + 1212015.525, + 1218674.042, + 1225121.683, + 1231551.101, + 1238126.379, + 1244673.795, + 1251260.649, + 1257697.86, + 1264320.983, + 1270736.319, + 1277274.694, + 1283804.95, + 1290211.514, + 1296858.568, + 1303455.691, + ], +]; + +static BIAS_DATA: &[&[f64]] = &[ + &[ + 10.0, + 9.717, + 9.207, + 8.7896, + 8.2882, + 7.8204, + 7.3772, + 6.9342, + 6.5202, + 6.161, + 5.7722, + 5.4636, + 5.0396, + 4.6766, + 4.3566, + 4.0454, + 3.7936, + 3.4856, + 3.2666, + 2.9946, + 2.766, + 2.4692, + 2.3638, + 2.0764, + 1.7864, + 1.7602, + 1.4814, + 1.433, + 1.2926, + 1.0664, + 0.999600000000001, + 0.7956, + 0.5366, + 0.589399999999998, + 0.573799999999999, + 0.269799999999996, + 0.368200000000002, + 0.0544000000000011, + 0.234200000000001, + 0.0108000000000033, + -0.203400000000002, + -0.0701999999999998, + -0.129600000000003, + -0.364199999999997, + -0.480600000000003, + -0.226999999999997, + -0.322800000000001, + -0.382599999999996, + -0.511200000000002, + -0.669600000000003, + -0.749400000000001, + -0.500399999999999, + -0.617600000000003, + -0.6922, + -0.601599999999998, + -0.416200000000003, + -0.338200000000001, + -0.782600000000002, + -0.648600000000002, + -0.919800000000002, + -0.851799999999997, + -0.962400000000002, + -0.6402, + -1.1922, + -1.0256, + -1.086, + -1.21899999999999, + -0.819400000000002, + -0.940600000000003, + -1.1554, + -1.2072, + -1.1752, + -1.16759999999999, + -1.14019999999999, + -1.3754, + -1.29859999999999, + -1.607, + -1.3292, + -1.7606, + ], + &[ + 22.0, + 21.1194, + 20.8208, + 20.2318, + 19.77, + 19.2436, + 18.7774, + 18.2848, + 17.8224, + 17.3742, + 16.9336, + 16.503, + 16.0494, + 15.6292, + 15.2124, + 14.798, + 14.367, + 13.9728, + 13.5944, + 13.217, + 12.8438, + 12.3696, + 12.0956, + 11.7044, + 11.324, + 11.0668, + 10.6698, + 10.3644, + 10.049, + 9.6918, + 9.4146, + 9.082, + 8.687, + 8.5398, + 8.2462, + 7.857, + 7.6606, + 7.4168, + 7.1248, + 6.9222, + 6.6804, + 6.447, + 6.3454, + 5.9594, + 5.7636, + 5.5776, + 5.331, + 5.19, + 4.9676, + 4.7564, + 4.5314, + 4.4442, + 4.3708, + 3.9774, + 3.9624, + 3.8796, + 3.755, + 3.472, + 3.2076, + 3.1024, + 2.8908, + 2.7338, + 2.7728, + 2.629, + 2.413, + 2.3266, + 2.1524, + 2.2642, + 2.1806, + 2.0566, + 1.9192, + 1.7598, + 1.3516, + 1.5802, + 1.43859999999999, + 1.49160000000001, + 1.1524, + 1.1892, + 0.841399999999993, + 0.879800000000003, + 0.837599999999995, + 0.469800000000006, + 0.765600000000006, + 0.331000000000003, + 0.591399999999993, + 0.601200000000006, + 0.701599999999999, + 0.558199999999999, + 0.339399999999998, + 0.354399999999998, + 0.491200000000006, + 0.308000000000007, + 0.355199999999996, + -0.0254000000000048, + 0.205200000000005, + -0.272999999999996, + 0.132199999999997, + 0.394400000000005, + -0.241200000000006, + 0.242000000000004, + 0.191400000000002, + 0.253799999999998, + -0.122399999999999, + -0.370800000000003, + 0.193200000000004, + -0.0848000000000013, + 0.0867999999999967, + -0.327200000000005, + -0.285600000000002, + 0.311400000000006, + -0.128399999999999, + -0.754999999999995, + -0.209199999999996, + -0.293599999999998, + -0.364000000000004, + -0.253600000000006, + -0.821200000000005, + -0.253600000000006, + -0.510400000000004, + -0.383399999999995, + -0.491799999999998, + -0.220200000000006, + -0.0972000000000008, + -0.557400000000001, + -0.114599999999996, + -0.295000000000002, + -0.534800000000004, + 0.346399999999988, + -0.65379999999999, + 0.0398000000000138, + 0.0341999999999985, + -0.995800000000003, + -0.523400000000009, + -0.489000000000004, + -0.274799999999999, + -0.574999999999989, + -0.482799999999997, + 0.0571999999999946, + -0.330600000000004, + -0.628800000000012, + -0.140199999999993, + -0.540600000000012, + -0.445999999999998, + -0.599400000000003, + -0.262599999999992, + 0.163399999999996, + -0.100599999999986, + -0.39500000000001, + -1.06960000000001, + -0.836399999999998, + -0.753199999999993, + -0.412399999999991, + -0.790400000000005, + -0.29679999999999, + -0.28540000000001, + -0.193000000000012, + -0.0772000000000048, + -0.962799999999987, + -0.414800000000014, + ], + &[ + 45.0, + 44.1902, + 43.271, + 42.8358, + 41.8142, + 41.2854, + 40.317, + 39.354, + 38.8924, + 37.9436, + 37.4596, + 36.5262, + 35.6248, + 35.1574, + 34.2822, + 33.837, + 32.9636, + 32.074, + 31.7042, + 30.7976, + 30.4772, + 29.6564, + 28.7942, + 28.5004, + 27.686, + 27.291, + 26.5672, + 25.8556, + 25.4982, + 24.8204, + 24.4252, + 23.7744, + 23.0786, + 22.8344, + 22.0294, + 21.8098, + 21.0794, + 20.5732, + 20.1878, + 19.5648, + 19.2902, + 18.6784, + 18.3352, + 17.8946, + 17.3712, + 17.0852, + 16.499, + 16.2686, + 15.6844, + 15.2234, + 14.9732, + 14.3356, + 14.2286, + 13.7262, + 13.3284, + 13.1048, + 12.5962, + 12.3562, + 12.1272, + 11.4184, + 11.4974, + 11.0822, + 10.856, + 10.48, + 10.2834, + 10.0208, + 9.637, + 9.51739999999999, + 9.05759999999999, + 8.74760000000001, + 8.42700000000001, + 8.1326, + 8.2372, + 8.2788, + 7.6776, + 7.79259999999999, + 7.1952, + 6.9564, + 6.6454, + 6.87, + 6.5428, + 6.19999999999999, + 6.02940000000001, + 5.62780000000001, + 5.6782, + 5.792, + 5.35159999999999, + 5.28319999999999, + 5.0394, + 5.07480000000001, + 4.49119999999999, + 4.84899999999999, + 4.696, + 4.54040000000001, + 4.07300000000001, + 4.37139999999999, + 3.7216, + 3.7328, + 3.42080000000001, + 3.41839999999999, + 3.94239999999999, + 3.27719999999999, + 3.411, + 3.13079999999999, + 2.76900000000001, + 2.92580000000001, + 2.68279999999999, + 2.75020000000001, + 2.70599999999999, + 2.3886, + 3.01859999999999, + 2.45179999999999, + 2.92699999999999, + 2.41720000000001, + 2.41139999999999, + 2.03299999999999, + 2.51240000000001, + 2.5564, + 2.60079999999999, + 2.41720000000001, + 1.80439999999999, + 1.99700000000001, + 2.45480000000001, + 1.8948, + 2.2346, + 2.30860000000001, + 2.15479999999999, + 1.88419999999999, + 1.6508, + 0.677199999999999, + 1.72540000000001, + 1.4752, + 1.72280000000001, + 1.66139999999999, + 1.16759999999999, + 1.79300000000001, + 1.00059999999999, + 0.905200000000008, + 0.659999999999997, + 1.55879999999999, + 1.1636, + 0.688199999999995, + 0.712600000000009, + 0.450199999999995, + 1.1978, + 0.975599999999986, + 0.165400000000005, + 1.727, + 1.19739999999999, + -0.252600000000001, + 1.13460000000001, + 1.3048, + 1.19479999999999, + 0.313400000000001, + 0.878999999999991, + 1.12039999999999, + 0.853000000000009, + 1.67920000000001, + 0.856999999999999, + 0.448599999999999, + 1.2362, + 0.953399999999988, + 1.02859999999998, + 0.563199999999995, + 0.663000000000011, + 0.723000000000013, + 0.756599999999992, + 0.256599999999992, + -0.837600000000009, + 0.620000000000005, + 0.821599999999989, + 0.216600000000028, + 0.205600000000004, + 0.220199999999977, + 0.372599999999977, + 0.334400000000016, + 0.928400000000011, + 0.972800000000007, + 0.192400000000021, + 0.487199999999973, + -0.413000000000011, + 0.807000000000016, + 0.120600000000024, + 0.769000000000005, + 0.870799999999974, + 0.66500000000002, + 0.118200000000002, + 0.401200000000017, + 0.635199999999998, + 0.135400000000004, + 0.175599999999974, + 1.16059999999999, + 0.34620000000001, + 0.521400000000028, + -0.586599999999976, + -1.16480000000001, + 0.968399999999974, + 0.836999999999989, + 0.779600000000016, + 0.985799999999983, + ], + &[ + 91.0, + 89.4934, + 87.9758, + 86.4574, + 84.9718, + 83.4954, + 81.5302, + 80.0756, + 78.6374, + 77.1782, + 75.7888, + 73.9522, + 72.592, + 71.2532, + 69.9086, + 68.5938, + 66.9474, + 65.6796, + 64.4394, + 63.2176, + 61.9768, + 60.4214, + 59.2528, + 58.0102, + 56.8658, + 55.7278, + 54.3044, + 53.1316, + 52.093, + 51.0032, + 49.9092, + 48.6306, + 47.5294, + 46.5756, + 45.6508, + 44.662, + 43.552, + 42.3724, + 41.617, + 40.5754, + 39.7872, + 38.8444, + 37.7988, + 36.8606, + 36.2118, + 35.3566, + 34.4476, + 33.5882, + 32.6816, + 32.0824, + 31.0258, + 30.6048, + 29.4436, + 28.7274, + 27.957, + 27.147, + 26.4364, + 25.7592, + 25.3386, + 24.781, + 23.8028, + 23.656, + 22.6544, + 21.996, + 21.4718, + 21.1544, + 20.6098, + 19.5956, + 19.0616, + 18.5758, + 18.4878, + 17.5244, + 17.2146, + 16.724, + 15.8722, + 15.5198, + 15.0414, + 14.941, + 14.9048, + 13.87, + 13.4304, + 13.028, + 12.4708, + 12.37, + 12.0624, + 11.4668, + 11.5532, + 11.4352, + 11.2564, + 10.2744, + 10.2118, + 9.74720000000002, + 10.1456, + 9.2928, + 8.75040000000001, + 8.55279999999999, + 8.97899999999998, + 8.21019999999999, + 8.18340000000001, + 7.3494, + 7.32499999999999, + 7.66140000000001, + 6.90300000000002, + 7.25439999999998, + 6.9042, + 7.21499999999997, + 6.28640000000001, + 6.08139999999997, + 6.6764, + 6.30099999999999, + 5.13900000000001, + 5.65800000000002, + 5.17320000000001, + 4.59019999999998, + 4.9538, + 5.08280000000002, + 4.92200000000003, + 4.99020000000002, + 4.7328, + 5.4538, + 4.11360000000002, + 4.22340000000003, + 4.08780000000002, + 3.70800000000003, + 4.15559999999999, + 4.18520000000001, + 3.63720000000001, + 3.68220000000002, + 3.77960000000002, + 3.6078, + 2.49160000000001, + 3.13099999999997, + 2.5376, + 3.19880000000001, + 3.21100000000001, + 2.4502, + 3.52820000000003, + 2.91199999999998, + 3.04480000000001, + 2.7432, + 2.85239999999999, + 2.79880000000003, + 2.78579999999999, + 1.88679999999999, + 2.98860000000002, + 2.50639999999999, + 1.91239999999999, + 2.66160000000002, + 2.46820000000002, + 1.58199999999999, + 1.30399999999997, + 2.27379999999999, + 2.68939999999998, + 1.32900000000001, + 3.10599999999999, + 1.69080000000002, + 2.13740000000001, + 2.53219999999999, + 1.88479999999998, + 1.33240000000001, + 1.45119999999997, + 1.17899999999997, + 2.44119999999998, + 1.60659999999996, + 2.16700000000003, + 0.77940000000001, + 2.37900000000002, + 2.06700000000001, + 1.46000000000004, + 2.91160000000002, + 1.69200000000001, + 0.954600000000028, + 2.49300000000005, + 2.2722, + 1.33500000000004, + 2.44899999999996, + 1.20140000000004, + 3.07380000000001, + 2.09739999999999, + 2.85640000000001, + 2.29960000000005, + 2.40899999999999, + 1.97040000000004, + 0.809799999999996, + 1.65279999999996, + 2.59979999999996, + 0.95799999999997, + 2.06799999999998, + 2.32780000000002, + 4.20159999999998, + 1.96320000000003, + 1.86400000000003, + 1.42999999999995, + 3.77940000000001, + 1.27200000000005, + 1.86440000000005, + 2.20600000000002, + 3.21900000000005, + 1.5154, + 2.61019999999996, + ], + &[ + 183.2152, + 180.2454, + 177.2096, + 173.6652, + 170.6312, + 167.6822, + 164.249, + 161.3296, + 158.0038, + 155.2074, + 152.4612, + 149.27, + 146.5178, + 143.4412, + 140.8032, + 138.1634, + 135.1688, + 132.6074, + 129.6946, + 127.2664, + 124.8228, + 122.0432, + 119.6824, + 116.9464, + 114.6268, + 112.2626, + 109.8376, + 107.4034, + 104.8956, + 102.8522, + 100.7638, + 98.3552, + 96.3556, + 93.7526, + 91.9292, + 89.8954, + 87.8198, + 85.7668, + 83.298, + 81.6688, + 79.9466, + 77.9746, + 76.1672, + 74.3474, + 72.3028, + 70.8912, + 69.114, + 67.4646, + 65.9744, + 64.4092, + 62.6022, + 60.843, + 59.5684, + 58.1652, + 56.5426, + 55.4152, + 53.5388, + 52.3592, + 51.1366, + 49.486, + 48.3918, + 46.5076, + 45.509, + 44.3834, + 43.3498, + 42.0668, + 40.7346, + 40.1228, + 38.4528, + 37.7, + 36.644, + 36.0518, + 34.5774, + 33.9068, + 32.432, + 32.1666, + 30.434, + 29.6644, + 28.4894, + 27.6312, + 26.3804, + 26.292, + 25.5496000000001, + 25.0234, + 24.8206, + 22.6146, + 22.4188, + 22.117, + 20.6762, + 20.6576, + 19.7864, + 19.509, + 18.5334, + 17.9204, + 17.772, + 16.2924, + 16.8654, + 15.1836, + 15.745, + 15.1316, + 15.0386, + 14.0136, + 13.6342, + 12.6196, + 12.1866, + 12.4281999999999, + 11.3324, + 10.4794000000001, + 11.5038, + 10.129, + 9.52800000000002, + 10.3203999999999, + 9.46299999999997, + 9.79280000000006, + 9.12300000000005, + 8.74180000000001, + 9.2192, + 7.51020000000005, + 7.60659999999996, + 7.01840000000004, + 7.22239999999999, + 7.40139999999997, + 6.76179999999999, + 7.14359999999999, + 5.65060000000005, + 5.63779999999997, + 5.76599999999996, + 6.75139999999999, + 5.57759999999996, + 3.73220000000003, + 5.8048, + 5.63019999999995, + 4.93359999999996, + 3.47979999999995, + 4.33879999999999, + 3.98940000000005, + 3.81960000000004, + 3.31359999999995, + 3.23080000000004, + 3.4588, + 3.08159999999998, + 3.4076, + 3.00639999999999, + 2.38779999999997, + 2.61900000000003, + 1.99800000000005, + 3.34820000000002, + 2.95060000000001, + 0.990999999999985, + 2.11440000000005, + 2.20299999999997, + 2.82219999999995, + 2.73239999999998, + 2.7826, + 3.76660000000004, + 2.26480000000004, + 2.31280000000004, + 2.40819999999997, + 2.75360000000001, + 3.33759999999995, + 2.71559999999999, + 1.7478000000001, + 1.42920000000004, + 2.39300000000003, + 2.22779999999989, + 2.34339999999997, + 0.87259999999992, + 3.88400000000001, + 1.80600000000004, + 1.91759999999999, + 1.16779999999994, + 1.50320000000011, + 2.52500000000009, + 0.226400000000012, + 2.31500000000005, + 0.930000000000064, + 1.25199999999995, + 2.14959999999996, + 0.0407999999999902, + 2.5447999999999, + 1.32960000000003, + 0.197400000000016, + 2.52620000000002, + 3.33279999999991, + -1.34300000000007, + 0.422199999999975, + 0.917200000000093, + 1.12920000000008, + 1.46060000000011, + 1.45779999999991, + 2.8728000000001, + 3.33359999999993, + -1.34079999999994, + 1.57680000000005, + 0.363000000000056, + 1.40740000000005, + 0.656600000000026, + 0.801400000000058, + -0.454600000000028, + 1.51919999999996, + ], + &[ + 368.0, + 361.8294, + 355.2452, + 348.6698, + 342.1464, + 336.2024, + 329.8782, + 323.6598, + 317.462, + 311.2826, + 305.7102, + 299.7416, + 293.9366, + 288.1046, + 282.285, + 277.0668, + 271.306, + 265.8448, + 260.301, + 254.9886, + 250.2422, + 244.8138, + 239.7074, + 234.7428, + 229.8402, + 225.1664, + 220.3534, + 215.594, + 210.6886, + 205.7876, + 201.65, + 197.228, + 192.8036, + 188.1666, + 184.0818, + 180.0824, + 176.2574, + 172.302, + 168.1644, + 164.0056, + 160.3802, + 156.7192, + 152.5234, + 149.2084, + 145.831, + 142.485, + 139.1112, + 135.4764, + 131.76, + 129.3368, + 126.5538, + 122.5058, + 119.2646, + 116.5902, + 113.3818, + 110.8998, + 107.9532, + 105.2062, + 102.2798, + 99.4728, + 96.9582, + 94.3292, + 92.171, + 89.7809999999999, + 87.5716, + 84.7048, + 82.5322, + 79.875, + 78.3972, + 75.3464, + 73.7274, + 71.2834, + 70.1444, + 68.4263999999999, + 66.0166, + 64.018, + 62.0437999999999, + 60.3399999999999, + 58.6856, + 57.9836, + 55.0311999999999, + 54.6769999999999, + 52.3188, + 51.4846, + 49.4423999999999, + 47.739, + 46.1487999999999, + 44.9202, + 43.4059999999999, + 42.5342000000001, + 41.2834, + 38.8954000000001, + 38.3286000000001, + 36.2146, + 36.6684, + 35.9946, + 33.123, + 33.4338, + 31.7378000000001, + 29.076, + 28.9692, + 27.4964, + 27.0998, + 25.9864, + 26.7754, + 24.3208, + 23.4838, + 22.7388000000001, + 24.0758000000001, + 21.9097999999999, + 20.9728, + 19.9228000000001, + 19.9292, + 16.617, + 17.05, + 18.2996000000001, + 15.6128000000001, + 15.7392, + 14.5174, + 13.6322, + 12.2583999999999, + 13.3766000000001, + 11.423, + 13.1232, + 9.51639999999998, + 10.5938000000001, + 9.59719999999993, + 8.12220000000002, + 9.76739999999995, + 7.50440000000003, + 7.56999999999994, + 6.70440000000008, + 6.41419999999994, + 6.71019999999999, + 5.60940000000005, + 4.65219999999999, + 6.84099999999989, + 3.4072000000001, + 3.97859999999991, + 3.32760000000007, + 5.52160000000003, + 3.31860000000006, + 2.06940000000009, + 4.35400000000004, + 1.57500000000005, + 0.280799999999999, + 2.12879999999996, + -0.214799999999968, + -0.0378000000000611, + -0.658200000000079, + 0.654800000000023, + -0.0697999999999865, + 0.858400000000074, + -2.52700000000004, + -2.1751999999999, + -3.35539999999992, + -1.04019999999991, + -0.651000000000067, + -2.14439999999991, + -1.96659999999997, + -3.97939999999994, + -0.604400000000169, + -3.08260000000018, + -3.39159999999993, + -5.29640000000018, + -5.38920000000007, + -5.08759999999984, + -4.69900000000007, + -5.23720000000003, + -3.15779999999995, + -4.97879999999986, + -4.89899999999989, + -7.48880000000008, + -5.94799999999987, + -5.68060000000014, + -6.67180000000008, + -4.70499999999993, + -7.27779999999984, + -4.6579999999999, + -4.4362000000001, + -4.32139999999981, + -5.18859999999995, + -6.66879999999992, + -6.48399999999992, + -5.1260000000002, + -4.4032000000002, + -6.13500000000022, + -5.80819999999994, + -4.16719999999987, + -4.15039999999999, + -7.45600000000013, + -7.24080000000004, + -9.83179999999993, + -5.80420000000004, + -8.6561999999999, + -6.99940000000015, + -10.5473999999999, + -7.34139999999979, + -6.80999999999995, + -6.29719999999998, + -6.23199999999997, + ], + &[ + 737.1256, + 724.4234, + 711.1064, + 698.4732, + 685.4636, + 673.0644, + 660.488, + 647.9654, + 636.0832, + 623.7864, + 612.1992, + 600.2176, + 588.5228, + 577.1716, + 565.7752, + 554.899, + 543.6126, + 532.6492, + 521.9474, + 511.5214, + 501.1064, + 490.6364, + 480.2468, + 470.4588, + 460.3832, + 451.0584, + 440.8606, + 431.3868, + 422.5062, + 413.1862, + 404.463, + 395.339, + 386.1936, + 378.1292, + 369.1854, + 361.2908, + 353.3324, + 344.8518, + 337.5204, + 329.4854, + 321.9318, + 314.552, + 306.4658, + 299.4256, + 292.849, + 286.152, + 278.8956, + 271.8792, + 265.118, + 258.62, + 252.5132, + 245.9322, + 239.7726, + 233.6086, + 227.5332, + 222.5918, + 216.4294, + 210.7662, + 205.4106, + 199.7338, + 194.9012, + 188.4486, + 183.1556, + 178.6338, + 173.7312, + 169.6264, + 163.9526, + 159.8742, + 155.8326, + 151.1966, + 147.5594, + 143.07, + 140.037, + 134.1804, + 131.071, + 127.4884, + 124.0848, + 120.2944, + 117.333, + 112.9626, + 110.2902, + 107.0814, + 103.0334, + 99.4832000000001, + 96.3899999999999, + 93.7202000000002, + 90.1714000000002, + 87.2357999999999, + 85.9346, + 82.8910000000001, + 80.0264000000002, + 78.3834000000002, + 75.1543999999999, + 73.8683999999998, + 70.9895999999999, + 69.4367999999999, + 64.8701999999998, + 65.0408000000002, + 61.6738, + 59.5207999999998, + 57.0158000000001, + 54.2302, + 53.0962, + 50.4985999999999, + 52.2588000000001, + 47.3914, + 45.6244000000002, + 42.8377999999998, + 43.0072, + 40.6516000000001, + 40.2453999999998, + 35.2136, + 36.4546, + 33.7849999999999, + 33.2294000000002, + 32.4679999999998, + 30.8670000000002, + 28.6507999999999, + 28.9099999999999, + 27.5983999999999, + 26.1619999999998, + 24.5563999999999, + 23.2328000000002, + 21.9484000000002, + 21.5902000000001, + 21.3346000000001, + 17.7031999999999, + 20.6111999999998, + 19.5545999999999, + 15.7375999999999, + 17.0720000000001, + 16.9517999999998, + 15.326, + 13.1817999999998, + 14.6925999999999, + 13.0859999999998, + 13.2754, + 10.8697999999999, + 11.248, + 7.3768, + 4.72339999999986, + 7.97899999999981, + 8.7503999999999, + 7.68119999999999, + 9.7199999999998, + 7.73919999999998, + 5.6224000000002, + 7.44560000000001, + 6.6601999999998, + 5.9058, + 4.00199999999995, + 4.51699999999983, + 4.68240000000014, + 3.86220000000003, + 5.13639999999987, + 5.98500000000013, + 2.47719999999981, + 2.61999999999989, + 1.62800000000016, + 4.65000000000009, + 0.225599999999758, + 0.831000000000131, + -0.359400000000278, + 1.27599999999984, + -2.92559999999958, + -0.0303999999996449, + 2.37079999999969, + -2.0033999999996, + 0.804600000000391, + 0.30199999999968, + 1.1247999999996, + -2.6880000000001, + 0.0321999999996478, + -1.18099999999959, + -3.9402, + -1.47940000000017, + -0.188400000000001, + -2.10720000000038, + -2.04159999999956, + -3.12880000000041, + -4.16160000000036, + -0.612799999999879, + -3.48719999999958, + -8.17900000000009, + -5.37780000000021, + -4.01379999999972, + -5.58259999999973, + -5.73719999999958, + -7.66799999999967, + -5.69520000000011, + -1.1247999999996, + -5.58520000000044, + -8.04560000000038, + -4.64840000000004, + -11.6468000000004, + -7.97519999999986, + -5.78300000000036, + -7.67420000000038, + -10.6328000000003, + -9.81720000000041, + ], + &[ + 1476.0, + 1449.6014, + 1423.5802, + 1397.7942, + 1372.3042, + 1347.2062, + 1321.8402, + 1297.2292, + 1272.9462, + 1248.9926, + 1225.3026, + 1201.4252, + 1178.0578, + 1155.6092, + 1132.626, + 1110.5568, + 1088.527, + 1066.5154, + 1045.1874, + 1024.3878, + 1003.37, + 982.1972, + 962.5728, + 942.1012, + 922.9668, + 903.292, + 884.0772, + 864.8578, + 846.6562, + 828.041, + 809.714, + 792.3112, + 775.1806, + 757.9854, + 740.656, + 724.346, + 707.5154, + 691.8378, + 675.7448, + 659.6722, + 645.5722, + 630.1462, + 614.4124, + 600.8728, + 585.898, + 572.408, + 558.4926, + 544.4938, + 531.6776, + 517.282, + 505.7704, + 493.1012, + 480.7388, + 467.6876, + 456.1872, + 445.5048, + 433.0214, + 420.806, + 411.409, + 400.4144, + 389.4294, + 379.2286, + 369.651, + 360.6156, + 350.337, + 342.083, + 332.1538, + 322.5094, + 315.01, + 305.6686, + 298.1678, + 287.8116, + 280.9978, + 271.9204, + 265.3286, + 257.5706, + 249.6014, + 242.544, + 235.5976, + 229.583, + 220.9438, + 214.672, + 208.2786, + 201.8628, + 195.1834, + 191.505, + 186.1816, + 178.5188, + 172.2294, + 167.8908, + 161.0194, + 158.052, + 151.4588, + 148.1596, + 143.4344, + 138.5238, + 133.13, + 127.6374, + 124.8162, + 118.7894, + 117.3984, + 114.6078, + 109.0858, + 105.1036, + 103.6258, + 98.6018000000004, + 95.7618000000002, + 93.5821999999998, + 88.5900000000001, + 86.9992000000002, + 82.8800000000001, + 80.4539999999997, + 74.6981999999998, + 74.3644000000004, + 73.2914000000001, + 65.5709999999999, + 66.9232000000002, + 65.1913999999997, + 62.5882000000001, + 61.5702000000001, + 55.7035999999998, + 56.1764000000003, + 52.7596000000003, + 53.0302000000001, + 49.0609999999997, + 48.4694, + 44.933, + 46.0474000000004, + 44.7165999999997, + 41.9416000000001, + 39.9207999999999, + 35.6328000000003, + 35.5276000000003, + 33.1934000000001, + 33.2371999999996, + 33.3864000000003, + 33.9228000000003, + 30.2371999999996, + 29.1373999999996, + 25.2272000000003, + 24.2942000000003, + 19.8338000000003, + 18.9005999999999, + 23.0907999999999, + 21.8544000000002, + 19.5176000000001, + 15.4147999999996, + 16.9314000000004, + 18.6737999999996, + 12.9877999999999, + 14.3688000000002, + 12.0447999999997, + 15.5219999999999, + 12.5299999999997, + 14.5940000000001, + 14.3131999999996, + 9.45499999999993, + 12.9441999999999, + 3.91139999999996, + 13.1373999999996, + 5.44720000000052, + 9.82779999999912, + 7.87279999999919, + 3.67760000000089, + 5.46980000000076, + 5.55099999999948, + 5.65979999999945, + 3.89439999999922, + 3.1275999999998, + 5.65140000000065, + 6.3062000000009, + 3.90799999999945, + 1.87060000000019, + 5.17020000000048, + 2.46680000000015, + 0.770000000000437, + -3.72340000000077, + 1.16400000000067, + 8.05340000000069, + 0.135399999999208, + 2.15940000000046, + 0.766999999999825, + 1.0594000000001, + 3.15500000000065, + -0.287399999999252, + 2.37219999999979, + -2.86620000000039, + -1.63199999999961, + -2.22979999999916, + -0.15519999999924, + -1.46039999999994, + -0.262199999999211, + -2.34460000000036, + -2.8078000000005, + -3.22179999999935, + -5.60159999999996, + -8.42200000000048, + -9.43740000000071, + 0.161799999999857, + -10.4755999999998, + -10.0823999999993, + ], + &[ + 2953.0, + 2900.4782, + 2848.3568, + 2796.3666, + 2745.324, + 2694.9598, + 2644.648, + 2595.539, + 2546.1474, + 2498.2576, + 2450.8376, + 2403.6076, + 2357.451, + 2311.38, + 2266.4104, + 2221.5638, + 2176.9676, + 2134.193, + 2090.838, + 2048.8548, + 2007.018, + 1966.1742, + 1925.4482, + 1885.1294, + 1846.4776, + 1807.4044, + 1768.8724, + 1731.3732, + 1693.4304, + 1657.5326, + 1621.949, + 1586.5532, + 1551.7256, + 1517.6182, + 1483.5186, + 1450.4528, + 1417.865, + 1385.7164, + 1352.6828, + 1322.6708, + 1291.8312, + 1260.9036, + 1231.476, + 1201.8652, + 1173.6718, + 1145.757, + 1119.2072, + 1092.2828, + 1065.0434, + 1038.6264, + 1014.3192, + 988.5746, + 965.0816, + 940.1176, + 917.9796, + 894.5576, + 871.1858, + 849.9144, + 827.1142, + 805.0818, + 783.9664, + 763.9096, + 742.0816, + 724.3962, + 706.3454, + 688.018, + 667.4214, + 650.3106, + 633.0686, + 613.8094, + 597.818, + 581.4248, + 563.834, + 547.363, + 531.5066, + 520.455400000001, + 505.583199999999, + 488.366, + 476.480799999999, + 459.7682, + 450.0522, + 434.328799999999, + 423.952799999999, + 408.727000000001, + 399.079400000001, + 387.252200000001, + 373.987999999999, + 360.852000000001, + 351.6394, + 339.642, + 330.902400000001, + 322.661599999999, + 311.662200000001, + 301.3254, + 291.7484, + 279.939200000001, + 276.7508, + 263.215200000001, + 254.811400000001, + 245.5494, + 242.306399999999, + 234.8734, + 223.787200000001, + 217.7156, + 212.0196, + 200.793, + 195.9748, + 189.0702, + 182.449199999999, + 177.2772, + 170.2336, + 164.741, + 158.613600000001, + 155.311, + 147.5964, + 142.837, + 137.3724, + 132.0162, + 130.0424, + 121.9804, + 120.451800000001, + 114.8968, + 111.585999999999, + 105.933199999999, + 101.705, + 98.5141999999996, + 95.0488000000005, + 89.7880000000005, + 91.4750000000004, + 83.7764000000006, + 80.9698000000008, + 72.8574000000008, + 73.1615999999995, + 67.5838000000003, + 62.6263999999992, + 63.2638000000006, + 66.0977999999996, + 52.0843999999997, + 58.9956000000002, + 47.0912000000008, + 46.4956000000002, + 48.4383999999991, + 47.1082000000006, + 43.2392, + 37.2759999999998, + 40.0283999999992, + 35.1864000000005, + 35.8595999999998, + 32.0998, + 28.027, + 23.6694000000007, + 33.8266000000003, + 26.3736000000008, + 27.2008000000005, + 21.3245999999999, + 26.4115999999995, + 23.4521999999997, + 19.5013999999992, + 19.8513999999996, + 10.7492000000002, + 18.6424000000006, + 13.1265999999996, + 18.2436000000016, + 6.71860000000015, + 3.39459999999963, + 6.33759999999893, + 7.76719999999841, + 0.813999999998487, + 3.82819999999992, + 0.826199999999517, + 8.07440000000133, + -1.59080000000176, + 5.01780000000144, + 0.455399999998917, + -0.24199999999837, + 0.174800000000687, + -9.07640000000174, + -4.20160000000033, + -3.77520000000004, + -4.75179999999818, + -5.3724000000002, + -8.90680000000066, + -6.10239999999976, + -5.74120000000039, + -9.95339999999851, + -3.86339999999836, + -13.7304000000004, + -16.2710000000006, + -7.51359999999841, + -3.30679999999847, + -13.1339999999982, + -10.0551999999989, + -6.72019999999975, + -8.59660000000076, + -10.9307999999983, + -1.8775999999998, + -4.82259999999951, + -13.7788, + -21.6470000000008, + -10.6735999999983, + -15.7799999999988, + ], + &[ + 5907.5052, + 5802.2672, + 5697.347, + 5593.5794, + 5491.2622, + 5390.5514, + 5290.3376, + 5191.6952, + 5093.5988, + 4997.3552, + 4902.5972, + 4808.3082, + 4715.5646, + 4624.109, + 4533.8216, + 4444.4344, + 4356.3802, + 4269.2962, + 4183.3784, + 4098.292, + 4014.79, + 3932.4574, + 3850.6036, + 3771.2712, + 3691.7708, + 3615.099, + 3538.1858, + 3463.4746, + 3388.8496, + 3315.6794, + 3244.5448, + 3173.7516, + 3103.3106, + 3033.6094, + 2966.5642, + 2900.794, + 2833.7256, + 2769.81, + 2707.3196, + 2644.0778, + 2583.9916, + 2523.4662, + 2464.124, + 2406.073, + 2347.0362, + 2292.1006, + 2238.1716, + 2182.7514, + 2128.4884, + 2077.1314, + 2025.037, + 1975.3756, + 1928.933, + 1879.311, + 1831.0006, + 1783.2144, + 1738.3096, + 1694.5144, + 1649.024, + 1606.847, + 1564.7528, + 1525.3168, + 1482.5372, + 1443.9668, + 1406.5074, + 1365.867, + 1329.2186, + 1295.4186, + 1257.9716, + 1225.339, + 1193.2972, + 1156.3578, + 1125.8686, + 1091.187, + 1061.4094, + 1029.4188, + 1000.9126, + 972.3272, + 944.004199999999, + 915.7592, + 889.965, + 862.834200000001, + 840.4254, + 812.598399999999, + 785.924200000001, + 763.050999999999, + 741.793799999999, + 721.466, + 699.040799999999, + 677.997200000002, + 649.866999999998, + 634.911800000002, + 609.8694, + 591.981599999999, + 570.2922, + 557.129199999999, + 538.3858, + 521.872599999999, + 502.951400000002, + 495.776399999999, + 475.171399999999, + 459.751, + 439.995200000001, + 426.708999999999, + 413.7016, + 402.3868, + 387.262599999998, + 372.0524, + 357.050999999999, + 342.5098, + 334.849200000001, + 322.529399999999, + 311.613799999999, + 295.848000000002, + 289.273000000001, + 274.093000000001, + 263.329600000001, + 251.389599999999, + 245.7392, + 231.9614, + 229.7952, + 217.155200000001, + 208.9588, + 199.016599999999, + 190.839199999999, + 180.6976, + 176.272799999999, + 166.976999999999, + 162.5252, + 151.196400000001, + 149.386999999999, + 133.981199999998, + 130.0586, + 130.164000000001, + 122.053400000001, + 110.7428, + 108.1276, + 106.232400000001, + 100.381600000001, + 98.7668000000012, + 86.6440000000002, + 79.9768000000004, + 82.4722000000002, + 68.7026000000005, + 70.1186000000016, + 71.9948000000004, + 58.998599999999, + 59.0492000000013, + 56.9818000000014, + 47.5338000000011, + 42.9928, + 51.1591999999982, + 37.2740000000013, + 42.7220000000016, + 31.3734000000004, + 26.8090000000011, + 25.8934000000008, + 26.5286000000015, + 29.5442000000003, + 19.3503999999994, + 26.0760000000009, + 17.9527999999991, + 14.8419999999969, + 10.4683999999979, + 8.65899999999965, + 9.86720000000059, + 4.34139999999752, + -0.907800000000861, + -3.32080000000133, + -0.936199999996461, + -11.9916000000012, + -8.87000000000262, + -6.33099999999831, + -11.3366000000024, + -15.9207999999999, + -9.34659999999712, + -15.5034000000014, + -19.2097999999969, + -15.357799999998, + -28.2235999999975, + -30.6898000000001, + -19.3271999999997, + -25.6083999999973, + -24.409599999999, + -13.6385999999984, + -33.4473999999973, + -32.6949999999997, + -28.9063999999998, + -31.7483999999968, + -32.2935999999972, + -35.8329999999987, + -47.620600000002, + -39.0855999999985, + -33.1434000000008, + -46.1371999999974, + -37.5892000000022, + -46.8164000000033, + -47.3142000000007, + -60.2914000000019, + -37.7575999999972, + ], + &[ + 11816.475, + 11605.0046, + 11395.3792, + 11188.7504, + 10984.1814, + 10782.0086, + 10582.0072, + 10384.503, + 10189.178, + 9996.2738, + 9806.0344, + 9617.9798, + 9431.394, + 9248.7784, + 9067.6894, + 8889.6824, + 8712.9134, + 8538.8624, + 8368.4944, + 8197.7956, + 8031.8916, + 7866.6316, + 7703.733, + 7544.5726, + 7386.204, + 7230.666, + 7077.8516, + 6926.7886, + 6778.6902, + 6631.9632, + 6487.304, + 6346.7486, + 6206.4408, + 6070.202, + 5935.2576, + 5799.924, + 5671.0324, + 5541.9788, + 5414.6112, + 5290.0274, + 5166.723, + 5047.6906, + 4929.162, + 4815.1406, + 4699.127, + 4588.5606, + 4477.7394, + 4369.4014, + 4264.2728, + 4155.9224, + 4055.581, + 3955.505, + 3856.9618, + 3761.3828, + 3666.9702, + 3575.7764, + 3482.4132, + 3395.0186, + 3305.8852, + 3221.415, + 3138.6024, + 3056.296, + 2970.4494, + 2896.1526, + 2816.8008, + 2740.2156, + 2670.497, + 2594.1458, + 2527.111, + 2460.8168, + 2387.5114, + 2322.9498, + 2260.6752, + 2194.2686, + 2133.7792, + 2074.767, + 2015.204, + 1959.4226, + 1898.6502, + 1850.006, + 1792.849, + 1741.4838, + 1687.9778, + 1638.1322, + 1589.3266, + 1543.1394, + 1496.8266, + 1447.8516, + 1402.7354, + 1361.9606, + 1327.0692, + 1285.4106, + 1241.8112, + 1201.6726, + 1161.973, + 1130.261, + 1094.2036, + 1048.2036, + 1020.6436, + 990.901400000002, + 961.199800000002, + 924.769800000002, + 899.526400000002, + 872.346400000002, + 834.375, + 810.432000000001, + 780.659800000001, + 756.013800000001, + 733.479399999997, + 707.923999999999, + 673.858, + 652.222399999999, + 636.572399999997, + 615.738599999997, + 586.696400000001, + 564.147199999999, + 541.679600000003, + 523.943599999999, + 505.714599999999, + 475.729599999999, + 461.779600000002, + 449.750800000002, + 439.020799999998, + 412.7886, + 400.245600000002, + 383.188199999997, + 362.079599999997, + 357.533799999997, + 334.319000000003, + 327.553399999997, + 308.559399999998, + 291.270199999999, + 279.351999999999, + 271.791400000002, + 252.576999999997, + 247.482400000001, + 236.174800000001, + 218.774599999997, + 220.155200000001, + 208.794399999999, + 201.223599999998, + 182.995600000002, + 185.5268, + 164.547400000003, + 176.5962, + 150.689599999998, + 157.8004, + 138.378799999999, + 134.021200000003, + 117.614399999999, + 108.194000000003, + 97.0696000000025, + 89.6042000000016, + 95.6030000000028, + 84.7810000000027, + 72.635000000002, + 77.3482000000004, + 59.4907999999996, + 55.5875999999989, + 50.7346000000034, + 61.3916000000027, + 50.9149999999936, + 39.0384000000049, + 58.9395999999979, + 29.633600000001, + 28.2032000000036, + 26.0078000000067, + 17.0387999999948, + 9.22000000000116, + 13.8387999999977, + 8.07240000000456, + 14.1549999999988, + 15.3570000000036, + 3.42660000000615, + 6.24820000000182, + -2.96940000000177, + -8.79940000000352, + -5.97860000000219, + -14.4048000000039, + -3.4143999999942, + -13.0148000000045, + -11.6977999999945, + -25.7878000000055, + -22.3185999999987, + -24.409599999999, + -31.9756000000052, + -18.9722000000038, + -22.8678000000073, + -30.8972000000067, + -32.3715999999986, + -22.3907999999938, + -43.6720000000059, + -35.9038, + -39.7492000000057, + -54.1641999999993, + -45.2749999999942, + -42.2989999999991, + -44.1089999999967, + -64.3564000000042, + -49.9551999999967, + -42.6116000000038, + ], + &[ + 23634.0036, + 23210.8034, + 22792.4744, + 22379.1524, + 21969.7928, + 21565.326, + 21165.3532, + 20770.2806, + 20379.9892, + 19994.7098, + 19613.318, + 19236.799, + 18865.4382, + 18498.8244, + 18136.5138, + 17778.8668, + 17426.2344, + 17079.32, + 16734.778, + 16397.2418, + 16063.3324, + 15734.0232, + 15409.731, + 15088.728, + 14772.9896, + 14464.1402, + 14157.5588, + 13855.5958, + 13559.3296, + 13264.9096, + 12978.326, + 12692.0826, + 12413.8816, + 12137.3192, + 11870.2326, + 11602.5554, + 11340.3142, + 11079.613, + 10829.5908, + 10583.5466, + 10334.0344, + 10095.5072, + 9859.694, + 9625.2822, + 9395.7862, + 9174.0586, + 8957.3164, + 8738.064, + 8524.155, + 8313.7396, + 8116.9168, + 7913.542, + 7718.4778, + 7521.65, + 7335.5596, + 7154.2906, + 6968.7396, + 6786.3996, + 6613.236, + 6437.406, + 6270.6598, + 6107.7958, + 5945.7174, + 5787.6784, + 5635.5784, + 5482.308, + 5337.9784, + 5190.0864, + 5045.9158, + 4919.1386, + 4771.817, + 4645.7742, + 4518.4774, + 4385.5454, + 4262.6622, + 4142.74679999999, + 4015.5318, + 3897.9276, + 3790.7764, + 3685.13800000001, + 3573.6274, + 3467.9706, + 3368.61079999999, + 3271.5202, + 3170.3848, + 3076.4656, + 2982.38400000001, + 2888.4664, + 2806.4868, + 2711.9564, + 2634.1434, + 2551.3204, + 2469.7662, + 2396.61139999999, + 2318.9902, + 2243.8658, + 2171.9246, + 2105.01360000001, + 2028.8536, + 1960.9952, + 1901.4096, + 1841.86079999999, + 1777.54700000001, + 1714.5802, + 1654.65059999999, + 1596.311, + 1546.2016, + 1492.3296, + 1433.8974, + 1383.84600000001, + 1339.4152, + 1293.5518, + 1245.8686, + 1193.50659999999, + 1162.27959999999, + 1107.19439999999, + 1069.18060000001, + 1035.09179999999, + 999.679000000004, + 957.679999999993, + 925.300199999998, + 888.099400000006, + 848.638600000006, + 818.156400000007, + 796.748399999997, + 752.139200000005, + 725.271200000003, + 692.216, + 671.633600000001, + 647.939799999993, + 621.670599999998, + 575.398799999995, + 561.226599999995, + 532.237999999998, + 521.787599999996, + 483.095799999996, + 467.049599999998, + 465.286399999997, + 415.548599999995, + 401.047399999996, + 380.607999999993, + 377.362599999993, + 347.258799999996, + 338.371599999999, + 310.096999999994, + 301.409199999995, + 276.280799999993, + 265.586800000005, + 258.994399999996, + 223.915999999997, + 215.925399999993, + 213.503800000006, + 191.045400000003, + 166.718200000003, + 166.259000000005, + 162.941200000001, + 148.829400000002, + 141.645999999993, + 123.535399999993, + 122.329800000007, + 89.473399999988, + 80.1962000000058, + 77.5457999999926, + 59.1056000000099, + 83.3509999999951, + 52.2906000000075, + 36.3979999999865, + 40.6558000000077, + 42.0003999999899, + 19.6630000000005, + 19.7153999999864, + -8.38539999999921, + -0.692799999989802, + 0.854800000000978, + 3.23219999999856, + -3.89040000000386, + -5.25880000001052, + -24.9052000000083, + -22.6837999999989, + -26.4286000000138, + -34.997000000003, + -37.0216000000073, + -43.430400000012, + -58.2390000000014, + -68.8034000000043, + -56.9245999999985, + -57.8583999999973, + -77.3097999999882, + -73.2793999999994, + -81.0738000000129, + -87.4530000000086, + -65.0254000000132, + -57.296399999992, + -96.2746000000043, + -103.25, + -96.081600000005, + -91.5542000000132, + -102.465200000006, + -107.688599999994, + -101.458000000013, + -109.715800000005, + ], + &[ + 47270.0, + 46423.3584, + 45585.7074, + 44757.152, + 43938.8416, + 43130.9514, + 42330.03, + 41540.407, + 40759.6348, + 39988.206, + 39226.5144, + 38473.2096, + 37729.795, + 36997.268, + 36272.6448, + 35558.665, + 34853.0248, + 34157.4472, + 33470.5204, + 32793.5742, + 32127.0194, + 31469.4182, + 30817.6136, + 30178.6968, + 29546.8908, + 28922.8544, + 28312.271, + 27707.0924, + 27114.0326, + 26526.692, + 25948.6336, + 25383.7826, + 24823.5998, + 24272.2974, + 23732.2572, + 23201.4976, + 22674.2796, + 22163.6336, + 21656.515, + 21161.7362, + 20669.9368, + 20189.4424, + 19717.3358, + 19256.3744, + 18795.9638, + 18352.197, + 17908.5738, + 17474.391, + 17052.918, + 16637.2236, + 16228.4602, + 15823.3474, + 15428.6974, + 15043.0284, + 14667.6278, + 14297.4588, + 13935.2882, + 13578.5402, + 13234.6032, + 12882.1578, + 12548.0728, + 12219.231, + 11898.0072, + 11587.2626, + 11279.9072, + 10973.5048, + 10678.5186, + 10392.4876, + 10105.2556, + 9825.766, + 9562.5444, + 9294.2222, + 9038.2352, + 8784.848, + 8533.2644, + 8301.7776, + 8058.30859999999, + 7822.94579999999, + 7599.11319999999, + 7366.90779999999, + 7161.217, + 6957.53080000001, + 6736.212, + 6548.21220000001, + 6343.06839999999, + 6156.28719999999, + 5975.15419999999, + 5791.75719999999, + 5621.32019999999, + 5451.66, + 5287.61040000001, + 5118.09479999999, + 4957.288, + 4798.4246, + 4662.17559999999, + 4512.05900000001, + 4364.68539999999, + 4220.77720000001, + 4082.67259999999, + 3957.19519999999, + 3842.15779999999, + 3699.3328, + 3583.01180000001, + 3473.8964, + 3338.66639999999, + 3233.55559999999, + 3117.799, + 3008.111, + 2909.69140000001, + 2814.86499999999, + 2719.46119999999, + 2624.742, + 2532.46979999999, + 2444.7886, + 2370.1868, + 2272.45259999999, + 2196.19260000001, + 2117.90419999999, + 2023.2972, + 1969.76819999999, + 1885.58979999999, + 1833.2824, + 1733.91200000001, + 1682.54920000001, + 1604.57980000001, + 1556.11240000001, + 1491.3064, + 1421.71960000001, + 1371.22899999999, + 1322.1324, + 1264.7892, + 1196.23920000001, + 1143.8474, + 1088.67240000001, + 1073.60380000001, + 1023.11660000001, + 959.036400000012, + 927.433199999999, + 906.792799999996, + 853.433599999989, + 841.873800000001, + 791.1054, + 756.899999999994, + 704.343200000003, + 672.495599999995, + 622.790399999998, + 611.254799999995, + 567.283200000005, + 519.406599999988, + 519.188400000014, + 495.312800000014, + 451.350799999986, + 443.973399999988, + 431.882199999993, + 392.027000000002, + 380.924200000009, + 345.128999999986, + 298.901400000002, + 287.771999999997, + 272.625, + 247.253000000026, + 222.490600000019, + 223.590000000026, + 196.407599999977, + 176.425999999978, + 134.725199999986, + 132.4804, + 110.445599999977, + 86.7939999999944, + 56.7038000000175, + 64.915399999998, + 38.3726000000024, + 37.1606000000029, + 46.170999999973, + 49.1716000000015, + 15.3362000000197, + 6.71639999997569, + -34.8185999999987, + -39.4476000000141, + 12.6830000000191, + -12.3331999999937, + -50.6565999999875, + -59.9538000000175, + -65.1054000000004, + -70.7576000000117, + -106.325200000021, + -126.852200000023, + -110.227599999984, + -132.885999999999, + -113.897200000007, + -142.713800000027, + -151.145399999979, + -150.799200000009, + -177.756200000003, + -156.036399999983, + -182.735199999996, + -177.259399999981, + -198.663600000029, + -174.577600000019, + -193.84580000001, + ], + &[ + 94541.0, + 92848.811, + 91174.019, + 89517.558, + 87879.9705, + 86262.7565, + 84663.5125, + 83083.7435, + 81521.7865, + 79977.272, + 78455.9465, + 76950.219, + 75465.432, + 73994.152, + 72546.71, + 71115.2345, + 69705.6765, + 68314.937, + 66944.2705, + 65591.255, + 64252.9485, + 62938.016, + 61636.8225, + 60355.592, + 59092.789, + 57850.568, + 56624.518, + 55417.343, + 54231.1415, + 53067.387, + 51903.526, + 50774.649, + 49657.6415, + 48561.05, + 47475.7575, + 46410.159, + 45364.852, + 44327.053, + 43318.4005, + 42325.6165, + 41348.4595, + 40383.6265, + 39436.77, + 38509.502, + 37594.035, + 36695.939, + 35818.6895, + 34955.691, + 34115.8095, + 33293.949, + 32465.0775, + 31657.6715, + 30877.2585, + 30093.78, + 29351.3695, + 28594.1365, + 27872.115, + 27168.7465, + 26477.076, + 25774.541, + 25106.5375, + 24452.5135, + 23815.5125, + 23174.0655, + 22555.2685, + 21960.2065, + 21376.3555, + 20785.1925, + 20211.517, + 19657.0725, + 19141.6865, + 18579.737, + 18081.3955, + 17578.995, + 17073.44, + 16608.335, + 16119.911, + 15651.266, + 15194.583, + 14749.0495, + 14343.4835, + 13925.639, + 13504.509, + 13099.3885, + 12691.2855, + 12328.018, + 11969.0345, + 11596.5145, + 11245.6355, + 10917.6575, + 10580.9785, + 10277.8605, + 9926.58100000001, + 9605.538, + 9300.42950000003, + 8989.97850000003, + 8728.73249999998, + 8448.3235, + 8175.31050000002, + 7898.98700000002, + 7629.79100000003, + 7413.76199999999, + 7149.92300000001, + 6921.12650000001, + 6677.1545, + 6443.28000000003, + 6278.23450000002, + 6014.20049999998, + 5791.20299999998, + 5605.78450000001, + 5438.48800000001, + 5234.2255, + 5059.6825, + 4887.43349999998, + 4682.935, + 4496.31099999999, + 4322.52250000002, + 4191.42499999999, + 4021.24200000003, + 3900.64799999999, + 3762.84250000003, + 3609.98050000001, + 3502.29599999997, + 3363.84250000003, + 3206.54849999998, + 3079.70000000001, + 2971.42300000001, + 2867.80349999998, + 2727.08100000001, + 2630.74900000001, + 2496.6165, + 2440.902, + 2356.19150000002, + 2235.58199999999, + 2120.54149999999, + 2012.25449999998, + 1933.35600000003, + 1820.93099999998, + 1761.54800000001, + 1663.09350000002, + 1578.84600000002, + 1509.48149999999, + 1427.3345, + 1379.56150000001, + 1306.68099999998, + 1212.63449999999, + 1084.17300000001, + 1124.16450000001, + 1060.69949999999, + 1007.48849999998, + 941.194499999983, + 879.880500000028, + 836.007500000007, + 782.802000000025, + 748.385499999975, + 647.991500000004, + 626.730500000005, + 570.776000000013, + 484.000500000024, + 513.98550000001, + 418.985499999952, + 386.996999999974, + 370.026500000036, + 355.496999999974, + 356.731499999994, + 255.92200000002, + 259.094000000041, + 205.434499999974, + 165.374500000034, + 197.347500000033, + 95.718499999959, + 67.6165000000037, + 54.6970000000438, + 31.7395000000251, + -15.8784999999916, + 8.42500000004657, + -26.3754999999655, + -118.425500000012, + -66.6629999999423, + -42.9745000000112, + -107.364999999991, + -189.839000000036, + -162.611499999999, + -164.964999999967, + -189.079999999958, + -223.931499999948, + -235.329999999958, + -269.639500000048, + -249.087999999989, + -206.475499999942, + -283.04449999996, + -290.667000000016, + -304.561499999953, + -336.784499999951, + -380.386500000022, + -283.280499999993, + -364.533000000054, + -389.059499999974, + -364.454000000027, + -415.748000000021, + -417.155000000028, + ], + &[ + 189083.0, + 185696.913, + 182348.774, + 179035.946, + 175762.762, + 172526.444, + 169329.754, + 166166.099, + 163043.269, + 159958.91, + 156907.912, + 153906.845, + 150924.199, + 147996.568, + 145093.457, + 142239.233, + 139421.475, + 136632.27, + 133889.588, + 131174.2, + 128511.619, + 125868.621, + 123265.385, + 120721.061, + 118181.769, + 115709.456, + 113252.446, + 110840.198, + 108465.099, + 106126.164, + 103823.469, + 101556.618, + 99308.004, + 97124.508, + 94937.803, + 92833.731, + 90745.061, + 88677.627, + 86617.47, + 84650.442, + 82697.833, + 80769.132, + 78879.629, + 77014.432, + 75215.626, + 73384.587, + 71652.482, + 69895.93, + 68209.301, + 66553.669, + 64921.981, + 63310.323, + 61742.115, + 60205.018, + 58698.658, + 57190.657, + 55760.865, + 54331.169, + 52908.167, + 51550.273, + 50225.254, + 48922.421, + 47614.533, + 46362.049, + 45098.569, + 43926.083, + 42736.03, + 41593.473, + 40425.26, + 39316.237, + 38243.651, + 37170.617, + 36114.609, + 35084.19, + 34117.233, + 33206.509, + 32231.505, + 31318.728, + 30403.404, + 29540.0550000001, + 28679.236, + 27825.862, + 26965.216, + 26179.148, + 25462.08, + 24645.952, + 23922.523, + 23198.144, + 22529.128, + 21762.4179999999, + 21134.779, + 20459.117, + 19840.818, + 19187.04, + 18636.3689999999, + 17982.831, + 17439.7389999999, + 16874.547, + 16358.2169999999, + 15835.684, + 15352.914, + 14823.681, + 14329.313, + 13816.897, + 13342.874, + 12880.882, + 12491.648, + 12021.254, + 11625.392, + 11293.7610000001, + 10813.697, + 10456.209, + 10099.074, + 9755.39000000001, + 9393.18500000006, + 9047.57900000003, + 8657.98499999999, + 8395.85900000005, + 8033.0, + 7736.95900000003, + 7430.59699999995, + 7258.47699999996, + 6924.58200000005, + 6691.29399999999, + 6357.92500000005, + 6202.05700000003, + 5921.19700000004, + 5628.28399999999, + 5404.96799999999, + 5226.71100000001, + 4990.75600000005, + 4799.77399999998, + 4622.93099999998, + 4472.478, + 4171.78700000001, + 3957.46299999999, + 3868.95200000005, + 3691.14300000004, + 3474.63100000005, + 3341.67200000002, + 3109.14000000001, + 3071.97400000005, + 2796.40399999998, + 2756.17799999996, + 2611.46999999997, + 2471.93000000005, + 2382.26399999997, + 2209.22400000005, + 2142.28399999999, + 2013.96100000001, + 1911.18999999994, + 1818.27099999995, + 1668.47900000005, + 1519.65800000005, + 1469.67599999998, + 1367.13800000004, + 1248.52899999998, + 1181.23600000003, + 1022.71900000004, + 1088.20700000005, + 959.03600000008, + 876.095999999903, + 791.183999999892, + 703.337000000058, + 731.949999999953, + 586.86400000006, + 526.024999999907, + 323.004999999888, + 320.448000000091, + 340.672999999952, + 309.638999999966, + 216.601999999955, + 102.922999999952, + 19.2399999999907, + -0.114000000059605, + -32.6240000000689, + -89.3179999999702, + -153.497999999905, + -64.2970000000205, + -143.695999999996, + -259.497999999905, + -253.017999999924, + -213.948000000091, + -397.590000000084, + -434.006000000052, + -403.475000000093, + -297.958000000101, + -404.317000000039, + -528.898999999976, + -506.621000000043, + -513.205000000075, + -479.351000000024, + -596.139999999898, + -527.016999999993, + -664.681000000099, + -680.306000000099, + -704.050000000047, + -850.486000000034, + -757.43200000003, + -713.308999999892, + ], +]; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to write hll to bytes, err:{}", source))] + WriteHll { source: bytes::Error }, + + #[snafu(display("Failed to write hll to bytes, err:{}", source))] + ReadHll { source: bytes::Error }, +} + +pub type Result = std::result::Result; + +pub struct HyperLogLog { + alpha: f64, + p: u8, + m: usize, + M: Vec, + sip: SipHasher13, +} + +impl HyperLogLog { + pub fn new(error_rate: f64) -> Self { + Self::new_with_keys(error_rate, rand::random(), rand::random()) + } + + pub fn new_with_keys(error_rate: f64, key0: u64, key1: u64) -> Self { + assert!(error_rate > 0.0 && error_rate < 1.0); + let sr = 1.04 / error_rate; + let p = f64::ln(sr * sr).ceil() as u8; + assert!(p <= 64); + let alpha = Self::get_alpha(p); + let m = 1usize << p; + HyperLogLog { + alpha, + p, + m, + M: repeat(0u8).take(m).collect(), + sip: SipHasher13::new_with_keys(key0, key1), + } + } + + pub fn new_from_template(hll: &HyperLogLog) -> Self { + HyperLogLog { + alpha: hll.alpha, + p: hll.p, + m: hll.m, + M: repeat(0u8).take(hll.m).collect(), + sip: hll.sip, + } + } + + pub fn insert(&mut self, value: &V) { + let sip = &mut self.sip.clone(); + value.hash(sip); + let x = sip.finish(); + self.insert_by_hash_value(x); + } + + pub fn insert_by_hash_value(&mut self, x: u64) { + let j = x as usize & (self.m - 1); + let w = x >> self.p; + let rho = Self::get_rho(w, 64 - self.p); + let mjr = &mut self.M[j]; + if rho > *mjr { + *mjr = rho; + } + } + + pub fn len(&self) -> f64 { + let V = Self::vec_count_zero(&self.M); + if V > 0 { + let H = self.m as f64 * (self.m as f64 / V as f64).ln(); + if H <= Self::get_treshold(self.p) { + H + } else { + self.ep() + } + } else { + self.ep() + } + } + + pub fn is_empty(&self) -> bool { + self.len() == 0.0 + } + + pub fn merge(&mut self, src: &HyperLogLog) { + assert!(src.p == self.p); + assert!(src.m == self.m); + let sip1 = &mut src.sip.clone(); + let sip2 = &mut self.sip.clone(); + 42.hash(sip1); + 42.hash(sip2); + assert!(sip1.finish() == sip2.finish()); + for i in 0..self.m { + let (src_mir, mir) = (src.M[i], &mut self.M[i]); + if src_mir > *mir { + *mir = src_mir; + } + } + } + + pub fn clear(&mut self) { + self.M.iter_mut().all(|x| { + *x = 0; + true + }); + } + + pub fn write_to_buf(&self, buf: &mut B) -> Result<()> { + buf.write_f64(self.alpha).context(WriteHll)?; + buf.write_u8(self.p).context(WriteHll)?; + // self.m is the length of self.M + buf.write_u64(self.m as u64).context(WriteHll)?; + buf.write_slice(&self.M).context(WriteHll)?; + // Store keys of hasher + let (key0, key1) = self.sip.keys(); + buf.write_u64(key0).context(WriteHll)?; + buf.write_u64(key1).context(WriteHll) + } + + pub fn read_from_buf(buf: &mut B) -> Result { + let alpha = buf.read_f64().context(ReadHll)?; + let p = buf.read_u8().context(ReadHll)?; + let m = buf.read_u64().context(ReadHll)? as usize; + let mut m_buf = vec![0u8; m]; + buf.read_to_slice(&mut m_buf).context(ReadHll)?; + let key0 = buf.read_u64().context(ReadHll)?; + let key1 = buf.read_u64().context(ReadHll)?; + + Ok(HyperLogLog { + alpha, + p, + m, + M: m_buf, + sip: SipHasher13::new_with_keys(key0, key1), + }) + } + + fn get_treshold(p: u8) -> f64 { + TRESHOLD_DATA[p as usize] + } + + fn get_alpha(p: u8) -> f64 { + assert!((4..=16).contains(&p)); + match p { + 4 => 0.673, + 5 => 0.697, + 6 => 0.709, + _ => 0.7213 / (1.0 + 1.079 / (1usize << (p as usize)) as f64), + } + } + + fn bit_length(x: u64) -> u8 { + let mut bits: u8 = 0; + let mut xm = x; + while xm != 0 { + bits += 1; + xm >>= 1; + } + bits + } + + fn get_rho(w: u64, max_width: u8) -> u8 { + let rho = max_width - Self::bit_length(w) + 1; + assert!(rho > 0); + rho + } + + fn vec_count_zero(v: &[u8]) -> usize { + bytecount::count(v, 0) + } + + fn estimate_bias(E: f64, p: u8) -> f64 { + let bias_vector = BIAS_DATA[(p - 4) as usize]; + let nearest_neighbors = Self::get_nearest_neighbors(E, RAW_ESTIMATE_DATA[(p - 4) as usize]); + let sum = nearest_neighbors + .iter() + .fold(0.0, |acc, &neighbor| acc + bias_vector[neighbor]); + sum / nearest_neighbors.len() as f64 + } + + fn get_nearest_neighbors(E: f64, estimate_vector: &[f64]) -> Vec { + let ev_len = estimate_vector.len(); + let mut r: Vec<(f64, usize)> = repeat((0.0f64, 0usize)).take(ev_len).collect(); + for i in 0..ev_len { + let dr = E - estimate_vector[i]; + r[i] = (dr * dr, i); + } + r.sort_by(|a, b| { + if a < b { + Less + } else if a > b { + Greater + } else { + Equal + } + }); + r.truncate(6); + r.iter() + .map(|&ez| { + let (_, b) = ez; + b + }) + .collect() + } + + fn ep(&self) -> f64 { + let sum = self + .M + .iter() + .fold(0.0, |acc, &x| acc + 2.0f64.powi(-(x as i32))); + let E = self.alpha * (self.m * self.m) as f64 / sum; + if E <= (5 * self.m) as f64 { + E - Self::estimate_bias(E, self.p) + } else { + E + } + } +} + +#[test] +fn hyperloglog_test_simple() { + let mut hll = HyperLogLog::new(0.00408); + let keys = ["test1", "test2", "test3", "test2", "test2", "test2"]; + for k in &keys { + hll.insert(k); + } + assert!((hll.len().round() - 3.0).abs() < std::f64::EPSILON); + assert!(!hll.is_empty()); + hll.clear(); + assert!(hll.is_empty()); + assert!(hll.len() == 0.0); +} + +#[test] +fn hyperloglog_test_merge() { + let mut hll = HyperLogLog::new(0.00408); + let keys = ["test1", "test2", "test3", "test2", "test2", "test2"]; + for k in &keys { + hll.insert(k); + } + assert!((hll.len().round() - 3.0).abs() < std::f64::EPSILON); + + let mut hll2 = HyperLogLog::new_from_template(&hll); + let keys2 = ["test3", "test4", "test4", "test4", "test4", "test1"]; + for k in &keys2 { + hll2.insert(k); + } + assert!((hll2.len().round() - 3.0).abs() < std::f64::EPSILON); + + hll.merge(&hll2); + assert!((hll.len().round() - 4.0).abs() < std::f64::EPSILON); +} + +#[test] +fn hyperloglog_test_write_read() { + let mut hll = HyperLogLog::new(0.00408); + hll.insert(&123); + + let mut write_buf = Vec::new(); + hll.write_to_buf(&mut write_buf).unwrap(); + + let mut buf = &write_buf[..]; + let hll2 = HyperLogLog::read_from_buf(&mut buf).unwrap(); + + let error_margin = f64::EPSILON; + assert!((hll.alpha - hll2.alpha).abs() < error_margin); + assert_eq!(hll.p, hll2.p); + assert_eq!(hll.m, hll2.m); + assert_eq!(hll.M, hll2.M); + assert_eq!(hll.sip.keys(), hll2.sip.keys()); +} diff --git a/components/skiplist/Cargo.toml b/components/skiplist/Cargo.toml new file mode 100644 index 0000000000..f56e48d122 --- /dev/null +++ b/components/skiplist/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "skiplist" +version = "0.1.0" +authors = ["Jay Lee "] +edition = "2018" + +[dependencies] +rand = "0.7" +bytes = "1.0" +arena = { path = "../arena" } + +[dev-dependencies] +yatp = { git = "https://github.com/tikv/yatp.git", rev = "4b71f8abd86890f0d1e95778c2b6bf5a9ee4c502" } +criterion = "0.3" + +# [target.'cfg(not(target_env = "msvc"))'.dev-dependencies] +# tikv-jemallocator = "0.4.0" + +[[bench]] +name = "bench" +harness = false diff --git a/components/skiplist/benches/bench.rs b/components/skiplist/benches/bench.rs new file mode 100644 index 0000000000..4744bb558c --- /dev/null +++ b/components/skiplist/benches/bench.rs @@ -0,0 +1,181 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + collections::*, + sync::{atomic::*, *}, + thread, +}; + +use arena::MonoIncArena; +use bytes::*; +use criterion::*; +use rand::prelude::*; +use skiplist::*; + +// #[cfg(not(target_env = "msvc"))] +// use tikv_jemallocator::Jemalloc; + +// #[cfg(not(target_env = "msvc"))] +// #[global_allocator] +// static GLOBAL: Jemalloc = Jemalloc; + +fn skiplist_round( + l: &Skiplist, + case: &(Bytes, bool), + exp: &Bytes, +) { + if case.1 { + if let Some(v) = l.get(&case.0) { + assert_eq!(v, exp); + } + } else { + l.put(&case.0, exp); + } +} + +fn append_ts(key: &mut BytesMut, ts: u64) { + key.put_u64(ts); +} + +fn random_key(rng: &mut ThreadRng) -> Bytes { + let mut key = BytesMut::with_capacity(16); + unsafe { + rng.fill_bytes(&mut *(&mut key.chunk_mut()[..8] as *mut _ as *mut [u8])); + key.advance_mut(8); + } + append_ts(&mut key, 0); + key.freeze() +} + +fn bench_read_write_skiplist_frac(b: &mut Bencher<'_>, frac: &usize) { + let frac = *frac; + let value = Bytes::from_static(b"00123"); + let comp = FixedLengthSuffixComparator::new(8); + let arena = MonoIncArena::new(1 << 10); + let list = Skiplist::with_arena(comp, arena); + let l = list.clone(); + let stop = Arc::new(AtomicBool::new(false)); + let s = stop.clone(); + let v = value.clone(); + let handle = thread::spawn(move || { + let mut rng = rand::thread_rng(); + while !s.load(Ordering::SeqCst) { + let key = random_key(&mut rng); + let case = (key, frac > rng.gen_range(0, 11)); + skiplist_round(&l, &case, &v); + } + }); + let mut rng = rand::thread_rng(); + b.iter_batched_ref( + || (random_key(&mut rng), frac > rng.gen_range(0, 11)), + |case| skiplist_round(&list, case, &value), + BatchSize::SmallInput, + ); + stop.store(true, Ordering::SeqCst); + handle.join().unwrap(); +} + +fn bench_read_write_skiplist(c: &mut Criterion) { + let mut group = c.benchmark_group("skiplist_read_write"); + for i in 0..=10 { + group.bench_with_input( + BenchmarkId::from_parameter(i), + &i, + bench_read_write_skiplist_frac, + ); + } + group.finish(); +} + +fn map_round(m: &Mutex>, case: &(Bytes, bool), exp: &Bytes) { + if case.1 { + let rm = m.lock().unwrap(); + let value = rm.get(&case.0); + if let Some(v) = value { + assert_eq!(v, exp); + } + } else { + let mut rm = m.lock().unwrap(); + rm.insert(case.0.clone(), exp.clone()); + } +} + +fn bench_read_write_map_frac(b: &mut Bencher<'_>, frac: &usize) { + let frac = *frac; + let value = Bytes::from_static(b"00123"); + let map = Arc::new(Mutex::new(HashMap::with_capacity(512 << 10))); + let map_in_thread = map.clone(); + let stop = Arc::new(AtomicBool::new(false)); + let thread_stop = stop.clone(); + + let v = value.clone(); + let handle = thread::spawn(move || { + let mut rng = rand::thread_rng(); + while !thread_stop.load(Ordering::SeqCst) { + let f = rng.gen_range(0, 11); + let case = (random_key(&mut rng), f < frac); + map_round(&map_in_thread, &case, &v); + } + }); + let mut rng = rand::thread_rng(); + b.iter_batched_ref( + || { + let f = rng.gen_range(0, 11); + (random_key(&mut rng), f < frac) + }, + |case| map_round(&map, case, &value), + BatchSize::SmallInput, + ); + stop.store(true, Ordering::SeqCst); + handle.join().unwrap(); +} + +fn bench_read_write_map(c: &mut Criterion) { + let mut group = c.benchmark_group("map_read_write"); + for i in 0..=10 { + group.bench_with_input( + BenchmarkId::from_parameter(i), + &i, + bench_read_write_map_frac, + ); + } + group.finish(); +} + +fn bench_write_skiplist(c: &mut Criterion) { + let comp = FixedLengthSuffixComparator::new(8); + let arena = MonoIncArena::new(1 << 10); + let list = Skiplist::with_arena(comp, arena); + let value = Bytes::from_static(b"00123"); + let l = list.clone(); + let stop = Arc::new(AtomicBool::new(false)); + let s = stop.clone(); + let v = value.clone(); + let handle = thread::spawn(move || { + let mut rng = rand::thread_rng(); + while !s.load(Ordering::SeqCst) { + let case = (random_key(&mut rng), false); + skiplist_round(&l, &case, &v); + } + }); + let mut rng = rand::thread_rng(); + c.bench_function("skiplist_write", |b| { + b.iter_batched( + || random_key(&mut rng), + |key| { + list.put(&key, &value); + }, + BatchSize::SmallInput, + ) + }); + stop.store(true, Ordering::SeqCst); + handle.join().unwrap(); +} + +criterion_group!( + benches, + bench_read_write_skiplist, + bench_read_write_map, + bench_write_skiplist +); +criterion_main!(benches); diff --git a/components/skiplist/src/key.rs b/components/skiplist/src/key.rs new file mode 100644 index 0000000000..297e4e446d --- /dev/null +++ b/components/skiplist/src/key.rs @@ -0,0 +1,55 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::cmp::Ordering; + +use bytes::Bytes; + +pub trait KeyComparator: Clone { + fn compare_key(&self, lhs: &[u8], rhs: &[u8]) -> Ordering; + fn same_key(&self, lhs: &[u8], rhs: &[u8]) -> bool; +} + +#[derive(Default, Debug, Clone, Copy)] +pub struct FixedLengthSuffixComparator { + len: usize, +} + +impl FixedLengthSuffixComparator { + pub const fn new(len: usize) -> FixedLengthSuffixComparator { + FixedLengthSuffixComparator { len } + } +} + +impl KeyComparator for FixedLengthSuffixComparator { + #[inline] + fn compare_key(&self, lhs: &[u8], rhs: &[u8]) -> Ordering { + if lhs.len() < self.len { + panic!( + "cannot compare with suffix {}: {:?}", + self.len, + Bytes::copy_from_slice(lhs) + ); + } + if rhs.len() < self.len { + panic!( + "cannot compare with suffix {}: {:?}", + self.len, + Bytes::copy_from_slice(rhs) + ); + } + let (l_p, l_s) = lhs.split_at(lhs.len() - self.len); + let (r_p, r_s) = rhs.split_at(rhs.len() - self.len); + let res = l_p.cmp(r_p); + match res { + Ordering::Greater | Ordering::Less => res, + Ordering::Equal => l_s.cmp(r_s), + } + } + + #[inline] + fn same_key(&self, lhs: &[u8], rhs: &[u8]) -> bool { + let (l_p, _) = lhs.split_at(lhs.len() - self.len); + let (r_p, _) = rhs.split_at(rhs.len() - self.len); + l_p == r_p + } +} diff --git a/components/skiplist/src/lib.rs b/components/skiplist/src/lib.rs new file mode 100644 index 0000000000..ca7d13b1a8 --- /dev/null +++ b/components/skiplist/src/lib.rs @@ -0,0 +1,21 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Forked from +//! +//! Differences: +//! 1. Inline key and value in Node, so all memory of skiplist is allocated from +//! arena. Drawback: we have to copy the content of key/value +//! 2. Tower stores pointer to Node instead of offset, so we can use other arena +//! implementation +//! 3. Use [ArenaSlice] to replace Bytes +//! 4. impl Send/Sync for the iterator + +mod key; +mod list; +mod slice; + +const MAX_HEIGHT: usize = 20; + +pub use key::{FixedLengthSuffixComparator, KeyComparator}; +pub use list::{IterRef, Skiplist}; +pub use slice::ArenaSlice; diff --git a/components/skiplist/src/list.rs b/components/skiplist/src/list.rs new file mode 100644 index 0000000000..ae84d2c3e7 --- /dev/null +++ b/components/skiplist/src/list.rs @@ -0,0 +1,698 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + alloc::Layout, + convert::TryInto, + mem, ptr, + ptr::NonNull, + slice, + sync::{ + atomic::{AtomicPtr, AtomicUsize, Ordering}, + Arc, + }, +}; + +use arena::{Arena, BasicStats}; +use rand::Rng; + +use super::{slice::ArenaSlice, KeyComparator, MAX_HEIGHT}; + +const HEIGHT_INCREASE: u32 = u32::MAX / 3; + +type KeySize = u16; +type ValueSize = u32; + +/// The layout of Node +/// 1. height: usize +/// 2. tower: AtomicPtr x (height + 1) +/// 3. key_size: KeySize +/// 4. key: u8 x key_size +/// 5. value_size: ValueSize +/// 6. value: ValueSize +// Uses C layout to make sure tower is at the bottom +#[derive(Debug)] +#[repr(C)] +pub struct Node { + /// Height of node, different from badger, The valid range of tower is [0, + /// height] + height: usize, + /// The node tower + /// + /// Only [0, height] parts is utilized to store node pointer, the key and + /// value block are start from tower[height + 1] + tower: [AtomicPtr; MAX_HEIGHT], +} + +impl Node { + /// Allocate a new node from the arena, and copy the content of key/value + /// into the node + /// # Safety + /// - from_size_align_unchecked: align is got from [mem::align_of]. + /// # Notice + /// This will only allocate the *exact* amount of memory needed within the + /// given height. + fn alloc(arena: &A, key: &[u8], value: &[u8], height: usize) -> *mut Node + where + A: Arena, + { + // Calculate node size to alloc + let size = mem::size_of::(); + // Not all values in Node::tower will be utilized. + let not_used = (MAX_HEIGHT - height - 1) * mem::size_of::>(); + // Space to store key/value: (key size) + key + (value size) + value + let kv_used = + mem::size_of::() + key.len() + mem::size_of::() + value.len(); + // UB in fact: the `not_used` size is able to be access in a "safe" way. + // It is guaranteed by the user to not use those memory. + let alloc_size = size - not_used + kv_used; + let layout = + unsafe { Layout::from_size_align_unchecked(alloc_size, mem::align_of::()) }; + let node_ptr = arena.alloc(layout).as_ptr() as *mut Node; + unsafe { + let node = &mut *node_ptr; + node.height = height; + ptr::write_bytes(node.tower.as_mut_ptr(), 0, height + 1); + Self::init_key_value(node, key, value); + + node_ptr + } + } + + /// Fetch next node ptr in given height + fn next_ptr(&self, height: usize) -> *mut Node { + self.tower[height].load(Ordering::SeqCst) + } + + /// Get key + /// + /// REQUIRE: This Node is created via `Node::alloc()` + unsafe fn key(&self) -> &[u8] { + let (key_block, key_size) = self.load_key_size(); + + slice::from_raw_parts(key_block, key_size as usize) + } + + /// Get value + /// + /// REQUIRE: This Node is created via `Node::alloc()` + unsafe fn value(&self) -> &[u8] { + let (key_block, key_size) = self.load_key_size(); + let (value_block, value_size) = self.load_value_size(key_block, key_size); + + slice::from_raw_parts(value_block, value_size as usize) + } + + /// Set key and value parts of Node during creating Node + /// + /// Will copy the content of key and value to the Node + /// + /// REQUIRE: This Node is created via Arena and node.tower and node.height + /// is already set to correct value + /// Panic: The size of key/value must less than max value of + /// KeySize/ValueSize (u16/u32), otherwise this function will panic + unsafe fn init_key_value(node: &mut Node, key: &[u8], value: &[u8]) { + let key_block = node.tower.as_mut_ptr().add(node.height + 1) as *mut u8; + let key_size: KeySize = key.len().try_into().unwrap(); + let key_size_bytes = key_size.to_ne_bytes(); + + ptr::copy_nonoverlapping( + key_size_bytes.as_ptr(), + key_block, + mem::size_of::(), + ); + let key_block = key_block.add(mem::size_of::()); + ptr::copy_nonoverlapping(key.as_ptr(), key_block, key.len()); + + let value_block = key_block.add(key.len()); + let value_size: ValueSize = value.len().try_into().unwrap(); + let value_size_bytes = value_size.to_ne_bytes(); + + ptr::copy_nonoverlapping( + value_size_bytes.as_ptr(), + value_block, + mem::size_of::(), + ); + let value_block = value_block.add(mem::size_of::()); + ptr::copy_nonoverlapping(value.as_ptr(), value_block, value.len()); + } + + /// Load key pointer and size of key + /// + /// REQUIRE: This Node is created via `Node::alloc()` + unsafe fn load_key_size(&self) -> (*const u8, KeySize) { + let tower = self.tower.as_ptr(); + // Move to key block + let key_block = tower.add(self.height + 1) as *const u8; + // Load key size from key block + let key_size = u16::from_ne_bytes(*(key_block as *const [u8; mem::size_of::()])); + // Move key block to the start of key + let key_block = key_block.add(mem::size_of::()); + + (key_block, key_size) + } + + /// Load value pointer and size of value + /// + /// Given key_block and key_size returned from `load_key_size()`, loads + /// value pointer and value size + /// + /// REQUIRE: This Node is created via `Node::alloc()` + unsafe fn load_value_size( + &self, + key_block: *const u8, + key_size: KeySize, + ) -> (*const u8, ValueSize) { + // Move to value block + let value_block = key_block.add(key_size as usize); + // Load value size from value block + let value_size = + u32::from_ne_bytes(*(value_block as *const [u8; mem::size_of::()])); + // Move value block to the start of value + let value_block = value_block.add(mem::size_of::()); + + (value_block, value_size) + } + + /// Get key with arena + /// + /// REQUIRE: This Node is created via `Node::alloc()` + unsafe fn key_with_arena(&self, arena: A) -> ArenaSlice + where + A: Arena, + { + let (key_block, key_size) = self.load_key_size(); + + ArenaSlice::from_raw_parts(arena, key_block, key_size as usize) + } + + /// Get value with arena + /// + /// REQUIRE: This Node is created via `Node::alloc()` + unsafe fn value_with_arena(&self, arena: A) -> ArenaSlice + where + A: Arena, + { + let (key_block, key_size) = self.load_key_size(); + let (value_block, value_size) = self.load_value_size(key_block, key_size); + + ArenaSlice::from_raw_parts(arena, value_block, value_size as usize) + } +} + +struct SkiplistCore> { + height: AtomicUsize, + head: NonNull, + arena: A, +} + +/// FIXME(yingwen): Modify the skiplist to support arena that supports growth, +/// otherwise it is hard to avoid memory usage not out of the arena capacity +#[derive(Clone)] +pub struct Skiplist + Clone> { + core: Arc>, + c: C, +} + +impl + Clone> Skiplist { + pub fn with_arena(c: C, arena: A) -> Skiplist { + let head = Node::alloc(&arena, &[], &[], MAX_HEIGHT - 1); + let head = unsafe { NonNull::new_unchecked(head) }; + Skiplist { + core: Arc::new(SkiplistCore { + height: AtomicUsize::new(0), + head, + arena, + }), + c, + } + } + + fn random_height(&self) -> usize { + let mut rng = rand::thread_rng(); + for h in 0..(MAX_HEIGHT - 1) { + if !rng.gen_ratio(HEIGHT_INCREASE, u32::MAX) { + return h; + } + } + MAX_HEIGHT - 1 + } + + fn height(&self) -> usize { + self.core.height.load(Ordering::SeqCst) + } +} + +impl + Clone> Skiplist { + /// Finds the node near to key. + /// + /// If less=true, it finds rightmost node such that node.key < key (if + /// allow_equal=false) or node.key <= key (if allow_equal=true). + /// If less=false, it finds leftmost node such that node.key > key (if + /// allowEqual=false) or node.key >= key (if allow_equal=true). + /// Returns the node found. + unsafe fn find_near(&self, key: &[u8], less: bool, allow_equal: bool) -> *const Node { + let mut cursor: *const Node = self.core.head.as_ptr(); + let mut level = self.height(); + loop { + // Assume cursor.key < key + let next_ptr = (&*cursor).next_ptr(level); + if next_ptr.is_null() { + // cursor.key < key < END OF LIST + if level > 0 { + // Can descend further to iterate closer to the end + level -= 1; + continue; + } + // 1. Level=0. Cannot descend further. Let's return something that makes sense + // 2. Try to return cursor. Make sure it is not a head node + if !less || cursor == self.core.head.as_ptr() { + return ptr::null(); + } + return cursor; + } + + let next = &*next_ptr; + let res = self.c.compare_key(key, next.key()); + if res == std::cmp::Ordering::Greater { + // cursor.key < next.key < key. We can continue to move right + cursor = next_ptr; + continue; + } + if res == std::cmp::Ordering::Equal { + // cursor.key < key == next.key + if allow_equal { + return next; + } + if !less { + // We want >, so go to base level to grab the next bigger node + return next.next_ptr(0); + } + // We want <. If not base level, we should go closer in the next level. + if level > 0 { + level -= 1; + continue; + } + // On base level. Return cursor + if cursor == self.core.head.as_ptr() { + return ptr::null(); + } + return cursor; + } + // cursor.key < key < next.key + if level > 0 { + level -= 1; + continue; + } + // At base level. Need to return something + if !less { + return next; + } + // Try to return cursor. Make sure it is not a head node + if cursor == self.core.head.as_ptr() { + return ptr::null(); + } + return cursor; + } + } + + /// Returns (out_before, out_after) with out_before.key <= key <= + /// out_after.key + /// + /// The input `before` tells us where to start looking + /// If we found a node with the same key, then we return out_before = + /// out_after. Otherwise, out_before.key < key < out_after.key + unsafe fn find_splice_for_level( + &self, + key: &[u8], + mut before: *mut Node, + level: usize, + ) -> (*mut Node, *mut Node) { + loop { + // Assume before.key < key + let next_ptr = (&*before).next_ptr(level); + if next_ptr.is_null() { + return (before, ptr::null_mut()); + } + let next_node = &*next_ptr; + match self.c.compare_key(key, next_node.key()) { + // Equality case + std::cmp::Ordering::Equal => return (next_ptr, next_ptr), + // before.key < key < next.key. We are done for this level + std::cmp::Ordering::Less => return (before, next_ptr), + // Keep moving right on this level + _ => before = next_ptr, + } + } + } + + /// Put the key-value into the skiplist if the key does not exists. + /// + /// The content of key and value will be copied into the list. Returns true + /// if the node is inserted, otherwise return false (key is duplicated) + /// + /// Panic: The skiplist will panic if the allocated memory + /// out of the capacity + pub fn put(&self, key: &[u8], value: &[u8]) -> bool { + let mut list_height = self.height(); + let mut prev = [ptr::null_mut(); MAX_HEIGHT + 1]; + let mut next = [ptr::null_mut(); MAX_HEIGHT + 1]; + prev[list_height + 1] = self.core.head.as_ptr(); + // Recompute splice levels + for i in (0..=list_height).rev() { + // Use higher level to speed up for current level + let (p, n) = unsafe { self.find_splice_for_level(key, prev[i + 1], i) }; + prev[i] = p; + next[i] = n; + if p == n { + // Key already exists + return false; + } + } + + // Create a new node + let height = self.random_height(); + let node_ptr = Node::alloc(&self.core.arena, key, value, height); + + // Try to increase skiplist height via CAS + while height > list_height { + match self.core.height.compare_exchange_weak( + list_height, + height, + Ordering::SeqCst, + Ordering::SeqCst, + ) { + // Successfully increased skiplist height + Ok(_) => break, + Err(h) => list_height = h, + } + } + + // We always insert from the base level and up. After you add a node in base + // leve, we cannot create a node in the level above because it would + // have discovered the node in the base level + let x: &mut Node = unsafe { &mut *node_ptr }; + for i in 0..=height { + loop { + if prev[i].is_null() { + // This cannot happen in base level + assert!(i > 1); + // We haven't computed prev, next for this level because height exceeds old + // list_height. For these levels, we expect the lists to be + // sparse, so we can just search from head. + let (p, n) = + unsafe { self.find_splice_for_level(x.key(), self.core.head.as_ptr(), i) }; + prev[i] = p; + next[i] = n; + // Someone adds the exact same key before we are able to do so. This can only + // happen on the base level. But we know we are not on the + // base level. + assert_ne!(p, n); + } + x.tower[i].store(next[i], Ordering::SeqCst); + match unsafe { &*prev[i] }.tower[i].compare_exchange( + next[i], + node_ptr, + Ordering::SeqCst, + Ordering::SeqCst, + ) { + // Managed to insert x between prev[i] and next[i]. Go to the next level. + Ok(_) => break, + Err(_) => { + // CAS failed. We need to recompute prev and next. + // It is unlikely to be helpful to try to use a different level as we redo + // the search, because it is unlikely that lots of + // nodes are inserted between prev[i] and next[i]. + let (p, n) = unsafe { self.find_splice_for_level(x.key(), prev[i], i) }; + if p == n { + assert_eq!(i, 0); + return false; + } + prev[i] = p; + next[i] = n; + } + } + } + } + true + } + + /// Returns if the skiplist is empty + pub fn is_empty(&self) -> bool { + let node = self.core.head.as_ptr(); + let next_ptr = unsafe { (&*node).next_ptr(0) }; + next_ptr.is_null() + } + + /// Returns len of the skiplist + pub fn len(&self) -> usize { + let mut node = self.core.head.as_ptr(); + let mut count = 0; + loop { + let next_ptr = unsafe { (&*node).next_ptr(0) }; + if !next_ptr.is_null() { + count += 1; + node = next_ptr; + continue; + } + return count; + } + } + + /// Returns the last element. If head (empty list), we return null. All the + /// find functions will NEVER return the head nodes + fn find_last(&self) -> *const Node { + let mut node = self.core.head.as_ptr(); + let mut level = self.height(); + loop { + let next_ptr = unsafe { (&*node).next_ptr(level) }; + if !next_ptr.is_null() { + node = next_ptr; + continue; + } + // next is null + if level == 0 { + if node == self.core.head.as_ptr() { + return ptr::null(); + } + return node; + } + level -= 1; + } + } + + /// Gets the value associated with the key. It returns a valid value if it + /// finds equal or earlier version of the same key. + pub fn get(&self, key: &[u8]) -> Option<&[u8]> { + if let Some((_, value)) = self.get_with_key(key) { + Some(value) + } else { + None + } + } + + /// Gets the key and value associated with the key. It returns a valid value + /// if it finds equal or earlier version of the same key. + pub fn get_with_key(&self, key: &[u8]) -> Option<(&[u8], &[u8])> { + // Find greater or equal + let node = unsafe { self.find_near(key, false, true) }; + if node.is_null() { + return None; + } + if self.c.same_key(unsafe { (*node).key() }, key) { + return Some(unsafe { ((*node).key(), (*node).value()) }); + } + None + } + + /// Returns a skiplist iterator + pub fn iter_ref(&self) -> IterRef<&Skiplist, C, A> { + IterRef { + list: self, + cursor: ptr::null(), + _key_cmp: std::marker::PhantomData, + _arena: std::marker::PhantomData, + } + } + + /// Returns a skiplist iterator + pub fn iter(&self) -> IterRef, C, A> { + IterRef { + list: self.clone(), + cursor: ptr::null(), + _key_cmp: std::marker::PhantomData, + _arena: std::marker::PhantomData, + } + } + + /// Consider the total bytes allocated by the arena (not the bytes used). + pub fn mem_size(&self) -> u32 { + self.core.arena.stats().bytes_allocated() as u32 + } +} + +impl + Clone> AsRef> for Skiplist { + fn as_ref(&self) -> &Skiplist { + self + } +} + +unsafe impl + Clone + Send> Send for Skiplist {} +unsafe impl + Clone + Sync> Sync for Skiplist {} + +pub struct IterRef +where + T: AsRef>, + A: Arena + Clone, +{ + list: T, + cursor: *const Node, + _key_cmp: std::marker::PhantomData, + _arena: std::marker::PhantomData, +} + +impl>, C: KeyComparator, A: Arena + Clone> + IterRef +{ + pub fn valid(&self) -> bool { + !self.cursor.is_null() + } + + pub fn key(&self) -> &[u8] { + assert!(self.valid()); + unsafe { (*self.cursor).key() } + } + + pub fn value(&self) -> &[u8] { + assert!(self.valid()); + unsafe { (*self.cursor).value() } + } + + pub fn next(&mut self) { + assert!(self.valid()); + unsafe { + self.cursor = (&*self.cursor).next_ptr(0); + } + } + + pub fn prev(&mut self) { + assert!(self.valid()); + unsafe { + self.cursor = self.list.as_ref().find_near(self.key(), true, false); + } + } + + pub fn seek(&mut self, target: &[u8]) { + unsafe { + self.cursor = self.list.as_ref().find_near(target, false, true); + } + } + + pub fn seek_for_prev(&mut self, target: &[u8]) { + unsafe { + self.cursor = self.list.as_ref().find_near(target, true, true); + } + } + + pub fn seek_to_first(&mut self) { + unsafe { + self.cursor = (&*self.list.as_ref().core.head.as_ptr()).next_ptr(0); + } + } + + pub fn seek_to_last(&mut self) { + self.cursor = self.list.as_ref().find_last(); + } + + pub fn key_with_arena(&self) -> ArenaSlice { + assert!(self.valid()); + unsafe { (*self.cursor).key_with_arena(self.list.as_ref().core.arena.clone()) } + } + + pub fn value_with_arena(&self) -> ArenaSlice { + assert!(self.valid()); + unsafe { (*self.cursor).value_with_arena(self.list.as_ref().core.arena.clone()) } + } +} + +unsafe impl>, C: Send, A: Arena + Clone + Send> Send + for IterRef +{ +} +unsafe impl>, C: Sync, A: Arena + Clone + Sync> Sync + for IterRef +{ +} + +#[cfg(test)] +mod tests { + use arena::MonoIncArena; + use bytes::Bytes; + + use super::*; + use crate::FixedLengthSuffixComparator; + + #[test] + fn test_node_alloc() { + let arena = MonoIncArena::new(1 << 10); + let key = b"key of node"; + let value = b"value of node"; + let node_ptr = Node::alloc(&arena, key, value, 5); + unsafe { + let node = &*node_ptr; + assert_eq!(5, node.height); + for i in 0..=node.height { + assert!(node.tower[i].load(Ordering::SeqCst).is_null()); + } + assert_eq!(key, node.key()); + assert_eq!(value, node.value()); + } + } + + #[test] + fn test_find_near() { + let comp = FixedLengthSuffixComparator::new(8); + let arena = MonoIncArena::new(1 << 10); + let list = Skiplist::with_arena(comp, arena); + for i in 0..1000 { + let key = Bytes::from(format!("{:05}{:08}", i * 10 + 5, 0)); + let value = Bytes::from(format!("{:05}", i)); + list.put(&key, &value); + } + let mut cases = vec![ + ("00001", false, false, Some("00005")), + ("00001", false, true, Some("00005")), + ("00001", true, false, None), + ("00001", true, true, None), + ("00005", false, false, Some("00015")), + ("00005", false, true, Some("00005")), + ("00005", true, false, None), + ("00005", true, true, Some("00005")), + ("05555", false, false, Some("05565")), + ("05555", false, true, Some("05555")), + ("05555", true, false, Some("05545")), + ("05555", true, true, Some("05555")), + ("05558", false, false, Some("05565")), + ("05558", false, true, Some("05565")), + ("05558", true, false, Some("05555")), + ("05558", true, true, Some("05555")), + ("09995", false, false, None), + ("09995", false, true, Some("09995")), + ("09995", true, false, Some("09985")), + ("09995", true, true, Some("09995")), + ("59995", false, false, None), + ("59995", false, true, None), + ("59995", true, false, Some("09995")), + ("59995", true, true, Some("09995")), + ]; + for (i, (key, less, allow_equal, exp)) in cases.drain(..).enumerate() { + let seek_key = Bytes::from(format!("{}{:08}", key, 0)); + let res = unsafe { list.find_near(&seek_key, less, allow_equal) }; + if exp.is_none() { + assert!(res.is_null(), "{}", i); + continue; + } + let e = format!("{}{:08}", exp.unwrap(), 0); + assert_eq!(unsafe { (*res).key() }, e.as_bytes(), "{}", i); + } + } +} diff --git a/components/skiplist/src/slice.rs b/components/skiplist/src/slice.rs new file mode 100644 index 0000000000..fb2fe9b0b1 --- /dev/null +++ b/components/skiplist/src/slice.rs @@ -0,0 +1,74 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Slice with arena + +use std::{fmt, ops::Deref, slice}; + +use arena::{Arena, BasicStats}; + +/// Arena slice +/// +/// A slice allocated from the arena, it will holds the reference to the arena +/// so it is safe to clone and deref the slice +#[derive(Clone)] +pub struct ArenaSlice> { + /// Arena the slice memory allocated from. + _arena: A, + /// The slice pointer. + slice_ptr: *const u8, + /// The slice len. + slice_len: usize, +} + +impl> ArenaSlice { + /// Create a [ArenaSlice] + /// + /// See the documentation of [`slice::from_raw_parts`] for slice safety + /// requirements. + pub(crate) unsafe fn from_raw_parts(_arena: A, slice_ptr: *const u8, slice_len: usize) -> Self { + Self { + _arena, + slice_ptr, + slice_len, + } + } +} + +unsafe impl + Send> Send for ArenaSlice {} +unsafe impl + Sync> Sync for ArenaSlice {} + +impl> Deref for ArenaSlice { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + unsafe { slice::from_raw_parts(self.slice_ptr, self.slice_len) } + } +} + +impl> fmt::Debug for ArenaSlice { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_list().entries(self.iter()).finish() + } +} + +#[cfg(test)] +mod tests { + use std::{alloc::Layout, mem, ptr}; + + use arena::MonoIncArena; + + use super::*; + + #[test] + fn test_arena_slice() { + let hello = b"hello"; + let arena = MonoIncArena::new(1 << 10); + let slice = unsafe { + let data_ptr = arena + .alloc(Layout::from_size_align(hello.len(), mem::align_of_val(hello)).unwrap()); + ptr::copy_nonoverlapping(hello.as_ptr(), data_ptr.as_ptr(), hello.len()); + ArenaSlice::from_raw_parts(arena, data_ptr.as_ptr(), hello.len()) + }; + assert_eq!(hello, &slice[..]); + } +} diff --git a/components/skiplist/tests/tests.rs b/components/skiplist/tests/tests.rs new file mode 100644 index 0000000000..78a5d81f78 --- /dev/null +++ b/components/skiplist/tests/tests.rs @@ -0,0 +1,261 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + str, + sync::{atomic::*, *}, + thread::yield_now, + time::Duration, +}; + +use arena::MonoIncArena; +use bytes::*; +use skiplist::*; +use yatp::task::callback::Handle; + +fn new_value(v: usize) -> Bytes { + Bytes::from(format!("{:05}", v)) +} + +fn key_with_ts(key: &str, ts: u64) -> Bytes { + Bytes::from(format!("{}{:08}", key, ts)) +} + +#[test] +fn test_empty() { + let key = key_with_ts("aaa", 0); + let comp = FixedLengthSuffixComparator::new(8); + let arena = MonoIncArena::new(1 << 10); + let list = Skiplist::with_arena(comp, arena); + let v = list.get(&key); + assert!(v.is_none()); + + let mut iter = list.iter_ref(); + assert!(!iter.valid()); + iter.seek_to_first(); + assert!(!iter.valid()); + iter.seek_to_last(); + assert!(!iter.valid()); + iter.seek(&key); + assert!(!iter.valid()); + assert!(list.is_empty()); +} + +#[test] +fn test_basic() { + let comp = FixedLengthSuffixComparator::new(8); + let arena = MonoIncArena::new(1 << 10); + let list = Skiplist::with_arena(comp, arena); + let table = vec![ + ("key1", new_value(42)), + ("key2", new_value(52)), + ("key3", new_value(62)), + ("key5", Bytes::from(format!("{:0102400}", 1))), + ("key4", new_value(72)), + ]; + + for (key, value) in &table { + list.put(&key_with_ts(*key, 0), value); + } + + assert_eq!(list.get(&key_with_ts("key", 0)), None); + assert_eq!(list.len(), 5); + assert!(!list.is_empty()); + for (key, value) in &table { + let get_key = key_with_ts(*key, 0); + assert_eq!(list.get(&get_key), Some(&value[..]), "{}", key); + } +} + +fn test_concurrent_basic(n: usize, value_len: usize) { + let pool = yatp::Builder::new("concurrent_basic").build_callback_pool(); + let comp = FixedLengthSuffixComparator::new(8); + let arena = MonoIncArena::new(1 << 10); + let list = Skiplist::with_arena(comp, arena); + let kvs: Vec<_> = (0..n) + .map(|i| { + ( + key_with_ts(format!("{:05}", i).as_str(), 0), + Bytes::from(format!("{1:00$}", value_len, i)), + ) + }) + .collect(); + let (tx, rx) = mpsc::channel(); + for (k, v) in kvs.clone() { + let tx = tx.clone(); + let list = list.clone(); + pool.spawn(move |_: &mut Handle<'_>| { + list.put(&k, &v); + tx.send(()).unwrap(); + }) + } + for _ in 0..n { + rx.recv_timeout(Duration::from_secs(3)).unwrap(); + } + for (k, v) in kvs { + let tx = tx.clone(); + let list = list.clone(); + pool.spawn(move |_: &mut Handle<'_>| { + let val = list.get(&k); + assert_eq!(val, Some(&v[..]), "{:?}", k); + tx.send(()).unwrap(); + }); + } + for _ in 0..n { + rx.recv_timeout(Duration::from_secs(3)).unwrap(); + } + assert_eq!(list.len(), n); +} + +#[test] +fn test_concurrent_basic_small_value() { + test_concurrent_basic(1000, 5); +} + +#[test] +fn test_concurrent_basic_big_value() { + test_concurrent_basic(100, 1048576); +} + +#[test] +fn test_one_key() { + let n = 10000; + let write_pool = yatp::Builder::new("one_key_write").build_callback_pool(); + let read_pool = yatp::Builder::new("one_key_read").build_callback_pool(); + let comp = FixedLengthSuffixComparator::new(8); + let arena = MonoIncArena::new(1 << 10); + let list = Skiplist::with_arena(comp, arena); + let key = key_with_ts("thekey", 0); + let (tx, rx) = mpsc::channel(); + list.put(&key, &new_value(0)); + for i in 0..n { + let tx = tx.clone(); + let list = list.clone(); + let key = key.clone(); + let value = new_value(i); + write_pool.spawn(move |_: &mut Handle<'_>| { + list.put(&key, &value); + tx.send("w").unwrap(); + yield_now(); + }) + } + let mark = Arc::new(AtomicBool::new(false)); + for _ in 0..n { + let tx = tx.clone(); + let list = list.clone(); + let mark = mark.clone(); + let key = key.clone(); + read_pool.spawn(move |_: &mut Handle<'_>| { + let val = list.get(&key); + if val.is_none() { + return; + } + let s = unsafe { str::from_utf8_unchecked(val.unwrap()) }; + let val: usize = s.parse().unwrap(); + assert!(val < n); + mark.store(true, Ordering::SeqCst); + tx.send("r").unwrap(); + yield_now(); + }); + } + let mut r = 0; + let mut w = 0; + for _ in 0..(n * 2) { + match rx.recv_timeout(Duration::from_secs(3)) { + Ok("w") => w += 1, + Ok("r") => r += 1, + Err(err) => panic!("timeout on receiving r{} w{} msg {:?}", r, w, err), + _ => panic!("unexpected value"), + } + } + assert_eq!(list.len(), 1); + assert!(mark.load(Ordering::SeqCst)); +} + +#[test] +fn test_iterator_next() { + let n = 100; + let comp = FixedLengthSuffixComparator::new(8); + let arena = MonoIncArena::new(1 << 10); + let list = Skiplist::with_arena(comp, arena); + let mut iter_ref = list.iter_ref(); + assert!(!iter_ref.valid()); + iter_ref.seek_to_first(); + assert!(!iter_ref.valid()); + for i in (0..n).rev() { + let key = key_with_ts(format!("{:05}", i).as_str(), 0); + list.put(&key, &new_value(i)); + } + iter_ref.seek_to_first(); + for i in 0..n { + assert!(iter_ref.valid()); + let v = iter_ref.value(); + assert_eq!(v, &new_value(i)); + iter_ref.next(); + } + assert!(!iter_ref.valid()); +} + +#[test] +fn test_iterator_prev() { + let n = 100; + let comp = FixedLengthSuffixComparator::new(8); + let arena = MonoIncArena::new(1 << 10); + let list = Skiplist::with_arena(comp, arena); + let mut iter_ref = list.iter_ref(); + assert!(!iter_ref.valid()); + iter_ref.seek_to_last(); + assert!(!iter_ref.valid()); + for i in (0..n).rev() { + let key = key_with_ts(format!("{:05}", i).as_str(), 0); + list.put(&key, &new_value(i)); + } + iter_ref.seek_to_last(); + for i in (0..n).rev() { + assert!(iter_ref.valid()); + let v = iter_ref.value(); + assert_eq!(v, &new_value(i)); + iter_ref.prev(); + } + assert!(!iter_ref.valid()); +} + +#[test] +fn test_iterator_seek() { + let n = 100; + let comp = FixedLengthSuffixComparator::new(8); + let arena = MonoIncArena::new(1 << 10); + let list = Skiplist::with_arena(comp, arena); + let mut iter_ref = list.iter_ref(); + assert!(!iter_ref.valid()); + iter_ref.seek_to_first(); + assert!(!iter_ref.valid()); + for i in (0..n).rev() { + let v = i * 10 + 1000; + let key = key_with_ts(format!("{:05}", v).as_str(), 0); + list.put(&key, &new_value(v)); + } + iter_ref.seek_to_first(); + assert!(iter_ref.valid()); + assert_eq!(iter_ref.value(), b"01000" as &[u8]); + + let cases = vec![ + ("00000", Some(b"01000"), None), + ("01000", Some(b"01000"), Some(b"01000")), + ("01005", Some(b"01010"), Some(b"01000")), + ("01010", Some(b"01010"), Some(b"01010")), + ("99999", None, Some(b"01990")), + ]; + for (key, seek_expect, for_prev_expect) in cases { + let key = key_with_ts(key, 0); + iter_ref.seek(&key); + assert_eq!(iter_ref.valid(), seek_expect.is_some()); + if let Some(v) = seek_expect { + assert_eq!(iter_ref.value(), &v[..]); + } + iter_ref.seek_for_prev(&key); + assert_eq!(iter_ref.valid(), for_prev_expect.is_some()); + if let Some(v) = for_prev_expect { + assert_eq!(iter_ref.value(), &v[..]); + } + } +} diff --git a/components/tracing/Cargo.toml b/components/tracing/Cargo.toml new file mode 100644 index 0000000000..dc493f02cc --- /dev/null +++ b/components/tracing/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "tracing" +version = "0.1.0" +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +upstream = { version = "0.1.26", package = "tracing" } diff --git a/components/tracing/src/lib.rs b/components/tracing/src/lib.rs new file mode 100644 index 0000000000..5cdff967b6 --- /dev/null +++ b/components/tracing/src/lib.rs @@ -0,0 +1,5 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Tracing core lib re-export. + +pub use upstream::*; diff --git a/components/tracing_examples/Cargo.toml b/components/tracing_examples/Cargo.toml new file mode 100644 index 0000000000..b8bea30722 --- /dev/null +++ b/components/tracing_examples/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "trace_examples" +version = "0.1.0" +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +tracing = { path = "../tracing" } +tracing_util = { path = "../tracing_util" } diff --git a/components/tracing_examples/examples/init_tracing_with_file.rs b/components/tracing_examples/examples/init_tracing_with_file.rs new file mode 100644 index 0000000000..75f89f6dca --- /dev/null +++ b/components/tracing_examples/examples/init_tracing_with_file.rs @@ -0,0 +1,41 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use tracing_util::{init_tracing_with_file, tracing_appender::rolling::Rotation}; + +#[tracing::instrument(level = "debug")] +fn nth_fibonacci(n: u64) -> u64 { + if n == 0 || n == 1 { + 1 + } else { + nth_fibonacci(n - 1) + nth_fibonacci(n - 2) + } +} + +// default leve info +#[tracing::instrument] +fn fibonacci_seq(to: u64) -> Vec { + let mut sequence = vec![]; + + for n in 0..=to { + sequence.push(nth_fibonacci(n)); + } + + sequence +} + +// cargo run --example init_tracing_with_file +// log file: /tmp/test_logs/init_tracing_with_file +// 2021-09-28T22:41:30.362078+08:00 INFO main ThreadId(01) fibonacci_seq{to=5}: +// init_tracing_with_file: enter 2021-09-28T22:41:30.364181+08:00 INFO main +// ThreadId(01) fibonacci_seq{to=5}: init_tracing_with_file: close +// time.busy=2.13ms time.idle=34.8µs +fn main() { + let _g = init_tracing_with_file( + "init_tracing_with_file", + "/tmp/test_logs", + "info", + Rotation::NEVER, + ); + let ret = fibonacci_seq(5); + println!("{:?}", ret); +} diff --git a/components/tracing_util/Cargo.toml b/components/tracing_util/Cargo.toml new file mode 100644 index 0000000000..15eb11520a --- /dev/null +++ b/components/tracing_util/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "tracing_util" +version = "0.1.0" +authors = ["Databend Authors "] +license = "Apache-2.0" +publish = false +edition = "2018" + +[dependencies] # In alphabetical order +lazy_static = "1.4.0" +tracing = "0.1.26" +tracing-appender = "0.1.2" +tracing-subscriber = "0.2.20" diff --git a/components/tracing_util/src/lib.rs b/components/tracing_util/src/lib.rs new file mode 100644 index 0000000000..69c7432fd4 --- /dev/null +++ b/components/tracing_util/src/lib.rs @@ -0,0 +1,22 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Copyright 2020 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// fork from:https://github.com/datafuselabs/databend/tree/master/common/tracing + +mod logging; + +pub use logging::{init_default_tracing, init_default_ut_tracing, init_tracing_with_file}; +pub use tracing_appender; diff --git a/components/tracing_util/src/logging.rs b/components/tracing_util/src/logging.rs new file mode 100644 index 0000000000..7a314608f5 --- /dev/null +++ b/components/tracing_util/src/logging.rs @@ -0,0 +1,147 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Copyright 2020 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{ + fs::OpenOptions, + path::Path, + sync::{Arc, Mutex, Once}, +}; + +use fmt::format::FmtSpan; +use lazy_static::lazy_static; +use tracing::Subscriber; +use tracing_appender::{ + non_blocking::WorkerGuard, + rolling::{RollingFileAppender, Rotation}, +}; +use tracing_subscriber::{ + fmt, + fmt::{time::ChronoLocal, Layer}, + prelude::*, + registry::Registry, + EnvFilter, +}; + +/// Write logs to stdout. +pub fn init_default_tracing() { + static START: Once = Once::new(); + + START.call_once(|| { + init_tracing_stdout(); + }); +} + +/// Init tracing for unittest. +/// Write logs to file `unittest`. +pub fn init_default_ut_tracing() { + static START: Once = Once::new(); + + START.call_once(|| { + let mut g = GLOBAL_UT_LOG_GUARD.as_ref().lock().unwrap(); + let (work_guard, sub) = init_file_subscriber("unittest", "_logs"); + tracing::subscriber::set_global_default(sub) + .expect("error setting global tracing subscriber"); + + tracing::info!("init default ut tracing"); + *g = Some(work_guard); + }); +} + +lazy_static! { + static ref GLOBAL_UT_LOG_GUARD: Arc>> = Arc::new(Mutex::new(None)); +} + +fn init_tracing_stdout() { + let fmt_layer = Layer::default() + .with_thread_ids(true) + .with_thread_names(false) + .with_ansi(false) + .with_span_events(fmt::format::FmtSpan::FULL); + + let subscriber = Registry::default() + .with(EnvFilter::from_default_env()) + .with(fmt_layer); + + tracing::subscriber::set_global_default(subscriber) + .expect("error setting global tracing subscriber"); +} + +/// Write logs to file and rotation. +pub fn init_tracing_with_file( + app_name: &str, + dir: impl AsRef, + level: &str, + rotation: Rotation, +) -> WorkerGuard { + let file_appender = RollingFileAppender::new(rotation, dir, app_name); + let (file_writer, file_guard) = tracing_appender::non_blocking(file_appender); + let f_layer = Layer::new() + .with_timer(ChronoLocal::rfc3339()) + .with_writer(file_writer) + .with_thread_ids(true) + .with_thread_names(true) + .with_ansi(false) + .with_span_events(FmtSpan::ENTER | FmtSpan::CLOSE); + + let subscriber = Registry::default() + .with(EnvFilter::new(level)) + .with(f_layer); + + tracing::subscriber::set_global_default(subscriber) + .expect("error setting global tracing subscriber"); + + file_guard +} + +/// Create a file based tracing/logging subscriber. +/// A guard must be held during using the logging. +fn init_file_subscriber(app_name: &str, dir: &str) -> (WorkerGuard, impl Subscriber) { + let path_str = dir.to_string() + "/" + app_name; + let path: &Path = path_str.as_ref(); + + // open log file + + let mut open_options = OpenOptions::new(); + open_options.append(true).create(true); + + let mut open_res = open_options.open(path); + if open_res.is_err() { + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).unwrap(); + open_res = open_options.open(path); + } + } + + let f = open_res.unwrap(); + + // build subscriber + + let (writer, writer_guard) = tracing_appender::non_blocking(f); + + let f_layer = Layer::new() + .with_timer(ChronoLocal::rfc3339()) + .with_writer(writer) + .with_thread_ids(true) + .with_thread_names(false) + .with_ansi(false) + .with_span_events(FmtSpan::ENTER | FmtSpan::CLOSE); + + let subscriber = Registry::default() + .with(EnvFilter::from_default_env()) + .with(f_layer); + + (writer_guard, subscriber) +} diff --git a/configs/ceresdb.toml b/configs/ceresdb.toml new file mode 100644 index 0000000000..7b1a216dfc --- /dev/null +++ b/configs/ceresdb.toml @@ -0,0 +1,23 @@ +bind_addr = "0.0.0.0" +http_port = ${HTTP_PORT} +grpc_port = ${GRPC_PORT} +log_level = "info" + +[analytic] +data_path = "${DATA_PATH}" + +[[meta_client.cluster_view.schema_shards]] +schema = 'public' +auto_create_tables = true + +[[meta_client.cluster_view.schema_shards.shard_views]] +shard_id = 0 + +[meta_client.cluster_view.schema_shards.shard_views.node] +addr = "${NODE_ADDR}" +port = ${GRPC_PORT} + +[[route_rules.prefix_rules]] +schema = 'public' +prefix = 'special_prefix' +shard = 0 diff --git a/docker/entrypoint.py b/docker/entrypoint.py new file mode 100755 index 0000000000..35b3e12cdf --- /dev/null +++ b/docker/entrypoint.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python2 +import os +import sys +import commands + +ENABLE_DATA_NODE = os.getenv('ENABLE_DATANODE') == 'true' +HTTP_PORT = os.getenv('CERESDB_HTTP_PORT', '5000') +GRPC_PORT = os.getenv('CERESDB_GRPC_PORT', '8831') +DATA_PATH = '/home/admin/data/ceresdb' + +# hostname maybe return some ip(array) +def get_local_ip(): + return commands.getoutput('/usr/bin/localip').strip().split()[0] + +def create_datanode_config(): + config = open('/etc/ceresdb/ceresdb.toml', 'r').read() + config = config.replace("${HTTP_PORT}", HTTP_PORT) + config = config.replace("${GRPC_PORT}", GRPC_PORT) + config = config.replace("${NODE_ADDR}", get_local_ip()) + config = config.replace("${DATA_PATH}", DATA_PATH) + open('/etc/ceresdb/ceresdb.toml', 'w').write(config) + +def start_datanode(): + create_datanode_config() + + cmd = ''' +# load env +. /ceresdb.env +env +exec /usr/bin/ceresdb-server --config /etc/ceresdb/ceresdb.toml +''' + open('/usr/bin/ceresdb-start.sh', 'w').write(cmd) + +def start_supervisord(): + port = int(os.getenv('SUPERVISORD_HTTP_PORT', '9001')) + conf = '/etc/supervisor/supervisord.conf' + if port: + os.system(''' sed -i 's/:9001/:%d/g' %s ''' % (port, conf)) + open('/etc/supervisor/conf.d/touch-admin-cron.conf', 'a').write('\nkillasgroup=true\nstopasgroup=true\n') + os.system('/usr/bin/supervisord -c %s --nodaemon' % conf) + +def copy_environ(): + envs = [] + for k, v in os.environ.items(): + envs.append('export %s="%s"' % (k, v)) + # copy DATANODE_ to CSE_ + if 'DATANODE_' in k: + envs.append('export %s="%s"' % (k.replace('DATANODE_', 'CSE_'), v)) + + envs.append('export LOCAL_IP=%s' % get_local_ip()) + # support register ceres meta + envs.append('export CSE_CERES_META_NODE_ADDR=%s' % (get_local_ip())) + + envs.append('export MALLOC_CONF=prof:true,prof_active:false,lg_prof_sample:19') + + open('/ceresdb.env', 'w').write('\n'.join(envs)) + +def init_dir(): + cmd = ''' +mkdir -p /home/admin/logs /home/admin/data + +# set logdir +mkdir -p /home/admin/logs/ceresdb + +ln -nsf /data /home/admin/data + +chmod +777 -R /data /home/admin/data /home/admin/logs +chown -R admin.admin /data /home/admin/data /home/admin/logs +''' + open('/ceresdb-init.sh', 'w').write(cmd) + os.system('sh /ceresdb-init.sh') + +def main(): + print "copy_environ" + copy_environ() + + print "init_dir" + init_dir() + + if ENABLE_DATA_NODE: + print "start_datanode" + start_datanode() + + print "start_datanode" + start_supervisord() + +if __name__ == '__main__': + main() diff --git a/docker/supervisor/conf.d/ceresdb.conf b/docker/supervisor/conf.d/ceresdb.conf new file mode 100644 index 0000000000..3b956c3118 --- /dev/null +++ b/docker/supervisor/conf.d/ceresdb.conf @@ -0,0 +1,17 @@ +[program:ceresdbx] +command=sh /usr/bin/ceresdb-start.sh +autostart=true +startsecs=3 +startretries=9999 +autorestart=true +;exitcodes=0,2 +;stopsignal=QUIT +;stopwaitsecs=10 +stopasgroup=true +killasgroup=true +user=admin +redirect_stderr=true +stdout_logfile=/home/admin/logs/ceresdb/out.log +stdout_logfile_maxbytes=200MB +stdout_logfile_backups=5 +;environment=A="1",B="2" diff --git a/docker/supervisor/supervisord.conf b/docker/supervisor/supervisord.conf new file mode 100644 index 0000000000..401fb2e363 --- /dev/null +++ b/docker/supervisor/supervisord.conf @@ -0,0 +1,24 @@ +[unix_http_server] +file=/tmp/supervisor.sock ; (the path to the socket file) + +[inet_http_server] ; inet (TCP) server disabled by default +port=127.0.0.1:9001 ; (ip_address:port specifier, *:port for all iface) + +[supervisord] +logfile=/tmp/supervisord.log ; (main log file;default $CWD/supervisord.log) +logfile_maxbytes=500MB ; (max main logfile bytes b4 rotation;default 50MB) +logfile_backups=10 ; (num of main logfile rotation backups;default 10) +loglevel=info ; (log level;default info; others: debug,warn,trace) +pidfile=/tmp/supervisord.pid ; (supervisord pidfile;default supervisord.pid) +nodaemon=false ; (start in foreground if true;default false) +minfds=1024 ; (min. avail startup file descriptors;default 1024) +minprocs=200 ; (min. avail process descriptors;default 200) + +[rpcinterface:supervisor] +supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface + +[supervisorctl] +serverurl=http://127.0.0.1:9001 + +[include] +files = /etc/supervisor/conf.d/*.conf diff --git a/docker/tini b/docker/tini new file mode 100644 index 0000000000..03af82f09e Binary files /dev/null and b/docker/tini differ diff --git a/docs/crate-deps.dot b/docs/crate-deps.dot new file mode 100644 index 0000000000..95622dc25d --- /dev/null +++ b/docs/crate-deps.dot @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// to update svg, run: +// ```bash +// dot -Tsvg crate-deps.dot > ./crate-deps.svg +// ``` + +digraph G { + + arrow_deps + + analytic_engine -> arrow_deps + analytic_engine -> proto + analytic_engine -> table_engine + analytic_engine -> wal + + catalog -> table_engine + + catalog_impls -> catalog + catalog_impls -> system_catalog + catalog_impls -> table_engine + + cluster -> analytic_engine + cluster -> catalog + cluster -> meta_client_v2 + + interpreters -> catalog + interpreters -> sql + interpreters -> table_engine + interpreters -> udf + interpreters -> query_engine + interpreters -> arrow_deps + + meta_client -> catalog + meta_client -> table_engine + + meta_client_v2 -> catalog + meta_client_v2 -> table_engine + + query_engine -> arrow_deps + query_engine -> sql + query_engine -> table_engine + query_engine -> udf + + server -> analytic_engine + server -> arrow_deps + server -> catalog + server -> interpreters + server -> meta_client + server -> query_engine + server -> sql + server -> system_catalog + server -> table_engine + server -> udf + + sql -> arrow_deps + sql -> catalog + sql -> table_engine + sql -> udf + + system_catalog -> arrow_deps + system_catalog -> catalog + system_catalog -> proto + system_catalog -> table_engine + + table_engine -> arrow_deps + table_engine -> proto + + udf -> arrow_deps + + ceresdb -> analytic_engine + ceresdb -> catalog + ceresdb -> catalog_impls + ceresdb -> query_engine + ceresdb -> server + ceresdb -> table_engine + ceresdb -> udf +} diff --git a/docs/crate-deps.svg b/docs/crate-deps.svg new file mode 100644 index 0000000000..a52863ea12 --- /dev/null +++ b/docs/crate-deps.svg @@ -0,0 +1,433 @@ + + + + + + +G + + + +arrow_deps + +arrow_deps + + + +analytic_engine + +analytic_engine + + + +analytic_engine->arrow_deps + + + + + +proto + +proto + + + +analytic_engine->proto + + + + + +table_engine + +table_engine + + + +analytic_engine->table_engine + + + + + +wal + +wal + + + +analytic_engine->wal + + + + + +table_engine->arrow_deps + + + + + +table_engine->proto + + + + + +catalog + +catalog + + + +catalog->table_engine + + + + + +catalog_impls + +catalog_impls + + + +catalog_impls->table_engine + + + + + +catalog_impls->catalog + + + + + +system_catalog + +system_catalog + + + +catalog_impls->system_catalog + + + + + +system_catalog->arrow_deps + + + + + +system_catalog->proto + + + + + +system_catalog->table_engine + + + + + +system_catalog->catalog + + + + + +cluster + +cluster + + + +cluster->analytic_engine + + + + + +cluster->catalog + + + + + +meta_client_v2 + +meta_client_v2 + + + +cluster->meta_client_v2 + + + + + +meta_client_v2->table_engine + + + + + +meta_client_v2->catalog + + + + + +interpreters + +interpreters + + + +interpreters->arrow_deps + + + + + +interpreters->table_engine + + + + + +interpreters->catalog + + + + + +sql + +sql + + + +interpreters->sql + + + + + +udf + +udf + + + +interpreters->udf + + + + + +query_engine + +query_engine + + + +interpreters->query_engine + + + + + +sql->arrow_deps + + + + + +sql->table_engine + + + + + +sql->catalog + + + + + +sql->udf + + + + + +udf->arrow_deps + + + + + +query_engine->arrow_deps + + + + + +query_engine->table_engine + + + + + +query_engine->sql + + + + + +query_engine->udf + + + + + +meta_client + +meta_client + + + +meta_client->table_engine + + + + + +meta_client->catalog + + + + + +server + +server + + + +server->arrow_deps + + + + + +server->analytic_engine + + + + + +server->table_engine + + + + + +server->catalog + + + + + +server->system_catalog + + + + + +server->interpreters + + + + + +server->sql + + + + + +server->udf + + + + + +server->query_engine + + + + + +server->meta_client + + + + + +ceresdb + +ceresdb + + + +ceresdb->analytic_engine + + + + + +ceresdb->table_engine + + + + + +ceresdb->catalog + + + + + +ceresdb->catalog_impls + + + + + +ceresdb->udf + + + + + +ceresdb->query_engine + + + + + +ceresdb->server + + + + + diff --git a/docs/example.toml b/docs/example.toml new file mode 100644 index 0000000000..2e0fdc5064 --- /dev/null +++ b/docs/example.toml @@ -0,0 +1,20 @@ +bind_addr = "0.0.0.0" +http_port = 5440 +grpc_port = 8831 +log_level = "info" +enable_cluster = true + +[analytic] +data_path = "/tmp/ceresdbx" +sst_data_cache_cap = 10000 +sst_meta_cache_cap = 10000 + +[[meta_client.cluster_view.schema_shards]] +schema = 'public' + +[[meta_client.cluster_view.schema_shards.shard_views]] +shard_id = 0 + +[meta_client.cluster_view.schema_shards.shard_views.node] +addr = "127.0.0.1" +port = 8831 diff --git a/etc/license.template b/etc/license.template new file mode 100644 index 0000000000..377ec98bed --- /dev/null +++ b/etc/license.template @@ -0,0 +1 @@ +// Copyright {\d+} CeresDB Project Authors. Licensed under Apache-2.0. \ No newline at end of file diff --git a/grpcio/Cargo.toml b/grpcio/Cargo.toml new file mode 100644 index 0000000000..09a147a0d1 --- /dev/null +++ b/grpcio/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "grpcio" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +# Rename to workaround doctest bug +# See: https://github.com/rust-lang/cargo/issues/6819 + +[target.'cfg(target_os = "macos")'.dependencies] +upstream = { version = "0.9.1", package = "grpcio" } + +[target.'cfg(target_os = "linux")'.dependencies] +upstream = { version = "0.9.1", package = "grpcio", features = ["openssl"] } diff --git a/grpcio/src/lib.rs b/grpcio/src/lib.rs new file mode 100644 index 0000000000..99d9172ad7 --- /dev/null +++ b/grpcio/src/lib.rs @@ -0,0 +1,3 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +pub use upstream::*; diff --git a/interpreters/Cargo.toml b/interpreters/Cargo.toml new file mode 100644 index 0000000000..8d28241eef --- /dev/null +++ b/interpreters/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "interpreters" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +# In alphabetical order +async-trait = "0.1.41" +catalog = { path = "../catalog" } +common_types = { path = "../common_types" } +common_util = { path = "../common_util" } +log = "0.4" +snafu = { version ="0.6.10", features = ["backtraces"]} +sql = { path = "../sql" } +table_engine = { path = "../table_engine" } +udf = { path = "../udf" } +query_engine = { path = "../query_engine" } +arrow_deps = { path = "../arrow_deps" } + +[dev-dependencies] +analytic_engine = { path = "../analytic_engine", features = ["test"] } +catalog_impls = { path = "../catalog_impls" } +sql = { path = "../sql", features = ["test"] } +tokio = { version = "1.0", features = ["sync", "time"] } diff --git a/interpreters/src/alter_table.rs b/interpreters/src/alter_table.rs new file mode 100644 index 0000000000..acfce81adc --- /dev/null +++ b/interpreters/src/alter_table.rs @@ -0,0 +1,132 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Interpreter for insert statement + +use async_trait::async_trait; +use common_types::{ + column_schema::{self, ColumnSchema}, + schema::{self, Schema}, +}; +use common_util::define_result; +use snafu::{ensure, ResultExt, Snafu}; +use sql::plan::{AlterTableOperation, AlterTablePlan}; +use table_engine::table::AlterSchemaRequest; + +use crate::interpreter::{self, AlterTable, Interpreter, InterpreterPtr, Output}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to add column to schema, err:{}", source))] + AddColumnSchema { source: common_types::schema::Error }, + + #[snafu(display("Failed to build schema, err:{}", source))] + BuildSchema { source: common_types::schema::Error }, + + #[snafu(display("Failed to alter table schema, err:{}", source))] + AlterSchema { source: table_engine::table::Error }, + + #[snafu(display("Failed to alter table options, err:{}", source))] + AlterOptions { source: table_engine::table::Error }, + + #[snafu(display("Not allow to add a not null column, name:{}", name))] + AddNotNull { name: String }, +} + +define_result!(Error); + +pub struct AlterTableInterpreter { + plan: AlterTablePlan, +} + +impl AlterTableInterpreter { + pub fn create(plan: AlterTablePlan) -> InterpreterPtr { + Box::new(Self { plan }) + } +} + +#[async_trait] +impl Interpreter for AlterTableInterpreter { + async fn execute(self: Box) -> interpreter::Result { + self.execute_alter().await.context(AlterTable) + } +} + +impl AlterTableInterpreter { + async fn execute_alter(self: Box) -> Result { + let AlterTablePlan { table, operations } = self.plan; + + match operations { + AlterTableOperation::AddColumn(columns) => { + let current_schema = table.schema(); + let new_schema = build_new_schema(¤t_schema, columns)?; + + let request = AlterSchemaRequest { + schema: new_schema, + pre_schema_version: current_schema.version(), + }; + + let num_rows = table.alter_schema(request).await.context(AlterSchema)?; + + Ok(Output::AffectedRows(num_rows)) + } + AlterTableOperation::ModifySetting(options) => { + let num_rows = table.alter_options(options).await.context(AlterOptions)?; + Ok(Output::AffectedRows(num_rows)) + } + } + } +} + +fn build_new_schema(current_schema: &Schema, column_schemas: Vec) -> Result { + let current_version = current_schema.version(); + + let mut builder = + schema::Builder::with_capacity(current_schema.num_columns() + column_schemas.len()) + // Increment the schema version. + .version(current_version + 1); + // Add existing columns to builder. + for key_column in current_schema.key_columns() { + builder = builder + .add_key_column(key_column.clone()) + .context(AddColumnSchema)?; + } + for normal_column in current_schema.normal_columns() { + builder = builder + .add_normal_column(normal_column.clone()) + .context(AddColumnSchema)?; + } + + builder = builder + // Enable column id generation of the schema builder. + .auto_increment_column_id(true) + .enable_tsid_primary_key(current_schema.index_of_tsid().is_some()); + + // Add new columns + for mut column_schema in column_schemas { + // Uninit the id of the column schema. + column_schema.id = column_schema::COLUMN_ID_UNINIT; + + validate_add_column(&column_schema)?; + + // Only allow to add normal column. + builder = builder + .add_normal_column(column_schema) + .context(AddColumnSchema)?; + } + + // Build the final schema. + let new_schema = builder.build().context(BuildSchema)?; + + Ok(new_schema) +} + +fn validate_add_column(column_schema: &ColumnSchema) -> Result<()> { + ensure!( + column_schema.is_nullable, + AddNotNull { + name: &column_schema.name + } + ); + + Ok(()) +} diff --git a/interpreters/src/context.rs b/interpreters/src/context.rs new file mode 100644 index 0000000000..2e46f07082 --- /dev/null +++ b/interpreters/src/context.rs @@ -0,0 +1,79 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Interpreter context + +use std::sync::Arc; + +use common_types::request_id::RequestId; +use query_engine::context::{Context as QueryContext, ContextRef as QueryContextRef}; +use snafu::Snafu; + +#[derive(Debug, Snafu)] +pub enum Error {} + +define_result!(Error); + +/// Interpreter context +/// +/// Contains information that all interpreters need +pub struct Context { + request_id: RequestId, + default_catalog: String, + default_schema: String, +} + +impl Context { + pub fn builder(request_id: RequestId) -> Builder { + Builder { + request_id, + default_catalog: String::new(), + default_schema: String::new(), + } + } + + /// Create a new context of query executor + pub fn new_query_context(&self) -> Result { + let ctx = QueryContext::builder(self.request_id) + .default_catalog_and_schema(self.default_catalog.clone(), self.default_schema.clone()) + .build(); + Ok(Arc::new(ctx)) + } + + #[inline] + pub fn default_catalog(&self) -> &str { + &self.default_catalog + } + + #[inline] + pub fn default_schema(&self) -> &str { + &self.default_schema + } + + #[inline] + pub fn request_id(&self) -> RequestId { + self.request_id + } +} + +#[must_use] +pub struct Builder { + request_id: RequestId, + default_catalog: String, + default_schema: String, +} + +impl Builder { + pub fn default_catalog_and_schema(mut self, catalog: String, schema: String) -> Self { + self.default_catalog = catalog; + self.default_schema = schema; + self + } + + pub fn build(self) -> Context { + Context { + request_id: self.request_id, + default_catalog: self.default_catalog, + default_schema: self.default_schema, + } + } +} diff --git a/interpreters/src/create.rs b/interpreters/src/create.rs new file mode 100644 index 0000000000..252b459732 --- /dev/null +++ b/interpreters/src/create.rs @@ -0,0 +1,137 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Interpreter for create statements + +use async_trait::async_trait; +use catalog::{manager::Manager, schema::CreateOptions}; +use snafu::{Backtrace, OptionExt, ResultExt, Snafu}; +use sql::plan::CreateTablePlan; +use table_engine::engine::{CreateTableRequest, TableEngineRef, TableState}; + +use crate::{ + context::Context, + interpreter::{Create, Interpreter, InterpreterPtr, Output, Result as InterpreterResult}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to find catalog, name:{}, err:{}", name, source))] + FindCatalog { + name: String, + source: catalog::manager::Error, + }, + + #[snafu(display("Catalog not exists, name:{}.\nBacktrace:\n{}", name, backtrace))] + CatalogNotExists { name: String, backtrace: Backtrace }, + + #[snafu(display("Failed to find schema, name:{}, err:{}", name, source))] + FindSchema { + name: String, + source: catalog::Error, + }, + + #[snafu(display("Schema not exists, name:{}.\nBacktrace:\n{}", name, backtrace))] + SchemaNotExists { name: String, backtrace: Backtrace }, + + #[snafu(display("Failed to create table, name:{}, err:{}", table, source))] + SchemaCreateTable { + table: String, + source: catalog::schema::Error, + }, + + #[snafu(display("Failed to allocate table id, err:{}", source))] + AllocTableId { source: catalog::schema::Error }, +} + +define_result!(Error); + +/// Create interpreter +pub struct CreateInterpreter { + ctx: Context, + plan: CreateTablePlan, + catalog_manager: C, + table_engine: TableEngineRef, +} + +impl CreateInterpreter { + pub fn create( + ctx: Context, + plan: CreateTablePlan, + catalog_manager: C, + table_engine: TableEngineRef, + ) -> InterpreterPtr { + Box::new(Self { + ctx, + plan, + catalog_manager, + table_engine, + }) + } +} + +impl CreateInterpreter { + async fn execute_create(self: Box) -> Result { + let default_catalog = self.ctx.default_catalog(); + let catalog = self + .catalog_manager + .catalog_by_name(default_catalog) + .context(FindCatalog { + name: default_catalog, + })? + .context(CatalogNotExists { + name: default_catalog, + })?; + + let default_schema = self.ctx.default_schema(); + let schema = catalog + .schema_by_name(default_schema) + .context(FindSchema { + name: default_schema, + })? + .context(SchemaNotExists { + name: default_schema, + })?; + + let CreateTablePlan { + engine, + table, + table_schema, + if_not_exists, + options, + } = self.plan; + + let table_id = schema.alloc_table_id(&table).context(AllocTableId)?; + let request = CreateTableRequest { + catalog_name: catalog.name().to_string(), + schema_name: schema.name().to_string(), + table_id, + table_name: table.clone(), + table_schema, + partition_info: None, + engine, + options, + state: TableState::Stable, + }; + + let opts = CreateOptions { + table_engine: self.table_engine, + create_if_not_exists: if_not_exists, + }; + + schema + .create_table(request, opts) + .await + .context(SchemaCreateTable { table })?; + + Ok(Output::AffectedRows(1)) + } +} + +// TODO(yingwen): Wrap a method that returns self::Result, simplify some code to +// converting self::Error to super::Error +#[async_trait] +impl Interpreter for CreateInterpreter { + async fn execute(self: Box) -> InterpreterResult { + self.execute_create().await.context(Create) + } +} diff --git a/interpreters/src/describe.rs b/interpreters/src/describe.rs new file mode 100644 index 0000000000..ca6266a872 --- /dev/null +++ b/interpreters/src/describe.rs @@ -0,0 +1,89 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{convert::TryInto, sync::Arc}; + +use arrow_deps::arrow::{ + array::{BooleanArray, StringArray}, + datatypes::{DataType, Field, Schema}, + record_batch::RecordBatch, +}; +use async_trait::async_trait; +use query_engine::executor::RecordBatchVec; +use snafu::{ResultExt, Snafu}; +use sql::plan::DescribeTablePlan; +use table_engine::table::TableRef; + +use crate::interpreter::{ + Describe, Interpreter, InterpreterPtr, Output, Result as InterpreterResult, +}; + +#[derive(Debug, Snafu)] +pub enum Error {} + +define_result!(Error); + +pub struct DescribeInterpreter { + plan: DescribeTablePlan, +} + +impl DescribeInterpreter { + pub fn create(plan: DescribeTablePlan) -> InterpreterPtr { + Box::new(Self { plan }) + } + + async fn execute_describe(self: Box) -> Result { + let DescribeTablePlan { table } = self.plan; + + Self::table_ref_to_record_batch(table).map(Output::Records) + } + + fn table_ref_to_record_batch(table_ref: TableRef) -> Result { + let table_schema = table_ref.schema(); + let num_columns = table_schema.num_columns(); + let num_key_columns = table_schema.num_key_columns(); + + let mut names = Vec::with_capacity(num_columns); + let mut types = Vec::with_capacity(num_columns); + let mut is_primary_keys = Vec::with_capacity(num_columns); + let mut is_nullables = Vec::with_capacity(num_columns); + let mut is_tags = Vec::with_capacity(num_columns); + for (idx, col) in table_schema.columns().iter().enumerate() { + names.push(col.name.to_string()); + types.push(col.data_type.to_string()); + is_primary_keys.push(idx < num_key_columns); + is_nullables.push(col.is_nullable); + is_tags.push(col.is_tag); + } + + let schema = Schema::new(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("type", DataType::Utf8, false), + Field::new("is_primary", DataType::Boolean, false), + Field::new("is_nullable", DataType::Boolean, false), + Field::new("is_tag", DataType::Boolean, false), + ]); + + let arrow_record_batch = RecordBatch::try_new( + Arc::new(schema), + vec![ + Arc::new(StringArray::from(names)), + Arc::new(StringArray::from(types)), + Arc::new(BooleanArray::from(is_primary_keys)), + Arc::new(BooleanArray::from(is_nullables)), + Arc::new(BooleanArray::from(is_tags)), + ], + ) + .unwrap(); + + let record_batch = arrow_record_batch.try_into().unwrap(); + + Ok(vec![record_batch]) + } +} + +#[async_trait] +impl Interpreter for DescribeInterpreter { + async fn execute(self: Box) -> InterpreterResult { + self.execute_describe().await.context(Describe) + } +} diff --git a/interpreters/src/drop.rs b/interpreters/src/drop.rs new file mode 100644 index 0000000000..7282ae3bc2 --- /dev/null +++ b/interpreters/src/drop.rs @@ -0,0 +1,126 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Interpreter for drop statements + +use async_trait::async_trait; +use catalog::{manager::Manager, schema::DropOptions}; +use snafu::{Backtrace, OptionExt, ResultExt, Snafu}; +use sql::plan::DropTablePlan; +use table_engine::engine::{DropTableRequest, TableEngineRef}; + +use crate::{ + context::Context, + interpreter::{Drop, Interpreter, InterpreterPtr, Output, Result as InterpreterResult}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to find catalog, name:{}, err:{}", name, source))] + FindCatalog { + name: String, + source: catalog::manager::Error, + }, + + #[snafu(display("Catalog not exists, name:{}.\nBacktrace:\n{}", name, backtrace))] + CatalogNotExists { name: String, backtrace: Backtrace }, + + #[snafu(display("Failed to find schema, name:{}, err:{}", name, source))] + FindSchema { + name: String, + source: catalog::Error, + }, + + #[snafu(display("Schema not exists, name:{}.\nBacktrace:\n{}", name, backtrace))] + SchemaNotExists { name: String, backtrace: Backtrace }, + + #[snafu(display("Failed to drop table in schema, name:{}, err:{}", table, source))] + SchemaDropTable { + table: String, + source: catalog::schema::Error, + }, + + #[snafu(display("Failed to drop table, name:{}, err:{}", table, source))] + DropTable { + table: String, + source: table_engine::engine::Error, + }, +} + +define_result!(Error); + +/// Drop interpreter +pub struct DropInterpreter { + ctx: Context, + plan: DropTablePlan, + catalog_manager: C, + table_engine: TableEngineRef, +} + +impl DropInterpreter { + pub fn create( + ctx: Context, + plan: DropTablePlan, + catalog_manager: C, + table_engine: TableEngineRef, + ) -> InterpreterPtr { + Box::new(Self { + ctx, + plan, + catalog_manager, + table_engine, + }) + } +} + +impl DropInterpreter { + async fn execute_drop(self: Box) -> Result { + let default_catalog = self.ctx.default_catalog(); + let catalog = self + .catalog_manager + .catalog_by_name(default_catalog) + .context(FindCatalog { + name: default_catalog, + })? + .context(CatalogNotExists { + name: default_catalog, + })?; + + let default_schema = self.ctx.default_schema(); + let schema = catalog + .schema_by_name(default_schema) + .context(FindSchema { + name: default_schema, + })? + .context(SchemaNotExists { + name: default_schema, + })?; + + let table = self.plan.table; + let request = DropTableRequest { + catalog_name: catalog.name().to_string(), + schema_name: schema.name().to_string(), + table_name: table.clone(), + engine: self.plan.engine, + }; + + let opts = DropOptions { + table_engine: self.table_engine, + }; + + let dropped = schema + .drop_table(request, opts) + .await + .context(SchemaDropTable { table: &table })?; + + Ok(Output::AffectedRows(if dropped { 1 } else { 0 })) + } +} + +// TODO(yingwen): Wrap a method that returns self::Result, simplify some code to +// converting self::Error to super::Error +#[async_trait] +impl Interpreter for DropInterpreter { + async fn execute(self: Box) -> InterpreterResult { + self.execute_drop().await.context(Drop) + } +} diff --git a/interpreters/src/exists.rs b/interpreters/src/exists.rs new file mode 100644 index 0000000000..f926a700c4 --- /dev/null +++ b/interpreters/src/exists.rs @@ -0,0 +1,62 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{convert::TryInto, sync::Arc}; + +use arrow_deps::arrow::{ + array::UInt8Array, + datatypes::{DataType, Field, Schema}, + record_batch::RecordBatch, +}; +use async_trait::async_trait; +use query_engine::executor::RecordBatchVec; +use snafu::{ResultExt, Snafu}; +use sql::plan::ExistsTablePlan; + +use crate::interpreter::{ + Exists, Interpreter, InterpreterPtr, Output, Result as InterpreterResult, +}; + +#[derive(Debug, Snafu)] +pub enum Error {} + +define_result!(Error); + +pub struct ExistsInterpreter { + plan: ExistsTablePlan, +} + +impl ExistsInterpreter { + pub fn create(plan: ExistsTablePlan) -> InterpreterPtr { + Box::new(Self { plan }) + } + + async fn execute_exists(self: Box) -> Result { + let ExistsTablePlan { exists } = self.plan; + + exists_table_result(exists).map(Output::Records) + } +} + +fn exists_table_result(exists: bool) -> Result { + let schema = Schema::new(vec![Field::new("result", DataType::UInt8, false)]); + + let arrow_record_batch = RecordBatch::try_new( + Arc::new(schema), + vec![Arc::new(UInt8Array::from_value( + if exists { 1u8 } else { 0u8 }, + 1, + ))], + ) + .unwrap(); + + let record_batch = arrow_record_batch.try_into().unwrap(); + + Ok(vec![record_batch]) +} + +#[async_trait] +impl Interpreter for ExistsInterpreter { + async fn execute(self: Box) -> InterpreterResult { + self.execute_exists().await.context(Exists) + } +} diff --git a/interpreters/src/factory.rs b/interpreters/src/factory.rs new file mode 100644 index 0000000000..26b858723c --- /dev/null +++ b/interpreters/src/factory.rs @@ -0,0 +1,49 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Interpreter factory + +use catalog::manager::Manager as CatalogManager; +use query_engine::executor::Executor; +use sql::plan::Plan; +use table_engine::engine::TableEngineRef; + +use crate::{ + alter_table::AlterTableInterpreter, context::Context, create::CreateInterpreter, + describe::DescribeInterpreter, drop::DropInterpreter, exists::ExistsInterpreter, + insert::InsertInterpreter, interpreter::InterpreterPtr, select::SelectInterpreter, + show_create::ShowCreateInInterpreter, +}; + +/// A factory to create interpreters +pub struct Factory { + query_executor: Q, + catalog_manager: C, + table_engine: TableEngineRef, +} + +impl Factory { + pub fn new(query_executor: Q, catalog_manager: C, table_engine: TableEngineRef) -> Self { + Self { + query_executor, + catalog_manager, + table_engine, + } + } + + pub fn create(self, ctx: Context, plan: Plan) -> InterpreterPtr { + match plan { + Plan::Query(p) => SelectInterpreter::create(ctx, p, self.query_executor), + Plan::Insert(p) => InsertInterpreter::create(ctx, p), + Plan::Create(p) => { + CreateInterpreter::create(ctx, p, self.catalog_manager, self.table_engine) + } + Plan::Drop(p) => { + DropInterpreter::create(ctx, p, self.catalog_manager, self.table_engine) + } + Plan::Describe(p) => DescribeInterpreter::create(p), + Plan::AlterTable(p) => AlterTableInterpreter::create(p), + Plan::ShowCreate(p) => ShowCreateInInterpreter::create(p), + Plan::Exists(p) => ExistsInterpreter::create(p), + } + } +} diff --git a/interpreters/src/insert.rs b/interpreters/src/insert.rs new file mode 100644 index 0000000000..c2a2ddf636 --- /dev/null +++ b/interpreters/src/insert.rs @@ -0,0 +1,138 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Interpreter for insert statement + +use async_trait::async_trait; +use common_types::{column_schema::ColumnId, datum::Datum, hash::hash64}; +use common_util::codec::{compact::MemCompactEncoder, Encoder}; +use snafu::{ResultExt, Snafu}; +use sql::plan::InsertPlan; +use table_engine::table::WriteRequest; + +use crate::{ + context::Context, + interpreter::{Insert, Interpreter, InterpreterPtr, Output, Result}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to write table, err:{}", source))] + WriteTable { source: table_engine::table::Error }, + + #[snafu(display("Failed to encode tsid, err:{}", source))] + EncodeTsid { + source: common_util::codec::compact::Error, + }, +} + +pub struct InsertInterpreter { + ctx: Context, + plan: InsertPlan, +} + +impl InsertInterpreter { + pub fn create(ctx: Context, plan: InsertPlan) -> InterpreterPtr { + Box::new(Self { ctx, plan }) + } +} + +#[async_trait] +impl Interpreter for InsertInterpreter { + async fn execute(mut self: Box) -> Result { + // Generate tsid if needed. + self.maybe_generate_tsid()?; + let InsertPlan { table, rows } = self.plan; + + // Context is unused now + let _ctx = self.ctx; + + let request = WriteRequest { row_group: rows }; + + let num_rows = table + .write(request) + .await + .context(WriteTable) + .context(Insert)?; + + Ok(Output::AffectedRows(num_rows)) + } +} + +impl InsertInterpreter { + fn maybe_generate_tsid(&mut self) -> Result<()> { + let schema = self.plan.rows.schema(); + let tsid_idx = schema.index_of_tsid(); + + if let Some(idx) = tsid_idx { + // Vec of (`index of tag`, `column id of tag`). + let tag_idx_column_ids: Vec<_> = schema + .columns() + .iter() + .enumerate() + .filter_map(|(i, column)| { + if column.is_tag { + Some((i, column.id)) + } else { + None + } + }) + .collect(); + + let mut hash_bytes = Vec::new(); + for i in 0..self.plan.rows.num_rows() { + let row = self.plan.rows.get_row_mut(i).unwrap(); + + let mut tsid_builder = TsidBuilder::new(&mut hash_bytes); + + for (idx, column_id) in &tag_idx_column_ids { + tsid_builder.maybe_write_datum(*column_id, &row[*idx])?; + } + + let tsid = tsid_builder.finish(); + row[idx] = Datum::UInt64(tsid); + } + } + Ok(()) + } +} + +struct TsidBuilder<'a> { + encoder: MemCompactEncoder, + hash_bytes: &'a mut Vec, +} + +impl<'a> TsidBuilder<'a> { + fn new(hash_bytes: &'a mut Vec) -> Self { + // Clear the bytes buffer. + hash_bytes.clear(); + + Self { + encoder: MemCompactEncoder, + hash_bytes, + } + } + + fn maybe_write_datum(&mut self, column_id: ColumnId, datum: &Datum) -> Result<()> { + // Null datum will be ignored, so tsid remains unchanged after adding a null + // column. + if datum.is_null() { + return Ok(()); + } + + // Write column id first. + self.encoder + .encode(self.hash_bytes, &Datum::UInt64(u64::from(column_id))) + .context(EncodeTsid) + .context(Insert)?; + // Write datum. + self.encoder + .encode(self.hash_bytes, datum) + .context(EncodeTsid) + .context(Insert)?; + Ok(()) + } + + fn finish(self) -> u64 { + hash64(self.hash_bytes) + } +} diff --git a/interpreters/src/interpreter.rs b/interpreters/src/interpreter.rs new file mode 100644 index 0000000000..4591eb5df5 --- /dev/null +++ b/interpreters/src/interpreter.rs @@ -0,0 +1,56 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Interpreter trait + +use async_trait::async_trait; +use query_engine::executor::RecordBatchVec; +use snafu::Snafu; + +// Make the variant closer to actual error code like invalid arguments. +#[derive(Debug, Snafu)] +#[snafu(visibility(pub(crate)))] +pub enum Error { + #[snafu(display("Failed to execute select, err:{}", source))] + Select { source: crate::select::Error }, + + #[snafu(display("Failed to execute create table, err:{}", source))] + Create { source: crate::create::Error }, + + #[snafu(display("Failed to execute drop table, err:{}", source))] + Drop { source: crate::drop::Error }, + + #[snafu(display("Failed to execute insert, err:{}", source))] + Insert { source: crate::insert::Error }, + + #[snafu(display("Failed to execute describe, err:{}", source))] + Describe { source: crate::describe::Error }, + + #[snafu(display("Failed to execute alter table, err:{}", source))] + AlterTable { source: crate::alter_table::Error }, + + #[snafu(display("Failed to show create table, err:{}", source))] + ShowCreate { source: crate::show_create::Error }, + + #[snafu(display("Failed to execute exists, err:{}", source))] + Exists { source: crate::exists::Error }, +} + +define_result!(Error); + +// TODO(yingwen): Maybe add a stream variant for streaming result +/// The interpreter output +pub enum Output { + /// Affected rows number + AffectedRows(usize), + /// A vec of RecordBatch + Records(RecordBatchVec), +} + +/// Interpreter executes the plan it holds +#[async_trait] +pub trait Interpreter { + async fn execute(self: Box) -> Result; +} + +/// A pointer to Interpreter +pub type InterpreterPtr = Box; diff --git a/interpreters/src/lib.rs b/interpreters/src/lib.rs new file mode 100644 index 0000000000..6f3b888e6e --- /dev/null +++ b/interpreters/src/lib.rs @@ -0,0 +1,23 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Interpreters of query/insert/update/delete commands +//! +//! Inspired by fuse-query: and ClickHouse + +#[macro_use] +extern crate common_util; + +pub mod alter_table; +pub mod context; +pub mod create; +pub mod describe; +pub mod drop; +pub mod exists; +pub mod factory; +pub mod insert; +pub mod interpreter; +pub mod select; +pub mod show_create; + +#[cfg(test)] +mod tests; diff --git a/interpreters/src/select.rs b/interpreters/src/select.rs new file mode 100644 index 0000000000..97a0f84a57 --- /dev/null +++ b/interpreters/src/select.rs @@ -0,0 +1,75 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Interpreter for select statement + +use async_trait::async_trait; +use log::debug; +use query_engine::executor::{Executor, Query}; +use snafu::{ResultExt, Snafu}; +use sql::plan::QueryPlan; + +use crate::{ + context::Context, + interpreter::{Interpreter, InterpreterPtr, Output, Result as InterpreterResult, Select}, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to create query context, err:{}", source))] + CreateQueryContext { source: crate::context::Error }, + + #[snafu(display("Failed to execute logical plan, err:{}", source))] + ExecutePlan { + source: query_engine::executor::Error, + }, +} + +define_result!(Error); + +/// Select interpreter +pub struct SelectInterpreter { + ctx: Context, + plan: QueryPlan, + executor: T, +} + +impl SelectInterpreter { + pub fn create(ctx: Context, plan: QueryPlan, executor: T) -> InterpreterPtr { + Box::new(Self { + ctx, + plan, + executor, + }) + } +} + +#[async_trait] +impl Interpreter for SelectInterpreter { + async fn execute(self: Box) -> InterpreterResult { + let request_id = self.ctx.request_id(); + debug!( + "Interpreter execute select begin, request_id:{}, plan:{:?}", + request_id, self.plan + ); + + let query_ctx = self + .ctx + .new_query_context() + .context(CreateQueryContext) + .context(Select)?; + let query = Query::new(self.plan); + let record_batches = self + .executor + .execute_logical_plan(query_ctx, query) + .await + .context(ExecutePlan) + .context(Select)?; + + debug!( + "Interpreter execute select finish, request_id:{}", + request_id + ); + + Ok(Output::Records(record_batches)) + } +} diff --git a/interpreters/src/show_create.rs b/interpreters/src/show_create.rs new file mode 100644 index 0000000000..38d1747ab8 --- /dev/null +++ b/interpreters/src/show_create.rs @@ -0,0 +1,136 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{collections::HashMap, convert::TryInto, sync::Arc}; + +use arrow_deps::arrow::{ + array::StringArray, + datatypes::{DataType, Field, Schema}, + record_batch::RecordBatch, +}; +use async_trait::async_trait; +use query_engine::executor::RecordBatchVec; +use snafu::{ensure, Backtrace, ResultExt, Snafu}; +use sql::{ast::ShowCreateObject, plan::ShowCreatePlan}; +use table_engine::table::TableRef; + +use crate::interpreter::{ + Interpreter, InterpreterPtr, Output, Result as InterpreterResult, ShowCreate, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Unsupported show create type, type: {:?}, err:{}", + obj_type, + backtrace + ))] + UnsupportedType { + obj_type: ShowCreateObject, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +pub struct ShowCreateInInterpreter { + plan: ShowCreatePlan, +} + +impl ShowCreateInInterpreter { + pub fn create(plan: ShowCreatePlan) -> InterpreterPtr { + Box::new(Self { plan }) + } + + async fn execute_show_create(self: Box) -> Result { + let ShowCreatePlan { table, obj_type } = self.plan; + + ensure!( + obj_type == ShowCreateObject::Table, + UnsupportedType { obj_type } + ); + + Self::table_ref_to_record_batch(table).map(Output::Records) + } + + fn table_ref_to_record_batch(table_ref: TableRef) -> Result { + let tables = vec![table_ref.name().to_string()]; + let sqls = vec![Self::render_table_sql(table_ref)]; + + let schema = Schema::new(vec![ + Field::new("Table", DataType::Utf8, false), + Field::new("Create Table", DataType::Utf8, false), + ]); + + let arrow_record_batch = RecordBatch::try_new( + Arc::new(schema), + vec![ + Arc::new(StringArray::from(tables)), + Arc::new(StringArray::from(sqls)), + ], + ) + .unwrap(); + + let record_batch = arrow_record_batch.try_into().unwrap(); + + Ok(vec![record_batch]) + } + + fn render_table_sql(table_ref: TableRef) -> String { + //TODO(boyan) pretty output + format!( + "CREATE TABLE `{}` ({}) ENGINE={}{}", + table_ref.name(), + Self::render_columns_and_constrains(&table_ref), + table_ref.engine_type(), + Self::render_options(table_ref.options()) + ) + } + + fn render_columns_and_constrains(table_ref: &TableRef) -> String { + let table_schema = table_ref.schema(); + let key_columns = table_schema.key_columns(); + let timestamp_key = table_schema.timestamp_name(); + + let mut res = String::new(); + for col in table_schema.columns() { + res += format!("`{}` {}", col.name, col.data_type).as_str(); + if col.is_tag { + res += " TAG"; + } + if !col.is_nullable { + res += " NOT NULL"; + } + + if !col.comment.is_empty() { + res += format!(" COMMENT '{}'", col.comment).as_str(); + } + res += ", "; + } + let keys: Vec = key_columns.iter().map(|col| col.name.to_string()).collect(); + res += format!("PRIMARY KEY({}), ", keys.join(",")).as_str(); + res += format!("TIMESTAMP KEY({})", timestamp_key).as_str(); + + res + } + + fn render_options(opts: HashMap) -> String { + if !opts.is_empty() { + let mut v: Vec = opts + .into_iter() + .map(|(k, v)| format!("{}='{}'", k, v)) + .collect(); + // sorted by option name + v.sort(); + format!(" WITH({})", v.join(", ")) + } else { + "".to_string() + } + } +} + +#[async_trait] +impl Interpreter for ShowCreateInInterpreter { + async fn execute(self: Box) -> InterpreterResult { + self.execute_show_create().await.context(ShowCreate) + } +} diff --git a/interpreters/src/tests.rs b/interpreters/src/tests.rs new file mode 100644 index 0000000000..4b05a239f8 --- /dev/null +++ b/interpreters/src/tests.rs @@ -0,0 +1,236 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use analytic_engine::tests::util::TestEnv; +use catalog::consts::{DEFAULT_CATALOG, DEFAULT_SCHEMA}; +use catalog_impls::table_based::TableBasedManager; +use common_types::request_id::RequestId; +use query_engine::executor::ExecutorImpl; +use sql::{ + parser::Parser, plan::Plan, planner::Planner, provider::MetaProvider, tests::MockMetaProvider, +}; +use table_engine::engine::TableEngine; + +use crate::{ + context::Context, + factory::Factory, + interpreter::{Output, Result}, +}; + +async fn build_catalog_manager(analytic: E) -> TableBasedManager +where + E: TableEngine + Clone + Send + Sync + 'static, +{ + // Create catalog manager, use analytic table as backend + TableBasedManager::new(&analytic.clone(), Arc::new(analytic)) + .await + .unwrap_or_else(|e| { + panic!("Failed to create catalog manager, err:{}", e); + }) +} + +fn sql_to_plan(meta_provider: &M, sql: &str) -> Plan { + let planner = Planner::new(meta_provider, RequestId::next_id(), 1); + let mut statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + planner.statement_to_plan(statements.remove(0)).unwrap() +} + +async fn build_factory(env: &Env) -> Factory +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + let catalog_manager = build_catalog_manager(env.engine()).await; + Factory::new(ExecutorImpl::new(), catalog_manager, Arc::new(env.engine())) +} + +async fn sql_to_output(env: &Env, sql: &str) -> Result +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + let plan = sql_to_plan(&env.meta_provider, sql); + + let ctx = Context::builder(RequestId::next_id()) + .default_catalog_and_schema(DEFAULT_CATALOG.to_string(), DEFAULT_SCHEMA.to_string()) + .build(); + + let factory = build_factory(env).await; + let interpreter = factory.create(ctx, plan); + interpreter.execute().await +} + +async fn test_create_table(env: &Env) +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + let sql="CREATE TABLE IF NOT EXISTS test_table(c1 string tag not null,ts timestamp not null, c3 string, timestamp key(ts),primary key(c1, ts)) \ + ENGINE=Analytic WITH (ttl='70d',update_mode='overwrite',arena_block_size='1KB')"; + + let output = sql_to_output(env, sql).await.unwrap(); + if let Output::AffectedRows(v) = output { + assert_eq!(v, 1); + } else { + panic!(); + } +} + +async fn test_desc_table(env: &Env) +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + let sql = "desc table test_table"; + let output = sql_to_output(env, sql).await.unwrap(); + if let Output::Records(v) = output { + assert_eq!(v.len(), 1); + } else { + panic!(); + } +} + +async fn test_exists_table(env: &Env) +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + let sql = "exists table test_table"; + let output = sql_to_output(env, sql).await.unwrap(); + if let Output::Records(v) = output { + assert_eq!(v.len(), 1); + } else { + panic!(); + } +} + +async fn test_insert_table(env: &Env) +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + let sql = "INSERT INTO test_table(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3'),('tagk2', 1638428434000,100, 'hello3');"; + let output = sql_to_output(env, sql).await.unwrap(); + if let Output::AffectedRows(v) = output { + assert_eq!(v, 2); + } else { + panic!(); + } +} + +async fn test_select_table(env: &Env) +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + let sql = "select * from test_table"; + let output = sql_to_output(env, sql).await.unwrap(); + if let Output::Records(v) = output { + assert_eq!(v.len(), 1); + assert_eq!(v[0].num_rows(), 2); + } else { + panic!(); + } + + let sql = "select count(*) from test_table"; + let output = sql_to_output(env, sql).await.unwrap(); + if let Output::Records(v) = output { + assert_eq!(v.len(), 1); + assert_eq!(v[0].num_rows(), 1); + } else { + panic!(); + } +} + +async fn test_show_create_table(env: &Env) +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + let sql = "show create table test_table"; + let output = sql_to_output(env, sql).await.unwrap(); + if let Output::Records(v) = output { + assert_eq!(v.len(), 1); + assert_eq!(v[0].num_rows(), 1); + } else { + panic!(); + } +} + +async fn test_alter_table(env: &Env) +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + let sql = "alter table test_table add column add_col string"; + let output = sql_to_output(env, sql).await.unwrap(); + if let Output::AffectedRows(v) = output { + assert_eq!(v, 1); + } else { + panic!(); + } + + let sql = "alter table test_table modify SETTING ttl='9d'"; + let output = sql_to_output(env, sql).await.unwrap(); + if let Output::AffectedRows(v) = output { + assert_eq!(v, 1); + } else { + panic!(); + } +} + +async fn test_drop_table(env: &Env) +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + let sql = "drop table test_table"; + let output = sql_to_output(env, sql).await.unwrap(); + if let Output::AffectedRows(v) = output { + assert_eq!(v, 1); + } else { + panic!(); + } +} + +struct Env +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + pub engine: E, + pub meta_provider: M, +} + +impl Env +where + E: TableEngine + Clone + Send + Sync + 'static, + M: MetaProvider, +{ + fn engine(&self) -> E { + self.engine.clone() + } +} + +#[tokio::test] +async fn test_interpreters() { + let env = TestEnv::builder().build(); + let mut test_ctx = env.new_context(); + test_ctx.open().await; + let mock = MockMetaProvider::default(); + let env = Env { + engine: test_ctx.engine(), + meta_provider: mock, + }; + + test_create_table(&env).await; + test_desc_table(&env).await; + test_exists_table(&env).await; + test_insert_table(&env).await; + test_select_table(&env).await; + test_show_create_table(&env).await; + test_alter_table(&env).await; + test_drop_table(&env).await; +} diff --git a/meta_client/Cargo.toml b/meta_client/Cargo.toml new file mode 100644 index 0000000000..bd544c5d7f --- /dev/null +++ b/meta_client/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "meta_client" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +async-trait = "0.1.41" +catalog = { path = "../catalog" } +common_types = { path = "../common_types" } +table_engine = { path = "../table_engine" } +common_util = { path = "../common_util" } +ceresdbproto = { git = "https://github.com/CeresDB/ceresdbproto.git"} +futures = "0.3" +grpcio = { path = "../grpcio" } +log = "0.4" +rand = "0.7" +reqwest = "0.11" +serde = "1.0" +serde_derive = "1.0.81" +serde_json = "1.0.60" +snafu = { version ="0.6.10", features = ["backtraces"]} +tokio = { version = "1.0", features = ["full"] } +url = "2.2" diff --git a/meta_client/src/lib.rs b/meta_client/src/lib.rs new file mode 100644 index 0000000000..34563a1e71 --- /dev/null +++ b/meta_client/src/lib.rs @@ -0,0 +1,705 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Client to communicate with meta + +use std::{ + collections::HashMap, + convert::TryFrom, + sync::{Arc, RwLock}, + time::Duration, +}; + +use async_trait::async_trait; +use ceresdbproto::{ + meta::{CommonNodeInfo, NodeType}, + metagrpc::{ + ClusterViewResponse, FetchClusterViewRequest, NameSpace, RegisterNodeRequest, + RegisterNodeResponse, + }, + metagrpc_grpc::CeresmetaRpcServiceClient, +}; +use common_types::{bytes::Bytes, schema::TIMESTAMP_COLUMN}; +use common_util::{config::ReadableDuration, define_result, runtime::Runtime}; +use futures::TryStreamExt; +use grpcio::{ChannelBuilder, Environment}; +use load_balance::{LoadBalancer, RandomLoadBalancer}; +use log::{error, info}; +use reqwest::{self, StatusCode, Url}; +use serde::de::DeserializeOwned; +use serde_derive::Deserialize; +use snafu::{Backtrace, OptionExt, ResultExt, Snafu}; +use table_engine::ANALYTIC_ENGINE_TYPE; +use tokio::time; + +use crate::static_client::StaticMetaClient; + +mod load_balance; +mod static_client; + +#[derive(Debug, Snafu)] +#[snafu(visibility = "pub")] +pub enum Error { + #[snafu(display("Build http client failed, err:{}.\nBacktrace:\n{}", source, backtrace))] + BuildHttpClient { + source: reqwest::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid meta addr, addr:{}, err:{}.\nBacktrace:\n{}", + meta_addr, + source, + backtrace + ))] + InvalidMetaAddr { + meta_addr: String, + source: url::ParseError, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to join url, input:{}, err:{}.\nBacktrace:\n{}", + input, + source, + backtrace + ))] + JoinUrl { + input: String, + source: url::ParseError, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to send http request, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + SendHttp { + source: reqwest::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to parse http text, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + ParseText { + source: reqwest::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Bad http status, status:{}, url:{}, text:{:?}.\nBacktrace:\n{}", + status, + url, + text, + backtrace + ))] + BadHttpStatus { + status: StatusCode, + url: String, + text: Bytes, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to parse json text, text:{:?}, err:{}.\nBacktrace:\n{}", + text, + source, + backtrace + ))] + ParseJson { + text: Bytes, + source: serde_json::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to fetch cluster view, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + FetchClusterViewError { + source: grpcio::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Encountered register node, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + RegisterNodeError { + source: grpcio::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Encountered build rpc client, err:{}", source))] + BuildRpcClientError { source: load_balance::Error }, + + #[snafu(display( + "Invalid node addr of cluster view, node:{}.\nBacktrace:\n{}", + node, + backtrace + ))] + InvalidNodeAddr { node: String, backtrace: Backtrace }, + + #[snafu(display( + "Invalid node port of cluster view, node:{}, err:{}.\nBacktrace:\n{}", + node, + source, + backtrace + ))] + InvalidNodePort { + node: String, + source: std::num::ParseIntError, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to create schema:{}, catalog:{}, err:{}", + schema, + catalog, + source + ))] + FailOnChangeView { + schema: String, + catalog: String, + source: Box, + }, + + #[snafu(display("Failed to get catalog:{}, err:{}", catalog, source))] + FailGetCatalog { + catalog: String, + source: Box, + }, +} + +define_result!(Error); + +type ShardViewMap = HashMap; + +#[async_trait] +pub trait MetaWatcher { + async fn on_change(&self, view: ClusterViewRef) -> Result<()>; +} + +pub type MetaWatcherPtr = Box; + +/// Meta client abstraction +#[async_trait] +pub trait MetaClient { + /// Start the meta client + async fn start(&self) -> Result<()>; + + /// Get current cluster view. + /// + /// The cluster view is updated by background workers periodically + fn get_cluster_view(&self) -> ClusterViewRef; +} + +// TODO(yingwen): Now meta use i32 as shard id, maybe switch to unsigned number +pub type ShardId = i32; + +#[derive(Debug, Clone, Deserialize)] +pub struct Node { + pub addr: String, + pub port: u32, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct ShardView { + pub shard_id: ShardId, + pub node: Node, +} + +fn default_engine_type() -> String { + ANALYTIC_ENGINE_TYPE.to_string() +} + +#[derive(Debug, Clone, Deserialize)] +#[serde(default)] +pub struct SchemaConfig { + pub auto_create_tables: bool, + pub default_engine_type: String, + pub default_timestamp_column_name: String, +} + +impl Default for SchemaConfig { + fn default() -> Self { + Self { + auto_create_tables: false, + default_engine_type: default_engine_type(), + default_timestamp_column_name: default_timestamp_column_name(), + } + } +} + +impl From for SchemaConfig { + fn from(view: SchemaShardView) -> Self { + Self { + auto_create_tables: view.auto_create_tables, + default_engine_type: view.default_engine_type, + default_timestamp_column_name: view.default_timestamp_column_name, + } + } +} + +#[derive(Debug, Default, Clone, Deserialize)] +pub struct ClusterView { + pub schema_shards: HashMap, + pub schema_configs: HashMap, +} + +impl TryFrom for ClusterView { + type Error = Error; + + fn try_from(result: ClusterViewResponse) -> Result { + let mut schema_shards = HashMap::with_capacity(result.schema_shards.len()); + let mut schema_configs = HashMap::with_capacity(result.schema_shards.len()); + + for (schema, shard_view) in result.schema_shards { + let mut schema_view = HashMap::with_capacity(shard_view.shard_nodes.len()); + for (shard_id, shard_node) in shard_view.shard_nodes { + let mut addr_port = shard_node.split(':'); + let addr = addr_port + .next() + .context(InvalidNodeAddr { node: &shard_node })?; + let port = addr_port + .next() + .context(InvalidNodeAddr { node: &shard_node })? + .parse() + .context(InvalidNodePort { node: &shard_node })?; + let node = Node { + addr: addr.to_string(), + port, + }; + schema_view.insert(shard_id, ShardView { shard_id, node }); + } + schema_shards.insert(schema.clone(), schema_view); + // TODO(boyan) support config in ClusterViewResponse + schema_configs.insert(schema, SchemaConfig::default()); + } + + Ok(ClusterView { + schema_shards, + schema_configs, + }) + } +} + +pub type ClusterViewRef = Arc; + +#[derive(Debug, Deserialize)] +#[serde(default)] +pub struct MetaClientConfig { + pub cluster: String, + pub meta_addr: String, + pub meta_version: String, + /// Local ip address of this node, used as endpoint ip in meta. + pub node: String, + /// Grpc port of this node, also used as endpoint port in meta. + pub port: u16, + pub meta_members_url: String, + pub lease: ReadableDuration, + pub timeout: ReadableDuration, + pub cq_count: usize, + /// + /// - If `enable_meta` is true, the client will fetch cluster view from + /// remote meta ndoe. + /// - If `enable_meta` is false, the client will try to read cluster view + /// from `cluster_view`. + pub enable_meta: bool, + /// The static cluster view used by static meta client. + pub cluster_view: ClusterViewConfig, +} + +impl Default for MetaClientConfig { + fn default() -> Self { + Self { + cluster: String::new(), + meta_addr: "http://127.0.0.1:8080".to_string(), + meta_version: String::from("v1"), + node: String::new(), + port: 8831, + meta_members_url: "ceresmeta/members".to_string(), + lease: ReadableDuration::secs(10), + timeout: ReadableDuration::secs(5), + cq_count: 8, + enable_meta: false, + cluster_view: ClusterViewConfig { + schema_shards: Vec::new(), + }, + } + } +} + +impl From<&MetaClientConfig> for RegisterNodeRequest { + fn from(meta_config: &MetaClientConfig) -> Self { + let mut req = RegisterNodeRequest::new(); + req.set_node_type(NodeType::Data); + req.set_ns(NameSpace { + cluster: meta_config.cluster.to_string(), + version: meta_config.meta_version.to_string(), + ..Default::default() + }); + req.set_node_info(CommonNodeInfo { + node: format!("{}:{}", meta_config.node, meta_config.port), + lease: meta_config.lease.as_secs() as i32, + ..Default::default() + }); + req + } +} + +#[derive(Debug, Deserialize, Clone)] +#[serde(default)] +pub struct SchemaShardView { + schema: String, + auto_create_tables: bool, + pub default_engine_type: String, + default_timestamp_column_name: String, + shard_views: Vec, +} + +impl Default for SchemaShardView { + fn default() -> Self { + Self { + schema: "".to_string(), + auto_create_tables: false, + default_engine_type: default_engine_type(), + default_timestamp_column_name: default_timestamp_column_name(), + shard_views: Vec::default(), + } + } +} + +#[inline] +fn default_timestamp_column_name() -> String { + TIMESTAMP_COLUMN.to_string() +} + +#[derive(Debug, Deserialize, Clone)] +pub struct ClusterViewConfig { + schema_shards: Vec, +} + +impl ClusterViewConfig { + pub(crate) fn to_cluster_view(&self) -> ClusterView { + let mut schema_configs = HashMap::with_capacity(self.schema_shards.len()); + let mut schema_shards = HashMap::with_capacity(self.schema_shards.len()); + + for schema_shard_view in self.schema_shards.clone() { + let schema = schema_shard_view.schema.clone(); + schema_shards.insert( + schema.clone(), + schema_shard_view + .shard_views + .iter() + .map(|shard| (shard.shard_id, shard.clone())) + .collect(), + ); + schema_configs.insert(schema, SchemaConfig::from(schema_shard_view)); + } + ClusterView { + schema_shards, + schema_configs, + } + } +} + +struct MetaClientImplInner { + meta_grpc_address: RwLock>, + http_client: reqwest::Client, + balancer: Box, + meta_config: MetaClientConfig, + cluster_view: RwLock, + members_url: Url, + watcher: Option, +} + +impl MetaClientImplInner { + fn new(meta_config: MetaClientConfig, watcher: Option) -> Result { + let http_client = reqwest::Client::builder() + .timeout(Duration::from(meta_config.timeout)) + .build() + .context(BuildHttpClient)?; + + let members_url = Url::parse(&meta_config.meta_addr) + .context(InvalidMetaAddr { + meta_addr: &meta_config.meta_addr, + })? + .join(format!("{}/", meta_config.meta_version).as_str()) + .context(JoinUrl { + input: &meta_config.meta_version, + })? + .join(&meta_config.meta_members_url) + .context(JoinUrl { + input: &meta_config.meta_members_url, + })?; + + Ok(Self { + meta_grpc_address: RwLock::new(Vec::new()), + http_client, + balancer: Box::new(RandomLoadBalancer), + meta_config, + cluster_view: RwLock::new(Arc::new(ClusterView::default())), + members_url, + watcher, + }) + } + + async fn fetch_cluster_view(&self) -> Result<()> { + let client = self.build_rpc_client()?; + let mut req = FetchClusterViewRequest::new(); + req.set_ns(NameSpace { + cluster: self.meta_config.cluster.to_string(), + version: self.meta_config.meta_version.to_string(), + ..Default::default() + }); + let mut receiver = client + .fetch_cluster_view(&req) + .context(FetchClusterViewError)?; + + while let Some(result) = receiver.try_next().await.context(FetchClusterViewError)? { + self.update_cluster_view_by_result(result).await?; + + info!( + "Fetch cluster view from meta, cluster:{}, view:{:#?}", + self.meta_config.cluster, + *self.cluster_view.read().unwrap(), + ); + } + + Ok(()) + } + + async fn update_cluster_view_by_result(&self, view_result: ClusterViewResponse) -> Result<()> { + let view = Arc::new(ClusterView::try_from(view_result)?); + + { + let mut cluster_view = self.cluster_view.write().unwrap(); + *cluster_view = view.clone(); + } + + if let Some(w) = &self.watcher { + w.on_change(view).await?; + } + + Ok(()) + } + + fn meta_addresses(&self) -> Vec { + self.meta_grpc_address.read().unwrap().clone() + } + + fn build_rpc_client(&self) -> Result { + let meta_addresses = self.meta_addresses(); + let meta_rpc_addr = self + .balancer + .select(&meta_addresses) + .context(BuildRpcClientError)?; + + let cb = ChannelBuilder::new(Arc::new(Environment::new(self.meta_config.cq_count))); + Ok(CeresmetaRpcServiceClient::new(cb.connect(meta_rpc_addr))) + } + + async fn register(&self, client: &CeresmetaRpcServiceClient) -> Result { + let req = RegisterNodeRequest::from(&self.meta_config); + client.register_node(&req).context(RegisterNodeError) + } + + async fn get_bytes_from_url(&self, url: Url) -> Result { + let resp = self + .http_client + .get(self.members_url.clone()) + .send() + .await + .context(SendHttp)?; + let status = resp.status(); + let text = resp.bytes().await.context(ParseText)?; + + if status.is_success() { + info!( + "Get bytes from url success, status:{}, url:{}, bytes:{:?}", + status, url, text + ); + + Ok(text) + } else { + error!( + "Failed to get bytes from url, status:{}, url:{}, bytes:{:?}", + status, url, text + ); + + BadHttpStatus { status, url, text }.fail() + } + } + + async fn get_from_url(&self, url: Url) -> Result { + let full = self.get_bytes_from_url(url).await?; + + serde_json::from_slice(&full).context(ParseJson { text: full }) + } + + async fn pull_meta_grpc_address(&self) -> Result<()> { + let addresses: Vec = self.get_from_url(self.members_url.clone()).await?; + + *self.meta_grpc_address.write().unwrap() = addresses; + + Ok(()) + } + + // TODO(yingwen): Store the value in field + fn error_wait_lease(&self) -> Duration { + Duration::from_secs(self.meta_config.lease.as_secs() / 2) + } + + // Register node every 2/3 lease + fn register_interval(&self) -> Duration { + Duration::from_secs(self.meta_config.lease.as_secs() * 2 / 3) + } + + fn fetch_view_interval(&self) -> Duration { + Duration::from_secs(self.meta_config.lease.as_secs() * 3) + } + + async fn start_fetch_cluster_view(&self) { + loop { + match self.fetch_cluster_view().await { + Ok(()) => { + info!( + "Fetch cluster view finished, cluster:{}", + self.meta_config.cluster + ); + } + Err(e) => { + error!( + "Failed to fetch cluster view from meta, cluster:{}, error:{}", + self.meta_config.cluster, e + ); + } + } + + time::sleep(self.error_wait_lease()).await; + } + } + + async fn register_loop(&self) -> Result<()> { + let mut interval = time::interval(self.register_interval()); + let rpc_client = self.build_rpc_client()?; + + loop { + let resp = self.register(&rpc_client).await?; + info!( + "Register node successfully, cluster:{}, response:{:#?}", + self.meta_config.cluster, resp + ); + + interval.tick().await; + } + } + + async fn start_register(&self) { + loop { + if let Err(e) = self.register_loop().await { + error!( + "Failed to register node to meta, cluster:{}, error:{}", + self.meta_config.cluster, e + ); + + time::sleep(self.error_wait_lease()).await; + } + } + } + + async fn start_refresh_meta_addresses(&self) { + let mut interval = time::interval(self.fetch_view_interval()); + + loop { + match self.pull_meta_grpc_address().await { + Ok(()) => { + interval.tick().await; + } + Err(e) => { + error!( + "Failed to refresh meta addresses from meta, url:{}, error:{}", + self.members_url, e + ); + + time::sleep(self.error_wait_lease()).await + } + } + } + } +} + +/// Default meta client impl, will interact with a remote meta node. +pub struct MetaClientImpl { + inner: Arc, + runtime: Arc, +} + +impl MetaClientImpl { + pub fn new( + config: MetaClientConfig, + runtime: Arc, + watcher: Option, + ) -> Result { + Ok(Self { + inner: Arc::new(MetaClientImplInner::new(config, watcher)?), + runtime, + }) + } +} + +#[async_trait] +impl MetaClient for MetaClientImpl { + async fn start(&self) -> Result<()> { + info!( + "Meta client is starting, config:{:?}", + self.inner.meta_config + ); + + self.inner.pull_meta_grpc_address().await?; + + let inner = self.inner.clone(); + self.runtime.spawn(async move { + inner.start_refresh_meta_addresses().await; + }); + + let inner = self.inner.clone(); + self.runtime.spawn(async move { + inner.start_register().await; + }); + + let inner = self.inner.clone(); + self.runtime.spawn(async move { + inner.start_fetch_cluster_view().await; + }); + + info!("Meta client has started"); + + Ok(()) + } + + fn get_cluster_view(&self) -> ClusterViewRef { + self.inner.cluster_view.read().unwrap().clone() + } +} + +/// Create a meta client with given `config`. +pub fn build_meta_client( + config: MetaClientConfig, + runtime: Arc, + watcher: Option, +) -> Result> { + if config.enable_meta { + let meta_client = MetaClientImpl::new(config, runtime, watcher)?; + Ok(Arc::new(meta_client)) + } else { + let meta_client = StaticMetaClient::new(config, watcher); + Ok(Arc::new(meta_client)) + } +} diff --git a/meta_client/src/load_balance.rs b/meta_client/src/load_balance.rs new file mode 100644 index 0000000000..707fb08d98 --- /dev/null +++ b/meta_client/src/load_balance.rs @@ -0,0 +1,65 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Load balancer + +use common_util::define_result; +use rand::Rng; +use snafu::{Backtrace, Snafu}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Meta Addresses empty.\nBacktrace:\n{}", backtrace))] + MetaAddressesEmpty { backtrace: Backtrace }, +} + +define_result!(Error); + +pub trait LoadBalancer { + fn select<'a>(&self, addresses: &'a [String]) -> Result<&'a String>; +} + +pub struct RandomLoadBalancer; + +impl LoadBalancer for RandomLoadBalancer { + fn select<'a>(&self, addresses: &'a [String]) -> Result<&'a String> { + if addresses.is_empty() { + return MetaAddressesEmpty.fail(); + } + + let len = addresses.len(); + if len == 1 { + return Ok(&addresses[0]); + } + let mut rng = rand::thread_rng(); + let idx = rng.gen_range(0, len); + + Ok(&addresses[idx]) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_random_loadbalancer() { + let lb = RandomLoadBalancer; + let addresses = vec![ + "127.0.0.1:8080".to_string(), + "127.0.0.2:8080".to_string(), + "127.0.0.3:8080".to_string(), + "127.0.0.4:8080".to_string(), + "127.0.0.5:8080".to_string(), + ]; + for _idx in 0..100 { + let addr = lb.select(&addresses).unwrap(); + assert!(addresses.contains(addr)); + } + + // Empty case + assert!(lb.select(&[]).is_err()); + + let addresses = ["127.0.0.1:5000".to_string()]; + assert_eq!(&addresses[0], lb.select(&addresses).unwrap()); + } +} diff --git a/meta_client/src/static_client.rs b/meta_client/src/static_client.rs new file mode 100644 index 0000000000..8639100f53 --- /dev/null +++ b/meta_client/src/static_client.rs @@ -0,0 +1,86 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Static meta client. + +use std::{collections::HashMap, sync::Arc}; + +use async_trait::async_trait; +use log::info; + +use crate::{ + ClusterView, ClusterViewConfig, ClusterViewRef, MetaClient, MetaClientConfig, MetaWatcherPtr, + Node, Result, ShardView, +}; + +/// Static meta client. +pub struct StaticMetaClient { + cluster_view: ClusterViewRef, + watcher: Option, +} + +impl StaticMetaClient { + pub fn new(config: MetaClientConfig, watcher: Option) -> Self { + let cluster_view = match new_cluster_view(&config.cluster_view) { + Some(v) => v, + None => cluster_view_without_meta(&config.node, config.port), + }; + + Self { + cluster_view: Arc::new(cluster_view), + watcher, + } + } +} + +#[async_trait] +impl MetaClient for StaticMetaClient { + async fn start(&self) -> Result<()> { + info!( + "File meta client is starting, cluster_view:{:?}", + self.cluster_view + ); + + info!("File meta client invoke watcher"); + + if let Some(w) = &self.watcher { + w.on_change(self.cluster_view.clone()).await?; + } + + info!("File meta client has started"); + + Ok(()) + } + + fn get_cluster_view(&self) -> ClusterViewRef { + self.cluster_view.clone() + } +} + +fn new_cluster_view(config: &ClusterViewConfig) -> Option { + if config.schema_shards.is_empty() { + return None; + } + + Some(config.to_cluster_view()) +} + +fn cluster_view_without_meta(addr: &str, port: u16) -> ClusterView { + let shard_id = 0; + let mut static_shards = HashMap::new(); + static_shards.insert( + shard_id, + ShardView { + shard_id, + node: Node { + addr: addr.to_string(), + port: u32::from(port), + }, + }, + ); + let mut schema_shards = HashMap::new(); + schema_shards.insert(catalog::consts::DEFAULT_SCHEMA.to_string(), static_shards); + ClusterView { + schema_shards, + schema_configs: HashMap::default(), + } +} diff --git a/meta_client_v2/Cargo.toml b/meta_client_v2/Cargo.toml new file mode 100644 index 0000000000..6ca7a6338a --- /dev/null +++ b/meta_client_v2/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "meta_client_v2" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +async-trait = "0.1.41" +catalog = { path = "../catalog" } +common_types = { path = "../common_types" } +table_engine = { path = "../table_engine" } +common_util = { path = "../common_util" } +ceresdbproto = { git = "https://github.com/CeresDB/ceresdbproto.git"} +futures = "0.3" +grpcio = { path = "../grpcio" } +log = "0.4" +protobuf = "2.20" +rand = "0.7" +reqwest = "0.11" +serde = "1.0" +serde_derive = "1.0.81" +serde_json = "1.0.60" +snafu = { version ="0.6.10", features = ["backtraces"]} +tokio = { version = "1.0", features = ["full"] } +url = "2.2" diff --git a/meta_client_v2/src/lib.rs b/meta_client_v2/src/lib.rs new file mode 100644 index 0000000000..4dd4244c12 --- /dev/null +++ b/meta_client_v2/src/lib.rs @@ -0,0 +1,676 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Client to communicate with meta + +use std::{ + sync::{Arc, RwLock as StdRwLock}, + time::Duration, +}; + +use async_trait::async_trait; +use ceresdbproto::{ + metagrpcV2::{ + AllocSchemaIdRequest as PbAllocSchemaIdRequest, + AllocTableIdRequest as PbAllocTableIdRequest, DropTableRequest as PbDropTableRequest, + GetTablesRequest as PbGetTablesRequest, NodeHeartbeatRequest as PbNodeHeartbeatRequest, + NodeHeartbeatResponse as PbNodeHeartbeatResponse, + }, + metagrpcV2_grpc::CeresmetaRpcServiceClient, +}; +use common_types::bytes::Bytes; +use common_util::{config::ReadableDuration, define_result, runtime::Runtime}; +use futures::{SinkExt, TryStreamExt}; +use grpcio::{ + CallOption, ChannelBuilder, ClientDuplexReceiver, ClientDuplexSender, Environment, WriteFlags, +}; +use load_balance::{LoadBalancer, RandomLoadBalancer}; +use log::{error, info, warn}; +use reqwest::{self, StatusCode, Url}; +use serde::de::DeserializeOwned; +use serde_derive::Deserialize; +use snafu::{Backtrace, ResultExt, Snafu}; +use tokio::{ + sync::{mpsc::Sender, RwLock}, + time, +}; +pub use types::*; + +mod load_balance; +mod types; + +#[derive(Debug, Snafu)] +#[snafu(visibility = "pub")] +pub enum Error { + #[snafu(display("Build http client failed, err:{}.\nBacktrace:\n{}", source, backtrace))] + BuildHttpClient { + source: reqwest::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Invalid meta addr, addr:{}, err:{}.\nBacktrace:\n{}", + meta_addr, + source, + backtrace + ))] + InvalidMetaAddr { + meta_addr: String, + source: url::ParseError, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to join url, input:{}, err:{}.\nBacktrace:\n{}", + input, + source, + backtrace + ))] + JoinUrl { + input: String, + source: url::ParseError, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to send http request, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + SendHttp { + source: reqwest::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to parse http text, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + ParseText { + source: reqwest::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Bad http status, status:{}, url:{}, text:{:?}.\nBacktrace:\n{}", + status, + url, + text, + backtrace + ))] + BadHttpStatus { + status: StatusCode, + url: String, + text: Bytes, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to parse json text, text:{:?}, err:{}.\nBacktrace:\n{}", + text, + source, + backtrace + ))] + ParseJson { + text: Bytes, + source: serde_json::Error, + backtrace: Backtrace, + }, + + #[snafu(display( + "Failed to fetch action cmd, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + FetchActionCmdError { + source: grpcio::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Encountered build rpc client, err:{}", source))] + BuildRpcClientError { source: load_balance::Error }, + + #[snafu(display("Failed to get grpc client, grpc client is none, msg:{}", msg))] + FailGetGrpcClient { msg: String }, + + #[snafu(display("Failed to send heartbeat, cluster:{}, err:{}", cluster, source))] + FailSendHeartbeat { + cluster: String, + source: Box, + }, + + #[snafu(display( + "Failed to notify action cmd, action cmd:{:?}, err:{}", + action_cmd, + source + ))] + FailNotifyActionCmd { + action_cmd: ActionCmd, + source: Box, + }, + + #[snafu(display("Failed to alloc schema id, err:{}", source))] + FailAllocSchemaId { + source: Box, + }, + + #[snafu(display("Failed to alloc table id, err:{}", source))] + FailAllocTableId { + source: Box, + }, + + #[snafu(display("Failed to drop table, err:{}", source))] + FailDropTable { + source: Box, + }, + + #[snafu(display("Failed to get tables, err:{}", source))] + FailGetTables { + source: Box, + }, + + #[snafu(display("Meta error, resp header:{:?}.\nBacktrace:\n{}", header, backtrace))] + Meta { + header: ResponseHeader, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +const DEFAULT_META_URL_VERSION: &str = "v1"; + +/// Meta client abstraction +#[async_trait] +pub trait MetaClient { + /// Start the meta client + async fn start(&self) -> Result<()>; + + async fn alloc_schema_id(&self, _: AllocSchemaIdRequest) -> Result; + + async fn alloc_table_id(&self, _: AllocTableIdRequest) -> Result; + + async fn drop_table(&self, _: DropTableRequest) -> Result; + + async fn get_tables(&self, _: GetTablesRequest) -> Result; + + async fn send_heartbeat(&self, _: Vec) -> Result<()>; +} + +#[derive(Debug, Deserialize, Clone)] +#[serde(default)] +pub struct MetaClientConfig { + pub cluster_name: String, + pub meta_addr: String, + pub meta_members_url: String, + pub lease: ReadableDuration, + pub timeout: ReadableDuration, + pub cq_count: usize, +} + +impl Default for MetaClientConfig { + fn default() -> Self { + Self { + cluster_name: String::new(), + meta_addr: "http://127.0.0.1:8080".to_string(), + meta_members_url: "ceresmeta/members".to_string(), + lease: ReadableDuration::secs(10), + timeout: ReadableDuration::secs(5), + cq_count: 8, + } + } +} + +struct NodeHeartbeatChannel { + heartbeat_sender: ClientDuplexSender, + action_cmd_receiver: Option>, +} + +struct GrpcClient { + client: CeresmetaRpcServiceClient, + heartbeat_channel: NodeHeartbeatChannel, +} + +struct MetaClientImplInner { + meta_grpc_address: StdRwLock>, + http_client: reqwest::Client, + balancer: Box, + meta_config: MetaClientConfig, + node_meta_info: NodeMetaInfo, + members_url: Url, + + grpc_client: RwLock>, + + notify_sender: Option>, +} + +impl MetaClientImplInner { + fn new( + meta_config: MetaClientConfig, + node_meta_info: NodeMetaInfo, + sender: Option>, + ) -> Result { + let http_client = reqwest::Client::builder() + .timeout(Duration::from(meta_config.timeout)) + .build() + .context(BuildHttpClient)?; + + let members_url = Url::parse(&meta_config.meta_addr) + .context(InvalidMetaAddr { + meta_addr: &meta_config.meta_addr, + })? + .join(format!("{}/", DEFAULT_META_URL_VERSION).as_str()) + .unwrap() + .join(&meta_config.meta_members_url) + .context(JoinUrl { + input: &meta_config.meta_members_url, + })?; + + let client = Self { + meta_grpc_address: StdRwLock::new(Vec::new()), + http_client, + balancer: Box::new(RandomLoadBalancer), + meta_config, + node_meta_info, + members_url, + grpc_client: RwLock::new(None), + notify_sender: sender, + }; + + Ok(client) + } + + fn request_header(&self) -> RequestHeader { + RequestHeader { + node: self.node_meta_info.node.to_string(), + cluster_name: self.meta_config.cluster_name.clone(), + } + } + + fn node_meta_info(&self) -> NodeMetaInfo { + self.node_meta_info.clone() + } + + fn get_cluster_name(&self) -> &str { + // let a :Option=None; + + self.meta_config.cluster_name.as_str() + } + + fn connect_grpc_client(&self) -> Result { + let client = self.build_rpc_client()?; + let (sender, receiver) = client + .node_heartbeat_opt(CallOption::default()) + .context(FetchActionCmdError)?; + Ok(GrpcClient { + client, + heartbeat_channel: NodeHeartbeatChannel { + heartbeat_sender: sender, + action_cmd_receiver: Some(receiver), + }, + }) + } + + async fn reconnect_heartbeat_channel(&self) { + let grpc_client = &mut *self.grpc_client.write().await; + loop { + match self.connect_grpc_client() { + Ok(client) => { + *grpc_client = Some(client); + return; + } + Err(e) => { + error!("Grpc reconnect failed, error:{}", e); + time::sleep(self.error_wait_lease()).await; + } + } + } + } + + fn meta_addresses(&self) -> Vec { + self.meta_grpc_address.read().unwrap().clone() + } + + fn build_rpc_client(&self) -> Result { + let meta_addresses = self.meta_addresses(); + let meta_rpc_addr = self + .balancer + .select(&meta_addresses) + .context(BuildRpcClientError)?; + + let cb = ChannelBuilder::new(Arc::new(Environment::new(self.meta_config.cq_count))); + Ok(CeresmetaRpcServiceClient::new(cb.connect(meta_rpc_addr))) + } + + async fn get_bytes_from_url(&self, url: Url) -> Result { + let resp = self + .http_client + .get(self.members_url.clone()) + .send() + .await + .context(SendHttp)?; + let status = resp.status(); + let text = resp.bytes().await.context(ParseText)?; + + if status.is_success() { + info!( + "Get bytes from url success, status:{}, url:{}, bytes:{:?}", + status, url, text + ); + + Ok(text) + } else { + error!( + "Failed to get bytes from url, status:{}, url:{}, bytes:{:?}", + status, url, text + ); + + BadHttpStatus { status, url, text }.fail() + } + } + + async fn get_from_url(&self, url: Url) -> Result { + let full = self.get_bytes_from_url(url).await?; + + serde_json::from_slice(&full).context(ParseJson { text: full }) + } + + async fn pull_meta_grpc_address(&self) -> Result<()> { + let addresses: Vec = self.get_from_url(self.members_url.clone()).await?; + + *self.meta_grpc_address.write().unwrap() = addresses; + + Ok(()) + } + + // TODO(yingwen): Store the value in field + fn error_wait_lease(&self) -> Duration { + Duration::from_secs(self.meta_config.lease.as_secs() / 2) + } + + fn fetch_view_interval(&self) -> Duration { + Duration::from_secs(self.meta_config.lease.as_secs() * 3) + } + + async fn start_refresh_meta_addresses(&self) { + let mut interval = time::interval(self.fetch_view_interval()); + + loop { + match self.pull_meta_grpc_address().await { + Ok(()) => { + interval.tick().await; + } + Err(e) => { + error!( + "Failed to refresh meta addresses from meta, url:{}, error:{}", + self.members_url, e + ); + + time::sleep(self.error_wait_lease()).await; + } + } + } + } + + async fn start_fetch_action_cmd(&self) { + loop { + let mut receiver = None; + if let Some(client) = &mut *self.grpc_client.write().await { + receiver = client.heartbeat_channel.action_cmd_receiver.take(); + if receiver.is_none() { + error!("Failed to fetch action cmd receiver"); + } + } else { + error!("Grpc client is not inited"); + } + + if let Some(v) = receiver { + match self.fetch_action_cmd(v).await { + Ok(()) => { + info!( + "Fetch cluster view finished, cluster:{}", + self.get_cluster_name() + ); + } + Err(e) => { + self.reconnect_heartbeat_channel().await; + error!( + "Failed to get action cmd, cluster:{}, error:{}", + self.get_cluster_name(), + e + ); + } + } + } + + time::sleep(self.error_wait_lease()).await; + } + } + + async fn fetch_action_cmd( + &self, + mut receiver: ClientDuplexReceiver, + ) -> Result<()> { + while let Some(resp) = receiver.try_next().await.context(FetchActionCmdError)? { + info!( + "Fetch action cmd from meta, cluster:{}, action_cmd:{:?}", + self.get_cluster_name(), + resp, + ); + if let Some(notify_sender) = &self.notify_sender { + let resp: NodeHeartbeatResponse = resp.into(); + if let Err(e) = check_response_header(&resp.header) { + error!("Fetch action cmd failed, err:{}", e); + continue; + } + if let Some(action_cmd) = resp.action_cmd { + if let Err(e) = notify_sender.send(action_cmd.clone()).await { + error!( + "Notify sender send failed, action cmd:{:?}, err:{}", + action_cmd, e + ); + } + } else { + warn!("Fetch action cmd is empty, resp:{:?}", resp) + } + } + } + + Ok(()) + } +} + +/// Default meta client impl, will interact with a remote meta node. +pub struct MetaClientImpl { + inner: Arc, + runtime: Arc, +} + +impl MetaClientImpl { + pub fn new( + config: MetaClientConfig, + node_meta_info: NodeMetaInfo, + runtime: Arc, + sender: Option>, + ) -> Result { + Ok(Self { + inner: Arc::new(MetaClientImplInner::new(config, node_meta_info, sender)?), + runtime, + }) + } +} + +#[async_trait] +impl MetaClient for MetaClientImpl { + async fn start(&self) -> Result<()> { + info!( + "Meta client is starting, config:{:?}", + self.inner.meta_config + ); + + self.inner.pull_meta_grpc_address().await?; + self.inner.reconnect_heartbeat_channel().await; + + let inner = self.inner.clone(); + self.runtime.spawn(async move { + inner.start_refresh_meta_addresses().await; + }); + + let inner = self.inner.clone(); + self.runtime.spawn(async move { + inner.start_fetch_action_cmd().await; + }); + + info!("Meta client has started"); + + Ok(()) + } + + async fn alloc_schema_id(&self, req: AllocSchemaIdRequest) -> Result { + if let Some(grpc_client) = &mut *self.inner.grpc_client.write().await { + let mut pb_req: PbAllocSchemaIdRequest = req.into(); + pb_req.set_header(self.inner.request_header().into()); + let pb_resp = grpc_client + .client + .alloc_schema_id_async_opt(&pb_req, CallOption::default()) + .map_err(|e| Box::new(e) as _) + .context(FailAllocSchemaId)? + .await + .map_err(|e| Box::new(e) as _) + .context(FailAllocSchemaId)?; + let resp: AllocSchemaIdResponse = pb_resp.into(); + check_response_header(&resp.header)?; + Ok(resp) + } else { + FailGetGrpcClient { + msg: "alloc schema id".to_string(), + } + .fail() + } + } + + async fn alloc_table_id(&self, req: AllocTableIdRequest) -> Result { + if let Some(grpc_client) = &mut *self.inner.grpc_client.write().await { + let mut pb_req: PbAllocTableIdRequest = req.into(); + pb_req.set_header(self.inner.request_header().into()); + let pb_resp = grpc_client + .client + .alloc_table_id_async_opt(&pb_req, CallOption::default()) + .map_err(|e| Box::new(e) as _) + .context(FailAllocTableId)? + .await + .map_err(|e| Box::new(e) as _) + .context(FailAllocTableId)?; + let resp: AllocTableIdResponse = pb_resp.into(); + check_response_header(&resp.header)?; + Ok(resp) + } else { + FailGetGrpcClient { + msg: "alloc table id".to_string(), + } + .fail() + } + } + + async fn drop_table(&self, req: DropTableRequest) -> Result { + if let Some(grpc_client) = &mut *self.inner.grpc_client.write().await { + let mut pb_req: PbDropTableRequest = req.into(); + pb_req.set_header(self.inner.request_header().into()); + let pb_resp = grpc_client + .client + .drop_table_async_opt(&pb_req, CallOption::default()) + .map_err(|e| Box::new(e) as _) + .context(FailDropTable)? + .await + .map_err(|e| Box::new(e) as _) + .context(FailDropTable)?; + let resp: DropTableResponse = pb_resp.into(); + check_response_header(&resp.header)?; + Ok(resp) + } else { + FailGetGrpcClient { + msg: "drop table".to_string(), + } + .fail() + } + } + + async fn get_tables(&self, req: GetTablesRequest) -> Result { + if let Some(grpc_client) = &mut *self.inner.grpc_client.write().await { + let mut pb_req: PbGetTablesRequest = req.into(); + pb_req.set_header(self.inner.request_header().into()); + let pb_resp = grpc_client + .client + .get_tables_async_opt(&pb_req, CallOption::default()) + .map_err(|e| Box::new(e) as _) + .context(FailGetTables)? + .await + .map_err(|e| Box::new(e) as _) + .context(FailGetTables)?; + let resp: GetTablesResponse = pb_resp.into(); + check_response_header(&resp.header)?; + Ok(resp) + } else { + FailGetGrpcClient { + msg: "get tables".to_string(), + } + .fail() + } + } + + async fn send_heartbeat(&self, shards_info: Vec) -> Result<()> { + if let Some(grpc_client) = &mut *self.inner.grpc_client.write().await { + info!( + "Meta client send heartbeat, cluster:{}, shards_info:{:?}", + self.inner.get_cluster_name(), + shards_info + ); + let mut pb_request = PbNodeHeartbeatRequest::new(); + pb_request.set_header(self.inner.request_header().into()); + let node_info = NodeInfo { + node_meta_info: self.inner.node_meta_info(), + shards_info, + }; + pb_request.set_info(node_info.into()); + if let Err(e) = grpc_client + .heartbeat_channel + .heartbeat_sender + .send((pb_request, WriteFlags::default())) + .await + .map_err(|e| Box::new(e) as _) + .context(FailSendHeartbeat { + cluster: self.inner.get_cluster_name(), + }) + { + self.inner.reconnect_heartbeat_channel().await; + return Err(e); + }; + } else { + error!("Grpc_client is none"); + } + + Ok(()) + } +} + +fn check_response_header(header: &ResponseHeader) -> Result<()> { + if header.success { + Ok(()) + } else { + Meta { + header: header.clone(), + } + .fail() + } +} + +/// Create a meta client with given `config`. +pub fn build_meta_client( + config: MetaClientConfig, + node_meta_info: NodeMetaInfo, + runtime: Arc, + sender: Option>, +) -> Result> { + let meta_client = MetaClientImpl::new(config, node_meta_info, runtime, sender)?; + Ok(Arc::new(meta_client)) +} diff --git a/meta_client_v2/src/load_balance.rs b/meta_client_v2/src/load_balance.rs new file mode 100644 index 0000000000..707fb08d98 --- /dev/null +++ b/meta_client_v2/src/load_balance.rs @@ -0,0 +1,65 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Load balancer + +use common_util::define_result; +use rand::Rng; +use snafu::{Backtrace, Snafu}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Meta Addresses empty.\nBacktrace:\n{}", backtrace))] + MetaAddressesEmpty { backtrace: Backtrace }, +} + +define_result!(Error); + +pub trait LoadBalancer { + fn select<'a>(&self, addresses: &'a [String]) -> Result<&'a String>; +} + +pub struct RandomLoadBalancer; + +impl LoadBalancer for RandomLoadBalancer { + fn select<'a>(&self, addresses: &'a [String]) -> Result<&'a String> { + if addresses.is_empty() { + return MetaAddressesEmpty.fail(); + } + + let len = addresses.len(); + if len == 1 { + return Ok(&addresses[0]); + } + let mut rng = rand::thread_rng(); + let idx = rng.gen_range(0, len); + + Ok(&addresses[idx]) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_random_loadbalancer() { + let lb = RandomLoadBalancer; + let addresses = vec![ + "127.0.0.1:8080".to_string(), + "127.0.0.2:8080".to_string(), + "127.0.0.3:8080".to_string(), + "127.0.0.4:8080".to_string(), + "127.0.0.5:8080".to_string(), + ]; + for _idx in 0..100 { + let addr = lb.select(&addresses).unwrap(); + assert!(addresses.contains(addr)); + } + + // Empty case + assert!(lb.select(&[]).is_err()); + + let addresses = ["127.0.0.1:5000".to_string()]; + assert_eq!(&addresses[0], lb.select(&addresses).unwrap()); + } +} diff --git a/meta_client_v2/src/types.rs b/meta_client_v2/src/types.rs new file mode 100644 index 0000000000..7f558feec7 --- /dev/null +++ b/meta_client_v2/src/types.rs @@ -0,0 +1,458 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::collections::HashMap; + +use ceresdbproto::{ + metaV2::ShardRole as PbShardRole, + metagrpcV2::{ + AllocSchemaIdRequest as PbAllocSchemaIdRequest, + AllocSchemaIdResponse as PbAllocSchemaIdResponse, + AllocTableIdRequest as PbAllocTableIdRequest, + AllocTableIdResponse as PbAllocTableIdResponse, ChangeRoleCmd as PbChangeRoleCmd, + CloseCmd as PbCloseCmd, DropTableRequest as PbDropTableRequest, + DropTableResponse as PbDropTableResponse, Error as PbError, ErrorType as PbErrorType, + GetTablesRequest as PbGetTablesRequest, GetTablesResponse as PbGetTablesResponse, + NodeHeartbeatResponse as PbNodeHeartbeatResponse, NodeHeartbeatResponse_oneof_cmd, + NodeInfo as PbNodeInfo, NoneCmd as PbNoneCmd, OpenCmd as PbOpenCmd, + RequestHeader as PbRequestHeader, ResponseHeader as PbResponseHeader, + ShardInfo as PbShardInfo, ShardTables as PbShardTables, SplitCmd as PbSplitCmd, + TableInfo as PbTableInfo, + }, +}; +use common_util::config::ReadableDuration; +use serde_derive::Deserialize; + +pub type TableId = u64; +pub type ShardId = u32; +pub type SchemaId = u32; + +#[derive(Debug, Clone)] +pub struct RequestHeader { + pub node: String, + pub cluster_name: String, +} + +#[derive(Debug, Clone)] +pub struct ResponseHeader { + pub success: bool, + pub error: ResponseError, +} + +#[derive(Debug, Clone)] +pub struct ResponseError { + pub error_type: ErrorType, + pub message: String, +} + +#[derive(Debug, Clone)] +pub enum ErrorType { + UNKNOWN, +} + +pub struct AllocSchemaIdRequest { + pub name: String, +} + +pub struct AllocSchemaIdResponse { + pub header: ResponseHeader, + + pub name: String, + pub id: SchemaId, +} + +pub struct AllocTableIdRequest { + pub schema_name: String, + pub name: String, +} + +pub struct AllocTableIdResponse { + pub header: ResponseHeader, + + pub schema_name: String, + pub name: String, + pub shard_id: ShardId, + pub schema_id: SchemaId, + pub id: TableId, +} + +pub struct DropTableRequest { + pub schema_name: String, + pub name: String, +} + +pub struct DropTableResponse { + pub header: ResponseHeader, +} + +#[derive(Clone, Debug)] +pub struct GetTablesRequest { + pub shard_ids: Vec, +} + +#[derive(Clone, Debug)] +pub struct GetTablesResponse { + pub header: ResponseHeader, + + pub tables_map: HashMap, +} + +#[derive(Clone, Debug)] +pub struct TableInfo { + pub id: TableId, + pub name: String, + pub schema_id: SchemaId, + pub schema_name: String, +} + +#[derive(Clone, Debug)] +pub struct ShardTables { + pub role: ShardRole, + pub tables: Vec, +} + +#[derive(Debug)] +struct NodeHeartbeatRequest { + info: NodeInfo, +} + +#[derive(Debug, Clone, Default, Deserialize)] +pub struct Node { + pub addr: String, + pub port: u16, +} + +impl ToString for Node { + fn to_string(&self) -> String { + format!("{}:{}", self.addr, self.port) + } +} + +#[derive(Debug, Default, Clone, Deserialize)] +pub struct NodeMetaInfo { + pub node: String, + pub zone: String, + pub idc: String, + pub binary_version: String, +} + +#[derive(Debug, Clone)] +pub struct NodeInfo { + pub node_meta_info: NodeMetaInfo, + pub shards_info: Vec, +} + +#[derive(Debug)] +pub struct NodeHeartbeatResponse { + pub header: ResponseHeader, + + pub timestamp: u64, + pub action_cmd: Option, +} + +#[derive(Debug, Clone)] +pub struct ShardInfo { + pub shard_id: ShardId, + pub role: ShardRole, +} + +#[derive(Debug, Copy, Clone)] +pub enum ShardRole { + LEADER, + FOLLOWER, +} + +#[derive(Debug, Clone)] +pub enum ActionCmd { + NoneCmd(NoneCmd), + OpenCmd(OpenCmd), + SplitCmd(SplitCmd), + CloseCmd(CloseCmd), + ChangeRoleCmd(ChangeRoleCmd), +} + +#[derive(Debug, Clone)] +pub struct NoneCmd {} + +#[derive(Debug, Clone)] +pub struct OpenCmd { + pub shard_ids: Vec, +} + +#[derive(Debug, Clone)] +pub struct SplitCmd {} + +#[derive(Debug, Clone)] +pub struct CloseCmd { + pub shard_ids: Vec, +} + +#[derive(Debug, Clone)] +pub struct ChangeRoleCmd {} + +#[derive(Debug, Deserialize, Clone)] +#[serde(default)] +pub struct MetaClientConfig { + pub cluster_name: String, + pub meta_addr: String, + pub meta_members_url: String, + pub lease: ReadableDuration, + pub timeout: ReadableDuration, + pub cq_count: usize, + + /// + /// - If `enable_meta` is true, the client will fetch cluster view from + /// remote meta ndoe. + /// - If `enable_meta` is false, the client will try to read cluster view + /// from `cluster_view`. + pub enable_meta: bool, +} + +impl Default for MetaClientConfig { + fn default() -> Self { + Self { + cluster_name: String::new(), + meta_addr: "http://127.0.0.1:8080".to_string(), + meta_members_url: "ceresmeta/members".to_string(), + lease: ReadableDuration::secs(10), + timeout: ReadableDuration::secs(5), + cq_count: 8, + enable_meta: true, + } + } +} + +impl From for PbNodeInfo { + fn from(node_info: NodeInfo) -> Self { + let mut pb_node_info = PbNodeInfo::new(); + pb_node_info.set_node(node_info.node_meta_info.node.to_string()); + pb_node_info.set_zone(node_info.node_meta_info.zone); + pb_node_info.set_binary_version(node_info.node_meta_info.binary_version); + pb_node_info.set_shardsInfo(protobuf::RepeatedField::from_vec( + node_info + .shards_info + .into_iter() + .map(|v| v.into()) + .collect(), + )); + pb_node_info + } +} + +impl From for PbShardInfo { + fn from(shard_info: ShardInfo) -> Self { + let mut pb_shard_info = PbShardInfo::new(); + pb_shard_info.set_shard_id(shard_info.shard_id); + pb_shard_info.set_role(shard_info.role.into()); + pb_shard_info + } +} + +impl From for PbShardRole { + fn from(shard_role: ShardRole) -> Self { + match shard_role { + ShardRole::LEADER => PbShardRole::LEADER, + ShardRole::FOLLOWER => PbShardRole::FOLLOWER, + } + } +} + +impl From for ShardRole { + fn from(pb: PbShardRole) -> Self { + match pb { + PbShardRole::LEADER => ShardRole::LEADER, + PbShardRole::FOLLOWER => ShardRole::FOLLOWER, + } + } +} + +impl From for NodeHeartbeatResponse { + fn from(mut pb: PbNodeHeartbeatResponse) -> Self { + let timestamp = pb.get_timestamp(); + NodeHeartbeatResponse { + header: pb.take_header().into(), + timestamp, + action_cmd: pb.cmd.map(|v| v.into()), + } + } +} + +impl From for ActionCmd { + fn from(pb: NodeHeartbeatResponse_oneof_cmd) -> Self { + match pb { + NodeHeartbeatResponse_oneof_cmd::none_cmd(_) => ActionCmd::NoneCmd(NoneCmd {}), + NodeHeartbeatResponse_oneof_cmd::open_cmd(v) => ActionCmd::OpenCmd(v.into()), + NodeHeartbeatResponse_oneof_cmd::split_cmd(v) => ActionCmd::SplitCmd(v.into()), + NodeHeartbeatResponse_oneof_cmd::close_cmd(v) => ActionCmd::CloseCmd(v.into()), + NodeHeartbeatResponse_oneof_cmd::change_role_cmd(v) => { + ActionCmd::ChangeRoleCmd(v.into()) + } + } + } +} + +impl From for NoneCmd { + fn from(_pb: PbNoneCmd) -> Self { + Self {} + } +} + +impl From for OpenCmd { + fn from(mut pb: PbOpenCmd) -> Self { + Self { + shard_ids: pb.take_shard_ids(), + } + } +} + +impl From for SplitCmd { + fn from(_pb: PbSplitCmd) -> Self { + Self {} + } +} + +impl From for CloseCmd { + fn from(mut pb: PbCloseCmd) -> Self { + Self { + shard_ids: pb.take_shard_ids(), + } + } +} + +impl From for ChangeRoleCmd { + fn from(_pb: PbChangeRoleCmd) -> Self { + Self {} + } +} + +impl From for PbGetTablesRequest { + fn from(req: GetTablesRequest) -> Self { + let mut pb = PbGetTablesRequest::new(); + pb.set_shard_id(req.shard_ids); + pb + } +} + +impl From for GetTablesResponse { + fn from(mut pb: PbGetTablesResponse) -> Self { + Self { + header: pb.take_header().into(), + tables_map: pb + .take_tables_map() + .into_iter() + .map(|(k, v)| (k, v.into())) + .collect(), + } + } +} + +impl From for ShardTables { + fn from(mut pb: PbShardTables) -> Self { + Self { + role: pb.get_role().into(), + tables: pb.take_tables().into_iter().map(|v| v.into()).collect(), + } + } +} + +impl From for TableInfo { + fn from(mut pb: PbTableInfo) -> Self { + TableInfo { + id: pb.get_id(), + name: pb.take_name(), + schema_id: pb.get_schema_id(), + schema_name: pb.take_schema_name(), + } + } +} + +impl From for PbRequestHeader { + fn from(req: RequestHeader) -> Self { + let mut pb = PbRequestHeader::new(); + pb.set_node(req.node); + pb.set_cluster_name(req.cluster_name); + pb + } +} + +impl From for ResponseHeader { + fn from(mut pb: PbResponseHeader) -> Self { + Self { + success: pb.get_success(), + error: pb.take_error().into(), + } + } +} + +impl From for ErrorType { + fn from(pb: PbErrorType) -> Self { + match pb { + PbErrorType::UNKNOWN => ErrorType::UNKNOWN, + } + } +} + +impl From for ResponseError { + fn from(mut pb: PbError) -> Self { + Self { + error_type: pb.get_error_type().into(), + message: pb.take_message(), + } + } +} + +impl From for PbAllocSchemaIdRequest { + fn from(req: AllocSchemaIdRequest) -> Self { + let mut pb = PbAllocSchemaIdRequest::new(); + pb.set_name(req.name); + pb + } +} + +impl From for AllocSchemaIdResponse { + fn from(mut pb: PbAllocSchemaIdResponse) -> Self { + Self { + header: pb.take_header().into(), + name: pb.take_name(), + id: pb.get_id(), + } + } +} + +impl From for PbAllocTableIdRequest { + fn from(req: AllocTableIdRequest) -> Self { + let mut pb = PbAllocTableIdRequest::new(); + pb.set_schema_name(req.schema_name); + pb.set_name(req.name); + pb + } +} + +impl From for AllocTableIdResponse { + fn from(mut pb: PbAllocTableIdResponse) -> Self { + Self { + header: pb.take_header().into(), + schema_name: pb.take_schema_name(), + name: pb.take_name(), + shard_id: pb.get_shard_id(), + schema_id: pb.get_schema_id(), + id: pb.get_id(), + } + } +} + +impl From for PbDropTableRequest { + fn from(req: DropTableRequest) -> Self { + let mut pb = PbDropTableRequest::new(); + pb.set_schema_name(req.schema_name); + pb.set_name(req.name); + pb + } +} + +impl From for DropTableResponse { + fn from(mut pb: PbDropTableResponse) -> Self { + Self { + header: pb.take_header().into(), + } + } +} diff --git a/proto/.gitignore b/proto/.gitignore new file mode 100644 index 0000000000..5eb2f8833d --- /dev/null +++ b/proto/.gitignore @@ -0,0 +1 @@ +src/protos diff --git a/proto/Cargo.toml b/proto/Cargo.toml new file mode 100644 index 0000000000..609680dd7f --- /dev/null +++ b/proto/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "proto" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +protobuf = "2.20" + +[build-dependencies.protobuf-builder] +git = "https://github.com/CeresDB/protobuf-builder.git" +rev = "745cc8527d1c5eb48745f5ce74b2b5bdb75c3bf2" diff --git a/proto/build.rs b/proto/build.rs new file mode 100644 index 0000000000..e992a9163c --- /dev/null +++ b/proto/build.rs @@ -0,0 +1,11 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use protobuf_builder::Builder; + +fn generate_pb() { + Builder::new().search_dir_for_protos("protos").generate(); +} + +fn main() { + generate_pb(); +} diff --git a/proto/protos/analytic_common.proto b/proto/protos/analytic_common.proto new file mode 100644 index 0000000000..c418296f99 --- /dev/null +++ b/proto/protos/analytic_common.proto @@ -0,0 +1,62 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Common protos of analytic engine +syntax = "proto3"; +package analytic_common; + +// Options of a table that need to persist +message TableOptions { + // Segment duration in ms. + uint64 segment_duration = 1; + bool enable_ttl = 2; + uint64 ttl = 3; + uint32 arena_block_size = 4; + uint64 num_rows_per_row_group = 5; + CompactionStrategy compaction_strategy= 6; + CompactionOptions compaction_options = 7; + UpdateMode update_mode = 8; + uint32 write_buffer_size = 9; + Compression compression = 10; + // If sampling_segment_duration is true, then the segment duration + // is still unknown. + bool sampling_segment_duration = 11; +} + +enum UpdateMode { + Overwrite = 0; + Append = 1; +} + +message CompactionOptions { + // Options for STCS + float bucket_low = 1; + float bucket_high = 2; + uint32 min_sstable_size = 3; + uint32 min_threshold = 4; + uint32 max_threshold = 5; + // Options for TWCS + TimeUnit timestamp_resolution = 6; +} + +enum TimeUnit { + NANOSECONDS = 0; + MICROSECONDS = 1; + MILLISECONDS = 2; + SECONDS = 3; + MINUTES = 4; + HOURS = 5; + DAYS = 6; +} + +enum CompactionStrategy { + DEFAULT = 0; + SIZE_TIERED = 1; + TIME_WINDOW = 2; +} + +enum Compression { + UNCOMPRESSED = 0; + LZ4 = 1; + SNAPPY = 2; + ZSTD = 3; +} diff --git a/proto/protos/common.proto b/proto/protos/common.proto new file mode 100644 index 0000000000..dc917685a7 --- /dev/null +++ b/proto/protos/common.proto @@ -0,0 +1,63 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Common types +syntax = "proto3"; +package common; + +// Data type of column +// TODO(yingwen): Do we need a null type? +enum DataType { + NULL = 0; + TIMESTAMP = 1; + DOUBLE = 2; + VARBINARY = 3; + STRING = 4; + UINT64 = 5; + FLOAT = 6; + INT64 = 7; + INT32 = 8; + INT16 = 9; + INT8 = 10; + UINT32 = 11; + UINT16 = 12; + UINT8 = 13; + BOOL = 14; +} + +// Column schema +message ColumnSchema { + // Column name + string name = 1; + // Column type + DataType data_type = 2; + // Is the column nullable + bool is_nullable = 3; + // Id of the column + uint32 id = 4; + // Is the column used as tag + bool is_tag = 5; + // Comment of the column + string comment = 6; +} + +// Table Schema +message TableSchema { + // Schema of each column + repeated ColumnSchema columns = 1; + // Version of the schema + uint32 version = 2; + // Key column num + uint32 num_key_columns = 3; + // Timestamp index in columns + uint32 timestamp_index = 4; + // Enable auto generated tsid as primary key + bool enable_tsid_primary_key = 5; +} + +// Time range of [start, end) +message TimeRange { + // inclusive start + int64 start = 1; + // exclusive end + int64 end = 2; +} diff --git a/proto/protos/meta_update.proto b/proto/protos/meta_update.proto new file mode 100644 index 0000000000..64c2b384ad --- /dev/null +++ b/proto/protos/meta_update.proto @@ -0,0 +1,101 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Meta Updates of analytic engine +syntax = "proto3"; +package meta_update; + +import "analytic_common.proto"; +import "common.proto"; + +// Meta update for a new space +message AddSpaceMeta { + uint32 space_id = 1; + string space_name = 2; +} + +// Meta update for a new table +message AddTableMeta { + uint32 space_id = 1; + uint64 table_id = 2; + string table_name = 3; + // Schema of the table + common.TableSchema schema = 4; + // Options of the table + analytic_common.TableOptions options = 5; +} + +// Meta update for dropping a table +message DropTableMeta { + uint32 space_id = 1; + uint64 table_id = 2; + string table_name = 3; +} + +// Meta data of a sst file +message AddFileMeta { + // Level of the file + uint32 level = 1; + // Id of the file + uint64 file_id = 2; + bytes min_key = 3; + bytes max_key = 4; + uint64 max_seq = 5; + common.TimeRange time_range = 6; + common.TableSchema schema = 7; + uint64 size = 8; + uint64 row_num = 9; +} + +// Meta data of the file to delete +message DeleteFileMeta { + // Level of the file + uint32 level = 1; + // Id of the file + uint64 file_id = 2; +} + +// Meta data of version edit to table +message VersionEditMeta { + uint32 space_id = 1; + uint64 table_id = 2; + uint64 flushed_sequence = 3; + repeated AddFileMeta files_to_add = 4; + repeated DeleteFileMeta files_to_delete = 5; +} + +// Meta data of schema update. +message AlterSchemaMeta { + uint32 space_id = 1; + uint64 table_id = 2; + // New schema of the table. + common.TableSchema schema = 3; + // Previous schema version. + uint32 pre_schema_version = 4; +} + +// Meta data of schema update. +message AlterOptionsMeta { + uint32 space_id = 1; + uint64 table_id = 2; + // New options of the table. + analytic_common.TableOptions options = 3; +} + +// Meta data of manifest snapshot. +message SnapshotManifestMeta { + uint64 region_id = 1; + uint64 sequence = 2; +} + +// Meta update data to persist +message MetaUpdate { + oneof meta { + AddSpaceMeta add_space = 1; + AddTableMeta add_table = 2; + VersionEditMeta version_edit = 3; + AlterSchemaMeta alter_schema = 4; + AlterOptionsMeta alter_options = 5; + DropTableMeta drop_table = 6; + SnapshotManifestMeta snapshot_manifest = 7; + } +} diff --git a/proto/protos/sst.proto b/proto/protos/sst.proto new file mode 100644 index 0000000000..a1ab16e9a7 --- /dev/null +++ b/proto/protos/sst.proto @@ -0,0 +1,21 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Sst types +syntax = "proto3"; +package sst; + +import "common.proto"; + +message SstMetaData { + // Min key in the sst + bytes min_key = 1; + // Max key in the sst + bytes max_key = 2; + // Max sequence number in the sst + uint64 max_sequence = 3; + // The time range of the sst + common.TimeRange time_range = 4; + common.TableSchema schema = 5; + uint64 size = 6; + uint64 row_num = 7; +} diff --git a/proto/protos/sys_catalog.proto b/proto/protos/sys_catalog.proto new file mode 100644 index 0000000000..11cce62d06 --- /dev/null +++ b/proto/protos/sys_catalog.proto @@ -0,0 +1,55 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Types for sys catalog +syntax = "proto3"; +package sys_catalog; + +import "common.proto"; + +// Catalog entry +message CatalogEntry { + // Name of catalog + string catalog_name = 1; + // Created time: ms + int64 created_time = 2; +} + +// Schema entry +message SchemaEntry { + // Name of catalog + string catalog_name = 1; + // Name of schema + string schema_name = 2; + // Id of the schema + uint32 schema_id = 3; + // Created time: ms + int64 created_time = 4; +} + +// State of the table +enum TableState { + STABLE = 0; + DROPPING = 1; + DROPPED = 2; +} + +// Table entry +// TODO(yingwen): Add PartitionInfo +message TableEntry { + // Name of catalog + string catalog_name = 1; + // Name of schema + string schema_name = 2; + // Table id + uint64 table_id = 3; + // Table name + string table_name = 4; + // Table engine type + string engine = 5; + // The state of the table. + TableState state = 6; + // Created time: ms + int64 created_time = 7; + // Modified time: ms + int64 modified_time = 8; +} diff --git a/proto/protos/table_requests.proto b/proto/protos/table_requests.proto new file mode 100644 index 0000000000..a379299ef5 --- /dev/null +++ b/proto/protos/table_requests.proto @@ -0,0 +1,19 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Types for table requests +syntax = "proto3"; +package table_requests; + +import "common.proto"; + +// Write table request +message WriteRequest { + // Version of row encoding method + uint32 version = 1; + // Schema of rows + common.TableSchema schema = 2; + // Rows in bytes + // + // Each row is encoded in the same format as memtable + repeated bytes rows = 3; +} diff --git a/proto/src/lib.rs b/proto/src/lib.rs new file mode 100644 index 0000000000..d9d1e95e10 --- /dev/null +++ b/proto/src/lib.rs @@ -0,0 +1,10 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Protobuf messages + +// TODO(yingwen): All the protos need review +mod protos { + include!(concat!(env!("OUT_DIR"), "/protos/mod.rs")); +} + +pub use protos::*; diff --git a/query_engine/Cargo.toml b/query_engine/Cargo.toml new file mode 100644 index 0000000000..232992401c --- /dev/null +++ b/query_engine/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "query_engine" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +# In alphabetical order +arrow_deps = { path = "../arrow_deps" } +async-trait = "0.1.41" +common_types = { path = "../common_types" } +common_util = { path = "../common_util" } +futures = "0.3" +log = "0.4" +snafu = { version ="0.6.10", features = ["backtraces"]} +sql = { path = "../sql" } +table_engine = { path = "../table_engine" } +udf = { path = "../udf" } diff --git a/query_engine/src/context.rs b/query_engine/src/context.rs new file mode 100644 index 0000000000..9ebc825f84 --- /dev/null +++ b/query_engine/src/context.rs @@ -0,0 +1,121 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Query context + +use std::sync::Arc; + +use arrow_deps::datafusion::{ + execution::context::{ExecutionConfig, ExecutionContext}, + optimizer::{ + common_subexpr_eliminate::CommonSubexprEliminate, eliminate_limit::EliminateLimit, + filter_push_down::FilterPushDown, limit_push_down::LimitPushDown, optimizer::OptimizerRule, + projection_push_down::ProjectionPushDown, simplify_expressions::SimplifyExpressions, + }, + physical_optimizer::optimizer::PhysicalOptimizerRule, +}; +use common_types::request_id::RequestId; + +use crate::{ + df_planner_extension::QueryPlannerAdapter, + logical_optimizer::{ + order_by_primary_key::OrderByPrimaryKeyRule, type_conversion::TypeConversion, + }, + physical_optimizer, +}; + +/// Query context +pub struct Context { + request_id: RequestId, + df_exec_ctx: ExecutionContext, +} + +impl Context { + // For datafusion, internal use only + #[inline] + pub(crate) fn df_exec_ctx(&self) -> &ExecutionContext { + &self.df_exec_ctx + } + + #[inline] + pub fn request_id(&self) -> RequestId { + self.request_id + } + + pub fn builder(request_id: RequestId) -> Builder { + Builder { + request_id, + df_exec_config: ExecutionConfig::new(), + } + } +} + +pub type ContextRef = Arc; + +#[must_use] +pub struct Builder { + request_id: RequestId, + df_exec_config: ExecutionConfig, +} + +impl Builder { + /// Set default catalog and schema of this query context + pub fn default_catalog_and_schema(mut self, catalog: String, schema: String) -> Self { + self.df_exec_config = self + .df_exec_config + .with_default_catalog_and_schema(catalog, schema); + + self + } + + pub fn build(self) -> Context { + // Always create default catalog and schema now + let df_exec_config = { + let adapted_physical_optimize_rules = Self::apply_adapters_for_physical_optimize_rules( + &self.df_exec_config.physical_optimizers, + ); + let logical_optimize_rules = Self::logical_optimize_rules(); + self.df_exec_config + .with_query_planner(Arc::new(QueryPlannerAdapter)) + .with_optimizer_rules(logical_optimize_rules) + .with_physical_optimizer_rules(adapted_physical_optimize_rules) + }; + + Context { + request_id: self.request_id, + df_exec_ctx: ExecutionContext::with_config(df_exec_config), + } + } + + fn apply_adapters_for_physical_optimize_rules( + default_rules: &[Arc], + ) -> Vec> { + let mut new_rules = Vec::with_capacity(default_rules.len()); + for rule in default_rules { + new_rules.push(physical_optimizer::may_adapt_optimize_rule(rule.clone())) + } + + new_rules + } + + fn logical_optimize_rules() -> Vec> { + let mut optimizers: Vec> = vec![ + Arc::new(TypeConversion), + // These rules are the default settings of the datafusion. + Arc::new(SimplifyExpressions::new()), + Arc::new(CommonSubexprEliminate::new()), + Arc::new(EliminateLimit::new()), + Arc::new(ProjectionPushDown::new()), + Arc::new(FilterPushDown::new()), + Arc::new(LimitPushDown::new()), + // TODO(xikai): restore this rule after the bug of df is fixed. + // Arc::new(SingleDistinctToGroupBy::new()), + ]; + + // FIXME(xikai): use config to control the optimize rule. + if std::env::var("ENABLE_CUSTOM_OPTIMIZE").is_ok() { + optimizers.push(Arc::new(OrderByPrimaryKeyRule)); + } + + optimizers + } +} diff --git a/query_engine/src/df_execution_extension/mod.rs b/query_engine/src/df_execution_extension/mod.rs new file mode 100644 index 0000000000..746499e79a --- /dev/null +++ b/query_engine/src/df_execution_extension/mod.rs @@ -0,0 +1,4 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +pub mod prom_align; +pub use prom_align::PromAlignExec; diff --git a/query_engine/src/df_execution_extension/prom_align.rs b/query_engine/src/df_execution_extension/prom_align.rs new file mode 100644 index 0000000000..5e41f6e9af --- /dev/null +++ b/query_engine/src/df_execution_extension/prom_align.rs @@ -0,0 +1,931 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + any::Any, + collections::{hash_map, BTreeMap, HashMap, VecDeque}, + fmt, mem, + pin::Pin, + sync::Arc, + task::{Context, Poll}, +}; + +use arrow_deps::{ + arrow::{ + array::{ + new_empty_array, Float64Array, StringArray, TimestampMillisecondArray, UInt64Array, + }, + error::ArrowError, + record_batch::RecordBatch, + }, + datafusion::{ + error::{DataFusionError, Result as ArrowResult}, + execution::runtime_env::RuntimeEnv, + physical_plan::{ + repartition::RepartitionExec, ColumnarValue, DisplayFormatType, ExecutionPlan, + Partitioning, PhysicalExpr, RecordBatchStream, + SendableRecordBatchStream as DfSendableRecordBatchStream, Statistics, + }, + }, +}; +use async_trait::async_trait; +use common_types::{ + schema::{ArrowSchema, ArrowSchemaRef, DataType, TSID_COLUMN}, + time::{TimeRange, Timestamp}, +}; +use futures::{Stream, StreamExt}; +use log::debug; +use snafu::{OptionExt, ResultExt, Snafu}; +use sql::promql::{AlignParameter, ColumnNames, Func as PromFunc}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Internal err, source:{:?}", source))] + Internal { source: DataFusionError }, + + #[snafu(display("Invalid schema, source:{:?}", source))] + InvalidSchema { source: common_types::schema::Error }, + + #[snafu(display("Tsid column is required"))] + TsidRequired, + + #[snafu(display("Invalid column type, required:{:?}", required_type))] + InvalidColumnType { required_type: String }, + + #[snafu(display("{} column type cannot be null", name))] + NullColumn { name: String }, + + #[snafu(display("timestamp out of range"))] + TimestampOutOfRange {}, +} + +define_result!(Error); + +/// Limits Extrapolation range. +/// Refer to https://github.com/prometheus/prometheus/pull/1295 +const PROMTHEUS_EXTRAPOLATION_THRESHOLD_COEFFICIENT: f64 = 1.1; + +#[derive(Debug)] +struct ExtractTsidExpr {} + +impl fmt::Display for ExtractTsidExpr { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "(ExtractTsid)") + } +} + +impl PhysicalExpr for ExtractTsidExpr { + fn as_any(&self) -> &dyn Any { + &*self + } + + fn data_type(&self, _input_schema: &ArrowSchema) -> ArrowResult { + Ok(DataType::UInt64) + } + + fn nullable(&self, _input_schema: &ArrowSchema) -> ArrowResult { + Ok(false) + } + + fn evaluate(&self, batch: &RecordBatch) -> ArrowResult { + let tsid_idx = batch + .schema() + .index_of(TSID_COLUMN) + .expect("checked in plan build"); + Ok(ColumnarValue::Array(batch.column(tsid_idx).clone())) + } +} + +/// Note: caller should ensure data[tail_index] is valid +pub(crate) trait AlignFunc: fmt::Debug { + fn call( + &self, + data: &VecDeque, + tail_index: usize, + timestamp: Timestamp, + param: &AlignParameter, + ) -> Result>; +} + +/// PromAlignExec will group data by tsid and align sample based on align_param +#[derive(Debug)] +pub struct PromAlignExec { + input: Arc, + column_name: Arc, + align_func: Arc, + align_param: AlignParameter, +} + +impl PromAlignExec { + pub fn try_new( + input: Arc, + column_name: Arc, + func: PromFunc, + align_param: AlignParameter, + read_parallelism: usize, + ) -> Result { + let extract_tsid: Arc = Arc::new(ExtractTsidExpr {}); + let input = Arc::new( + RepartitionExec::try_new( + input, + Partitioning::Hash(vec![extract_tsid], read_parallelism), + ) + .context(Internal)?, + ) as Arc; + let align_func: Arc = match func { + PromFunc::Instant => Arc::new(InstantFunc {}), + PromFunc::Rate => Arc::new(RateFunc {}), + PromFunc::Irate => Arc::new(IrateFunc {}), + PromFunc::Delta => Arc::new(DeltaFunc {}), + PromFunc::Idelta => Arc::new(IdeltaFunc {}), + PromFunc::Increase => Arc::new(IncreaseFunc {}), + }; + Ok(Self { + input, + column_name, + align_func, + align_param, + }) + } +} + +#[async_trait] +impl ExecutionPlan for PromAlignExec { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> ArrowSchemaRef { + self.input.schema() + } + + fn output_partitioning(&self) -> Partitioning { + self.input.output_partitioning() + } + + fn children(&self) -> Vec> { + vec![self.input.clone()] + } + + fn with_new_children( + &self, + children: Vec>, + ) -> ArrowResult> { + match children.len() { + 1 => Ok(Arc::new(PromAlignExec { + input: children[0].clone(), + column_name: self.column_name.clone(), + align_func: self.align_func.clone(), + align_param: self.align_param, + })), + _ => Err(DataFusionError::Internal( + "PromAlignExec wrong number of children".to_string(), + )), + } + } + + async fn execute( + &self, + partition: usize, + runtime: Arc, + ) -> ArrowResult { + debug!("PromAlignExec: partition:{}", partition); + Ok(Box::pin(PromAlignReader { + input: self.input.execute(partition, runtime).await?, + done: false, + column_name: self.column_name.clone(), + align_func: self.align_func.clone(), + align_param: self.align_param, + tsid_to_tags: HashMap::default(), + tsid_to_stepper: HashMap::default(), + record_schema: None, + })) + } + + fn fmt_as(&self, _t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "PromAlignExec: align_param={:?}, func={:?}, partition_count={}", + self.align_param, + self.align_func, + self.output_partitioning().partition_count(), + ) + } + + fn statistics(&self) -> Statistics { + // TODO(chenxiang) + Statistics::default() + } +} + +struct PromAlignReader { + /// The input to read data from + input: DfSendableRecordBatchStream, + /// Have we produced the output yet? + done: bool, + column_name: Arc, + align_func: Arc, + align_param: AlignParameter, + tsid_to_tags: HashMap>, + tsid_to_stepper: HashMap>, + record_schema: Option, +} + +impl PromAlignReader { + fn step_helper(&mut self, tsid: u64, samples: Vec) -> Result>> { + let start_timestamp = self.align_param.align_range.inclusive_start(); + let offset = self.align_param.offset; + let stepper = self.tsid_to_stepper.entry(tsid).or_insert_with(|| { + Box::new(FixedStepper::new(start_timestamp)) as Box + }); + let samples = samples + .into_iter() + .map(|Sample { timestamp, value }| { + Ok(Sample { + timestamp: timestamp + .checked_add(offset) + .context(TimestampOutOfRange {})?, + value, + }) + }) + .collect::>>()?; + let sample_range = if samples.is_empty() { + TimeRange::min_to_max() + } else { + TimeRange::new_unchecked( + samples.front().unwrap().timestamp, // we have at least one samples here + samples + .back() + .unwrap() + .timestamp + .checked_add_i64(1) + .context(TimestampOutOfRange {})?, + ) + }; + stepper.step( + samples, + sample_range, + &self.align_param, + self.align_func.clone(), + ) + } + + fn accumulate_record_batch( + &mut self, + record_batch: RecordBatch, + ) -> Result>> { + let schema = record_batch.schema(); + let tsid_idx = schema.index_of(TSID_COLUMN).expect("checked in plan build"); + let field_idx = schema + .index_of(&self.column_name.field) + .expect("checked in plan build"); + let timestamp_idx = schema + .index_of(&self.column_name.timestamp) + .expect("checked in plan build"); + + let mut tsid_samples = HashMap::new(); + let tsid_array = record_batch + .column(tsid_idx) + .as_any() + .downcast_ref::() + .expect("checked in build plan"); + if tsid_array.is_empty() { + // empty array means end of data, but maybe there are still pending samples, so + // step one more time + let tsids = self.tsid_to_stepper.keys().cloned().collect::>(); + for tsid in tsids { + if let Some(result) = self.step_helper(tsid, vec![])? { + tsid_samples.insert(tsid, result); + } + } + return Ok(tsid_samples); + } + + let mut previous_tsid = tsid_array.value(0); + let mut duplicated_tsids = vec![(previous_tsid, 0)]; + for row_idx in 1..tsid_array.len() { + let tsid = tsid_array.value(row_idx); + if tsid != previous_tsid { + previous_tsid = tsid; + duplicated_tsids.push((tsid, row_idx)); + } + } + let mut step_helper = |tsid, batch| { + if let hash_map::Entry::Vacant(e) = self.tsid_to_tags.entry(tsid) { + e.insert(Self::build_tags( + &self.column_name.tag_keys, + schema.clone(), + &batch, + )?); + } + if let Some(result) = + self.step_helper(tsid, self.build_sample(field_idx, timestamp_idx, batch)?)? + { + tsid_samples.insert(tsid, result); + } + Ok(()) + }; + if duplicated_tsids.len() == 1 { + // fast path, when there is only one tsid in record_batch + step_helper(duplicated_tsids[0].0, record_batch)?; + } else { + debug!("duplicated_tsids:{:?}", duplicated_tsids); + for i in 0..duplicated_tsids.len() { + let (tsid, offset) = duplicated_tsids[i]; + let length = if i == duplicated_tsids.len() - 1 { + tsid_array.len() - offset + } else { + duplicated_tsids[i + 1].1 - offset + }; + let current_batch = record_batch.slice(offset, length); + step_helper(tsid, current_batch)?; + } + } + + Ok(tsid_samples) + } + + fn build_tags( + tag_keys: &[String], + schema: ArrowSchemaRef, + record_batch: &RecordBatch, + ) -> Result> { + tag_keys + .iter() + .map(|key| { + let v = record_batch + .column(schema.index_of(key).expect("checked in build plan")) + .as_any() + .downcast_ref::() + .context(InvalidColumnType { + required_type: "StringArray", + })? + .value(0); + Ok((key.to_owned(), v.to_string())) + }) + .collect::>>() + } + + fn build_sample( + &self, + field_idx: usize, + timestamp_idx: usize, + record_batch: RecordBatch, + ) -> Result> { + let field_array = record_batch + .column(field_idx) + .as_any() + .downcast_ref::() + .context(InvalidColumnType { + required_type: "Float64Array", + })?; + let timestamp_array = record_batch + .column(timestamp_idx) + .as_any() + .downcast_ref::() + .context(InvalidColumnType { + required_type: "TimestampMillisecondArray", + })?; + field_array + .into_iter() + .zip(timestamp_array.into_iter()) + .map(|(field, timestamp)| { + Ok(Sample { + value: field.context(NullColumn { name: "field" })?, + timestamp: Timestamp::new(timestamp.context(NullColumn { name: "timestamp" })?), + }) + }) + .collect::>>() + } + + fn samples_to_record_batch( + &self, + schema: ArrowSchemaRef, + tsid_samples: HashMap>, + ) -> std::result::Result { + let tsid_idx = schema.index_of(TSID_COLUMN).expect("checked in plan build"); + let field_idx = schema + .index_of(&self.column_name.field) + .expect("checked in plan build"); + let timestamp_idx = schema + .index_of(&self.column_name.timestamp) + .expect("checked in plan build"); + let mut batches = Vec::with_capacity(tsid_samples.len()); + for (tsid, samples) in tsid_samples { + let record_batch_len = samples.len(); + let tags = self + .tsid_to_tags + .get(&tsid) + .expect("tags are ensured in accumulated_record_batch"); + let mut arrays = vec![new_empty_array(&DataType::Int32); schema.fields().len()]; + arrays[tsid_idx] = Arc::new(UInt64Array::from(vec![tsid; record_batch_len])); + let mut fields = Vec::with_capacity(record_batch_len); + let mut timestamps = Vec::with_capacity(record_batch_len); + for Sample { + timestamp, + value: field, + } in samples + { + fields.push(field); + timestamps.push(timestamp.as_i64()); + } + arrays[timestamp_idx] = Arc::new(TimestampMillisecondArray::from(timestamps)); + arrays[field_idx] = Arc::new(Float64Array::from(fields)); + + for tag_key in &self.column_name.tag_keys { + let tag_idx = schema + .index_of(tag_key.as_str()) + .expect("checked in plan build"); + arrays[tag_idx] = Arc::new(StringArray::from(vec![ + tags.get(tag_key) + .expect("tag_key are ensured in accmulate_record_batch") + .to_string(); + record_batch_len + ])); + } + batches.push(RecordBatch::try_new(schema.clone(), arrays)?); + } + + RecordBatch::concat(&schema, &batches) + } +} + +impl Stream for PromAlignReader { + type Item = std::result::Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + if self.done { + return Poll::Ready(None); + } + + match self.input.poll_next_unpin(cx) { + Poll::Ready(Some(Ok(batch))) => { + let schema = batch.schema(); + if self.record_schema.is_none() { + self.record_schema = Some(schema.clone()); + } + let tsid_samples = self + .accumulate_record_batch(batch) + .map_err(|e| ArrowError::SchemaError(e.to_string()))?; // convert all Error enum to SchemaError + if !tsid_samples.is_empty() { + Poll::Ready(Some(self.samples_to_record_batch(schema, tsid_samples))) + } else { + Poll::Ready(Some(Ok(RecordBatch::new_empty(schema)))) + } + } + Poll::Ready(None) => { + self.done = true; + if let Some(schema) = mem::take(&mut self.record_schema) { + let tsid_samples = self + .accumulate_record_batch(RecordBatch::new_empty(schema.clone())) + .map_err(|e| ArrowError::SchemaError(e.to_string()))?; + if !tsid_samples.is_empty() { + return Poll::Ready(Some( + self.samples_to_record_batch(schema, tsid_samples), + )); + } + } + Poll::Ready(None) + } + other => other, + } + } +} + +impl RecordBatchStream for PromAlignReader { + fn schema(&self) -> ArrowSchemaRef { + self.input.schema() + } +} + +#[derive(Debug)] +pub(crate) struct Sample { + timestamp: Timestamp, + value: f64, +} + +/// `Stepper` is used for align samples, specified by [range queries](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries). +/// Note: [instant queries](https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries) are models as range queries with step is 1. +/// +/// # Diagram +/// ```plaintext +/// range +/// +-------------+ +/// v | +/// |------|-----|-----|-----|-----|--------> +/// start step end +/// ``` +trait Stepper: fmt::Debug { + /// Calculate current sample based on new input samples. + /// Samples maybe kept since some function require large time range input, + /// such as rate(metric[1d]) + fn step( + &mut self, + input: VecDeque, + range: TimeRange, + param: &AlignParameter, + align_func: Arc, + ) -> Result>>; + + // Returns size of samples kept during query, mainly used for metrics + fn pending_column_bytes(&self) -> usize; +} + +/// `FixedStepper` is one implemention of `Stepper`, which will accumulate all +/// samples within each step before pass control to next execution node. +/// This implemention will consume high memory in large range query, such as +/// rate(metric[30d]) + +/// TODO(chenxiang): A streaming implemention is required for those large range +/// query. +#[derive(Debug)] +struct FixedStepper { + /// accumulated samples used for calculate sample for current step + entries: VecDeque, + /// tail index of entries for processing current step, which means + /// [0, tail_index] is used + tail_index: usize, + /// timestamp of current step sample + timestamp: Timestamp, +} + +impl Stepper for FixedStepper { + fn step( + &mut self, + mut column: VecDeque, + column_range: TimeRange, + param: &AlignParameter, + align_func: Arc, + ) -> Result>> { + self.entries.append(&mut column); + debug!( + "column_range:{:?}, param:{:?}, ts:{:?}", + column_range, param, self.timestamp + ); + let curr_range = param.align_range.intersected_range(column_range); + if curr_range.is_none() { + return Ok(None); + } + let curr_range = curr_range.unwrap(); + let mut result = vec![]; + + // self.timestamp = self.timestamp.max(start); + while self.timestamp < curr_range.inclusive_start() { + self.timestamp = self + .timestamp + .checked_add(param.step) + .context(TimestampOutOfRange {})?; + } + + while curr_range.contains(self.timestamp) { + // push `tail_index`. In look ahead (by increasing index by 1) way. + while self.tail_index + 1 < self.entries.len() + && self.entries[self.tail_index + 1].timestamp <= self.timestamp + { + self.tail_index += 1; + } + let mint = self + .timestamp + .checked_sub(param.lookback_delta) + .context(TimestampOutOfRange {})?; + // drop some unneeded entries from begining of `entries` + while let Some(entry) = self.entries.front() { + if entry.timestamp < mint { + self.entries.pop_front(); + if let Some(index) = self.tail_index.checked_sub(1) { + self.tail_index = index + } + } else { + break; + } + } + // [mint, self.timestamp] has no data, skip to next step. + let skip = { + if let Some(first_entry) = self.entries.get(0) { + first_entry.timestamp > self.timestamp + } else { + true + } + }; + if skip { + self.timestamp = self + .timestamp + .checked_add(param.step) + .context(TimestampOutOfRange {})?; + continue; + } + + // call range function + if let Some(value) = + align_func.call(&self.entries, self.tail_index, self.timestamp, param)? + { + result.push(value); + } + + self.timestamp = self + .timestamp + .checked_add(param.step) + .context(TimestampOutOfRange {})?; + } + + if !result.is_empty() { + Ok(Some(result)) + } else { + Ok(None) + } + } + + fn pending_column_bytes(&self) -> usize { + self.entries.len() * 16 // timestamp + float value + } +} + +impl FixedStepper { + fn new(start_timestamp: Timestamp) -> FixedStepper { + Self { + entries: VecDeque::new(), + tail_index: 0, + timestamp: start_timestamp, + } + } +} + +/// Helper for Promtheus functions which needs extrapolation. [Rate][rate], +/// [Increase][increase] and [Delta][delta] for now. +/// +/// Since "range" is not always equals to `data_duration`, extrapolation needs +/// to be performed to estimate absent data. Extrapolation is named by +/// Prometheus. This function is ported from [here][prom_extrapolate_code]. +/// "extrapolate" assumes absenting data is following the same distribution with +/// existing data. Thus it simply zooms result calculated from existing data to +/// required extrapolation time range. +/// +/// [rate]: https://prometheus.io/docs/prometheus/latest/querying/functions/#rate +/// [increase]: https://prometheus.io/docs/prometheus/latest/querying/functions/#increase +/// [delta]: https://prometheus.io/docs/prometheus/latest/querying/functions/#delta +/// [prom_extrapolate_code]: https://github.com/prometheus/prometheus/blob/063154eab720d8c3d495bd78312c0df090d0bf23/promql/functions.go#L59 +/// +/// This function can be roughly divided into three parts: +/// - Calculate result from real data +/// - Calculate time range needs extrapolate to. +/// - Calculate extrapolated result. +/// +/// The outputs of above three steps are `difference`, `extrapolated_duration` +/// and `extrapolated_result`. +/// +/// # Diagram +/// ```plaintext +/// range_start first_timestamp last_timestamp range_end +/// └─────────────────────┴────────────────────┴──────────────────┘ +/// range_to_start data_duration range_to_end +/// ``` +/// +/// Legends: +/// - `range_end` is the timestamp passed in +/// - `range_start` is calculated by `timestamp` - `lookback_range`. +/// - "range" here stands for `range_end` - `range_start`, which is equals to +/// `range_to_start` + `data_duration` + `range_to_end`. +/// - `first/last_timestamp` is the timestamp of provided data. +/// - `data_duration` is a time range covered by data. +fn extrapolate_fn_helper( + data: &VecDeque, + tail_index: usize, + timestamp: Timestamp, + lookback_delta: Timestamp, + is_counter: bool, + is_rate: bool, +) -> Result> { + // no sence to calculate rate on one single item. + if tail_index < 1 { + return Ok(None); + } + + let first_data = data[0].value; + + // calculate `counter_reset_correction` for counter type. + let mut counter_reset_correction = 0.0; + if is_counter { + let mut last_data = first_data; + for Sample { value, .. } in data.iter().take(tail_index + 1) { + if *value < last_data { + counter_reset_correction += last_data; + } + last_data = *value; + } + } + + let difference = data[tail_index].value - first_data + counter_reset_correction; + + // `average_duration_between_data` assumes all data is distributed evenly. + let first_timestamp = data[0].timestamp; + let last_timestamp = data[tail_index].timestamp; + let data_duration = (last_timestamp + .checked_sub(first_timestamp) + .context(TimestampOutOfRange {})?) + .as_i64() as f64; + let average_duration_between_data = data_duration / tail_index as f64; + + let range_start = timestamp + .checked_sub(lookback_delta) + .context(TimestampOutOfRange {})?; + let range_end = timestamp; + let mut range_to_start = (first_timestamp + .checked_sub(range_start) + .context(TimestampOutOfRange)?) + .as_i64() as f64; + let mut range_to_end = (range_end + .checked_sub(last_timestamp) + .context(TimestampOutOfRange {})?) + .as_i64() as f64; + + // Prometheus shorten forward-extrapolation to zero point. + if is_counter && difference > 0.0 && first_data >= 0.0 { + let range_to_zero_point = data_duration * (first_data / difference); + range_to_start = range_to_start.min(range_to_zero_point); + } + + let extrapolation_threshold = + average_duration_between_data * PROMTHEUS_EXTRAPOLATION_THRESHOLD_COEFFICIENT; + + // if lots of data is absent (`range_to_start` or `range_to_end` is longer than + // `extrapolation_threshold`), Prometheus will not estimate all time range. Use + // half of `average_duration_between_data` instead. + if range_to_start > extrapolation_threshold { + range_to_start = average_duration_between_data / 2.0; + } + if range_to_end > extrapolation_threshold { + range_to_end = average_duration_between_data / 2.0; + } + + // `difference` is the real result calculated by existing data. Prometheus will + // zoom it to `extrapolated_duration` to get extrapolated estimated result. + let extrapolated_duration = data_duration + range_to_start + range_to_end; + let mut extrapolated_result = difference * extrapolated_duration / data_duration; + + if is_rate { + // `lookback_delta` here is in millisecond. + extrapolated_result /= lookback_delta.as_i64() as f64 / 1000.0; + } + + Ok(Some(Sample { + timestamp, + value: extrapolated_result, + })) +} + +/// Implementation of `Rate` function in `Prometheus`. More +/// [details](https://prometheus.io/docs/prometheus/latest/querying/functions/#rate) +#[derive(Debug)] +struct RateFunc {} + +impl AlignFunc for RateFunc { + fn call( + &self, + data: &VecDeque, + tail_index: usize, + timestamp: Timestamp, + param: &AlignParameter, + ) -> Result> { + extrapolate_fn_helper( + data, + tail_index, + timestamp, + param.lookback_delta, + true, + true, + ) + } +} + +#[derive(Debug)] +struct DeltaFunc {} + +impl AlignFunc for DeltaFunc { + fn call( + &self, + data: &VecDeque, + tail_index: usize, + timestamp: Timestamp, + param: &AlignParameter, + ) -> Result> { + extrapolate_fn_helper( + data, + tail_index, + timestamp, + param.lookback_delta, + false, + false, + ) + } +} + +#[derive(Debug)] +struct IncreaseFunc {} + +impl AlignFunc for IncreaseFunc { + fn call( + &self, + data: &VecDeque, + tail_index: usize, + timestamp: Timestamp, + param: &AlignParameter, + ) -> Result> { + extrapolate_fn_helper( + data, + tail_index, + timestamp, + param.lookback_delta, + true, + false, + ) + } +} + +// Port from https://github.com/prometheus/prometheus/blob/063154eab720d8c3d495bd78312c0df090d0bf23/promql/functions.go#L159 +fn instant_value( + data: &VecDeque, + tail_index: usize, + timestamp: Timestamp, + is_rate: bool, +) -> Result> { + if tail_index < 2 { + return Ok(None); + } + + let last_entry = &data[tail_index]; + let previous_entry = &data[tail_index - 1]; + + let mut result = if is_rate && last_entry.value < previous_entry.value { + last_entry.value + } else { + last_entry.value - previous_entry.value + }; + + let interval = last_entry + .timestamp + .checked_sub(previous_entry.timestamp) + .context(TimestampOutOfRange {})?; + assert!(interval.as_i64() > 0); + + if is_rate { + // Convert to per-second. + result /= interval.as_i64() as f64 / 1000.0; + } + + Ok(Some(Sample { + value: result, + timestamp, + })) +} + +#[derive(Debug)] +pub struct IdeltaFunc; + +impl AlignFunc for IdeltaFunc { + fn call( + &self, + data: &VecDeque, + tail_index: usize, + timestamp: Timestamp, + _param: &AlignParameter, + ) -> Result> { + instant_value(data, tail_index, timestamp, false) + } +} + +#[derive(Debug)] +struct IrateFunc; + +impl AlignFunc for IrateFunc { + fn call( + &self, + data: &VecDeque, + tail_index: usize, + timestamp: Timestamp, + _param: &AlignParameter, + ) -> Result> { + instant_value(data, tail_index, timestamp, true) + } +} + +/// This function is not in Promtheus' functions list. +/// +/// It simulates the behavior of `Instant Selector` by finding the newest point +/// from the input. Thus `Instant Selector` can be represented by [PromAlignOp] +/// + [InstantFn]. +#[derive(Debug)] +pub struct InstantFunc; + +impl AlignFunc for InstantFunc { + fn call( + &self, + data: &VecDeque, + tail_index: usize, + timestamp: Timestamp, + _param: &AlignParameter, + ) -> Result> { + Ok(Some(Sample { + timestamp, + value: data[tail_index].value, + })) + } +} diff --git a/query_engine/src/df_planner_extension/mod.rs b/query_engine/src/df_planner_extension/mod.rs new file mode 100644 index 0000000000..336cd128f5 --- /dev/null +++ b/query_engine/src/df_planner_extension/mod.rs @@ -0,0 +1,40 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! The query planner adapter provides some planner extensions of datafusion. + +use std::sync::Arc; + +use arrow_deps::datafusion::{ + execution::context::{ExecutionContextState, QueryPlanner}, + logical_plan::LogicalPlan, + physical_plan::{ + planner::{DefaultPhysicalPlanner, ExtensionPlanner}, + ExecutionPlan, PhysicalPlanner, + }, +}; + +pub mod prom_align; +pub mod table_scan_by_primary_key; +use async_trait::async_trait; + +/// The adapter for extending the default datafusion planner. +pub struct QueryPlannerAdapter; + +#[async_trait] +impl QueryPlanner for QueryPlannerAdapter { + async fn create_physical_plan( + &self, + logical_plan: &LogicalPlan, + ctx_state: &ExecutionContextState, + ) -> arrow_deps::datafusion::error::Result> { + let extension_planners: Vec> = vec![ + Arc::new(table_scan_by_primary_key::Planner), + Arc::new(prom_align::PromAlignPlanner), + ]; + + let physical_planner = DefaultPhysicalPlanner::with_extension_planners(extension_planners); + physical_planner + .create_physical_plan(logical_plan, ctx_state) + .await + } +} diff --git a/query_engine/src/df_planner_extension/prom_align.rs b/query_engine/src/df_planner_extension/prom_align.rs new file mode 100644 index 0000000000..f55b7042e4 --- /dev/null +++ b/query_engine/src/df_planner_extension/prom_align.rs @@ -0,0 +1,53 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use arrow_deps::datafusion::{ + error::DataFusionError, + execution::context::ExecutionContextState, + logical_plan::{LogicalPlan, UserDefinedLogicalNode}, + physical_plan::{planner::ExtensionPlanner, ExecutionPlan, PhysicalPlanner}, +}; +use snafu::Snafu; +use sql::promql::PromAlignNode; + +use crate::df_execution_extension::prom_align::{Error as ExecError, PromAlignExec}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Build execution failed. err:{:?}", source))] + ExecutionError { source: ExecError }, +} + +pub struct PromAlignPlanner; + +impl ExtensionPlanner for PromAlignPlanner { + fn plan_extension( + &self, + _planner: &dyn PhysicalPlanner, + node: &dyn UserDefinedLogicalNode, + logical_inputs: &[&LogicalPlan], + physical_inputs: &[Arc], + _ctx_state: &ExecutionContextState, + ) -> arrow_deps::datafusion::error::Result>> { + Ok( + if let Some(node) = node.as_any().downcast_ref::() { + assert_eq!(logical_inputs.len(), 1, "Inconsistent number of inputs"); + assert_eq!(physical_inputs.len(), 1, "Inconsistent number of inputs"); + Some(Arc::new( + PromAlignExec::try_new( + physical_inputs[0].clone(), + node.column_name.clone(), + node.func, + node.align_param, + node.read_parallelism, + ) + // DataFusionError is lost when wrapped, use string instead. + .map_err(|e| DataFusionError::Plan(e.to_string()))?, + )) + } else { + None + }, + ) + } +} diff --git a/query_engine/src/df_planner_extension/table_scan_by_primary_key.rs b/query_engine/src/df_planner_extension/table_scan_by_primary_key.rs new file mode 100644 index 0000000000..c864270aaa --- /dev/null +++ b/query_engine/src/df_planner_extension/table_scan_by_primary_key.rs @@ -0,0 +1,141 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + any::Any, + fmt::{Debug, Formatter}, + sync::Arc, +}; + +use arrow_deps::datafusion::{ + error::DataFusionError, + execution::context::ExecutionContextState, + logical_plan::{self, DFSchemaRef, Expr, LogicalPlan, TableScan, UserDefinedLogicalNode}, + physical_plan::{planner::ExtensionPlanner, ExecutionPlan, PhysicalPlanner}, +}; +use table_engine::{provider::TableProviderAdapter, table::ReadOrder}; + +/// The extension planner creates physical plan for the +/// [`TableScanByPrimaryKey`] which is a logical plan node. +pub struct Planner; + +impl ExtensionPlanner for Planner { + fn plan_extension( + &self, + _planner: &dyn PhysicalPlanner, + node: &dyn UserDefinedLogicalNode, + _logical_inputs: &[&LogicalPlan], + _physical_inputs: &[Arc], + _ctx_state: &ExecutionContextState, + ) -> arrow_deps::datafusion::error::Result>> { + node.as_any() + .downcast_ref::() + .map(|order_by_node| order_by_node.build_scan_table_exec_plan()) + .transpose() + } +} + +/// TableScanInPrimaryKeyOrder is a [`UserDefinedLogicalNode`] of datafusion +/// which normally is generated during logical plan optimization. +/// +/// It differs from the default [`TableScan`] in its corresponding +/// [`ExecutionPlan`] is a special [`ScanTable`] which can controls the scan +/// order. +#[derive(Clone)] +pub struct TableScanByPrimaryKey { + asc: bool, + scan_plan: Arc, +} + +impl Debug for TableScanByPrimaryKey { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + self.fmt_for_explain(f) + } +} + +impl TableScanByPrimaryKey { + /// Build the node from a [TableScan] node + /// + /// Note it panics if the plan node is not a LogicalPlan::TableScan. + pub fn new_from_scan_plan(asc: bool, scan_plan: Arc) -> Self { + // TODO(xikai): should ensure the scan_plan is a real TableScan. + Self { asc, scan_plan } + } + + /// Build the scan table [ExecutionPlan]. + fn build_scan_table_exec_plan( + &self, + ) -> arrow_deps::datafusion::error::Result> { + match self.scan_plan.as_ref() { + LogicalPlan::TableScan(TableScan { + source, + projection, + filters, + limit, + .. + }) => { + let table_provider = + if let Some(v) = source.as_any().downcast_ref::() { + v + } else { + return Err(DataFusionError::Internal(format!( + "expect table provider adapter, given plan:{:?}", + self.scan_plan, + ))); + }; + + // Remove all qualifiers from the scan as the provider + // doesn't know (nor should care) how the relation was + // referred to in the query + let filters = logical_plan::unnormalize_cols(filters.iter().cloned()); + + table_provider.scan_table( + projection, + &filters, + *limit, + ReadOrder::from_is_asc(Some(self.asc)), + ) + } + _ => Err(DataFusionError::Internal(format!( + "expect scan plan, given plan:{:?}", + self.scan_plan + ))), + } + } +} + +impl UserDefinedLogicalNode for TableScanByPrimaryKey { + fn as_any(&self) -> &dyn Any { + self + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![] + } + + fn schema(&self) -> &DFSchemaRef { + self.scan_plan.schema() + } + + fn expressions(&self) -> Vec { + vec![] + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "ScanTableInPrimaryKeyOrder, asc:{}, table_scan:{:?}", + self.asc, self.scan_plan + ) + } + + fn from_template( + &self, + _exprs: &[Expr], + _inputs: &[LogicalPlan], + ) -> Arc { + Arc::new(Self { + asc: self.asc, + scan_plan: self.scan_plan.clone(), + }) + } +} diff --git a/query_engine/src/executor.rs b/query_engine/src/executor.rs new file mode 100644 index 0000000000..99d8a637bf --- /dev/null +++ b/query_engine/src/executor.rs @@ -0,0 +1,138 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Query executor + +use std::sync::Arc; + +use async_trait::async_trait; +use common_types::record_batch::RecordBatch; +use futures::TryStreamExt; +use log::debug; +use snafu::{ResultExt, Snafu}; +use sql::{plan::QueryPlan, provider::CatalogProviderAdapter}; +use table_engine::stream::SendableRecordBatchStream; + +use crate::{ + context::ContextRef, + logical_optimizer::{LogicalOptimizer, LogicalOptimizerImpl}, + physical_optimizer::{PhysicalOptimizer, PhysicalOptimizerImpl}, + physical_plan::PhysicalPlanPtr, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to do logical optimization, err:{}", source))] + LogicalOptimize { + source: crate::logical_optimizer::Error, + }, + + #[snafu(display("Failed to do physical optimization, err:{}", source))] + PhysicalOptimize { + source: crate::physical_optimizer::Error, + }, + + #[snafu(display("Failed to execute physical plan, err:{}", source))] + ExecutePhysical { source: crate::physical_plan::Error }, + + #[snafu(display("Failed to collect record batch stream, err:{}", source,))] + Collect { source: table_engine::stream::Error }, +} + +define_result!(Error); + +// Use a type alias so that we are able to replace the implementation +pub type RecordBatchVec = Vec; + +/// Query to execute +/// +/// Contains the query plan and other infos +#[derive(Debug)] +pub struct Query { + /// The query plan + plan: QueryPlan, +} + +impl Query { + pub fn new(plan: QueryPlan) -> Self { + Self { plan } + } +} + +/// Query executor +/// +/// Executes the logical plan +#[async_trait] +pub trait Executor: Clone + Send + Sync { + // TODO(yingwen): Maybe return a stream + /// Execute the query, returning the query results as RecordBatchVec + /// + /// REQUIRE: The meta data of tables in query should be found from + /// ContextRef + async fn execute_logical_plan(&self, ctx: ContextRef, query: Query) -> Result; +} + +#[derive(Clone, Default)] +pub struct ExecutorImpl; + +impl ExecutorImpl { + pub fn new() -> Self { + Self::default() + } +} + +#[async_trait] +impl Executor for ExecutorImpl { + async fn execute_logical_plan(&self, ctx: ContextRef, query: Query) -> Result { + let plan = query.plan; + + // Register catalogs to datafusion execution context. + let catalogs = CatalogProviderAdapter::new_adapters(plan.tables.clone()); + let df_ctx = ctx.df_exec_ctx(); + for (name, catalog) in catalogs { + df_ctx.register_catalog(&name, Arc::new(catalog)); + } + let request_id = ctx.request_id(); + + let physical_plan = optimize_plan(ctx, plan).await?; + + debug!( + "Executor physical optimization finished, request_id:{}, physical_plan: {:?}", + request_id, physical_plan + ); + + let stream = physical_plan.execute().await.context(ExecutePhysical)?; + + // Collect all records in the pool, as the stream may perform some costly + // calculation + let record_batches = collect(stream).await?; + + debug!( + "Executor executed plan, request_id:{}, plan_and_metrics: {}", + request_id, + physical_plan.metrics_to_string() + ); + + Ok(record_batches) + } +} + +async fn optimize_plan(ctx: ContextRef, plan: QueryPlan) -> Result { + let mut logical_optimizer = LogicalOptimizerImpl::with_context(ctx.clone()); + let plan = logical_optimizer.optimize(plan).context(LogicalOptimize)?; + + debug!( + "Executor logical optimization finished, request_id:{}, plan: {:#?}", + ctx.request_id(), + plan + ); + + let mut physical_optimizer = PhysicalOptimizerImpl::with_context(ctx); + physical_optimizer + .optimize(plan) + .await + .context(PhysicalOptimize) +} + +async fn collect(stream: SendableRecordBatchStream) -> Result { + stream.try_collect().await.context(Collect) +} diff --git a/query_engine/src/lib.rs b/query_engine/src/lib.rs new file mode 100644 index 0000000000..36440dbb11 --- /dev/null +++ b/query_engine/src/lib.rs @@ -0,0 +1,19 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Query engine +//! +//! Optimizes and executes logical plan + +// TODO(yingwen): Maybe renamed to query_executor or query_backend? +// TODO(yingwen): Use datafusion or fuse-query as query backend + +#[macro_use] +extern crate common_util; + +pub mod context; +pub mod df_execution_extension; +pub mod df_planner_extension; +pub mod executor; +pub mod logical_optimizer; +pub mod physical_optimizer; +pub mod physical_plan; diff --git a/query_engine/src/logical_optimizer/mod.rs b/query_engine/src/logical_optimizer/mod.rs new file mode 100644 index 0000000000..2bcad7955f --- /dev/null +++ b/query_engine/src/logical_optimizer/mod.rs @@ -0,0 +1,61 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Logical optimizer + +pub mod order_by_primary_key; +#[cfg(test)] +pub mod tests; +pub mod type_conversion; + +use arrow_deps::datafusion::error::DataFusionError; +use snafu::{Backtrace, ResultExt, Snafu}; +use sql::plan::QueryPlan; + +use crate::context::ContextRef; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "DataFusion Failed to optimize logical plan, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + // TODO(yingwen): Should we carry plan in this context? + DataFusionOptimize { + source: DataFusionError, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +/// LogicalOptimizer transform the QueryPlan into a potentially more efficient +/// plan +pub trait LogicalOptimizer { + // TODO(yingwen): Maybe support other plans + fn optimize(&mut self, plan: QueryPlan) -> Result; +} + +pub struct LogicalOptimizerImpl { + ctx: ContextRef, +} + +impl LogicalOptimizerImpl { + pub fn with_context(ctx: ContextRef) -> Self { + Self { ctx } + } +} + +impl LogicalOptimizer for LogicalOptimizerImpl { + fn optimize(&mut self, plan: QueryPlan) -> Result { + // TODO(yingwen): Avoid clone the plan multiple times during optimization + let QueryPlan { + mut df_plan, + tables, + } = plan; + let exec_ctx = self.ctx.df_exec_ctx(); + df_plan = exec_ctx.optimize(&df_plan).context(DataFusionOptimize)?; + + Ok(QueryPlan { df_plan, tables }) + } +} diff --git a/query_engine/src/logical_optimizer/order_by_primary_key.rs b/query_engine/src/logical_optimizer/order_by_primary_key.rs new file mode 100644 index 0000000000..ef7942bbd9 --- /dev/null +++ b/query_engine/src/logical_optimizer/order_by_primary_key.rs @@ -0,0 +1,413 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{convert::TryFrom, sync::Arc}; + +use arrow_deps::datafusion::{ + execution::context::ExecutionProps, + logical_plan::{ + plan::{Extension, Filter, Projection, Sort}, + DFSchemaRef, Expr, Limit, LogicalPlan, TableScan, + }, + optimizer::optimizer::OptimizerRule, +}; +use common_types::schema::Schema; +use log::info; + +use crate::df_planner_extension::table_scan_by_primary_key::TableScanByPrimaryKey; + +/// The optimizer rule applies to the example plan: +/// Limit: 1 +/// Sort: #test.id ASC NULLS FIRST, #test.t ASC NULLS FIRST +/// Projection: #test.tsid, #test.t, #test.id, #test.tag1, #test.tag2 +/// TableScan: test projection=None +pub struct OrderByPrimaryKeyRule; + +impl OrderByPrimaryKeyRule { + /// Optimize the plan if it is the pattern: + /// Limit: + /// Sort: + /// Project: + /// (Filter): (Filer node is allowed to be not exist) + /// TableScan + fn do_optimize( + &self, + plan: &LogicalPlan, + ) -> arrow_deps::datafusion::error::Result> { + if let LogicalPlan::Limit(Limit { + n, + input: sort_plan, + }) = plan + { + if let LogicalPlan::Sort(Sort { + expr: sort_exprs, + input: projection_plan, + }) = sort_plan.as_ref() + { + if let LogicalPlan::Projection(Projection { + expr: projection_exprs, + input: scan_or_filter_plan, + schema: projection_schema, + alias, + }) = projection_plan.as_ref() + { + let (scan_plan, filter_predicate) = if let LogicalPlan::Filter(Filter { + predicate, + input: scan_plan, + }) = scan_or_filter_plan.as_ref() + { + (scan_plan, Some(predicate)) + } else { + (scan_or_filter_plan, None) + }; + + if let LogicalPlan::TableScan(TableScan { + table_name, source, .. + }) = scan_plan.as_ref() + { + let schema = Schema::try_from(source.schema()).map_err(|e| { + let err_msg = format!( + "fail to convert arrow schema to schema, table:{}, err:{:?}", + table_name, e + ); + arrow_deps::datafusion::error::DataFusionError::Plan(err_msg) + })?; + if let Some(sort_in_asc_order) = + Self::detect_primary_key_order(&schema, sort_exprs.as_slice()) + { + let new_plan = Self::rewrite_plan(RewriteContext { + projection: projection_exprs.clone(), + filter_predicate: filter_predicate.cloned(), + schema: projection_schema.clone(), + alias: alias.clone(), + scan_plan: scan_plan.clone(), + sort_exprs: sort_exprs.clone(), + sort_in_asc_order, + limit: *n, + }); + return Ok(Some(new_plan)); + } + } + } + } + } + + Ok(None) + } + + /// Check: + /// - Whether `timestamp` is the first column in the primary key. + /// - Whether `sort_exprs` is equals the any prefix of primary key. + /// - Whether `sort_exprs` is in the same order. + /// + /// Returns: Some(sort_order) if the two rules above are true. + fn detect_primary_key_order(schema: &Schema, sort_exprs: &[Expr]) -> Option { + if schema.timestamp_index() != 0 { + return None; + } + + let key_cols = schema.key_columns(); + if sort_exprs.len() > key_cols.len() { + return None; + } + let sub_key_cols = &key_cols[..sort_exprs.len()]; + + let mut in_asc_order = None; + for (sort_expr, key_col) in sort_exprs.iter().zip(sub_key_cols.iter()) { + if let Expr::Sort { expr, asc, .. } = sort_expr { + if let Some(in_asc_order) = in_asc_order.as_mut() { + if in_asc_order != asc { + return None; + } + } + in_asc_order = Some(*asc); + + if let Expr::Column(col) = expr.as_ref() { + if col.name == key_col.name { + continue; + } + } + } + return None; + } + + in_asc_order + } + + // TODO(xikai): The topmost limit and sort plan node of the rewritten plan is + // not necessary now because now the rewrite requires the timestamp key is + // the first column in the primary key and that means the output of + // TableScanByPrimaryKey is in the correct order. And topmost two + // plan nodes is used to optimize the normal cases where the timestamp key is + // any column. + /// Rewrite the plan: + /// Limit: + /// Sort: + /// Project: + /// Filter: + /// TableScan + /// + /// Rewritten plan: + /// Limit: + /// Sort: + /// Limit: + /// Project: + /// Filter: + /// TableScanByPrimaryKey + fn rewrite_plan(rewrite_ctx: RewriteContext) -> LogicalPlan { + let order_by_primary_key_scan = Arc::new(LogicalPlan::Extension(Extension { + node: Arc::new(TableScanByPrimaryKey::new_from_scan_plan( + rewrite_ctx.sort_in_asc_order, + rewrite_ctx.scan_plan, + )), + })); + + let filter_plan = if let Some(predicate) = rewrite_ctx.filter_predicate { + Arc::new(LogicalPlan::Filter(Filter { + predicate, + input: order_by_primary_key_scan, + })) + } else { + order_by_primary_key_scan + }; + + let new_project_plan = Arc::new(LogicalPlan::Projection(Projection { + expr: rewrite_ctx.projection, + input: filter_plan, + schema: rewrite_ctx.schema, + alias: rewrite_ctx.alias, + })); + + let new_limit_plan = Arc::new(LogicalPlan::Limit(Limit { + n: rewrite_ctx.limit, + input: new_project_plan, + })); + + let new_sort_plan = Arc::new(LogicalPlan::Sort(Sort { + expr: rewrite_ctx.sort_exprs, + input: new_limit_plan, + })); + LogicalPlan::Limit(Limit { + n: rewrite_ctx.limit, + input: new_sort_plan, + }) + } +} + +impl OptimizerRule for OrderByPrimaryKeyRule { + fn optimize( + &self, + plan: &LogicalPlan, + _execution_props: &ExecutionProps, + ) -> arrow_deps::datafusion::error::Result { + match self.do_optimize(plan)? { + Some(new_plan) => { + info!( + "optimize plan by OrderByPrimaryKeyRule, original plan:\n{:?}\n optimized plan:\n{:?}", + plan, new_plan + ); + Ok(new_plan) + } + None => Ok(plan.clone()), + } + } + + fn name(&self) -> &str { + "order_by_primary_key" + } +} + +struct RewriteContext { + projection: Vec, + filter_predicate: Option, + schema: DFSchemaRef, + alias: Option, + scan_plan: Arc, + sort_exprs: Vec, + sort_in_asc_order: bool, + limit: usize, +} + +#[cfg(test)] +mod tests { + use arrow_deps::datafusion::{logical_plan::Column, scalar::ScalarValue}; + use common_types::{column_schema, datum::DatumKind, schema}; + + use super::*; + use crate::logical_optimizer::tests::LogicalPlanNodeBuilder; + + const TEST_TABLE_NAME: &str = "order_by_primary_key_test_table"; + + fn build_no_optimized_schema() -> Schema { + schema::Builder::new() + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new("key".to_string(), DatumKind::Varbinary) + .build() + .expect("Build column schema"), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("t".to_string(), DatumKind::Timestamp) + .build() + .expect("Build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field".to_string(), DatumKind::Double) + .build() + .expect("Build column schema"), + ) + .unwrap() + .build() + .expect("Build schema") + } + + fn build_optimized_schema() -> Schema { + schema::Builder::new() + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new("t".to_string(), DatumKind::Timestamp) + .build() + .expect("Build column schema"), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("key".to_string(), DatumKind::Varbinary) + .build() + .expect("Build column schema"), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field".to_string(), DatumKind::Double) + .build() + .expect("Build column schema"), + ) + .unwrap() + .build() + .expect("Build schema") + } + + fn build_sort_expr(sort_col: &str, asc: bool) -> Expr { + let col_expr = Expr::Column(Column::from(sort_col)); + Expr::Sort { + expr: Box::new(col_expr), + asc, + nulls_first: false, + } + } + + fn build_primary_key_sort_exprs(schema: &Schema, asc: bool) -> Vec { + schema + .key_columns() + .iter() + .map(|col| build_sort_expr(&col.name, asc)) + .collect() + } + + fn check_optimization_works( + schema: Schema, + sort_exprs: Vec, + filter_expr: Option, + asc: bool, + ) { + let builder = LogicalPlanNodeBuilder::new(TEST_TABLE_NAME.to_string(), schema); + + let plan = { + let mut builder = builder.clone().table_scan(); + if let Some(filter) = &filter_expr { + builder = builder.filter(filter.clone()); + } + builder + .projection(vec![]) + .sort(sort_exprs.clone()) + .limit(10) + .take_plan() + }; + + let rule = OrderByPrimaryKeyRule; + let optimized_plan = rule + .do_optimize(&*plan) + .expect("Optimize plan") + .expect("Succeed to optimize plan"); + let expected_plan = { + let mut builder = builder.table_scan().table_scan_in_primary_key_order(asc); + if let Some(filter) = filter_expr { + builder = builder.filter(filter); + } + builder + .projection(vec![]) + .limit(10) + .sort(sort_exprs) + .limit(10) + .take_plan() + }; + + crate::logical_optimizer::tests::assert_logical_plan_eq( + &optimized_plan, + expected_plan.as_ref(), + ); + } + + #[test] + fn test_optimize_applied_with_no_filter() { + let schema = build_optimized_schema(); + let sort_in_asc_order = true; + let sort_exprs = build_primary_key_sort_exprs(&schema, sort_in_asc_order); + check_optimization_works(schema, sort_exprs, None, sort_in_asc_order); + } + + #[test] + fn test_optimize_applied_with_prefix_sort_exprs() { + let schema = build_optimized_schema(); + let sort_in_asc_order = true; + let sort_exprs = build_primary_key_sort_exprs(&schema, sort_in_asc_order); + let prefix_sort_exprs = sort_exprs[..1].to_vec(); + check_optimization_works(schema, prefix_sort_exprs, None, sort_in_asc_order); + } + + #[test] + fn test_optimize_applied_with_filter() { + let schema = build_optimized_schema(); + let filter_expr = Expr::Literal(ScalarValue::Int8(None)); + let sort_in_asc_order = false; + let sort_exprs = build_primary_key_sort_exprs(&schema, sort_in_asc_order); + + check_optimization_works(schema, sort_exprs, Some(filter_expr), sort_in_asc_order); + } + + #[test] + fn test_optimize_fail_with_wrong_schema() { + let plan = { + let schema = build_no_optimized_schema(); + let sort_exprs = build_primary_key_sort_exprs(&schema, true); + let builder = LogicalPlanNodeBuilder::new(TEST_TABLE_NAME.to_string(), schema); + builder + .table_scan() + .projection(vec![]) + .sort(sort_exprs) + .limit(10) + .take_plan() + }; + + let rule = OrderByPrimaryKeyRule; + let optimized_plan = rule.do_optimize(&*plan).expect("Optimize plan"); + assert!(optimized_plan.is_none()); + } + + #[test] + fn test_optimize_with_wrong_plan() { + let plan = { + let schema = build_optimized_schema(); + let builder = LogicalPlanNodeBuilder::new(TEST_TABLE_NAME.to_string(), schema); + builder + .table_scan() + .projection(vec![]) + .limit(10) + .take_plan() + }; + + let rule = OrderByPrimaryKeyRule; + let optimized_plan = rule.do_optimize(&*plan).expect("Optimize plan"); + assert!(optimized_plan.is_none()); + } +} diff --git a/query_engine/src/logical_optimizer/tests.rs b/query_engine/src/logical_optimizer/tests.rs new file mode 100644 index 0000000000..7febd2283e --- /dev/null +++ b/query_engine/src/logical_optimizer/tests.rs @@ -0,0 +1,159 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! test utils for logical optimizer + +use std::{any::Any, sync::Arc}; + +use arrow_deps::{ + arrow::datatypes::SchemaRef, + datafusion::{ + datasource::TableProvider, + logical_plan::{ + plan::{Extension, Filter, Projection, Sort}, + DFSchemaRef, Expr, Limit, LogicalPlan, TableScan, ToDFSchema, + }, + physical_plan::ExecutionPlan, + }, +}; +use async_trait::async_trait; +use common_types::schema::Schema; + +use crate::df_planner_extension::table_scan_by_primary_key::TableScanByPrimaryKey; + +#[derive(Clone, Debug)] +#[must_use] +pub struct LogicalPlanNodeBuilder { + pub schema: Schema, + pub table_name: String, + pub plan: Option>, +} + +pub struct MockTableProvider { + schema: Schema, +} + +#[async_trait] +impl TableProvider for MockTableProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.schema.to_arrow_schema_ref() + } + + async fn scan( + &self, + _projection: &Option>, + _filters: &[Expr], + _limit: Option, + ) -> arrow_deps::datafusion::error::Result> { + unimplemented!("not support") + } +} + +impl LogicalPlanNodeBuilder { + pub fn new(table_name: String, schema: Schema) -> Self { + Self { + schema, + table_name, + plan: None, + } + } + + // caller should ensure the sub plan exists. + pub fn take_plan(&mut self) -> Arc { + self.plan.take().unwrap() + } + + pub fn df_schema_ref(&self) -> DFSchemaRef { + self.schema + .to_arrow_schema_ref() + .to_dfschema_ref() + .expect("Build dfschema") + } + + pub fn filter(mut self, predicate: Expr) -> Self { + let plan = LogicalPlan::Filter(Filter { + predicate, + input: self.take_plan(), + }); + + self.plan = Some(Arc::new(plan)); + + self + } + + pub fn projection(mut self, proj_exprs: Vec) -> Self { + let plan = LogicalPlan::Projection(Projection { + expr: proj_exprs, + input: self.take_plan(), + schema: self.df_schema_ref(), + alias: None, + }); + + self.plan = Some(Arc::new(plan)); + + self + } + + pub fn limit(mut self, n: usize) -> Self { + let plan = LogicalPlan::Limit(Limit { + n, + input: self.take_plan(), + }); + + self.plan = Some(Arc::new(plan)); + + self + } + + pub fn sort(mut self, sort_exprs: Vec) -> Self { + let plan = LogicalPlan::Sort(Sort { + expr: sort_exprs, + input: self.take_plan(), + }); + + self.plan = Some(Arc::new(plan)); + + self + } + + pub fn table_scan(mut self) -> Self { + let provider = MockTableProvider { + schema: self.schema.clone(), + }; + let projected_schema = self.df_schema_ref(); + + let plan = LogicalPlan::TableScan(TableScan { + table_name: self.table_name.clone(), + source: Arc::new(provider), + projection: None, + projected_schema, + filters: vec![], + limit: None, + }); + + self.plan = Some(Arc::new(plan)); + + self + } + + pub fn table_scan_in_primary_key_order(mut self, asc: bool) -> Self { + let sub_plan = self.take_plan(); + let node = TableScanByPrimaryKey::new_from_scan_plan(asc, sub_plan); + let plan = LogicalPlan::Extension(Extension { + node: Arc::new(node), + }); + self.plan = Some(Arc::new(plan)); + + self + } +} + +/// Check whether the logical plans are equal. +pub fn assert_logical_plan_eq(left: &LogicalPlan, right: &LogicalPlan) { + let left_plan_str = format!("{:#?}", left); + let right_plan_str = format!("{:#?}", right); + assert_eq!(left_plan_str, right_plan_str) +} diff --git a/query_engine/src/logical_optimizer/type_conversion.rs b/query_engine/src/logical_optimizer/type_conversion.rs new file mode 100644 index 0000000000..ef6aaf6d12 --- /dev/null +++ b/query_engine/src/logical_optimizer/type_conversion.rs @@ -0,0 +1,506 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{mem, sync::Arc}; + +use arrow_deps::{ + arrow::{compute, compute::kernels::cast_utils::string_to_timestamp_nanos}, + datafusion::{ + arrow::datatypes::DataType, + error::{DataFusionError, Result}, + execution::context::ExecutionProps, + logical_plan::{ + plan::Filter, DFSchemaRef, Expr, ExprRewriter, LogicalPlan, Operator, TableScan, + }, + optimizer::{optimizer::OptimizerRule, utils}, + scalar::ScalarValue, + }, +}; +use log::debug; + +/// Optimizer that cast literal value to target column's type +/// +/// Example transformations that are applied: +/// * `expr > '5'` to `expr > 5` when `expr` is of numeric type +/// * `expr > '2021-12-02 15:00:34'` to `expr > 1638428434000(ms)` when `expr` +/// is of timestamp type +/// * `expr > 10` to `expr > '10'` when `expr` is of string type +/// * `expr = 'true'` to `expr = true` when `expr` is of boolean type +pub struct TypeConversion; + +impl OptimizerRule for TypeConversion { + fn optimize( + &self, + plan: &LogicalPlan, + execution_props: &ExecutionProps, + ) -> Result { + let mut rewriter = TypeRewriter { + schemas: plan.all_schemas(), + }; + + match plan { + LogicalPlan::Filter(Filter { predicate, input }) => Ok(LogicalPlan::Filter(Filter { + predicate: predicate.clone().rewrite(&mut rewriter)?, + input: Arc::new(self.optimize(input, execution_props)?), + })), + LogicalPlan::TableScan(TableScan { + table_name, + source, + projection, + projected_schema, + filters, + limit, + }) => { + let rewrite_filters = filters + .clone() + .into_iter() + .map(|e| e.rewrite(&mut rewriter)) + .collect::>>()?; + Ok(LogicalPlan::TableScan(TableScan { + table_name: table_name.clone(), + source: source.clone(), + projection: projection.clone(), + projected_schema: projected_schema.clone(), + filters: rewrite_filters, + limit: *limit, + })) + } + LogicalPlan::Projection { .. } + | LogicalPlan::Window { .. } + | LogicalPlan::Aggregate { .. } + | LogicalPlan::Repartition { .. } + | LogicalPlan::CreateExternalTable { .. } + | LogicalPlan::Extension { .. } + | LogicalPlan::Sort { .. } + | LogicalPlan::Explain { .. } + | LogicalPlan::Limit { .. } + | LogicalPlan::Union { .. } + | LogicalPlan::Join { .. } + | LogicalPlan::CrossJoin { .. } + | LogicalPlan::CreateMemoryTable { .. } + | LogicalPlan::DropTable { .. } + | LogicalPlan::Values { .. } + | LogicalPlan::Analyze { .. } => { + let inputs = plan.inputs(); + let new_inputs = inputs + .iter() + .map(|plan| self.optimize(plan, execution_props)) + .collect::>>()?; + + let expr = plan + .expressions() + .into_iter() + .map(|e| e.rewrite(&mut rewriter)) + .collect::>>()?; + + utils::from_plan(plan, &expr, &new_inputs) + } + LogicalPlan::EmptyRelation { .. } => Ok(plan.clone()), + } + } + + fn name(&self) -> &str { + "type_conversion" + } +} + +struct TypeRewriter<'a> { + /// input schemas + schemas: Vec<&'a DFSchemaRef>, +} + +impl<'a> TypeRewriter<'a> { + fn column_data_type(&self, expr: &Expr) -> Option { + if let Expr::Column(_) = expr { + for schema in &self.schemas { + if let Ok(v) = expr.get_type(schema) { + return Some(v); + } + } + } + + None + } + + fn convert_type<'b>(&self, mut left: &'b Expr, mut right: &'b Expr) -> Result<(Expr, Expr)> { + let left_type = self.column_data_type(left); + let right_type = self.column_data_type(right); + + let mut reverse = false; + let left_type = match (&left_type, &right_type) { + (Some(v), None) => v, + (None, Some(v)) => { + reverse = true; + mem::swap(&mut left, &mut right); + v + } + _ => return Ok((left.clone(), right.clone())), + }; + + match (left, right) { + (Expr::Column(col), Expr::Literal(value)) => { + let casted_right = Self::cast_scalar_value(value, left_type)?; + debug!( + "TypeRewriter convert type, origin_left:{:?}, type:{}, right:{:?}, casted_right:{:?}", + col, left_type, value, casted_right + ); + if casted_right.is_null() { + return Err(DataFusionError::Plan(format!( + "column:{:?} value:{:?} is invalid", + col, value + ))); + } + if reverse { + Ok((Expr::Literal(casted_right), left.clone())) + } else { + Ok((left.clone(), Expr::Literal(casted_right))) + } + } + _ => Ok((left.clone(), right.clone())), + } + } + + fn cast_scalar_value(value: &ScalarValue, data_type: &DataType) -> Result { + if let DataType::Timestamp(_, _) = data_type { + if let ScalarValue::Utf8(Some(v)) = value { + return string_to_timestamp_ms(v); + } + } + + if let DataType::Boolean = data_type { + if let ScalarValue::Utf8(Some(v)) = value { + return match v.to_lowercase().as_str() { + "true" => Ok(ScalarValue::Boolean(Some(true))), + "false" => Ok(ScalarValue::Boolean(Some(false))), + _ => Ok(ScalarValue::Boolean(None)), + }; + } + } + + let array = value.to_array(); + ScalarValue::try_from_array( + &compute::cast(&array, data_type).map_err(DataFusionError::ArrowError)?, + // index: Converts a value in `array` at `index` into a ScalarValue + 0, + ) + } +} + +impl<'a> ExprRewriter for TypeRewriter<'a> { + fn mutate(&mut self, expr: Expr) -> Result { + let new_expr = match expr { + Expr::BinaryExpr { left, op, right } => match op { + Operator::Eq + | Operator::NotEq + | Operator::Lt + | Operator::LtEq + | Operator::Gt + | Operator::GtEq => { + let (left, right) = self.convert_type(&left, &right)?; + Expr::BinaryExpr { + left: Box::new(left), + op, + right: Box::new(right), + } + } + _ => Expr::BinaryExpr { left, op, right }, + }, + Expr::Between { + expr, + negated, + low, + high, + } => { + let (expr, low) = self.convert_type(&expr, &low)?; + let (expr, high) = self.convert_type(&expr, &high)?; + Expr::Between { + expr: Box::new(expr), + negated, + low: Box::new(low), + high: Box::new(high), + } + } + Expr::InList { + expr, + list, + negated, + } => { + let mut list_expr = Vec::with_capacity(list.len()); + for e in list { + let (_, expr_conversion) = self.convert_type(&expr, &e)?; + list_expr.push(expr_conversion); + } + Expr::InList { + expr, + list: list_expr, + negated, + } + } + Expr::Literal(value) => match value { + ScalarValue::TimestampSecond(Some(i), _) => { + timestamp_to_timestamp_ms_expr(TimestampType::Second, i) + } + ScalarValue::TimestampMicrosecond(Some(i), _) => { + timestamp_to_timestamp_ms_expr(TimestampType::Microsecond, i) + } + ScalarValue::TimestampNanosecond(Some(i), _) => { + timestamp_to_timestamp_ms_expr(TimestampType::Nanosecond, i) + } + _ => Expr::Literal(value), + }, + expr => { + // no rewrite possible + expr + } + }; + Ok(new_expr) + } +} + +fn string_to_timestamp_ms(string: &str) -> Result { + Ok(ScalarValue::TimestampMillisecond( + Some( + string_to_timestamp_nanos(string) + .map(|t| t / 1_000_000) + .map_err(DataFusionError::from)?, + ), + None, + )) +} + +#[allow(dead_code)] +enum TimestampType { + Second, + Millisecond, + Microsecond, + Nanosecond, +} + +fn timestamp_to_timestamp_ms_expr(typ: TimestampType, timestamp: i64) -> Expr { + let timestamp = match typ { + TimestampType::Second => timestamp * 1_000, + TimestampType::Millisecond => timestamp, + TimestampType::Microsecond => timestamp / 1_000, + TimestampType::Nanosecond => timestamp / 1_000 / 1_000, + }; + + Expr::Literal(ScalarValue::TimestampMillisecond(Some(timestamp), None)) +} + +#[cfg(test)] +mod tests { + use arrow_deps::{ + arrow::datatypes::TimeUnit, + datafusion::{ + logical_plan::{DFField, DFSchema}, + prelude::col, + }, + }; + + use super::*; + + fn expr_test_schema() -> DFSchemaRef { + Arc::new( + DFSchema::new(vec![ + DFField::new(None, "c1", DataType::Utf8, true), + DFField::new(None, "c2", DataType::Int64, true), + DFField::new(None, "c3", DataType::Float64, true), + DFField::new(None, "c4", DataType::Float32, true), + DFField::new(None, "c5", DataType::Boolean, true), + DFField::new( + None, + "c6", + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + ]) + .unwrap(), + ) + } + + #[test] + fn test_type_conversion_int64() { + let int_value = 100; + let int_str = int_value.to_string(); + let not_int_str = "100ss".to_string(); + let schema = expr_test_schema(); + let mut rewriter = TypeRewriter { + schemas: vec![&schema], + }; + + // Int64 c2 > "100" success + let exp = col("c2").gt(Expr::Literal(ScalarValue::Utf8(Some(int_str.clone())))); + let rewrite_exp = exp.rewrite(&mut rewriter).unwrap(); + assert_eq!( + rewrite_exp, + col("c2").gt(Expr::Literal(ScalarValue::Int64(Some(int_value)),)) + ); + + // Int64 "100" > c2 success + let exp = Expr::Literal(ScalarValue::Utf8(Some(int_str))).gt(col("c2")); + let rewrite_exp = exp.rewrite(&mut rewriter).unwrap(); + assert_eq!( + rewrite_exp, + Expr::Literal(ScalarValue::Int64(Some(int_value))).gt(col("c2")) + ); + + // Int64 c2 > "100ss" fail + let exp = col("c2").gt(Expr::Literal(ScalarValue::Utf8(Some(not_int_str)))); + assert!(exp.rewrite(&mut rewriter).is_err()); + } + + #[test] + fn test_type_conversion_float() { + let double_value = 100.1; + let double_str = double_value.to_string(); + let not_int_str = "100ss".to_string(); + let schema = expr_test_schema(); + let mut rewriter = TypeRewriter { + schemas: vec![&schema], + }; + + // Float64 c3 > "100" success + let exp = col("c3").gt(Expr::Literal(ScalarValue::Utf8(Some(double_str.clone())))); + let rewrite_exp = exp.rewrite(&mut rewriter).unwrap(); + assert_eq!( + rewrite_exp, + col("c3").gt(Expr::Literal(ScalarValue::Float64(Some(double_value)),)) + ); + + // Float64 c3 > "100ss" fail + let exp = col("c3").gt(Expr::Literal(ScalarValue::Utf8(Some(not_int_str.clone())))); + assert!(exp.rewrite(&mut rewriter).is_err()); + + // Float32 c4 > "100" success + let exp = col("c4").gt(Expr::Literal(ScalarValue::Utf8(Some(double_str)))); + let rewrite_exp = exp.rewrite(&mut rewriter).unwrap(); + assert_eq!( + rewrite_exp, + col("c4").gt(Expr::Literal(ScalarValue::Float32(Some( + double_value as f32 + )),)) + ); + + // Float32 c4 > "100ss" fail + let exp = col("c4").gt(Expr::Literal(ScalarValue::Utf8(Some(not_int_str)))); + assert!(exp.rewrite(&mut rewriter).is_err()); + } + + #[test] + fn test_type_conversion_boolean() { + let bool_value = true; + let bool_str = bool_value.to_string(); + let not_int_str = "100ss".to_string(); + let schema = expr_test_schema(); + let mut rewriter = TypeRewriter { + schemas: vec![&schema], + }; + + // Boolean c5 > "100ss" fail + let exp = col("c5").gt(Expr::Literal(ScalarValue::Utf8(Some(not_int_str)))); + assert!(exp.rewrite(&mut rewriter).is_err()); + + // Boolean c5 > "true" success + let exp = col("c5").gt(Expr::Literal(ScalarValue::Utf8(Some(bool_str)))); + let rewrite_exp = exp.rewrite(&mut rewriter).unwrap(); + assert_eq!( + rewrite_exp, + col("c5").gt(Expr::Literal(ScalarValue::Boolean(Some(bool_value)),)) + ); + + // Boolean c5 > true success + let exp = col("c5").gt(Expr::Literal(ScalarValue::Boolean(Some(bool_value)))); + let rewrite_exp = exp.rewrite(&mut rewriter).unwrap(); + assert_eq!( + rewrite_exp, + col("c5").gt(Expr::Literal(ScalarValue::Boolean(Some(bool_value)),)) + ); + } + + #[test] + fn test_type_conversion_timestamp() { + let date_string = "2021-09-07 16:00:00".to_string(); + let schema = expr_test_schema(); + let mut rewriter = TypeRewriter { + schemas: vec![&schema], + }; + + // Timestamp c6 > "2021-09-07 16:00:00" + let exp = col("c6").gt(Expr::Literal(ScalarValue::Utf8(Some(date_string.clone())))); + let rewrite_exp = exp.rewrite(&mut rewriter).unwrap(); + assert_eq!( + rewrite_exp, + col("c6").gt(Expr::Literal(ScalarValue::TimestampMillisecond( + Some( + string_to_timestamp_nanos(&date_string) + .map(|t| t / 1_000_000) + .unwrap(), + ), + None + ),)) + ); + + // "2021-09-07 16:00:00" > Timestamp c6 + let exp = Expr::Literal(ScalarValue::Utf8(Some(date_string.clone()))).gt(col("c6")); + let rewrite_exp = exp.rewrite(&mut rewriter).unwrap(); + assert_eq!( + rewrite_exp, + Expr::Literal(ScalarValue::TimestampMillisecond( + Some( + string_to_timestamp_nanos(&date_string) + .map(|t| t / 1_000_000) + .unwrap(), + ), + None + ),) + .gt(col("c6")) + ); + + // Timestamp c6 > 1642141472 + let timestamp_int = 1642141472; + let exp = col("c6").gt(Expr::Literal(ScalarValue::TimestampSecond( + Some(timestamp_int), + None, + ))); + let rewrite_exp = exp.rewrite(&mut rewriter).unwrap(); + assert_eq!( + rewrite_exp, + col("c6").gt(Expr::Literal(ScalarValue::TimestampMillisecond( + Some(timestamp_int * 1000), + None + ))) + ); + + // Timestamp c6 between "2021-09-07 16:00:00" and "2021-09-07 17:00:00" + let date_string2 = "2021-09-07 17:00:00".to_string(); + let exp = Expr::Between { + expr: Box::new(col("c6")), + negated: false, + low: Box::new(Expr::Literal(ScalarValue::Utf8(Some(date_string.clone())))), + high: Box::new(Expr::Literal(ScalarValue::Utf8(Some(date_string2.clone())))), + }; + let rewrite_exp = exp.rewrite(&mut rewriter).unwrap(); + assert_eq!( + rewrite_exp, + Expr::Between { + expr: Box::new(col("c6")), + negated: false, + low: Box::new(Expr::Literal(ScalarValue::TimestampMillisecond( + Some( + string_to_timestamp_nanos(&date_string) + .map(|t| t / 1_000_000) + .unwrap(), + ), + None + ),)), + high: Box::new(Expr::Literal(ScalarValue::TimestampMillisecond( + Some( + string_to_timestamp_nanos(&date_string2) + .map(|t| t / 1_000_000) + .unwrap(), + ), + None + ),)) + } + ); + } +} diff --git a/query_engine/src/physical_optimizer/coalesce_batches.rs b/query_engine/src/physical_optimizer/coalesce_batches.rs new file mode 100644 index 0000000000..36645aa633 --- /dev/null +++ b/query_engine/src/physical_optimizer/coalesce_batches.rs @@ -0,0 +1,70 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use arrow_deps::datafusion::{ + physical_optimizer::{coalesce_batches::CoalesceBatches, optimizer::PhysicalOptimizerRule}, + physical_plan::{limit::GlobalLimitExec, ExecutionPlan}, + prelude::ExecutionConfig, +}; + +use crate::physical_optimizer::{Adapter, OptimizeRuleRef}; + +pub struct CoalesceBatchesAdapter { + original_rule: CoalesceBatches, +} + +impl Default for CoalesceBatchesAdapter { + fn default() -> Self { + Self { + original_rule: CoalesceBatches::new(), + } + } +} + +impl Adapter for CoalesceBatchesAdapter { + fn may_adapt(original_rule: OptimizeRuleRef) -> OptimizeRuleRef { + if original_rule.name() == CoalesceBatches::new().name() { + Arc::new(Self::default()) + } else { + original_rule + } + } +} + +impl CoalesceBatchesAdapter { + /// Detect the plan contains any limit plan with a small limit(smaller than + /// `batch_size`). + fn detect_small_limit_plan(plan: &dyn ExecutionPlan, batch_size: usize) -> bool { + if let Some(limit_plan) = plan.as_any().downcast_ref::() { + return limit_plan.limit() < batch_size; + } + + for child_plan in plan.children() { + if Self::detect_small_limit_plan(&*child_plan, batch_size) { + return true; + } + } + + // No small limit plan is found. + false + } +} + +impl PhysicalOptimizerRule for CoalesceBatchesAdapter { + fn optimize( + &self, + plan: Arc, + config: &ExecutionConfig, + ) -> arrow_deps::datafusion::error::Result> { + if Self::detect_small_limit_plan(&*plan, config.runtime.batch_size) { + Ok(plan) + } else { + self.original_rule.optimize(plan, config) + } + } + + fn name(&self) -> &str { + "custom_coalesce_batches" + } +} diff --git a/query_engine/src/physical_optimizer/mod.rs b/query_engine/src/physical_optimizer/mod.rs new file mode 100644 index 0000000000..98571d2d6f --- /dev/null +++ b/query_engine/src/physical_optimizer/mod.rs @@ -0,0 +1,87 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Physical query optimizer + +use std::sync::Arc; + +use arrow_deps::datafusion::{ + error::DataFusionError, physical_optimizer::optimizer::PhysicalOptimizerRule, +}; +use async_trait::async_trait; +use snafu::{Backtrace, ResultExt, Snafu}; +use sql::plan::QueryPlan; + +use crate::{ + context::ContextRef, + physical_optimizer::{ + coalesce_batches::CoalesceBatchesAdapter, repartition::RepartitionAdapter, + }, + physical_plan::{DataFusionPhysicalPlan, PhysicalPlanPtr}, +}; + +pub mod coalesce_batches; +pub mod repartition; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "DataFusion Failed to optimize physical plan, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + // TODO(yingwen): Should we carry plan in this context? + DataFusionOptimize { + source: DataFusionError, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +/// Physical query optimizer that converts a logical plan to a +/// physical plan suitable for execution +#[async_trait] +pub trait PhysicalOptimizer { + /// Create a physical plan from a logical plan + async fn optimize(&mut self, logical_plan: QueryPlan) -> Result; +} + +pub struct PhysicalOptimizerImpl { + ctx: ContextRef, +} + +impl PhysicalOptimizerImpl { + pub fn with_context(ctx: ContextRef) -> Self { + Self { ctx } + } +} + +#[async_trait] +impl PhysicalOptimizer for PhysicalOptimizerImpl { + async fn optimize(&mut self, logical_plan: QueryPlan) -> Result { + let exec_ctx = self.ctx.df_exec_ctx(); + let exec_plan = exec_ctx + .create_physical_plan(&logical_plan.df_plan) + .await + .context(DataFusionOptimize)?; + let physical_plan = DataFusionPhysicalPlan::with_plan(exec_ctx.clone(), exec_plan); + + Ok(Box::new(physical_plan)) + } +} + +pub type OptimizeRuleRef = Arc; + +/// The default optimize rules of the datafusion is not all suitable for our +/// cases so the adapters may change the default rules(normally just decide +/// whether to apply the rule according to the specific plan). +pub trait Adapter { + /// May change the original rule into the custom one. + fn may_adapt(original_rule: OptimizeRuleRef) -> OptimizeRuleRef; +} + +pub fn may_adapt_optimize_rule( + original_rule: Arc, +) -> Arc { + CoalesceBatchesAdapter::may_adapt(RepartitionAdapter::may_adapt(original_rule)) +} diff --git a/query_engine/src/physical_optimizer/repartition.rs b/query_engine/src/physical_optimizer/repartition.rs new file mode 100644 index 0000000000..e45d2c939b --- /dev/null +++ b/query_engine/src/physical_optimizer/repartition.rs @@ -0,0 +1,59 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Adapter for the original datafusion repartiton optimization rule. + +use std::sync::Arc; + +use arrow_deps::datafusion::{ + physical_optimizer::{optimizer::PhysicalOptimizerRule, repartition::Repartition}, + physical_plan::ExecutionPlan, + prelude::ExecutionConfig, +}; +use log::debug; + +use crate::physical_optimizer::{Adapter, OptimizeRuleRef}; + +pub struct RepartitionAdapter { + original_rule: Repartition, +} + +impl Default for RepartitionAdapter { + fn default() -> Self { + Self { + original_rule: Repartition::new(), + } + } +} + +impl Adapter for RepartitionAdapter { + fn may_adapt(original_rule: OptimizeRuleRef) -> OptimizeRuleRef { + if original_rule.name() == Repartition::new().name() { + Arc::new(Self::default()) + } else { + original_rule + } + } +} + +impl PhysicalOptimizerRule for RepartitionAdapter { + fn optimize( + &self, + plan: Arc, + config: &ExecutionConfig, + ) -> arrow_deps::datafusion::error::Result> { + // the underlying plan maybe requires the order of the output. + if plan.output_partitioning().partition_count() == 1 { + debug!( + "RepartitionAdapter avoid repartion optimization for plan:{:?}", + plan + ); + Ok(plan) + } else { + self.original_rule.optimize(plan, config) + } + } + + fn name(&self) -> &str { + "custom-repartition" + } +} diff --git a/query_engine/src/physical_plan.rs b/query_engine/src/physical_plan.rs new file mode 100644 index 0000000000..28e344b839 --- /dev/null +++ b/query_engine/src/physical_plan.rs @@ -0,0 +1,101 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Physical execution plan + +use std::{ + fmt::{Debug, Formatter}, + sync::Arc, +}; + +use arrow_deps::datafusion::{ + error::DataFusionError, + physical_plan::{ + coalesce_partitions::CoalescePartitionsExec, display::DisplayableExecutionPlan, + ExecutionPlan, + }, + prelude::ExecutionContext, +}; +use async_trait::async_trait; +use snafu::{Backtrace, ResultExt, Snafu}; +use table_engine::stream::{FromDfStream, SendableRecordBatchStream}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "DataFusion Failed to execute plan, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + DataFusionExec { + partition_count: usize, + source: DataFusionError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to convert datafusion stream, err:{}", source))] + ConvertStream { source: table_engine::stream::Error }, +} + +define_result!(Error); + +#[async_trait] +pub trait PhysicalPlan: std::fmt::Debug { + /// execute this plan and returns the result + async fn execute(&self) -> Result; + + /// Convert internal metrics to string. + fn metrics_to_string(&self) -> String; +} + +pub type PhysicalPlanPtr = Box; + +pub struct DataFusionPhysicalPlan { + ctx: ExecutionContext, + plan: Arc, +} + +impl DataFusionPhysicalPlan { + pub fn with_plan(ctx: ExecutionContext, plan: Arc) -> Self { + Self { ctx, plan } + } +} + +impl Debug for DataFusionPhysicalPlan { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DataFusionPhysicalPlan") + .field("plan", &self.plan) + .finish() + } +} + +#[async_trait] +impl PhysicalPlan for DataFusionPhysicalPlan { + async fn execute(&self) -> Result { + let runtime = self.ctx.state.lock().unwrap().runtime_env.clone(); + let partition_count = self.plan.output_partitioning().partition_count(); + let df_stream = if partition_count <= 1 { + self.plan + .execute(0, runtime) + .await + .context(DataFusionExec { partition_count })? + } else { + // merge into a single partition + let plan = CoalescePartitionsExec::new(self.plan.clone()); + // MergeExec must produce a single partition + assert_eq!(1, plan.output_partitioning().partition_count()); + plan.execute(0, runtime) + .await + .context(DataFusionExec { partition_count })? + }; + + let stream = FromDfStream::new(df_stream).context(ConvertStream)?; + + Ok(Box::pin(stream)) + } + + fn metrics_to_string(&self) -> String { + DisplayableExecutionPlan::with_metrics(&*self.plan) + .indent() + .to_string() + } +} diff --git a/rust-toolchain b/rust-toolchain new file mode 100644 index 0000000000..58d0130e05 --- /dev/null +++ b/rust-toolchain @@ -0,0 +1 @@ +nightly-2022-01-06 diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000000..61594ccda0 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,14 @@ +# https://github.com/rust-lang/rustfmt/blob/master/Configurations.md + +# Break comments to fit on the line +wrap_comments = true +# Merge multiple imports into a single nested import. +imports_granularity = "Crate" +# Format code snippet included in doc comments. +format_code_in_doc_comments = true +# Reorder impl items. type and const are put first, then macros and methods. +reorder_impl_items = true +# Discard existing import groups, and create three groups for std, external crates, crates +group_imports = "StdExternalCrate" + +license_template_path = "etc/license.template" \ No newline at end of file diff --git a/server/Cargo.toml b/server/Cargo.toml new file mode 100644 index 0000000000..5f7b349704 --- /dev/null +++ b/server/Cargo.toml @@ -0,0 +1,44 @@ +[package] +name = "server" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +analytic_engine = { path = "../analytic_engine" } +arrow_deps = { path = "../arrow_deps" } +async-trait = "0.1.41" +avro-rs = "0.13" +catalog = { path = "../catalog" } +ceresdbproto = { git = "https://github.com/CeresDB/ceresdbproto.git"} +common_types = { path = "../common_types" } +common_util = { path = "../common_util" } +futures = "0.3" +grpcio = { path = "../grpcio" } +http = "0.2" +interpreters = { path = "../interpreters" } +lazy_static = "1.4.0" +log = "0.4" +logger = { path = "../components/logger" } +meta_client = { path = "../meta_client" } +profile = { path = "../components/profile" } +protobuf = "2.20" +query_engine = { path = "../query_engine" } +prometheus = "0.12" +prometheus-static-metric = "0.5" +serde = "1.0" +serde_derive = "1.0" +serde_json = "1.0.60" +snafu = { version ="0.6.10", features = ["backtraces"]} +sql = { path = "../sql" } +system_catalog = { path = "../system_catalog" } +table_engine = { path = "../table_engine" } +tokio = { version = "1.0", features = ["full"] } +twox-hash = "1.6" +udf = { path = "../udf" } +warp = "0.3" + +[dev-dependencies] +sql = { path = "../sql" , features=["test"]} diff --git a/server/src/avro_util.rs b/server/src/avro_util.rs new file mode 100644 index 0000000000..69ab049ca3 --- /dev/null +++ b/server/src/avro_util.rs @@ -0,0 +1,166 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Avro utility + +use std::collections::HashMap; + +use avro_rs::{ + schema::{Name, RecordField, RecordFieldOrder}, + types::{Record, Value}, +}; +use common_types::{ + bytes::ByteVec, + column::ColumnBlock, + datum::{Datum, DatumKind}, + record_batch::RecordBatch, + schema::RecordSchema, +}; +use common_util::define_result; +use snafu::{Backtrace, ResultExt, Snafu}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Failed to write avro record, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + WriteAvroRecord { + source: avro_rs::Error, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +/// Create [avro_rs::Schema] with given `name` from [RecordSchema] +pub fn to_avro_schema(name: &str, schema: &RecordSchema) -> avro_rs::Schema { + let columns = schema.columns(); + let mut lookup = HashMap::with_capacity(columns.len()); + let mut avro_fields = Vec::with_capacity(columns.len()); + + for (pos, column) in columns.iter().enumerate() { + // Create avro record field + let default = if column.is_nullable { + Some(serde_json::value::Value::Null) + } else { + None + }; + + let field_schema = if column.is_nullable { + // We want to declare a schema which may be either a null or non-null value, + // for example: ["null", "string"]. + // + // However, `avro_rs` does not provide an accessible API to build a `Union`. + // We can't find a better way to do this than using JSON. + let field_schema_str = format!( + r#"["null", {}]"#, + data_type_to_schema(&column.data_type).canonical_form() + ); + avro_rs::Schema::parse_str(&field_schema_str).unwrap() + } else { + data_type_to_schema(&column.data_type) + }; + + let record_field = RecordField { + name: column.name.clone(), + doc: None, + default, + schema: field_schema, + order: RecordFieldOrder::Ignore, + position: pos, + }; + + avro_fields.push(record_field); + lookup.insert(column.name.clone(), pos); + } + + avro_rs::Schema::Record { + name: Name::new(name), + doc: None, + fields: avro_fields, + lookup, + } +} + +fn data_type_to_schema(data_type: &DatumKind) -> avro_rs::Schema { + match data_type { + DatumKind::Null => avro_rs::Schema::Null, + DatumKind::Timestamp => avro_rs::Schema::TimestampMillis, + DatumKind::Double => avro_rs::Schema::Double, + DatumKind::Float => avro_rs::Schema::Float, + DatumKind::Varbinary => avro_rs::Schema::Bytes, + DatumKind::String => avro_rs::Schema::String, + DatumKind::UInt32 | DatumKind::Int64 | DatumKind::UInt64 => avro_rs::Schema::Long, + DatumKind::UInt16 + | DatumKind::UInt8 + | DatumKind::Int32 + | DatumKind::Int16 + | DatumKind::Int8 => avro_rs::Schema::Int, + DatumKind::Boolean => avro_rs::Schema::Boolean, + } +} + +/// Convert record batch to avro format +pub fn record_batch_to_avro( + record_batch: &RecordBatch, + schema: &avro_rs::Schema, + rows: &mut Vec, +) -> Result<()> { + let record_batch_schema = record_batch.schema(); + assert_eq!( + record_batch_schema.num_columns(), + record_batch.num_columns() + ); + + rows.reserve(record_batch.num_rows()); + + let column_schemas = record_batch_schema.columns(); + for row_idx in 0..record_batch.num_rows() { + let mut record = Record::new(schema).unwrap(); + for (col_idx, column_schema) in column_schemas.iter().enumerate() { + let column = record_batch.column(col_idx); + let value = column_to_value(column, row_idx, column_schema.is_nullable); + + record.put(&column_schema.name, value); + } + + let row_bytes = avro_rs::to_avro_datum(schema, record).context(WriteAvroRecord)?; + + rows.push(row_bytes); + } + + Ok(()) +} + +/// Panic if row_idx is out of bound. +fn column_to_value(array: &ColumnBlock, row_idx: usize, is_nullable: bool) -> Value { + let datum = array.datum(row_idx); + match datum { + Datum::Null => may_union(Value::Null, is_nullable), + Datum::Timestamp(v) => may_union(Value::TimestampMillis(v.as_i64()), is_nullable), + Datum::Double(v) => may_union(Value::Double(v), is_nullable), + Datum::Float(v) => may_union(Value::Float(v), is_nullable), + Datum::Varbinary(v) => may_union(Value::Bytes(v.to_vec()), is_nullable), + Datum::String(v) => may_union(Value::String(v.to_string()), is_nullable), + // TODO(yingwen): Should we return error if overflow? Avro does not support uint64. + Datum::UInt64(v) => may_union(Value::Long(v as i64), is_nullable), + Datum::Int64(v) => may_union(Value::Long(v), is_nullable), + Datum::UInt32(v) => may_union(Value::Long(i64::from(v)), is_nullable), + Datum::UInt16(v) => may_union(Value::Int(i32::from(v)), is_nullable), + Datum::UInt8(v) => may_union(Value::Int(i32::from(v)), is_nullable), + Datum::Int32(v) => may_union(Value::Int(v), is_nullable), + Datum::Int16(v) => may_union(Value::Int(i32::from(v)), is_nullable), + Datum::Int8(v) => may_union(Value::Int(i32::from(v)), is_nullable), + Datum::Boolean(v) => may_union(Value::Boolean(v), is_nullable), + } +} + +#[inline] +fn may_union(val: Value, is_nullable: bool) -> Value { + if is_nullable { + Value::Union(Box::new(val)) + } else { + val + } +} diff --git a/server/src/config.rs b/server/src/config.rs new file mode 100644 index 0000000000..3a62758a0d --- /dev/null +++ b/server/src/config.rs @@ -0,0 +1,88 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Server configs + +use analytic_engine; +use meta_client::MetaClientConfig; +use serde_derive::Deserialize; + +use crate::router::RuleList; + +#[derive(Debug, Deserialize)] +#[serde(default)] +pub struct RuntimeConfig { + // Runtime for reading data + pub read_thread_num: usize, + // Runtime for writing data + pub write_thread_num: usize, + // Runtime for background tasks + pub background_thread_num: usize, +} + +// TODO(yingwen): Split config into several sub configs. +#[derive(Debug, Deserialize)] +#[serde(default)] +pub struct Config { + /// The address to listen. + pub bind_addr: String, + pub http_port: u16, + pub grpc_port: u16, + pub grpc_server_cq_count: usize, + + // Engine related configs: + pub runtime: RuntimeConfig, + + // Log related configs: + pub log_level: String, + pub enable_async_log: bool, + pub async_log_channel_len: i32, + + // Tracing related configs: + pub tracing_log_dir: String, + pub tracing_log_name: String, + pub tracing_level: String, + + // Meta client related configs: + pub meta_client: MetaClientConfig, + // Config of router. + pub route_rules: RuleList, + + // Analytic engine configs: + pub analytic: analytic_engine::Config, +} + +impl Default for RuntimeConfig { + fn default() -> Self { + Self { + read_thread_num: 8, + write_thread_num: 8, + background_thread_num: 8, + } + } +} + +impl Default for Config { + fn default() -> Self { + let grpc_port = 8831; + Self { + bind_addr: String::from("127.0.0.1"), + http_port: 5000, + grpc_port, + grpc_server_cq_count: 20, + runtime: RuntimeConfig::default(), + log_level: "debug".to_string(), + enable_async_log: true, + async_log_channel_len: 102400, + tracing_log_dir: String::from("/tmp/ceresdbx"), + tracing_log_name: String::from("tracing"), + tracing_level: String::from("info"), + meta_client: MetaClientConfig { + node: String::from("127.0.0.1"), + port: grpc_port, + ..Default::default() + }, + route_rules: RuleList::default(), + analytic: analytic_engine::Config::default(), + } + } +} diff --git a/server/src/consts.rs b/server/src/consts.rs new file mode 100644 index 0000000000..bbaa5c1c98 --- /dev/null +++ b/server/src/consts.rs @@ -0,0 +1,8 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Common constants + +/// Header of catalog name +pub const CATALOG_HEADER: &str = "x-ceresdbx-catalog"; +/// Header of tenant name +pub const TENANT_HEADER: &str = "x-ceresdbx-access-tenant"; diff --git a/server/src/context.rs b/server/src/context.rs new file mode 100644 index 0000000000..119c3ec984 --- /dev/null +++ b/server/src/context.rs @@ -0,0 +1,81 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Server context + +use std::sync::Arc; + +use common_util::runtime::Runtime; +use snafu::{ensure, Backtrace, OptionExt, Snafu}; + +#[allow(clippy::enum_variant_names)] +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Missing catalog.\nBacktrace:\n{}", backtrace))] + MissingCatalog { backtrace: Backtrace }, + + #[snafu(display("Missing tenant.\nBacktrace:\n{}", backtrace))] + MissingTenant { backtrace: Backtrace }, + + #[snafu(display("Missing runtime.\nBacktrace:\n{}", backtrace))] + MissingRuntime { backtrace: Backtrace }, +} + +define_result!(Error); + +/// Server request context +/// +/// Context for request, may contains +/// 1. Request context and options +/// 2. Info from http headers +pub struct RequestContext { + /// Catalog of the request + pub catalog: String, + /// Tenant of request + pub tenant: String, + /// Runtime of this request + pub runtime: Arc, +} + +impl RequestContext { + pub fn builder() -> Builder { + Builder::default() + } +} + +#[derive(Default)] +pub struct Builder { + catalog: String, + tenant: String, + runtime: Option>, +} + +impl Builder { + pub fn catalog(mut self, catalog: String) -> Self { + self.catalog = catalog; + self + } + + pub fn tenant(mut self, tenant: String) -> Self { + self.tenant = tenant; + self + } + + pub fn runtime(mut self, runtime: Arc) -> Self { + self.runtime = Some(runtime); + self + } + + pub fn build(self) -> Result { + ensure!(!self.catalog.is_empty(), MissingCatalog); + // We use tenant as schema, so we use default schema if tenant is not specific + ensure!(!self.tenant.is_empty(), MissingTenant); + + let runtime = self.runtime.context(MissingRuntime)?; + + Ok(RequestContext { + catalog: self.catalog, + tenant: self.tenant, + runtime, + }) + } +} diff --git a/server/src/error.rs b/server/src/error.rs new file mode 100644 index 0000000000..47006fde7e --- /dev/null +++ b/server/src/error.rs @@ -0,0 +1,67 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Server error + +use common_util::define_result; +use snafu::Snafu; + +/// Server status code +#[derive(Debug, Clone, Copy)] +pub enum StatusCode { + Ok = 200, + InvalidArgument = 400, + NotFound = 404, + TooManyRequests = 429, + InternalError = 500, +} + +impl StatusCode { + pub fn as_u32(&self) -> u32 { + *self as u32 + } +} + +define_result!(ServerError); + +#[derive(Snafu, Debug)] +#[snafu(visibility(pub(crate)))] +pub enum ServerError { + #[snafu(display("Rpc error, code:{}, message:{}", code.as_u32(), msg))] + ErrNoCause { code: StatusCode, msg: String }, + + #[snafu(display("Rpc error, code:{}, message:{}, cause:{}", code.as_u32(), msg, source))] + ErrWithCause { + code: StatusCode, + msg: String, + source: Box, + }, +} + +impl ServerError { + pub fn code(&self) -> StatusCode { + match *self { + ServerError::ErrNoCause { code, .. } => code, + ServerError::ErrWithCause { code, .. } => code, + } + } + + /// Get the error message returned to the user. + pub fn error_message(&self) -> String { + match self { + ServerError::ErrNoCause { msg, .. } => msg.clone(), + + ServerError::ErrWithCause { msg, source, .. } => { + let err_string = source.to_string(); + let first_line = first_line_in_error(&err_string); + format!("{}. Caused by: {}", msg, first_line) + } + } + } +} + +/// Returns first line in error message, now we use this hack to exclude +/// backtrace from error message that returned to user. +// TODO(yingwen): Consider a better way to get the error message. +pub(crate) fn first_line_in_error(err_string: &str) -> &str { + err_string.split('\n').next().unwrap_or(err_string) +} diff --git a/server/src/grpc/metrics.rs b/server/src/grpc/metrics.rs new file mode 100644 index 0000000000..aec9f7acdc --- /dev/null +++ b/server/src/grpc/metrics.rs @@ -0,0 +1,42 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +// Grpc server metrics + +use lazy_static::lazy_static; +use prometheus::{exponential_buckets, register_histogram_vec, HistogramVec}; +use prometheus_static_metric::{auto_flush_from, make_auto_flush_static_metric}; + +// Register auto flush static metrics. +make_auto_flush_static_metric! { + pub label_enum GrpcTypeKind { + handle_route, + handle_write, + handle_query, + handle_stream_write, + handle_stream_query, + } + + pub struct GrpcHandlerDurationHistogramVec: LocalHistogram { + "type" => GrpcTypeKind, + } +} + +// Register global metrics. +lazy_static! { + pub static ref GRPC_HANDLER_DURATION_HISTOGRAM_VEC_GLOBAL: HistogramVec = + register_histogram_vec!( + "grpc_handler_duration", + "Bucketed histogram of grpc server handler", + &["type"], + exponential_buckets(0.0005, 2.0, 20).unwrap() + ) + .unwrap(); +} + +// Register thread local metrics with default flush interval (1s). +lazy_static! { + pub static ref GRPC_HANDLER_DURATION_HISTOGRAM_VEC: GrpcHandlerDurationHistogramVec = auto_flush_from!( + GRPC_HANDLER_DURATION_HISTOGRAM_VEC_GLOBAL, + GrpcHandlerDurationHistogramVec + ); +} diff --git a/server/src/grpc/mod.rs b/server/src/grpc/mod.rs new file mode 100644 index 0000000000..521400ab72 --- /dev/null +++ b/server/src/grpc/mod.rs @@ -0,0 +1,1034 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Grpc services + +use std::{ + collections::{BTreeMap, HashMap}, + sync::Arc, + time::Instant, +}; + +use async_trait::async_trait; +use catalog::{consts as catalogConst, manager::Manager as CatalogManager}; +use ceresdbproto::{ + common::ResponseHeader, + prometheus::{PrometheusQueryRequest, PrometheusQueryResponse}, + storage::{ + QueryRequest, QueryResponse, RouteRequest, RouteResponse, Value_oneof_value, WriteMetric, + WriteRequest, WriteResponse, + }, + storage_grpc::{self, StorageService}, +}; +use common_types::{ + column_schema::{self, ColumnSchema}, + datum::DatumKind, + schema::{Builder as SchemaBuilder, Error as SchemaError, Schema, TSID_COLUMN}, +}; +use common_util::{define_result, time::InstantExt}; +use futures::{stream::StreamExt, FutureExt, SinkExt, TryFutureExt}; +use grpcio::{ + ClientStreamingSink, Environment, Metadata, RequestStream, RpcContext, Server, ServerBuilder, + ServerStreamingSink, UnarySink, WriteFlags, +}; +use log::{error, info}; +use meta_client::{ + ClusterViewRef, FailGetCatalog, FailOnChangeView, MetaClient, MetaClientConfig, MetaWatcher, + SchemaConfig, +}; +use query_engine::executor::Executor as QueryExecutor; +use snafu::{ensure, Backtrace, OptionExt, ResultExt, Snafu}; +use sql::plan::CreateTablePlan; +use table_engine::engine::EngineRuntimes; +use tokio::sync::oneshot; + +use crate::{ + consts, + error::{ErrNoCause, ErrWithCause, Result as ServerResult, ServerError, StatusCode}, + grpc::metrics::GRPC_HANDLER_DURATION_HISTOGRAM_VEC, + instance::InstanceRef, + router::{Router, RouterRef, RuleBasedRouter, RuleList}, +}; + +mod metrics; +mod prom_query; +mod query; +mod route; +mod write; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display( + "Failed to build rpc server, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + BuildRpcServer { + source: grpcio::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to build meta client, err:{}", source))] + BuildMetaClient { source: meta_client::Error }, + + #[snafu(display("Failed to start meta client, err:{}", source))] + StartMetaClient { source: meta_client::Error }, + + #[snafu(display("Missing meta client config.\nBacktrace:\n{}", backtrace))] + MissingMetaClientConfig { backtrace: Backtrace }, + + #[snafu(display("Missing grpc environment.\nBacktrace:\n{}", backtrace))] + MissingEnv { backtrace: Backtrace }, + + #[snafu(display("Missing runtimes.\nBacktrace:\n{}", backtrace))] + MissingRuntimes { backtrace: Backtrace }, + + #[snafu(display("Missing instance.\nBacktrace:\n{}", backtrace))] + MissingInstance { backtrace: Backtrace }, + + #[snafu(display("Catalog name is not utf8.\nBacktrace:\n{}", backtrace))] + ParseCatalogName { + source: std::string::FromUtf8Error, + backtrace: Backtrace, + }, + + #[snafu(display("Schema name is not utf8.\nBacktrace:\n{}", backtrace))] + ParseSchemaName { + source: std::string::FromUtf8Error, + backtrace: Backtrace, + }, + + #[snafu(display("Fail to build table schema for metric: {}, err:{}", metric, source))] + BuildTableSchema { metric: String, source: SchemaError }, + + #[snafu(display( + "Fail to build column schema from column: {}, err:{}", + column_name, + source + ))] + BuildColumnSchema { + column_name: String, + source: column_schema::Error, + }, + #[snafu(display("Invalid column: {} schema, err:{}", column_name, source))] + InvalidColumnSchema { + column_name: String, + source: column_schema::Error, + }, + + #[snafu(display("Invalid argument: {}", msg))] + InvalidArgument { msg: String }, + + #[snafu(display( + "Failed to send response to grpc sink, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + GrpcSink { + source: grpcio::Error, + backtrace: Backtrace, + }, +} + +const STREAM_QUERY_CHANNEL_LEN: usize = 20; + +define_result!(Error); + +/// Rpc request header +#[derive(Debug, Default)] +pub struct RequestHeader { + metas: HashMap>, +} + +impl From<&Metadata> for RequestHeader { + fn from(meta: &Metadata) -> Self { + let metas = meta + .iter() + .map(|(k, v)| (k.to_string(), v.to_vec())) + .collect(); + + Self { metas } + } +} + +impl RequestHeader { + pub fn get(&self, key: &str) -> Option<&[u8]> { + self.metas.get(key).map(|v| v.as_slice()) + } +} + +pub struct HandlerContext<'a, C, Q> { + #[allow(dead_code)] + header: RequestHeader, + router: RouterRef, + instance: InstanceRef, + catalog: String, + schema: String, + schema_config: Option<&'a SchemaConfig>, +} + +impl<'a, C: CatalogManager, Q> HandlerContext<'a, C, Q> { + fn new( + header: RequestHeader, + router: Arc, + instance: InstanceRef, + cluster_view: &'a ClusterViewRef, + ) -> Result { + let default_catalog = instance.catalog_manager.default_catalog_name(); + let default_schema = instance.catalog_manager.default_schema_name(); + + let catalog = header + .get(consts::CATALOG_HEADER) + .map(|v| String::from_utf8(v.to_vec())) + .transpose() + .context(ParseCatalogName)? + .unwrap_or_else(|| default_catalog.to_string()); + + let schema = header + .get(consts::TENANT_HEADER) + .map(|v| String::from_utf8(v.to_vec())) + .transpose() + .context(ParseSchemaName)? + .unwrap_or_else(|| default_schema.to_string()); + + let schema_config = cluster_view.schema_configs.get(&schema); + + Ok(Self { + header, + router, + instance, + catalog, + schema, + schema_config, + }) + } + + #[inline] + fn catalog(&self) -> &str { + &self.catalog + } + + #[inline] + fn tenant(&self) -> &str { + &self.schema + } +} + +/// Rpc services manages all grpc services of the server. +pub struct RpcServices { + /// The grpc server + rpc_server: Server, + /// Meta client + meta_client: Arc, +} + +impl RpcServices { + /// Start the rpc services + pub async fn start(&mut self) -> Result<()> { + self.meta_client.start().await.context(StartMetaClient)?; + + self.rpc_server.start(); + for (host, port) in self.rpc_server.bind_addrs() { + info!("Grpc server listening on {}:{}", host, port); + } + + Ok(()) + } + + pub fn shutdown(&mut self) { + self.rpc_server.shutdown(); + } +} + +pub struct Builder { + bind_addr: String, + port: u16, + meta_client_config: Option, + env: Option>, + runtimes: Option>, + instance: Option>, + route_rules: RuleList, +} + +impl Builder { + pub fn new() -> Self { + Self { + bind_addr: String::from("0.0.0.0"), + port: 38081, + meta_client_config: None, + env: None, + runtimes: None, + instance: None, + route_rules: RuleList::default(), + } + } + + pub fn bind_addr(mut self, addr: String) -> Self { + self.bind_addr = addr; + self + } + + pub fn port(mut self, port: u16) -> Self { + self.port = port; + self + } + + pub fn meta_client_config(mut self, config: MetaClientConfig) -> Self { + self.meta_client_config = Some(config); + self + } + + pub fn env(mut self, env: Arc) -> Self { + self.env = Some(env); + self + } + + pub fn runtimes(mut self, runtimes: Arc) -> Self { + self.runtimes = Some(runtimes); + self + } + + pub fn instance(mut self, instance: InstanceRef) -> Self { + self.instance = Some(instance); + self + } + + pub fn route_rules(mut self, route_rules: RuleList) -> Self { + self.route_rules = route_rules; + self + } +} + +impl Builder { + pub fn build(self) -> Result { + let meta_client_config = self.meta_client_config.context(MissingMetaClientConfig)?; + let runtimes = self.runtimes.context(MissingRuntimes)?; + let instance = self.instance.context(MissingInstance)?; + + let watcher = Box::new(SchemaWatcher { + catalog_manager: instance.catalog_manager.clone(), + }); + + let meta_client = meta_client::build_meta_client( + meta_client_config, + runtimes.bg_runtime.clone(), + Some(watcher), + ) + .context(BuildMetaClient)?; + let router = Arc::new(RuleBasedRouter::new(meta_client.clone(), self.route_rules)); + let storage_service = StorageServiceImpl { + router, + instance, + runtimes, + meta_client: meta_client.clone(), + }; + let rpc_service = storage_grpc::create_storage_service(storage_service); + + let env = self.env.context(MissingEnv)?; + + let rpc_server = ServerBuilder::new(env) + .register_service(rpc_service) + .bind(self.bind_addr, self.port) + .build() + .context(BuildRpcServer)?; + + Ok(RpcServices { + rpc_server, + meta_client, + }) + } +} + +struct SchemaWatcher { + catalog_manager: C, +} + +#[async_trait] +impl MetaWatcher for SchemaWatcher { + async fn on_change(&self, view: ClusterViewRef) -> meta_client::Result<()> { + for schema in view.schema_shards.keys() { + let default_catalog = catalogConst::DEFAULT_CATALOG; + if let Some(catalog) = self + .catalog_manager + .catalog_by_name(default_catalog) + .map_err(|e| Box::new(e) as _) + .context(FailGetCatalog { + catalog: default_catalog, + })? + { + catalog + .create_schema(schema) + .await + .map_err(|e| Box::new(e) as _) + .context(FailOnChangeView { + schema, + catalog: default_catalog, + })?; + } + } + Ok(()) + } +} + +fn build_err_header(err: ServerError) -> ResponseHeader { + let mut header = ResponseHeader::new(); + header.set_code(err.code().as_u32()); + header.set_error(err.error_message()); + + header +} + +fn build_ok_header() -> ResponseHeader { + let mut header = ResponseHeader::new(); + header.set_code(StatusCode::Ok.as_u32()); + + header +} + +struct StorageServiceImpl { + router: Arc, + instance: InstanceRef, + runtimes: Arc, + meta_client: Arc, +} + +impl Clone for StorageServiceImpl { + fn clone(&self) -> Self { + Self { + router: self.router.clone(), + instance: self.instance.clone(), + runtimes: self.runtimes.clone(), + meta_client: self.meta_client.clone(), + } + } +} + +macro_rules! handle_request { + ($mod_name: ident, $handle_fn: ident, $req_ty: ident, $resp_ty: ident) => { + fn $mod_name(&mut self, ctx: RpcContext<'_>, req: $req_ty, sink: UnarySink<$resp_ty>) { + let begin_instant = Instant::now(); + + let router = self.router.clone(); + let header = RequestHeader::from(ctx.request_headers()); + let instance = self.instance.clone(); + let (tx, rx) = oneshot::channel(); + + // The future spawned by tokio cannot be executed by other executor/runtime, so + + let runtime = match stringify!($mod_name) { + "query" => &self.runtimes.read_runtime, + "write" => &self.runtimes.write_runtime, + _ => &self.runtimes.bg_runtime, + }; + + let cluster_view = self.meta_client.get_cluster_view(); + // we need to pass the result via channel + runtime.spawn( + async move { + let handler_ctx = HandlerContext::new(header, router, instance, &cluster_view) + .map_err(|e| Box::new(e) as _) + .context(ErrWithCause { + code: StatusCode::InvalidArgument, + msg: "Invalid header", + })?; + $mod_name::$handle_fn(&handler_ctx, req).await.map_err(|e| { + error!( + "Failed to handle request, mod:{}, handler:{}, err:{}", + stringify!($mod_name), + stringify!($handle_fn), + e + ); + e + }) + } + .then(|resp_result| async move { + if tx.send(resp_result).is_err() { + error!( + "Failed to send handler result, mod:{}, handler:{}", + stringify!($mod_name), + stringify!($handle_fn), + ) + } + }), + ); + + let task = async move { + let resp_result = match rx.await { + Ok(resp_result) => resp_result, + Err(_e) => ErrNoCause { + code: StatusCode::InternalError, + msg: "Result channel disconnected", + } + .fail(), + }; + + let resp = match resp_result { + Ok(resp) => resp, + Err(e) => { + let mut resp = $resp_ty::new(); + resp.set_header(build_err_header(e)); + resp + } + }; + let ret = sink.success(resp).await.context(GrpcSink); + + GRPC_HANDLER_DURATION_HISTOGRAM_VEC + .$handle_fn + .observe(begin_instant.saturating_elapsed().as_secs_f64()); + + ret?; + + Result::Ok(()) + } + .map_err(move |e| { + error!( + "Failed to reply grpc resp, mod:{}, handler:{}, err:{:?}", + stringify!($mod_name), + stringify!($handle_fn), + e + ) + }) + .map(|_| ()); + + ctx.spawn(task); + } + }; +} + +impl StorageService + for StorageServiceImpl +{ + handle_request!(route, handle_route, RouteRequest, RouteResponse); + + handle_request!(write, handle_write, WriteRequest, WriteResponse); + + handle_request!(query, handle_query, QueryRequest, QueryResponse); + + handle_request!( + prom_query, + handle_query, + PrometheusQueryRequest, + PrometheusQueryResponse + ); + + fn stream_write( + &mut self, + ctx: RpcContext<'_>, + mut stream_req: RequestStream, + sink: ClientStreamingSink, + ) { + let begin_instant = Instant::now(); + let router = self.router.clone(); + let header = RequestHeader::from(ctx.request_headers()); + let instance = self.instance.clone(); + let cluster_view = self.meta_client.get_cluster_view(); + + let (tx, rx) = oneshot::channel(); + self.runtimes.write_runtime.spawn(async move { + let handler_ctx = HandlerContext::new(header, router, instance, &cluster_view) + .map_err(|e| Box::new(e) as _) + .context(ErrWithCause { + code: StatusCode::InvalidArgument, + msg: "Invalid header", + })?; + let mut total_success = 0; + let mut resp = WriteResponse::new(); + let mut has_err = false; + while let Some(req) = stream_req.next().await { + let write_result = write::handle_write( + &handler_ctx, + req.map_err(|e| Box::new(e) as _).context(ErrWithCause { + code: StatusCode::InternalError, + msg: "Failed to fetch request", + })?, + ) + .await + .map_err(|e| { + error!("Failed to handle request, mod:stream_write, handler:handle_stream_write, err:{}", e); + e + }); + + match write_result { + Ok(write_resp) => total_success += write_resp.success, + Err(e) => { + resp.set_header(build_err_header(e)); + has_err = true; + break; + } + } + } + if !has_err { + resp.set_header(build_ok_header()); + resp.set_success(total_success as u32); + } + + ServerResult::Ok(resp) + }.then(|resp_result| async move { + if tx.send(resp_result).is_err() { + error!("Failed to send handler result, mod:stream_write, handler:handle_stream_write"); + } + }), + ); + + let task = async move { + let resp_result = match rx.await { + Ok(resp_result) => resp_result, + Err(_e) => ErrNoCause { + code: StatusCode::InternalError, + msg: "Result channel disconnected", + } + .fail(), + }; + + let resp = match resp_result { + Ok(resp) => resp, + Err(e) => { + let mut resp = WriteResponse::new(); + resp.set_header(build_err_header(e)); + resp + } + }; + sink.success(resp).await.context(GrpcSink)?; + + GRPC_HANDLER_DURATION_HISTOGRAM_VEC + .handle_stream_write + .observe(begin_instant.saturating_elapsed().as_secs_f64()); + + Result::Ok(()) + } + .map_err(move |e| { + error!( + "Failed to reply grpc resp, mod:stream_write, handler:handle_stream_write, err:{}", + e + ) + }) + .map(|_| ()); + + ctx.spawn(task); + } + + fn stream_query( + &mut self, + ctx: RpcContext<'_>, + req: QueryRequest, + mut sink: ServerStreamingSink, + ) { + let begin_instant = Instant::now(); + let router = self.router.clone(); + let header = RequestHeader::from(ctx.request_headers()); + let instance = self.instance.clone(); + let cluster_view = self.meta_client.get_cluster_view(); + let (tx, mut rx) = tokio::sync::mpsc::channel(STREAM_QUERY_CHANNEL_LEN); + self.runtimes.read_runtime.spawn(async move { + let handler_ctx = HandlerContext::new(header, router, instance, &cluster_view) + .map_err(|e| Box::new(e) as _) + .context(ErrWithCause { + code: StatusCode::InvalidArgument, + msg: "Invalid header", + })?; + let output = query::fetch_query_output(&handler_ctx, &req) + .await + .map_err(|e| { + error!("Failed to handle request, mod:stream_query, handler:handle_stream_query, err:{}", e); + e + })?; + if let Some(batch) = query::get_record_batch(&output) { + for i in 0..batch.len() { + let resp = query::convert_records(&batch[i..i + 1]); + if tx.send(resp).await.is_err() { + error!("Failed to send handler result, mod:stream_query, handler:handle_stream_query"); + break; + } + } + } else { + let mut resp = QueryResponse::new(); + resp.set_header(build_ok_header()); + + if tx.send(ServerResult::Ok(resp)).await.is_err() { + error!("Failed to send handler result, mod:stream_query, handler:handle_stream_query"); + } + } + ServerResult::Ok(()) + }); + + let mut has_err = false; + let task = async move { + while let Some(result) = rx.recv().await { + let resp = match result { + Ok(resp) => resp, + Err(e) => { + has_err = true; + let mut resp = QueryResponse::new(); + resp.set_header(build_err_header(e)); + resp + } + }; + sink.send((resp, WriteFlags::default())) + .await + .context(GrpcSink)?; + if has_err { + break; + } + } + sink.flush().await.context(GrpcSink)?; + sink.close().await.context(GrpcSink)?; + GRPC_HANDLER_DURATION_HISTOGRAM_VEC + .handle_stream_query + .observe(begin_instant.saturating_elapsed().as_secs_f64()); + Result::Ok(()) + } + .map_err(move |e| { + error!( + "Failed to reply grpc resp, mod:stream_query, handler:handle_stream_query, err:{}", + e + ); + }) + .map(|_| ()); + + ctx.spawn(task); + } +} + +/// Create CreateTablePlan from a write metric. +// The caller must ENSURE that the HandlerContext's schema_config is not None. +pub fn write_metric_to_create_table_plan< + C: CatalogManager + 'static, + Q: QueryExecutor + 'static, +>( + ctx: &HandlerContext, + write_metric: &WriteMetric, +) -> Result { + let schema_config = ctx.schema_config.unwrap(); + Ok(CreateTablePlan { + engine: schema_config.default_engine_type.clone(), + if_not_exists: true, + table: write_metric.get_metric().to_string(), + table_schema: build_schema_from_metric(schema_config, write_metric)?, + options: HashMap::default(), + }) +} + +fn build_column_schema( + column_name: &str, + data_type: DatumKind, + is_tag: bool, +) -> Result { + let builder = column_schema::Builder::new(column_name.to_string(), data_type) + .is_nullable(true) + .is_tag(is_tag); + + builder.build().context(BuildColumnSchema { column_name }) +} + +fn build_schema_from_metric(schema_config: &SchemaConfig, metric: &WriteMetric) -> Result { + let field_names = metric.get_field_names(); + let tag_names = metric.get_tag_names(); + let table_name = metric.get_metric(); + + let mut schema_builder = + SchemaBuilder::with_capacity(field_names.len()).auto_increment_column_id(true); + + let write_entries = metric.get_entries(); + + ensure!( + !write_entries.is_empty(), + InvalidArgument { + msg: format!("Emtpy write entires to write table:{}", table_name,), + } + ); + + let mut name_column_map: BTreeMap<_, ColumnSchema> = BTreeMap::new(); + for write_entry in write_entries { + // parse tags + for tag in write_entry.get_tags() { + let name_index = tag.name_index as usize; + ensure!( + name_index < tag_names.len(), + InvalidArgument { + msg: format!( + "tag index {} is not found in tag_names:{:?}, table:{}", + name_index, tag_names, table_name, + ), + } + ); + + let tag_name = &tag_names[name_index]; + + let tag_value = tag + .get_value() + .value + .as_ref() + .with_context(|| InvalidArgument { + msg: format!("Tag value is needed, tag_name:{} ", tag_name), + })?; + + let data_type = try_get_data_type_from_value(tag_value)?; + + if let Some(column_schema) = name_column_map.get(tag_name) { + ensure_data_type_compatible(table_name, tag_name, true, data_type, column_schema)?; + } + let column_schema = build_column_schema(tag_name, data_type, true)?; + name_column_map.insert(tag_name, column_schema); + } + + // parse fields + for field_group in write_entry.get_field_groups().iter() { + for field in field_group.get_fields() { + if (field.name_index as usize) < field_names.len() { + let field_name = &field_names[field.name_index as usize]; + let field_value = + field + .get_value() + .value + .as_ref() + .with_context(|| InvalidArgument { + msg: format!( + "Field: {} value is needed, table:{}", + field_name, table_name + ), + })?; + + let data_type = try_get_data_type_from_value(field_value)?; + + if let Some(column_schema) = name_column_map.get(field_name) { + ensure_data_type_compatible( + table_name, + field_name, + false, + data_type, + column_schema, + )?; + } + + let column_schema = build_column_schema(field_name, data_type, false)?; + name_column_map.insert(field_name, column_schema); + } + } + } + } + + // Timestamp column will be the last column + let timestamp_column_schema = column_schema::Builder::new( + schema_config.default_timestamp_column_name.clone(), + DatumKind::Timestamp, + ) + .is_nullable(false) + .build() + .context(InvalidColumnSchema { + column_name: TSID_COLUMN, + })?; + + // Use (timestamp, tsid) as primary key. + let tsid_column_schema = + column_schema::Builder::new(TSID_COLUMN.to_string(), DatumKind::UInt64) + .is_nullable(false) + .build() + .context(InvalidColumnSchema { + column_name: TSID_COLUMN, + })?; + + schema_builder = schema_builder + .enable_tsid_primary_key(true) + .add_key_column(timestamp_column_schema) + .with_context(|| BuildTableSchema { metric: table_name })? + .add_key_column(tsid_column_schema) + .with_context(|| BuildTableSchema { metric: table_name })?; + + for col in name_column_map.into_values() { + schema_builder = schema_builder + .add_normal_column(col) + .with_context(|| BuildTableSchema { metric: table_name })?; + } + + schema_builder.build().with_context(|| BuildTableSchema { + metric: metric.get_metric(), + }) +} + +fn ensure_data_type_compatible( + table_name: &str, + column_name: &str, + is_tag: bool, + data_type: DatumKind, + column_schema: &ColumnSchema, +) -> Result<()> { + ensure!( + column_schema.is_tag == is_tag, + InvalidArgument { + msg: format!( + "Duplicated column: {} in fields and tags for table: {}", + column_name, table_name, + ), + } + ); + ensure!( + column_schema.data_type == data_type, + InvalidArgument { + msg: format!( + "Column: {} in table: {} data type is not same, expected: {}, actual: {}", + column_name, table_name, column_schema.data_type, data_type, + ), + } + ); + Ok(()) +} + +fn try_get_data_type_from_value(value: &Value_oneof_value) -> Result { + match value { + Value_oneof_value::float64_value(_) => Ok(DatumKind::Double), + Value_oneof_value::string_value(_) => Ok(DatumKind::String), + Value_oneof_value::int64_value(_) => Ok(DatumKind::Int64), + Value_oneof_value::float32_value(_) => Ok(DatumKind::Float), + Value_oneof_value::int32_value(_) => Ok(DatumKind::Int32), + Value_oneof_value::int16_value(_) => Ok(DatumKind::Int16), + Value_oneof_value::int8_value(_) => Ok(DatumKind::Int8), + Value_oneof_value::bool_value(_) => Ok(DatumKind::Boolean), + Value_oneof_value::uint64_value(_) => Ok(DatumKind::UInt64), + Value_oneof_value::uint32_value(_) => Ok(DatumKind::UInt32), + Value_oneof_value::uint16_value(_) => Ok(DatumKind::UInt16), + Value_oneof_value::uint8_value(_) => Ok(DatumKind::UInt8), + Value_oneof_value::timestamp_value(_) => Ok(DatumKind::Timestamp), + Value_oneof_value::varbinary_value(_) => Ok(DatumKind::Varbinary), + } +} + +#[cfg(test)] +mod tests { + use ceresdbproto::storage::{Field, FieldGroup, Tag, Value, WriteEntry, WriteMetric}; + use common_types::datum::DatumKind; + use meta_client::SchemaConfig; + + use super::*; + + const TAG1: &str = "host"; + const TAG2: &str = "idc"; + const FIELD1: &str = "cpu"; + const FIELD2: &str = "memory"; + const FIELD3: &str = "log"; + const FIELD4: &str = "ping_ok"; + const METRIC: &str = "pod_system_metric"; + const TIMESTAMP_COLUMN: &str = "custom_timestamp"; + + fn generate_write_metric() -> WriteMetric { + let mut write_metric = WriteMetric::default(); + write_metric.set_metric(METRIC.to_string()); + + let tag_names = vec![TAG1.to_string(), TAG2.to_string()]; + let field_names = vec![ + FIELD1.to_string(), + FIELD2.to_string(), + FIELD3.to_string(), + FIELD4.to_string(), + ]; + + write_metric.set_field_names(field_names.into()); + write_metric.set_tag_names(tag_names.into()); + + //tags + let mut tag1 = Tag::new(); + tag1.set_name_index(0); + let mut tag_val1 = Value::new(); + tag_val1.set_string_value("test.host".to_string()); + tag1.set_value(tag_val1); + let mut tag2 = Tag::new(); + tag2.set_name_index(1); + let mut tag_val2 = Value::new(); + tag_val2.set_string_value("test.idc".to_string()); + tag2.set_value(tag_val2); + let tags = vec![tag1, tag2]; + + //fields + let mut field1 = Field::new(); + field1.set_name_index(0); + let mut field_val1 = Value::new(); + field_val1.set_float64_value(100.0); + field1.set_value(field_val1); + let mut field2 = Field::new(); + field2.set_name_index(1); + let mut field_val2 = Value::new(); + field_val2.set_float64_value(1024.0); + field2.set_value(field_val2); + let mut field3 = Field::new(); + field3.set_name_index(2); + let mut field_val3 = Value::new(); + field_val3.set_string_value("test log".to_string()); + field3.set_value(field_val3); + let mut field4 = Field::new(); + field4.set_name_index(3); + let mut field_val4 = Value::new(); + field_val4.set_bool_value(true); + field4.set_value(field_val4); + + let mut field_group1 = FieldGroup::new(); + field_group1.set_timestamp(1000); + field_group1.set_fields(vec![field1.clone(), field4].into()); + + let mut field_group2 = FieldGroup::new(); + field_group2.set_timestamp(2000); + field_group2.set_fields(vec![field1.clone(), field2.clone()].into()); + + let mut field_group3 = FieldGroup::new(); + field_group3.set_timestamp(3000); + field_group3.set_fields(vec![field3].into()); + + let mut write_entry = WriteEntry::new(); + write_entry.set_tags(tags.into()); + write_entry.set_field_groups(vec![field_group1, field_group2, field_group3].into()); + + write_metric.set_entries(vec![write_entry].into()); + + write_metric + } + + #[test] + fn test_build_schema_from_metric() { + let schema_config = SchemaConfig { + auto_create_tables: true, + default_timestamp_column_name: TIMESTAMP_COLUMN.to_string(), + ..SchemaConfig::default() + }; + let write_metric = generate_write_metric(); + + let schema = build_schema_from_metric(&schema_config, &write_metric); + assert!(schema.is_ok()); + + let schema = schema.unwrap(); + + assert_eq!(8, schema.num_columns()); + assert_eq!(2, schema.num_key_columns()); + assert_eq!(TIMESTAMP_COLUMN, schema.timestamp_name()); + let tsid = schema.tsid_column(); + assert!(tsid.is_some()); + + let key_columns = schema.key_columns(); + assert_eq!(2, key_columns.len()); + assert_eq!(TIMESTAMP_COLUMN, key_columns[0].name); + assert_eq!("tsid", key_columns[1].name); + + let columns = schema.normal_columns(); + assert_eq!(6, columns.len()); + + // sorted by column names because of btree + assert_eq!(FIELD1, columns[0].name); + assert!(!columns[0].is_tag); + assert_eq!(DatumKind::Double, columns[0].data_type); + assert_eq!(TAG1, columns[1].name); + assert!(columns[1].is_tag); + assert_eq!(DatumKind::String, columns[1].data_type); + assert_eq!(TAG2, columns[2].name); + assert!(columns[2].is_tag); + assert_eq!(DatumKind::String, columns[2].data_type); + assert_eq!(FIELD3, columns[3].name); + assert!(!columns[3].is_tag); + assert_eq!(DatumKind::String, columns[3].data_type); + assert_eq!(FIELD2, columns[4].name); + assert!(!columns[4].is_tag); + assert_eq!(DatumKind::Double, columns[4].data_type); + assert_eq!(FIELD4, columns[5].name); + assert!(!columns[5].is_tag); + assert_eq!(DatumKind::Boolean, columns[5].data_type); + + for column in columns { + assert!(column.is_nullable); + } + } +} diff --git a/server/src/grpc/prom_query.rs b/server/src/grpc/prom_query.rs new file mode 100644 index 0000000000..44916788ff --- /dev/null +++ b/server/src/grpc/prom_query.rs @@ -0,0 +1,467 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + collections::{BTreeMap, HashMap}, + sync::Arc, +}; + +use catalog::manager::Manager as CatalogManager; +use ceresdbproto::{ + common::ResponseHeader, + prometheus::{Label, PrometheusQueryRequest, PrometheusQueryResponse, Sample, TimeSeries}, +}; +use common_types::{ + datum::DatumKind, + record_batch::RecordBatch, + request_id::RequestId, + schema::{RecordSchema, TSID_COLUMN}, +}; +use interpreters::{context::Context as InterpreterContext, factory::Factory, interpreter::Output}; +use log::debug; +use query_engine::executor::{Executor as QueryExecutor, RecordBatchVec}; +use snafu::{ensure, OptionExt, ResultExt}; +use sql::{ + frontend::{Context as SqlContext, Frontend}, + promql::ColumnNames, + provider::CatalogMetaProvider, +}; + +use crate::{ + error::{ErrNoCause, ErrWithCause, Result, ServerError, StatusCode}, + grpc::HandlerContext, +}; + +pub async fn handle_query( + ctx: &HandlerContext<'_, C, Q>, + req: PrometheusQueryRequest, +) -> Result +where + C: CatalogManager + 'static, + Q: QueryExecutor + 'static, +{ + let request_id = RequestId::next_id(); + + debug!( + "Grpc handle query begin, catalog:{}, tenant:{}, request_id:{}, request:{:?}", + ctx.catalog(), + ctx.tenant(), + request_id, + req, + ); + + let instance = &ctx.instance; + // We use tenant as schema + // TODO(yingwen): Privilege check, cannot access data of other tenant + // TODO(yingwen): Maybe move MetaProvider to instance + let provider = CatalogMetaProvider { + manager: &instance.catalog_manager, + default_catalog: ctx.catalog(), + default_schema: ctx.tenant(), + function_registry: &*instance.function_registry, + }; + let frontend = Frontend::new(provider); + + let mut sql_ctx = SqlContext::new(request_id); + let expr = frontend + .parse_promql(&mut sql_ctx, req) + .map_err(|e| Box::new(e) as _) + .context(ErrWithCause { + code: StatusCode::InvalidArgument, + msg: "Invalid request", + })?; + + let (plan, column_name) = frontend + .promql_expr_to_plan(&mut sql_ctx, expr) + .map_err(|e| { + // TODO(chenxiang): improve error match + let code = if e.to_string().contains("Table not found") { + StatusCode::NotFound + } else { + StatusCode::InternalError + }; + ServerError::ErrWithCause { + code, + msg: "Failed to create plan".to_string(), + source: Box::new(e), + } + })?; + + if ctx.instance.limiter.should_limit(&plan) { + ErrNoCause { + code: StatusCode::TooManyRequests, + msg: "Query limited by reject list", + } + .fail()?; + } + + // Execute in interpreter + let interpreter_ctx = InterpreterContext::builder(request_id) + // Use current ctx's catalog and tenant as default catalog and tenant + .default_catalog_and_schema(ctx.catalog().to_string(), ctx.tenant().to_string()) + .build(); + let interpreter_factory = Factory::new( + instance.query_executor.clone(), + instance.catalog_manager.clone(), + instance.table_engine.clone(), + ); + let interpreter = interpreter_factory.create(interpreter_ctx, plan); + + let output = interpreter + .execute() + .await + .map_err(|e| Box::new(e) as _) + .with_context(|| ErrWithCause { + code: StatusCode::InternalError, + msg: "Failed to execute interpreter", + })?; + + let resp = convert_output(output, column_name) + .map_err(|e| Box::new(e) as _) + .with_context(|| ErrWithCause { + code: StatusCode::InternalError, + msg: "Failed to convert output", + })?; + + Ok(resp) +} + +fn convert_output( + output: Output, + column_name: Arc, +) -> Result { + match output { + Output::Records(records) => convert_records(records, column_name), + _ => unreachable!(), + } +} + +fn convert_records( + records: RecordBatchVec, + column_name: Arc, +) -> Result { + if records.is_empty() { + return Ok(empty_ok_resp()); + } + + let mut resp = empty_ok_resp(); + let mut tsid_to_tags = HashMap::new(); + let mut tsid_to_samples = HashMap::new(); + + // TODO(chenxiang): benchmark iterator by columns + for record_batch in records { + let converter = RecordConverter::try_new(&column_name, record_batch.schema())?; + + for (tsid, samples) in converter.convert_to_samples(record_batch, &mut tsid_to_tags) { + tsid_to_samples + .entry(tsid) + .or_insert_with(Vec::new) + .extend(samples) + } + } + + let series_set = tsid_to_samples + .into_iter() + .map(|(tsid, samples)| { + let tags = tsid_to_tags + .get(&tsid) + .expect("ensured in convert_to_samples"); + let mut timeseries = TimeSeries::new(); + timeseries.set_labels( + tags.iter() + .map(|(k, v)| { + let mut label = Label::new(); + label.set_name(k.clone()); + label.set_value(v.clone()); + label + }) + .collect::>() + .into(), + ); + timeseries.set_samples(samples.into()); + timeseries + }) + .collect::>(); + + resp.set_timeseries(series_set.into()); + Ok(resp) +} + +fn empty_ok_resp() -> PrometheusQueryResponse { + let mut header = ResponseHeader::new(); + header.code = StatusCode::Ok.as_u32(); + + let mut resp = PrometheusQueryResponse::new(); + resp.set_header(header); + + resp +} + +/// RecordConverter convert RecordBatch to time series format required by PromQL +struct RecordConverter { + tsid_idx: usize, + timestamp_idx: usize, + tags_idx: BTreeMap, // tag_key -> column_index + field_idx: usize, +} + +impl RecordConverter { + fn try_new(column_name: &ColumnNames, record_schema: &RecordSchema) -> Result { + let tsid_idx = record_schema + .index_of(TSID_COLUMN) + .with_context(|| ErrNoCause { + code: StatusCode::InvalidArgument, + msg: "Failed to find Tsid column".to_string(), + })?; + let timestamp_idx = record_schema + .index_of(&column_name.timestamp) + .with_context(|| ErrNoCause { + code: StatusCode::InvalidArgument, + msg: "Failed to find Timestamp column".to_string(), + })?; + ensure!( + record_schema.column(timestamp_idx).data_type == DatumKind::Timestamp, + ErrNoCause { + code: StatusCode::InvalidArgument, + msg: "Timestamp column should be timestamp type" + } + ); + let field_idx = record_schema + .index_of(&column_name.field) + .with_context(|| ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!("Failed to find {} column", column_name.field), + })?; + let field_type = record_schema.column(field_idx).data_type; + ensure!( + field_type.is_f64_castable(), + ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!( + "Field type must be f64-compatibile type, current:{}", + field_type + ) + } + ); + + let tags_idx: BTreeMap<_, _> = column_name + .tag_keys + .iter() + .filter_map(|tag_key| { + record_schema + .index_of(tag_key) + .map(|idx| (tag_key.to_string(), idx)) + }) + .collect(); + + Ok(Self { + tsid_idx, + timestamp_idx, + tags_idx, + field_idx, + }) + } + + fn convert_to_samples( + &self, + record_batch: RecordBatch, + tsid_to_tags: &mut HashMap>, + ) -> HashMap> { + let mut tsid_to_samples = HashMap::new(); + + let tsid_cols = record_batch.column(self.tsid_idx); + let timestamp_cols = record_batch.column(self.timestamp_idx); + let field_cols = record_batch.column(self.field_idx); + for row_idx in 0..record_batch.num_rows() { + let timestamp = timestamp_cols + .datum(row_idx) + .as_timestamp() + .expect("checked in try_new") + .as_i64(); + let field = field_cols + .datum(row_idx) + .as_f64() + .expect("checked in try_new"); + let tsid = tsid_cols + .datum(row_idx) + .as_u64() + .expect("checked in try_new"); + + tsid_to_tags.entry(tsid).or_insert_with(|| { + self.tags_idx + .iter() + .filter_map(|(tag_key, col_idx)| { + // TODO(chenxiang): avoid clone? + record_batch + .column(*col_idx) + .datum(row_idx) + .as_str() + .and_then(|tag_value| { + // filter empty tag value out, since Prometheus don't allow it. + if tag_value.is_empty() { + None + } else { + Some((tag_key.clone(), tag_value.to_string())) + } + }) + }) + .collect::>() + }); + + let samples = tsid_to_samples.entry(tsid).or_insert_with(Vec::new); + let mut sample = Sample::new(); + sample.set_value(field); + sample.set_timestamp(timestamp); + samples.push(sample); + } + + tsid_to_samples + } +} + +#[cfg(test)] +mod tests { + + use common_types::{ + column::{ColumnBlock, ColumnBlockBuilder}, + column_schema, + datum::Datum, + row::Row, + schema, + string::StringBytes, + time::Timestamp, + }; + + use super::*; + + fn build_schema() -> schema::Schema { + schema::Builder::new() + .auto_increment_column_id(true) + .enable_tsid_primary_key(true) + .add_key_column( + column_schema::Builder::new("timestamp".to_string(), DatumKind::Timestamp) + .build() + .unwrap(), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new(TSID_COLUMN.to_string(), DatumKind::UInt64) + .build() + .unwrap(), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("field1".to_string(), DatumKind::Double) + .build() + .unwrap(), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("tag1".to_string(), DatumKind::String) + .is_tag(true) + .build() + .unwrap(), + ) + .unwrap() + .build() + .unwrap() + } + + fn build_column_block() -> Vec { + let build_row = |ts: i64, tsid: u64, field1: f64, field2: &str| -> Row { + let datums = vec![ + Datum::Timestamp(Timestamp::new(ts)), + Datum::UInt64(tsid), + Datum::Double(field1), + Datum::String(StringBytes::from(field2)), + ]; + + Row::from_datums(datums) + }; + + let rows = vec![ + build_row(1000001, 1, 10.0, "v5"), + build_row(1000002, 1, 11.0, "v5"), + build_row(1000000, 2, 10.0, "v4"), + build_row(1000000, 3, 10.0, "v3"), + ]; + + let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::Timestamp, 2); + for row in &rows { + builder.append(row[0].clone()).unwrap(); + } + let timestamp_block = builder.build(); + + let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::UInt64, 2); + for row in &rows { + builder.append(row[1].clone()).unwrap(); + } + let tsid_block = builder.build(); + + let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::Double, 2); + for row in &rows { + builder.append(row[2].clone()).unwrap(); + } + let field_block = builder.build(); + + let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 2); + for row in &rows { + builder.append(row[3].clone()).unwrap(); + } + let tag_block = builder.build(); + + vec![timestamp_block, tsid_block, field_block, tag_block] + } + + fn make_sample(timestamp: i64, value: f64) -> Sample { + let mut sample = Sample::new(); + sample.set_value(value); + sample.set_timestamp(timestamp); + sample + } + + fn make_tags(tags: Vec<(String, String)>) -> BTreeMap { + tags.into_iter().collect::>() + } + + #[test] + fn test_record_convert() { + let schema = build_schema(); + let record_schema = schema.to_record_schema(); + let column_blocks = build_column_block(); + let record_batch = RecordBatch::new(record_schema, column_blocks).unwrap(); + + let column_name = ColumnNames { + timestamp: "timestamp".to_string(), + tag_keys: vec!["tag1".to_string()], + field: "field1".to_string(), + }; + let converter = RecordConverter::try_new(&column_name, &schema.to_record_schema()).unwrap(); + let mut tsid_to_tags = HashMap::new(); + let tsid_to_samples = converter.convert_to_samples(record_batch, &mut tsid_to_tags); + + assert_eq!( + tsid_to_samples.get(&1).unwrap().clone(), + vec![make_sample(1000001, 10.0), make_sample(1000002, 11.0)] + ); + assert_eq!( + tsid_to_samples.get(&2).unwrap().clone(), + vec![make_sample(1000000, 10.0)] + ); + assert_eq!( + tsid_to_samples.get(&3).unwrap().clone(), + vec![make_sample(1000000, 10.0)] + ); + assert_eq!( + tsid_to_tags.get(&1).unwrap().clone(), + make_tags(vec![("tag1".to_string(), "v5".to_string())]) + ); + assert_eq!( + tsid_to_tags.get(&2).unwrap().clone(), + make_tags(vec![("tag1".to_string(), "v4".to_string())]) + ); + assert_eq!( + tsid_to_tags.get(&3).unwrap().clone(), + make_tags(vec![("tag1".to_string(), "v3".to_string())]) + ); + } +} diff --git a/server/src/grpc/query.rs b/server/src/grpc/query.rs new file mode 100644 index 0000000000..9c36a196c4 --- /dev/null +++ b/server/src/grpc/query.rs @@ -0,0 +1,224 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Query handler + +use std::time::Instant; + +use catalog::manager::Manager as CatalogManager; +use ceresdbproto::{ + common::ResponseHeader, + storage::{QueryRequest, QueryResponse, QueryResponse_SchemaType}, +}; +use common_types::{record_batch::RecordBatch, request_id::RequestId}; +use common_util::time::InstantExt; +use interpreters::{context::Context as InterpreterContext, factory::Factory, interpreter::Output}; +use log::info; +use query_engine::executor::{Executor as QueryExecutor, RecordBatchVec}; +use snafu::{ensure, ResultExt}; +use sql::{ + frontend::{Context as SqlContext, Frontend}, + provider::CatalogMetaProvider, +}; + +use crate::{ + avro_util, + error::{ErrNoCause, ErrWithCause, Result, StatusCode}, + grpc::HandlerContext, +}; + +/// Schema name of the record +const RECORD_NAME: &str = "Result"; + +fn empty_ok_resp() -> QueryResponse { + let mut header = ResponseHeader::new(); + header.code = StatusCode::Ok.as_u32(); + + let mut resp = QueryResponse::new(); + resp.set_header(header); + + resp +} + +pub async fn handle_query( + ctx: &HandlerContext<'_, C, Q>, + req: QueryRequest, +) -> Result { + let output_result = fetch_query_output(ctx, &req).await?; + if let Some(output) = output_result { + convert_output(&output) + .map_err(|e| Box::new(e) as _) + .with_context(|| ErrWithCause { + code: StatusCode::InternalError, + msg: format!("Failed to convert output, query:{}", &req.ql), + }) + } else { + Ok(empty_ok_resp()) + } +} + +pub async fn fetch_query_output( + ctx: &HandlerContext<'_, C, Q>, + req: &QueryRequest, +) -> Result> { + let request_id = RequestId::next_id(); + let begin_instant = Instant::now(); + + info!( + "Grpc handle query begin, catalog:{}, tenant:{}, request_id:{}, request:{:?}", + ctx.catalog(), + ctx.tenant(), + request_id, + req, + ); + + let instance = &ctx.instance; + // We use tenant as schema + // TODO(yingwen): Privilege check, cannot access data of other tenant + // TODO(yingwen): Maybe move MetaProvider to instance + let provider = CatalogMetaProvider { + manager: &instance.catalog_manager, + default_catalog: ctx.catalog(), + default_schema: ctx.tenant(), + function_registry: &*instance.function_registry, + }; + let frontend = Frontend::new(provider); + + let mut sql_ctx = SqlContext::new(request_id); + // Parse sql, frontend error of invalid sql already contains sql + // TODO(yingwen): Maybe move sql from frontend error to outer error + let mut stmts = frontend + .parse_sql(&mut sql_ctx, &req.ql) + .map_err(|e| Box::new(e) as _) + .context(ErrWithCause { + code: StatusCode::InvalidArgument, + msg: "Failed to parse sql", + })?; + + if stmts.is_empty() { + return Ok(None); + } + + // TODO(yingwen): For simplicity, we only support executing one statement now + // TODO(yingwen): INSERT/UPDATE/DELETE can be batched + ensure!( + stmts.len() == 1, + ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!( + "Only support execute one statement now, current num:{}, query:{}", + stmts.len(), + req.ql + ), + } + ); + + // Create logical plan + // Note: Remember to store sql in error when creating logical plan + let plan = frontend + // TODO(yingwen): Check error, some error may indicate that the sql is invalid. Now we + // return internal server error in those cases + .statement_to_plan(&mut sql_ctx, stmts.remove(0)) + .map_err(|e| Box::new(e) as _) + .with_context(|| ErrWithCause { + code: StatusCode::InternalError, + msg: format!("Failed to create plan, query:{}", req.ql), + })?; + + if ctx.instance.limiter.should_limit(&plan) { + ErrNoCause { + code: StatusCode::TooManyRequests, + msg: "Query limited by reject list", + } + .fail()?; + } + + // Execute in interpreter + let interpreter_ctx = InterpreterContext::builder(request_id) + // Use current ctx's catalog and tenant as default catalog and tenant + .default_catalog_and_schema(ctx.catalog().to_string(), ctx.tenant().to_string()) + .build(); + let interpreter_factory = Factory::new( + instance.query_executor.clone(), + instance.catalog_manager.clone(), + instance.table_engine.clone(), + ); + let interpreter = interpreter_factory.create(interpreter_ctx, plan); + + let output = interpreter + .execute() + .await + .map_err(|e| Box::new(e) as _) + .with_context(|| ErrWithCause { + code: StatusCode::InternalError, + msg: format!("Failed to execute interpreter, query:{}", req.ql), + })?; + + info!( + "Grpc handle query success, catalog:{}, tenant:{}, request_id:{}, cost:{}, request:{:?}", + ctx.catalog(), + ctx.tenant(), + request_id, + begin_instant.saturating_elapsed().as_millis(), + req, + ); + + Ok(Some(output)) +} + +fn convert_output(output: &Output) -> Result { + match output { + Output::Records(records) => convert_records(records), + _ => unreachable!(), + } +} + +pub fn get_record_batch(op: &Option) -> Option<&RecordBatchVec> { + if let Some(output) = op { + match output { + Output::Records(records) => Some(records), + _ => unreachable!(), + } + } else { + None + } +} + +/// REQUIRE: records have same schema +pub fn convert_records(records: &[RecordBatch]) -> Result { + if records.is_empty() { + return Ok(empty_ok_resp()); + } + + let mut resp = empty_ok_resp(); + let mut avro_schema_opt = None; + + let total_row = records.iter().map(|v| v.num_rows()).sum(); + let mut rows = Vec::with_capacity(total_row); + for record_batch in records { + let avro_schema = match avro_schema_opt.as_ref() { + Some(schema) => schema, + None => { + let avro_schema = avro_util::to_avro_schema(RECORD_NAME, record_batch.schema()); + + // We only set schema_json once, so all record batches need to have same schema + resp.schema_type = QueryResponse_SchemaType::AVRO; + resp.schema_content = avro_schema.canonical_form(); + + avro_schema_opt = Some(avro_schema); + + avro_schema_opt.as_ref().unwrap() + } + }; + + avro_util::record_batch_to_avro(record_batch, avro_schema, &mut rows) + .map_err(|e| Box::new(e) as _) + .context(ErrWithCause { + code: StatusCode::InternalError, + msg: "Failed to convert record batch", + })?; + } + + resp.set_rows(rows.into()); + + Ok(resp) +} diff --git a/server/src/grpc/route.rs b/server/src/grpc/route.rs new file mode 100644 index 0000000000..ec0f354637 --- /dev/null +++ b/server/src/grpc/route.rs @@ -0,0 +1,35 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Route handler + +use std::sync::Arc; + +use catalog::manager::Manager; +use ceresdbproto::storage::{RouteRequest, RouteResponse}; + +use crate::{ + error::Result, + grpc::{self, HandlerContext}, + router::Router, +}; + +pub async fn handle_route( + ctx: &HandlerContext<'_, C, Q>, + req: RouteRequest, +) -> Result { + handle_route_sync(ctx.router.clone(), req, ctx.tenant()) +} + +fn handle_route_sync( + router: Arc, + req: RouteRequest, + schema: &str, +) -> Result { + let route_vec = router.route(schema, req)?; + + let mut resp = RouteResponse::new(); + resp.set_header(grpc::build_ok_header()); + resp.set_routes(route_vec.into()); + + Ok(resp) +} diff --git a/server/src/grpc/write.rs b/server/src/grpc/write.rs new file mode 100644 index 0000000000..55f1880d57 --- /dev/null +++ b/server/src/grpc/write.rs @@ -0,0 +1,586 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Write handler + +use std::collections::HashMap; + +use catalog::manager::Manager as CatalogManager; +use ceresdbproto::storage::{ + Value_oneof_value, WriteEntry, WriteMetric, WriteRequest, WriteResponse, +}; +use common_types::{ + bytes::Bytes, + datum::{Datum, DatumKind}, + request_id::RequestId, + row::{Row, RowGroupBuilder}, + schema::Schema, + time::Timestamp, +}; +use interpreters::{context::Context as InterpreterContext, factory::Factory, interpreter::Output}; +use log::debug; +use query_engine::executor::Executor as QueryExecutor; +use snafu::{ensure, OptionExt, ResultExt}; +use sql::plan::{InsertPlan, Plan}; +use table_engine::table::TableRef; + +use crate::{ + error::{ErrNoCause, ErrWithCause, Result, StatusCode}, + grpc::{self, HandlerContext}, +}; + +pub(crate) async fn handle_write( + ctx: &HandlerContext<'_, C, Q>, + req: WriteRequest, +) -> Result { + let request_id = RequestId::next_id(); + + debug!( + "Grpc handle write begin, catalog:{}, tenant:{}, request_id:{}, first_table:{:?}, num_tables:{}", + ctx.catalog(), + ctx.tenant(), + request_id, + req.get_metrics() + .first() + .map(|m| (m.get_metric(), m.get_tag_names(), m.get_field_names())), + req.get_metrics().len(), + ); + + let instance = &ctx.instance; + let plan_vec = write_request_to_insert_plan(ctx, req, request_id).await?; + + let mut success = 0; + for insert_plan in plan_vec { + debug!( + "Grpc handle write table begin, table:{}, row_num:{}", + insert_plan.table.name(), + insert_plan.rows.num_rows() + ); + let plan = Plan::Insert(insert_plan); + + if ctx.instance.limiter.should_limit(&plan) { + ErrNoCause { + code: StatusCode::TooManyRequests, + msg: "Insert limited by reject list", + } + .fail()?; + } + + let interpreter_ctx = InterpreterContext::builder(request_id) + // Use current ctx's catalog and tenant as default catalog and tenant + .default_catalog_and_schema(ctx.catalog().to_string(), ctx.tenant().to_string()) + .build(); + let interpreter_factory = Factory::new( + instance.query_executor.clone(), + instance.catalog_manager.clone(), + instance.table_engine.clone(), + ); + let interpreter = interpreter_factory.create(interpreter_ctx, plan); + + let row_num = match interpreter + .execute() + .await + .map_err(|e| Box::new(e) as _) + .context(ErrWithCause { + code: StatusCode::InternalError, + msg: "Failed to execute interpreter", + })? { + Output::AffectedRows(n) => n, + _ => unreachable!(), + }; + + success += row_num; + } + + let mut resp = WriteResponse::new(); + resp.set_header(grpc::build_ok_header()); + resp.set_success(success as u32); + + debug!( + "Grpc handle write finished, catalog:{}, tenant:{}, resp:{:?}", + ctx.catalog(), + ctx.tenant(), + resp + ); + + Ok(resp) +} + +async fn write_request_to_insert_plan( + ctx: &HandlerContext<'_, C, Q>, + mut write_request: WriteRequest, + request_id: RequestId, +) -> Result> { + let mut plan_vec = Vec::with_capacity(write_request.get_metrics().len()); + + for write_metric in write_request.take_metrics() { + let table_name = write_metric.get_metric(); + let mut table = try_get_table(ctx, table_name)?; + + if table.is_none() { + if let Some(config) = ctx.schema_config { + if config.auto_create_tables { + create_table(ctx, &write_metric, request_id).await?; + // try to get table again + table = try_get_table(ctx, table_name)?; + } + } + } + + match table { + Some(table) => { + let plan = write_metric_to_insert_plan(table, write_metric)?; + plan_vec.push(plan); + } + None => { + return ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!("Table not found, table:{}", write_metric.get_metric()), + } + .fail(); + } + } + } + + Ok(plan_vec) +} + +fn try_get_table( + ctx: &HandlerContext<'_, C, Q>, + table_name: &str, +) -> Result> { + ctx.instance + .catalog_manager + .catalog_by_name(ctx.catalog()) + .map_err(|e| Box::new(e) as _) + .with_context(|| ErrWithCause { + code: StatusCode::InternalError, + msg: format!("Failed to find catalog, catalog_name:{}", ctx.catalog()), + })? + .with_context(|| ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!("Catalog not found, catalog_name:{}", ctx.catalog()), + })? + .schema_by_name(ctx.tenant()) + .map_err(|e| Box::new(e) as _) + .with_context(|| ErrWithCause { + code: StatusCode::InternalError, + msg: format!("Failed to find tenant, tenant_name:{}", ctx.tenant()), + })? + .with_context(|| ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!("Tenant not found, tenant_name:{}", ctx.tenant()), + })? + .table_by_name(table_name) + .map_err(|e| Box::new(e) as _) + .with_context(|| ErrWithCause { + code: StatusCode::InternalError, + msg: format!("Failed to find table, table:{}", table_name), + }) +} + +async fn create_table( + ctx: &HandlerContext<'_, C, Q>, + write_metric: &WriteMetric, + request_id: RequestId, +) -> Result<()> { + let create_table_plan = grpc::write_metric_to_create_table_plan(ctx, write_metric) + .map_err(|e| Box::new(e) as _) + .with_context(|| ErrWithCause { + code: StatusCode::InternalError, + msg: format!( + "Failed to build creating table plan from metric, table:{}", + write_metric.get_metric() + ), + })?; + + debug!( + "Grpc handle create table begin, table:{}, schema: {:?}", + create_table_plan.table, create_table_plan.table_schema, + ); + let plan = Plan::Create(create_table_plan); + + let instance = &ctx.instance; + + if instance.limiter.should_limit(&plan) { + ErrNoCause { + code: StatusCode::TooManyRequests, + msg: "Create table limited by reject list", + } + .fail()?; + } + + let interpreter_ctx = InterpreterContext::builder(request_id) + // Use current ctx's catalog and tenant as default catalog and tenant + .default_catalog_and_schema(ctx.catalog().to_string(), ctx.tenant().to_string()) + .build(); + let interpreter_factory = Factory::new( + instance.query_executor.clone(), + instance.catalog_manager.clone(), + instance.table_engine.clone(), + ); + let interpreter = interpreter_factory.create(interpreter_ctx, plan); + + let _ = match interpreter + .execute() + .await + .map_err(|e| Box::new(e) as _) + .context(ErrWithCause { + code: StatusCode::InternalError, + msg: "Failed to execute interpreter", + })? { + Output::AffectedRows(n) => n, + _ => unreachable!(), + }; + + Ok(()) +} + +fn write_metric_to_insert_plan( + table: TableRef, + mut write_metric: WriteMetric, +) -> Result { + let schema = table.schema(); + + let mut rows_total = Vec::new(); + for write_entry in write_metric.take_entries() { + let mut rows = write_entry_to_rows( + write_metric.get_metric(), + &schema, + write_metric.get_tag_names(), + write_metric.get_field_names(), + write_entry, + )?; + rows_total.append(&mut rows); + } + // The row group builder will checks nullable. + let row_group = RowGroupBuilder::with_rows(schema, rows_total) + .map_err(|e| Box::new(e) as _) + .context(ErrWithCause { + code: StatusCode::InternalError, + msg: format!("Failed to build row group, table:{}", table.name()), + })? + .build(); + Ok(InsertPlan { + table, + rows: row_group, + }) +} + +fn write_entry_to_rows( + table_name: &str, + schema: &Schema, + tag_names: &[String], + field_names: &[String], + mut write_entry: WriteEntry, +) -> Result> { + // Init all columns by null. + let mut rows = vec![ + Row::from_datums(vec![Datum::Null; schema.num_columns()]); + write_entry.get_field_groups().len() + ]; + + // Fill tsid by default value. + if let Some(tsid_idx) = schema.index_of_tsid() { + let kind = &schema.tsid_column().unwrap().data_type; + let default_datum = Datum::empty(kind); + for row in &mut rows { + row[tsid_idx] = default_datum.clone(); + } + } + + // Fill tags. + for mut tag in write_entry.take_tags() { + let name_index = tag.name_index as usize; + ensure!( + name_index < tag_names.len(), + ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!( + "tag index {} is not found in tag_names:{:?}, table:{}", + name_index, tag_names, table_name, + ), + } + ); + + let tag_name = &tag_names[name_index]; + let tag_index_in_schema = schema.index_of(tag_name).with_context(|| ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!( + "Can't find tag in schema, table:{}, tag_name:{}", + table_name, tag_name + ), + })?; + + let column_schema = schema.column(tag_index_in_schema); + ensure!( + column_schema.is_tag, + ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!( + "column {} is a field rather than a tag, table:{}", + tag_name, table_name + ), + } + ); + + let tag_value = tag.take_value().value.with_context(|| ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!( + "Tag value is needed, table:{}, tag_name:{}", + table_name, tag_name + ), + })?; + for row in &mut rows { + row[tag_index_in_schema] = convert_proto_value_to_datum( + table_name, + tag_name, + tag_value.clone(), + column_schema.data_type, + )?; + } + } + + // Fill fields. + let mut field_name_index: HashMap = HashMap::new(); + for (i, mut field_group) in write_entry.take_field_groups().into_iter().enumerate() { + // timestamp + let timestamp_index_in_schema = schema.timestamp_index(); + rows[i][timestamp_index_in_schema] = + Datum::Timestamp(Timestamp::new(field_group.get_timestamp())); + + for mut field in field_group.take_fields() { + if (field.name_index as usize) < field_names.len() { + let field_name = &field_names[field.name_index as usize]; + let index_in_schema = if field_name_index.contains_key(field_name) { + field_name_index.get(field_name).unwrap().to_owned() + } else { + let index_in_schema = + schema.index_of(field_name).with_context(|| ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!( + "Can't find field in schema, table:{}, field_name:{}", + table_name, field_name + ), + })?; + field_name_index.insert(field_name.to_string(), index_in_schema); + index_in_schema + }; + let column_schema = schema.column(index_in_schema); + ensure!( + !column_schema.is_tag, + ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!( + "Column {} is a tag rather than a field, table:{}", + field_name, table_name + ) + } + ); + let field_value = field.take_value().value.with_context(|| ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!("Field is needed, table:{}", table_name), + })?; + + rows[i][index_in_schema] = convert_proto_value_to_datum( + table_name, + field_name, + field_value, + column_schema.data_type, + )?; + } + } + } + + Ok(rows) +} + +/// Convert the `Value_oneof_value` defined in protos into the datum. +fn convert_proto_value_to_datum( + table_name: &str, + name: &str, + value: Value_oneof_value, + data_type: DatumKind, +) -> Result { + match (value, data_type) { + (Value_oneof_value::float64_value(v), DatumKind::Double) => Ok(Datum::Double(v)), + (Value_oneof_value::string_value(v), DatumKind::String) => Ok(Datum::String(v.into())), + (Value_oneof_value::int64_value(v), DatumKind::Int64) => Ok(Datum::Int64(v)), + (Value_oneof_value::float32_value(v), DatumKind::Float) => Ok(Datum::Float(v)), + (Value_oneof_value::int32_value(v), DatumKind::Int32) => Ok(Datum::Int32(v)), + (Value_oneof_value::int16_value(v), DatumKind::Int16) => Ok(Datum::Int16(v as i16)), + (Value_oneof_value::int8_value(v), DatumKind::Int8) => Ok(Datum::Int8(v as i8)), + (Value_oneof_value::bool_value(v), DatumKind::Boolean) => Ok(Datum::Boolean(v)), + (Value_oneof_value::uint64_value(v), DatumKind::UInt64) => Ok(Datum::UInt64(v)), + (Value_oneof_value::uint32_value(v), DatumKind::UInt32) => Ok(Datum::UInt32(v)), + (Value_oneof_value::uint16_value(v), DatumKind::UInt16) => Ok(Datum::UInt16(v as u16)), + (Value_oneof_value::uint8_value(v), DatumKind::UInt8) => Ok(Datum::UInt8(v as u8)), + (Value_oneof_value::timestamp_value(v), DatumKind::Timestamp) => Ok(Datum::Timestamp(Timestamp::new(v))), + (Value_oneof_value::varbinary_value(v), DatumKind::Varbinary) => Ok(Datum::Varbinary(Bytes::from(v))), + (v, _) => ErrNoCause { + code: StatusCode::InvalidArgument, + msg: format!( + "Value type is not same, table:{}, value_name:{}, schema_type:{:?}, actual_value:{:?}", + table_name, + name, + data_type, + v + ), + } + .fail(), + } +} + +#[cfg(test)] +mod test { + use ceresdbproto::storage::{Field, FieldGroup, Tag, Value}; + use common_types::{ + column_schema::{self, ColumnSchema}, + schema::Builder, + }; + use system_catalog::sys_catalog_table::TIMESTAMP_COLUMN_NAME; + + use super::*; + + const TAG_K: &str = "tagk"; + const TAG_V: &str = "tagv"; + const TAG_K1: &str = "tagk1"; + const TAG_V1: &str = "tagv1"; + const FIELD_NAME: &str = "field"; + const FIELD_NAME1: &str = "field1"; + const FIELD_VALUE_STRING: &str = "stringValue"; + + // tag_names field_names write_entry + fn generate_write_entry() -> (Schema, Vec, Vec, WriteEntry) { + let tag_names = vec![TAG_K.to_string(), TAG_K1.to_string()]; + let field_names = vec![FIELD_NAME.to_string(), FIELD_NAME1.to_string()]; + + let mut tag = Tag::new(); + tag.set_name_index(0); + let mut tag_val = Value::new(); + tag_val.set_string_value(TAG_V.to_string()); + tag.set_value(tag_val); + + let mut tag1 = Tag::new(); + tag1.set_name_index(1); + let mut tag_val1 = Value::new(); + tag_val1.set_string_value(TAG_V1.to_string()); + tag1.set_value(tag_val1); + let tags = vec![tag, tag1]; + + let mut field = Field::new(); + field.set_name_index(0); + let mut field_val = Value::new(); + field_val.set_float64_value(100.0); + field.set_value(field_val); + let mut field1 = Field::new(); + field1.set_name_index(1); + let mut field_val1 = Value::new(); + field_val1.set_string_value(FIELD_VALUE_STRING.to_string()); + field1.set_value(field_val1); + let mut field_group = FieldGroup::new(); + field_group.set_timestamp(1000); + field_group.set_fields(vec![field].into()); + + let mut field_group1 = FieldGroup::new(); + field_group1.set_timestamp(2000); + field_group1.set_fields(vec![field1.clone()].into()); + + let mut field_group2 = FieldGroup::new(); + field_group2.set_timestamp(3000); + field_group2.set_fields(vec![field1].into()); + + let mut write_entry = WriteEntry::new(); + + write_entry.set_tags(tags.into()); + + write_entry.set_field_groups(vec![field_group, field_group1, field_group2].into()); + + let schema_builder = Builder::new(); + let schema = schema_builder + .auto_increment_column_id(true) + .add_key_column(ColumnSchema { + id: column_schema::COLUMN_ID_UNINIT, + name: TIMESTAMP_COLUMN_NAME.to_string(), + data_type: DatumKind::Timestamp, + is_nullable: false, + is_tag: false, + comment: String::new(), + }) + .unwrap() + .add_key_column(ColumnSchema { + id: column_schema::COLUMN_ID_UNINIT, + name: TAG_K.to_string(), + data_type: DatumKind::String, + is_nullable: false, + is_tag: true, + comment: String::new(), + }) + .unwrap() + .add_normal_column(ColumnSchema { + id: column_schema::COLUMN_ID_UNINIT, + name: TAG_K1.to_string(), + data_type: DatumKind::String, + is_nullable: false, + is_tag: true, + comment: String::new(), + }) + .unwrap() + .add_normal_column(ColumnSchema { + id: column_schema::COLUMN_ID_UNINIT, + name: FIELD_NAME.to_string(), + data_type: DatumKind::Double, + is_nullable: true, + is_tag: false, + comment: String::new(), + }) + .unwrap() + .add_normal_column(ColumnSchema { + id: column_schema::COLUMN_ID_UNINIT, + name: FIELD_NAME1.to_string(), + data_type: DatumKind::String, + is_nullable: true, + is_tag: false, + comment: String::new(), + }) + .unwrap() + .build() + .unwrap(); + (schema, tag_names, field_names, write_entry) + } + + #[test] + fn test_write_entry_to_row_group() { + let (schema, tag_names, field_names, write_entry) = generate_write_entry(); + let rows = + write_entry_to_rows("test_table", &schema, &tag_names, &field_names, write_entry) + .unwrap(); + let row0 = vec![ + Datum::Timestamp(Timestamp::new(1000)), + Datum::String(TAG_V.into()), + Datum::String(TAG_V1.into()), + Datum::Double(100.0), + Datum::Null, + ]; + let row1 = vec![ + Datum::Timestamp(Timestamp::new(2000)), + Datum::String(TAG_V.into()), + Datum::String(TAG_V1.into()), + Datum::Null, + Datum::String(FIELD_VALUE_STRING.into()), + ]; + let row2 = vec![ + Datum::Timestamp(Timestamp::new(3000)), + Datum::String(TAG_V.into()), + Datum::String(TAG_V1.into()), + Datum::Null, + Datum::String(FIELD_VALUE_STRING.into()), + ]; + + let expect_rows = vec![ + Row::from_datums(row0), + Row::from_datums(row1), + Row::from_datums(row2), + ]; + assert_eq!(rows, expect_rows); + } +} diff --git a/server/src/handlers/admin.rs b/server/src/handlers/admin.rs new file mode 100644 index 0000000000..1779e917c6 --- /dev/null +++ b/server/src/handlers/admin.rs @@ -0,0 +1,71 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::collections::BTreeSet; + +use crate::handlers::prelude::*; + +#[derive(Debug, Deserialize)] +pub enum Operation { + Add, + Set, + Remove, +} + +#[derive(Debug, Deserialize)] +pub struct RejectRequest { + operation: Operation, + write_reject_list: Vec, + read_reject_list: Vec, +} + +#[derive(Serialize)] +pub struct RejectResponse { + write_reject_list: BTreeSet, + read_reject_list: BTreeSet, +} + +pub async fn handle_reject( + _ctx: RequestContext, + instance: InstanceRef, + request: RejectRequest, +) -> Result { + match request.operation { + Operation::Add => { + instance + .limiter + .add_write_reject_list(request.write_reject_list); + instance + .limiter + .add_read_reject_list(request.read_reject_list); + } + Operation::Set => { + instance + .limiter + .set_write_reject_list(request.write_reject_list); + instance + .limiter + .set_read_reject_list(request.read_reject_list); + } + Operation::Remove => { + instance + .limiter + .remove_write_reject_list(request.write_reject_list); + instance + .limiter + .remove_read_reject_list(request.read_reject_list); + } + } + + Ok(RejectResponse { + write_reject_list: instance + .limiter + .get_write_reject_list() + .into_iter() + .collect::>(), + read_reject_list: instance + .limiter + .get_read_reject_list() + .into_iter() + .collect::>(), + }) +} diff --git a/server/src/handlers/error.rs b/server/src/handlers/error.rs new file mode 100644 index 0000000000..0d781f2560 --- /dev/null +++ b/server/src/handlers/error.rs @@ -0,0 +1,52 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Error of handlers + +use snafu::{Backtrace, Snafu}; + +// TODO(yingwen): Avoid printing huge sql string +// TODO(yingwen): Maybe add an error type to sql sub mod +#[derive(Debug, Snafu)] +#[snafu(visibility(pub(crate)))] +pub enum Error { + #[snafu(display("Failed to parse sql, err:{}", source))] + ParseSql { source: sql::frontend::Error }, + + #[snafu(display("Failed to create plan, query:{}, err:{}", query, source))] + CreatePlan { + query: String, + source: sql::frontend::Error, + }, + + #[snafu(display( + "Only support execute one statement now, current num:{}, query:{}.\nBacktrace:\n{}", + len, + query, + backtrace, + ))] + TooMuchStmt { + len: usize, + query: String, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to execute interpreter, query:{}, err:{}", query, source))] + InterpreterExec { + query: String, + source: interpreters::interpreter::Error, + }, + + #[snafu(display( + "Failed to convert arrow to string, query:{}, err:{}.\nBacktrace:\n{}", + query, + source, + backtrace + ))] + ArrowToString { + query: String, + source: arrow_deps::arrow::error::ArrowError, + backtrace: Backtrace, + }, +} + +define_result!(Error); diff --git a/server/src/handlers/mod.rs b/server/src/handlers/mod.rs new file mode 100644 index 0000000000..e695b3b610 --- /dev/null +++ b/server/src/handlers/mod.rs @@ -0,0 +1,21 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Request handlers + +pub mod admin; +pub mod error; +pub mod sql; + +mod prelude { + pub use catalog::manager::Manager as CatalogManager; + pub use query_engine::executor::Executor as QueryExecutor; + pub use serde_derive::{Deserialize, Serialize}; + pub use snafu::ResultExt; + pub use warp::Filter; + + pub use crate::{ + context::RequestContext, + handlers::error::{Error, Result}, + instance::InstanceRef, + }; +} diff --git a/server/src/handlers/sql.rs b/server/src/handlers/sql.rs new file mode 100644 index 0000000000..1fa96b1d54 --- /dev/null +++ b/server/src/handlers/sql.rs @@ -0,0 +1,148 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! SQL request handler + +use std::collections::HashMap; + +use arrow_deps::arrow::error::Result as ArrowResult; +use common_types::{datum::Datum, request_id::RequestId}; +use interpreters::{context::Context as InterpreterContext, factory::Factory, interpreter::Output}; +use log::info; +use query_engine::executor::RecordBatchVec; +use serde_derive::Serialize; +use snafu::ensure; +use sql::{ + frontend::{Context as SqlContext, Frontend}, + provider::CatalogMetaProvider, +}; + +use crate::handlers::{ + error::{ArrowToString, CreatePlan, InterpreterExec, ParseSql, TooMuchStmt}, + prelude::*, +}; + +#[derive(Debug, Deserialize)] +pub struct Request { + query: String, +} + +// TODO(yingwen): Improve serialize performance +#[derive(Serialize)] +#[serde(rename_all = "snake_case")] +pub enum Response { + AffectedRows(usize), + Rows(Vec>), +} + +pub async fn handle_sql( + ctx: RequestContext, + instance: InstanceRef, + request: Request, +) -> Result { + let request_id = RequestId::next_id(); + + info!( + "sql handler try to process request, request_id:{}, request:{:?}", + request_id, request + ); + + // We use tenant as schema + // TODO(yingwen): Privilege check, cannot access data of other tenant + // TODO(yingwen): Maybe move MetaProvider to instance + let provider = CatalogMetaProvider { + manager: &instance.catalog_manager, + default_catalog: &ctx.catalog, + default_schema: &ctx.tenant, + function_registry: &*instance.function_registry, + }; + let frontend = Frontend::new(provider); + + let mut sql_ctx = SqlContext::new(request_id); + // Parse sql, frontend error of invalid sql already contains sql + // TODO(yingwen): Maybe move sql from frontend error to outer error + let mut stmts = frontend + .parse_sql(&mut sql_ctx, &request.query) + .context(ParseSql)?; + + if stmts.is_empty() { + return Ok(Response::AffectedRows(0)); + } + + // TODO(yingwen): For simplicity, we only support executing one statement now + // TODO(yingwen): INSERT/UPDATE/DELETE can be batched + ensure!( + stmts.len() == 1, + TooMuchStmt { + len: stmts.len(), + query: request.query, + } + ); + + // Create logical plan + // Note: Remember to store sql in error when creating logical plan + let plan = frontend + .statement_to_plan(&mut sql_ctx, stmts.remove(0)) + .context(CreatePlan { + query: &request.query, + })?; + + // Execute in interpreter + let interpreter_ctx = InterpreterContext::builder(request_id) + // Use current ctx's catalog and tenant as default catalog and tenant + .default_catalog_and_schema(ctx.catalog, ctx.tenant) + .build(); + let interpreter_factory = Factory::new( + instance.query_executor.clone(), + instance.catalog_manager.clone(), + instance.table_engine.clone(), + ); + let interpreter = interpreter_factory.create(interpreter_ctx, plan); + + let output = interpreter.execute().await.context(InterpreterExec { + query: &request.query, + })?; + + // Convert output to json + let resp = convert_output(output).context(ArrowToString { + query: &request.query, + })?; + + info!( + "sql handler finished processing request, request:{:?}", + request + ); + + Ok(resp) +} + +fn convert_output(output: Output) -> ArrowResult { + match output { + Output::AffectedRows(n) => Ok(Response::AffectedRows(n)), + Output::Records(records) => convert_records(records), + } +} + +fn convert_records(records: RecordBatchVec) -> ArrowResult { + let total_rows = records.iter().map(|v| v.num_rows()).sum(); + let mut resp = Vec::with_capacity(total_rows); + for record_batch in records { + let num_cols = record_batch.num_columns(); + let num_rows = record_batch.num_rows(); + let schema = record_batch.schema(); + + for row_idx in 0..num_rows { + let mut row = HashMap::with_capacity(num_cols); + for col_idx in 0..num_cols { + let column = record_batch.column(col_idx); + let column = column.datum(row_idx); + + let column_name = schema.column(col_idx).name.clone(); + row.insert(column_name, column); + } + + resp.push(row); + } + } + + Ok(Response::Rows(resp)) +} diff --git a/server/src/http.rs b/server/src/http.rs new file mode 100644 index 0000000000..7318d60433 --- /dev/null +++ b/server/src/http.rs @@ -0,0 +1,341 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Http service + +use std::{convert::Infallible, net::IpAddr, sync::Arc}; + +use catalog::manager::Manager as CatalogManager; +use log::error; +use profile::Profiler; +use query_engine::executor::Executor as QueryExecutor; +use serde_derive::Serialize; +use snafu::{Backtrace, OptionExt, ResultExt, Snafu}; +use table_engine::engine::EngineRuntimes; +use tokio::sync::oneshot::{self, Sender}; +use warp::{ + header, + http::StatusCode, + reject, + reply::{self, Reply}, + Filter, +}; + +use crate::{consts, context::RequestContext, error, handlers, instance::InstanceRef, metrics}; + +#[derive(Debug)] +pub struct Config { + pub ip: String, + pub port: u16, +} + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to create request context, err:{}", source))] + CreateContext { source: crate::context::Error }, + + #[snafu(display("Failed to handle request, err:{}", source))] + HandleRequest { + source: crate::handlers::error::Error, + }, + + #[snafu(display("Missing runtimes to build service.\nBacktrace:\n{}", backtrace))] + MissingRuntimes { backtrace: Backtrace }, + + #[snafu(display("Missing instance to build service.\nBacktrace:\n{}", backtrace))] + MissingInstance { backtrace: Backtrace }, + + #[snafu(display( + "Fail to do heap profiling, err:{}.\nBacktrace:\n{}", + source, + backtrace + ))] + ProfileHeap { + source: profile::Error, + backtrace: Backtrace, + }, + + #[snafu(display("Fail to join async task, err:{}.", source))] + JoinAsyncTask { source: common_util::runtime::Error }, + + #[snafu(display( + "Failed to parse ip addr, ip:{}, err:{}.\nBacktrace:\n{}", + ip, + source, + backtrace + ))] + ParseIpAddr { + ip: String, + source: std::net::AddrParseError, + backtrace: Backtrace, + }, +} + +define_result!(Error); + +impl reject::Reject for Error {} + +/// Http service +/// +/// Note that the service does not owns the runtime +pub struct Service { + runtimes: Arc, + instance: InstanceRef, + profiler: Arc, + tx: Sender<()>, +} + +impl Service { + // TODO(yingwen): Maybe log error or return error + pub fn stop(self) { + let _ = self.tx.send(()); + } +} + +// TODO(yingwen): How to support non json response? +impl Service { + fn routes(&self) -> impl Filter + Clone { + self.home() + .or(self.metrics()) + .or(self.sql()) + .or(self.heap_profile()) + .or(self.admin_reject()) + } + + fn home(&self) -> impl Filter + Clone { + warp::path::end().and(warp::get()).map(|| { + use std::collections::HashMap; + let mut resp = HashMap::new(); + resp.insert("status", "ok"); + reply::json(&resp) + }) + } + + // TODO(yingwen): Avoid boilterplate code if there are more handlers + fn sql(&self) -> impl Filter + Clone { + warp::path!("sql") + .and(warp::post()) + // TODO(yingwen): content length limit + .and(warp::body::json()) + .and(self.with_context()) + .and(self.with_instance()) + .and_then(|req, ctx, instance| async { + // TODO(yingwen): Wrap common logic such as metrics, trace and error log + let result = handlers::sql::handle_sql(ctx, instance, req) + .await + .map_err(|e| { + // TODO(yingwen): Maybe truncate and print the sql + error!("Http service Failed to handle sql, err:{}", e); + e + }) + .context(HandleRequest); + match result { + Ok(res) => Ok(reply::json(&res)), + Err(e) => Err(reject::custom(e)), + } + }) + } + + fn metrics(&self) -> impl Filter + Clone { + warp::path!("metrics").and(warp::get()).map(metrics::dump) + } + + fn heap_profile( + &self, + ) -> impl Filter + Clone { + warp::path!("debug" / "heap_profile" / ..) + .and(warp::path::param::()) + .and(warp::get()) + .and(self.with_context()) + .and(self.with_profiler()) + .and_then( + |duration_sec: u64, ctx: RequestContext, profiler: Arc| async move { + let handle = ctx.runtime.spawn_blocking(move || { + profiler.dump_mem_prof(duration_sec).context(ProfileHeap) + }); + let result = handle.await.context(JoinAsyncTask); + match result { + Ok(Ok(prof_data)) => Ok(prof_data.into_response()), + Ok(Err(e)) => Err(reject::custom(e)), + Err(e) => Err(reject::custom(e)), + } + }, + ) + } + + fn with_context( + &self, + ) -> impl Filter + Clone { + let default_catalog = self + .instance + .catalog_manager + .default_catalog_name() + .to_string(); + let default_schema = self + .instance + .catalog_manager + .default_schema_name() + .to_string(); + //TODO(boyan) use read/write runtime by sql type. + let runtime = self.runtimes.bg_runtime.clone(); + + header::optional::(consts::CATALOG_HEADER) + .and(header::optional::(consts::TENANT_HEADER)) + .and_then(move |catalog: Option<_>, tenant: Option<_>| { + // Clone the captured variables + let default_catalog = default_catalog.clone(); + let default_schema = default_schema.clone(); + let runtime = runtime.clone(); + async { + RequestContext::builder() + .catalog(catalog.unwrap_or(default_catalog)) + .tenant(tenant.unwrap_or(default_schema)) + .runtime(runtime) + .build() + .context(CreateContext) + .map_err(reject::custom) + } + }) + } + + fn with_profiler(&self) -> impl Filter,), Error = Infallible> + Clone { + let profiler = self.profiler.clone(); + warp::any().map(move || profiler.clone()) + } + + fn with_instance( + &self, + ) -> impl Filter,), Error = Infallible> + Clone { + let instance = self.instance.clone(); + warp::any().map(move || instance.clone()) + } + + fn admin_reject( + &self, + ) -> impl Filter + Clone { + warp::path!("reject") + .and(warp::post()) + .and(warp::body::json()) + .and(self.with_context()) + .and(self.with_instance()) + .and_then(|req, ctx, instance| async { + let result = handlers::admin::handle_reject(ctx, instance, req) + .await + .map_err(|e| { + error!("Http service failed to handle admin reject, err:{}", e); + e + }) + .context(HandleRequest); + + match result { + Ok(res) => Ok(reply::json(&res)), + Err(e) => Err(reject::custom(e)), + } + }) + } +} + +/// Service builder +pub struct Builder { + config: Config, + runtimes: Option>, + instance: Option>, +} + +impl Builder { + pub fn new(config: Config) -> Self { + Self { + config, + runtimes: None, + instance: None, + } + } + + pub fn runtimes(mut self, runtimes: Arc) -> Self { + self.runtimes = Some(runtimes); + self + } + + pub fn instance(mut self, instance: InstanceRef) -> Self { + self.instance = Some(instance); + self + } +} + +impl Builder { + /// Build and start the service + pub fn build(self) -> Result> { + let runtimes = self.runtimes.context(MissingRuntimes)?; + let instance = self.instance.context(MissingInstance)?; + let (tx, rx) = oneshot::channel(); + + let service = Service { + runtimes: runtimes.clone(), + instance, + profiler: Arc::new(Profiler::default()), + tx, + }; + + let ip_addr: IpAddr = self + .config + .ip + .parse() + .context(ParseIpAddr { ip: self.config.ip })?; + + // Register filters to warp and rejection handler + let routes = service.routes().recover(handle_rejection); + let (_addr, server) = + warp::serve(routes).bind_with_graceful_shutdown((ip_addr, self.config.port), async { + rx.await.ok(); + }); + // Run the service + runtimes.bg_runtime.spawn(server); + + Ok(service) + } +} + +#[derive(Debug, Serialize)] +struct ErrorResponse { + code: u16, + message: String, +} + +fn error_to_status_code(err: &Error) -> StatusCode { + match err { + Error::CreateContext { .. } => StatusCode::BAD_REQUEST, + // TODO(yingwen): Map handle request error to more accurate status code + Error::HandleRequest { .. } + | Error::MissingRuntimes { .. } + | Error::MissingInstance { .. } + | Error::ParseIpAddr { .. } + | Error::ProfileHeap { .. } + | Error::JoinAsyncTask { .. } => StatusCode::INTERNAL_SERVER_ERROR, + } +} + +async fn handle_rejection( + rejection: warp::Rejection, +) -> std::result::Result { + let code; + let message; + + if rejection.is_not_found() { + code = StatusCode::NOT_FOUND; + message = String::from("NOT_FOUND"); + } else if let Some(err) = rejection.find() { + code = error_to_status_code(err); + let err_string = err.to_string(); + message = error::first_line_in_error(&err_string).to_string(); + } else { + error!("handle error: {:?}", rejection); + code = StatusCode::INTERNAL_SERVER_ERROR; + message = format!("UNKNOWN_ERROR: {:?}", rejection); + } + + let json = reply::json(&ErrorResponse { + code: code.as_u16(), + message, + }); + + Ok(reply::with_status(json, code)) +} diff --git a/server/src/instance.rs b/server/src/instance.rs new file mode 100644 index 0000000000..64d3ada775 --- /dev/null +++ b/server/src/instance.rs @@ -0,0 +1,26 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Instance contains shared states of service + +use std::sync::Arc; + +use table_engine::engine::TableEngineRef; +use udf::registry::FunctionRegistryRef; + +use crate::limiter::Limiter; + +/// A cluster instance. Usually there is only one instance per cluster +/// +/// C: catalog::manager::Manager +/// Q: query_engine::executor::Executor +pub struct Instance { + pub catalog_manager: C, + pub query_executor: Q, + pub table_engine: TableEngineRef, + // User defined functions registry. + pub function_registry: FunctionRegistryRef, + pub limiter: Limiter, +} + +/// A reference counted instance pointer +pub type InstanceRef = Arc>; diff --git a/server/src/lib.rs b/server/src/lib.rs new file mode 100644 index 0000000000..122735a07f --- /dev/null +++ b/server/src/lib.rs @@ -0,0 +1,25 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Rpc server + +// TODO(yingwen): +// Borrow some ideas from tikv: https://github.com/tikv/tikv/blob/dc8ce2cf6a8904cb3dad556f71b11bac3531689b/src/server/service/kv.rs#L51 + +#[macro_use] +extern crate common_util; + +mod avro_util; +pub mod config; +mod consts; +mod context; +mod error; +mod grpc; +mod handlers; +mod http; +mod instance; +pub mod limiter; +pub mod logger; +mod metrics; +mod router; +pub mod server; +pub mod table_engine; diff --git a/server/src/limiter.rs b/server/src/limiter.rs new file mode 100644 index 0000000000..f594b2028b --- /dev/null +++ b/server/src/limiter.rs @@ -0,0 +1,194 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{collections::HashSet, sync::RwLock}; + +use arrow_deps::datafusion::catalog::TableReference; +use sql::plan::Plan; + +pub struct Limiter { + write_reject_list: RwLock>, + read_reject_list: RwLock>, +} + +impl Default for Limiter { + fn default() -> Self { + Self { + write_reject_list: RwLock::new(HashSet::new()), + read_reject_list: RwLock::new(HashSet::new()), + } + } +} + +impl Limiter { + pub fn should_limit(&self, plan: &Plan) -> bool { + match plan { + Plan::Query(query) => { + let read_reject_list = self.read_reject_list.read().unwrap().clone(); + for table in read_reject_list { + if query + .tables + .get(TableReference::from(table.as_str())) + .is_some() + { + return true; + } + } + false + } + Plan::Insert(insert) => self + .write_reject_list + .read() + .unwrap() + .contains(insert.table.name()), + _ => false, + } + } + + pub fn add_write_reject_list(&self, reject_list: Vec) { + self.write_reject_list + .write() + .unwrap() + .extend(reject_list.into_iter()) + } + + pub fn add_read_reject_list(&self, reject_list: Vec) { + self.read_reject_list + .write() + .unwrap() + .extend(reject_list.into_iter()) + } + + pub fn set_write_reject_list(&self, reject_list: Vec) { + *self.write_reject_list.write().unwrap() = reject_list.into_iter().collect(); + } + + pub fn set_read_reject_list(&self, reject_list: Vec) { + *self.read_reject_list.write().unwrap() = reject_list.into_iter().collect(); + } + + pub fn get_write_reject_list(&self) -> HashSet { + self.write_reject_list.write().unwrap().clone() + } + + pub fn get_read_reject_list(&self) -> HashSet { + self.read_reject_list.write().unwrap().clone() + } + + pub fn remove_write_reject_list(&self, reject_list: Vec) { + let mut write_reject_list = self.write_reject_list.write().unwrap(); + for value in reject_list { + write_reject_list.remove(&value); + } + } + + pub fn remove_read_reject_list(&self, reject_list: Vec) { + let mut read_reject_list = self.read_reject_list.write().unwrap(); + for value in reject_list { + read_reject_list.remove(&value); + } + } +} + +#[cfg(test)] +mod tests { + use common_types::request_id::RequestId; + use sql::{parser::Parser, plan::Plan, planner::Planner, tests::MockMetaProvider}; + + use crate::limiter::Limiter; + + fn sql_to_plan(meta_provider: &MockMetaProvider, sql: &str) -> Plan { + let planner = Planner::new(meta_provider, RequestId::next_id(), 1); + let mut statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + planner.statement_to_plan(statements.remove(0)).unwrap() + } + + fn prepare() -> (MockMetaProvider, Limiter) { + let mock = MockMetaProvider::default(); + + let reject_list = vec!["test_table".to_string()]; + let limiter = Limiter::default(); + limiter.set_read_reject_list(reject_list.clone()); + limiter.set_write_reject_list(reject_list); + (mock, limiter) + } + + #[test] + fn test_limiter() { + let (mock, limiter) = prepare(); + let query = "select * from test_table"; + let query_plan = sql_to_plan(&mock, query); + assert!(limiter.should_limit(&query_plan)); + + let insert="INSERT INTO test_table(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3')"; + let insert_plan = sql_to_plan(&mock, insert); + assert!(limiter.should_limit(&insert_plan)); + } + + #[test] + fn test_limiter_remove() { + let (mock, limiter) = prepare(); + let test_data = vec!["test_table".to_string()]; + + let query = "select * from test_table"; + let query_plan = sql_to_plan(&mock, query); + assert!(limiter.should_limit(&query_plan)); + + let insert="INSERT INTO test_table(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3')"; + let insert_plan = sql_to_plan(&mock, insert); + assert!(limiter.should_limit(&insert_plan)); + + limiter.remove_write_reject_list(test_data.clone()); + limiter.remove_read_reject_list(test_data); + assert!(!limiter.should_limit(&query_plan)); + assert!(!limiter.should_limit(&insert_plan)); + } + + #[test] + fn test_limiter_add() { + let (mock, limiter) = prepare(); + let test_data = vec!["test_table2".to_string()]; + + let query = "select * from test_table2"; + let query_plan = sql_to_plan(&mock, query); + assert!(!limiter.should_limit(&query_plan)); + + let insert="INSERT INTO test_table2(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3')"; + let insert_plan = sql_to_plan(&mock, insert); + assert!(!limiter.should_limit(&insert_plan)); + + limiter.add_write_reject_list(test_data.clone()); + limiter.add_read_reject_list(test_data); + assert!(limiter.should_limit(&query_plan)); + assert!(limiter.should_limit(&insert_plan)); + } + + #[test] + fn test_limiter_set() { + let (mock, limiter) = prepare(); + let test_data = vec!["test_table2".to_string()]; + + let query = "select * from test_table"; + let query_plan = sql_to_plan(&mock, query); + assert!(limiter.should_limit(&query_plan)); + + let query2 = "select * from test_table2"; + let query_plan2 = sql_to_plan(&mock, query2); + assert!(!limiter.should_limit(&query_plan2)); + + let insert="INSERT INTO test_table(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3')"; + let insert_plan = sql_to_plan(&mock, insert); + assert!(limiter.should_limit(&insert_plan)); + + let insert2="INSERT INTO test_table2(key1, key2, field1,field2) VALUES('tagk', 1638428434000,100, 'hello3')"; + let insert_plan2 = sql_to_plan(&mock, insert2); + assert!(!limiter.should_limit(&insert_plan2)); + + limiter.set_read_reject_list(test_data.clone()); + limiter.set_write_reject_list(test_data); + assert!(!limiter.should_limit(&query_plan)); + assert!(!limiter.should_limit(&insert_plan)); + assert!(limiter.should_limit(&query_plan2)); + assert!(limiter.should_limit(&insert_plan2)); + } +} diff --git a/server/src/logger.rs b/server/src/logger.rs new file mode 100644 index 0000000000..a05ecd44ec --- /dev/null +++ b/server/src/logger.rs @@ -0,0 +1,32 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::str::FromStr; + +use log::SetLoggerError; +use logger::{Level, LogDispatcher, RuntimeLevel}; + +use crate::config::Config; + +pub fn init_log(config: &Config) -> Result { + let level = match Level::from_str(&config.log_level) { + Ok(v) => v, + Err(e) => { + panic!( + "Parse log level failed, level: {}, err: {:?}", + &config.log_level, e + ); + } + }; + + let term_drain = logger::term_drainer(); + let drain = LogDispatcher::new(term_drain); + + // Use async and init stdlog + logger::init_log( + drain, + level, + config.enable_async_log, + config.async_log_channel_len, + true, + ) +} diff --git a/server/src/metrics.rs b/server/src/metrics.rs new file mode 100644 index 0000000000..89dd08fdbd --- /dev/null +++ b/server/src/metrics.rs @@ -0,0 +1,19 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Metrics util for server. + +use log::warn; +use prometheus::{Encoder, TextEncoder}; + +/// Gather and dump prometheus to string. +pub fn dump() -> String { + let mut buffer = vec![]; + let encoder = TextEncoder::new(); + let metric_families = prometheus::gather(); + for mf in metric_families { + if let Err(e) = encoder.encode(&[mf], &mut buffer) { + warn!("prometheus encoding error, err:{}", e); + } + } + String::from_utf8(buffer).unwrap() +} diff --git a/server/src/router.rs b/server/src/router.rs new file mode 100644 index 0000000000..aa687c714b --- /dev/null +++ b/server/src/router.rs @@ -0,0 +1,196 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ + collections::HashMap, + hash::{Hash, Hasher}, + sync::Arc, +}; + +use ceresdbproto::storage::{Endpoint, Route, RouteRequest}; +use log::info; +use meta_client::{MetaClient, ShardId}; +use serde_derive::Deserialize; +use twox_hash::XxHash64; + +use crate::error::{ErrNoCause, Result, StatusCode}; + +/// Hash seed to build hasher. Modify the seed will result in different route +/// result! +const HASH_SEED: u64 = 0; + +pub type RouterRef = Arc; + +pub trait Router { + fn route(&self, schema: &str, req: RouteRequest) -> Result>; +} + +#[derive(Debug, Deserialize)] +pub struct PrefixRule { + /// Schema name of the prefix. + pub schema: String, + /// Prefix of the table name. + pub prefix: String, + /// The shard of matched tables. + pub shard: ShardId, +} + +#[derive(Debug, Deserialize)] +pub struct HashRule { + /// Schema name of the prefix. + pub schema: String, + /// The shard list for hash rule. + pub shards: Vec, +} + +#[derive(Debug, Default, Deserialize)] +pub struct RuleList { + pub prefix_rules: Vec, + pub hash_rules: Vec, +} + +impl RuleList { + pub fn split_by_schema(self) -> SchemaRules { + let mut schema_rules = HashMap::new(); + + for rule in self.prefix_rules { + let rule_list = match schema_rules.get_mut(&rule.schema) { + Some(v) => v, + None => schema_rules + .entry(rule.schema.clone()) + .or_insert_with(RuleList::default), + }; + + rule_list.prefix_rules.push(rule); + } + + for rule in self.hash_rules { + let rule_list = match schema_rules.get_mut(&rule.schema) { + Some(v) => v, + None => schema_rules + .entry(rule.schema.clone()) + .or_insert_with(RuleList::default), + }; + + rule_list.hash_rules.push(rule); + } + + schema_rules + } +} + +// Schema -> Rule list of the schema. +type SchemaRules = HashMap; + +pub struct RuleBasedRouter { + meta_client: Arc, + schema_rules: SchemaRules, +} + +impl RuleBasedRouter { + pub fn new(meta_client: Arc, rules: RuleList) -> Self { + let schema_rules = rules.split_by_schema(); + + info!("RuleBasedRouter init with rules, rules:{:?}", schema_rules); + + Self { + meta_client, + schema_rules, + } + } + + fn maybe_route_by_rule(metric: &str, rule_list: &RuleList) -> Option { + for prefix_rule in &rule_list.prefix_rules { + if metric.starts_with(&prefix_rule.prefix) { + return Some(prefix_rule.shard); + } + } + + if let Some(hash_rule) = rule_list.hash_rules.get(0) { + let total_shards = hash_rule.shards.len(); + let hash_value = hash_metric(metric); + let index = hash_value as usize % total_shards; + + return Some(hash_rule.shards[index]); + } + + None + } + + #[inline] + fn route_by_hash(metric: &str, total_shards: usize) -> ShardId { + let hash_value = hash_metric(metric); + (hash_value as usize % total_shards) as ShardId + } + + fn route_metric( + metric: &str, + rule_list_opt: Option<&RuleList>, + total_shards: usize, + ) -> ShardId { + if let Some(rule_list) = rule_list_opt { + if let Some(shard_id) = Self::maybe_route_by_rule(metric, rule_list) { + return shard_id; + } + } + + // Fallback to hash route rule. + Self::route_by_hash(metric, total_shards) + } +} + +impl Router for RuleBasedRouter { + fn route(&self, schema: &str, req: RouteRequest) -> Result> { + let cluster_view = self.meta_client.get_cluster_view(); + if let Some(shard_view_map) = cluster_view.schema_shards.get(schema) { + if shard_view_map.is_empty() { + return ErrNoCause { + code: StatusCode::NotFound, + msg: "shards from meta is empty", + } + .fail(); + } + + // Get rule list of this schema. + let rule_list_opt = self.schema_rules.get(schema); + + // TODO(yingwen): Better way to get total shard number + let total_shards = shard_view_map.len(); + let mut route_vec = Vec::with_capacity(req.metrics.len()); + for metric in req.metrics { + let mut route = Route::new(); + route.set_metric(metric); + + let shard_id = Self::route_metric(route.get_metric(), rule_list_opt, total_shards); + + let mut endpoint = Endpoint::new(); + if let Some(shard_view) = shard_view_map.get(&shard_id) { + let node = &shard_view.node; + endpoint.set_ip(node.addr.clone()); + endpoint.set_port(node.port); + } else { + return ErrNoCause { + code: StatusCode::NotFound, + msg: format!( + "Shard not found, metric:{}, shard_id:{}", + route.get_metric(), + shard_id + ), + } + .fail(); + } + + route.set_endpoint(endpoint); + route_vec.push(route); + } + return Ok(route_vec); + } + + Ok(Vec::new()) + } +} + +fn hash_metric(metric: &str) -> u64 { + let mut hasher = XxHash64::with_seed(HASH_SEED); + metric.hash(&mut hasher); + hasher.finish() +} diff --git a/server/src/server.rs b/server/src/server.rs new file mode 100644 index 0000000000..90e5a999b9 --- /dev/null +++ b/server/src/server.rs @@ -0,0 +1,180 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Server + +use std::sync::Arc; + +use catalog::manager::Manager as CatalogManager; +use grpcio::Environment; +use query_engine::executor::Executor as QueryExecutor; +use snafu::{Backtrace, OptionExt, ResultExt, Snafu}; +use table_engine::engine::{EngineRuntimes, TableEngineRef}; +use udf::registry::FunctionRegistryRef; + +use crate::{ + config::Config, + grpc::{self, RpcServices}, + http::{self, Service}, + instance::{Instance, InstanceRef}, + limiter::Limiter, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Missing runtimes.\nBacktrace:\n{}", backtrace))] + MissingRuntimes { backtrace: Backtrace }, + + #[snafu(display("Missing catalog manager.\nBacktrace:\n{}", backtrace))] + MissingCatalogManager { backtrace: Backtrace }, + + #[snafu(display("Missing query executor.\nBacktrace:\n{}", backtrace))] + MissingQueryExecutor { backtrace: Backtrace }, + + #[snafu(display("Missing table engine.\nBacktrace:\n{}", backtrace))] + MissingTableEngine { backtrace: Backtrace }, + + #[snafu(display("Missing function registry.\nBacktrace:\n{}", backtrace))] + MissingFunctionRegistry { backtrace: Backtrace }, + + #[snafu(display("Missing limiter.\nBacktrace:\n{}", backtrace))] + MissingLimiter { backtrace: Backtrace }, + + #[snafu(display("Failed to start http service, err:{}", source))] + StartHttpService { source: crate::http::Error }, + + #[snafu(display("Failed to register system catalog, err:{}", source))] + RegisterSystemCatalog { source: catalog::manager::Error }, + + #[snafu(display("Failed to build grpc service, err:{}", source))] + BuildGrpcService { source: crate::grpc::Error }, + + #[snafu(display("Failed to start grpc service, err:{}", source))] + StartGrpcService { source: crate::grpc::Error }, +} + +define_result!(Error); + +// TODO(yingwen): Consider a config manager +/// Server +pub struct Server { + http_service: Service, + rpc_services: RpcServices, +} + +impl Server { + pub fn stop(mut self) { + self.rpc_services.shutdown(); + self.http_service.stop(); + } + + pub async fn start(&mut self) -> Result<()> { + self.rpc_services.start().await.context(StartGrpcService) + } +} + +#[must_use] +pub struct Builder { + config: Config, + runtimes: Option>, + catalog_manager: Option, + query_executor: Option, + table_engine: Option, + function_registry: Option, + limiter: Limiter, +} + +impl Builder { + pub fn new(config: Config) -> Self { + Self { + config, + runtimes: None, + catalog_manager: None, + query_executor: None, + table_engine: None, + function_registry: None, + limiter: Limiter::default(), + } + } + + pub fn runtimes(mut self, runtimes: Arc) -> Self { + self.runtimes = Some(runtimes); + self + } + + pub fn catalog_manager(mut self, val: C) -> Self { + self.catalog_manager = Some(val); + self + } + + pub fn query_executor(mut self, val: Q) -> Self { + self.query_executor = Some(val); + self + } + + pub fn table_engine(mut self, val: TableEngineRef) -> Self { + self.table_engine = Some(val); + self + } + + pub fn function_registry(mut self, val: FunctionRegistryRef) -> Self { + self.function_registry = Some(val); + self + } + + pub fn limiter(mut self, val: Limiter) -> Self { + self.limiter = val; + self + } + + /// Build and run the server + pub fn build(self) -> Result> { + // Build runtimes + let runtimes = self.runtimes.context(MissingRuntimes)?; + + // Build instance + let catalog_manager = self.catalog_manager.context(MissingCatalogManager)?; + let query_executor = self.query_executor.context(MissingQueryExecutor)?; + let table_engine = self.table_engine.context(MissingTableEngine)?; + let function_registry = self.function_registry.context(MissingFunctionRegistry)?; + let instance = Instance { + catalog_manager, + query_executor, + table_engine, + function_registry, + limiter: self.limiter, + }; + let instance = InstanceRef::new(instance); + + // Create http config + let http_config = http::Config { + ip: self.config.bind_addr.clone(), + port: self.config.http_port, + }; + + // Start http service + let http_service = http::Builder::new(http_config) + .runtimes(runtimes.clone()) + .instance(instance.clone()) + .build() + .context(StartHttpService)?; + + let meta_client_config = self.config.meta_client; + let env = Arc::new(Environment::new(self.config.grpc_server_cq_count)); + let rpc_services = grpc::Builder::new() + .bind_addr(self.config.bind_addr) + .port(self.config.grpc_port) + .meta_client_config(meta_client_config) + .env(env) + .runtimes(runtimes) + .instance(instance) + .route_rules(self.config.route_rules) + .build() + .context(BuildGrpcService)?; + + let server = Server { + http_service, + rpc_services, + }; + Ok(server) + } +} diff --git a/server/src/table_engine.rs b/server/src/table_engine.rs new file mode 100644 index 0000000000..7f7083b91c --- /dev/null +++ b/server/src/table_engine.rs @@ -0,0 +1,97 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table engine implementation + +use std::sync::Arc; + +use analytic_engine::AnalyticTableEngine; +use async_trait::async_trait; +use table_engine::{ + engine::{ + CreateTableRequest, DropTableRequest, OpenTableRequest, Result, TableEngine, + UnknownEngineType, + }, + memory::MemoryTable, + table::TableRef, + ANALYTIC_ENGINE_TYPE, MEMORY_ENGINE_TYPE, +}; + +/// Memory table engine implementation +// Mainly for test purpose now +pub struct MemoryTableEngine; + +#[async_trait] +impl TableEngine for MemoryTableEngine { + fn engine_type(&self) -> &str { + MEMORY_ENGINE_TYPE + } + + async fn close(&self) -> Result<()> { + Ok(()) + } + + async fn create_table(&self, request: CreateTableRequest) -> Result { + Ok(Arc::new(MemoryTable::new( + request.table_name, + request.table_id, + request.table_schema, + MEMORY_ENGINE_TYPE.to_string(), + ))) + } + + async fn drop_table(&self, _request: DropTableRequest) -> Result { + Ok(true) + } + + async fn open_table(&self, _request: OpenTableRequest) -> Result> { + Ok(None) + } +} + +/// Route [CreateTableRequest] to the correct engine by its engine type +pub struct TableEngineProxy { + /// Memory table engine + pub memory: MemoryTableEngine, + /// Analytic table engine + pub analytic: AnalyticTableEngine, +} + +#[async_trait] +impl TableEngine for TableEngineProxy { + fn engine_type(&self) -> &str { + "TableEngineProxy" + } + + async fn close(&self) -> Result<()> { + self.memory.close().await?; + self.analytic.close().await?; + + Ok(()) + } + + async fn create_table(&self, request: CreateTableRequest) -> Result { + // TODO(yingwen): Use a map + match request.engine.as_str() { + MEMORY_ENGINE_TYPE => self.memory.create_table(request).await, + ANALYTIC_ENGINE_TYPE => self.analytic.create_table(request).await, + engine_type => UnknownEngineType { engine_type }.fail(), + } + } + + async fn drop_table(&self, request: DropTableRequest) -> Result { + match request.engine.as_str() { + MEMORY_ENGINE_TYPE => self.memory.drop_table(request).await, + ANALYTIC_ENGINE_TYPE => self.analytic.drop_table(request).await, + engine_type => UnknownEngineType { engine_type }.fail(), + } + } + + /// Open table, return error if table not exists + async fn open_table(&self, request: OpenTableRequest) -> Result> { + match request.engine.as_str() { + MEMORY_ENGINE_TYPE => self.memory.open_table(request).await, + ANALYTIC_ENGINE_TYPE => self.analytic.open_table(request).await, + engine_type => UnknownEngineType { engine_type }.fail(), + } + } +} diff --git a/sql/Cargo.toml b/sql/Cargo.toml new file mode 100644 index 0000000000..3056272218 --- /dev/null +++ b/sql/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "sql" +version = "0.1.0" +authors = ["CeresDB Authors "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[features] +test = [] + +[dependencies] +# In alphabetical order +arrow_deps = { path = "../arrow_deps" } +catalog = { path = "../catalog" } +common_types = { path = "../common_types"} +common_util = { path = "../common_util" } +log = "0.4" +paste = "1.0" +snafu = { version ="0.6.10", features = ["backtraces"]} +sqlparser = "0.13.0" +table_engine = { path = "../table_engine" } +udf = { path = "../udf" } +ceresdbproto = { git = "https://github.com/CeresDB/ceresdbproto.git"} +regex = "1" + +[dev-dependencies] +common_types = { path = "../common_types", features = ["test"] } +tokio = { version = "1.0", features = ["full"] } diff --git a/sql/src/ast.rs b/sql/src/ast.rs new file mode 100644 index 0000000000..e68e486e43 --- /dev/null +++ b/sql/src/ast.rs @@ -0,0 +1,80 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! SQL statement + +use sqlparser::ast::{ + ColumnDef, ObjectName, SqlOption, Statement as SqlStatement, TableConstraint, +}; + +/// Statement representations +#[derive(Debug, PartialEq)] +pub enum Statement { + /// ANSI SQL AST node + Standard(Box), + // Other extensions + /// CREATE TABLE + Create(CreateTable), + /// Drop TABLE + Drop(DropTable), + Describe(DescribeTable), + AlterModifySetting(AlterModifySetting), + AlterAddColumn(AlterAddColumn), + /// SHOW CREATE TABLE + ShowCreate(ShowCreate), + Exists(ExistsTable), +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub enum ShowCreateObject { + Table, +} + +#[derive(Debug, PartialEq)] +pub struct CreateTable { + /// Create if not exists + pub if_not_exists: bool, + /// Table name + pub name: ObjectName, + pub columns: Vec, + pub engine: String, + pub constraints: Vec, + /// Table options in `WITH`. + pub options: Vec, +} + +#[derive(Debug, PartialEq)] +pub struct DropTable { + /// Table name + pub name: ObjectName, + pub if_exists: bool, + pub engine: String, +} + +#[derive(Debug, PartialEq)] +pub struct DescribeTable { + pub table_name: ObjectName, +} + +#[derive(Debug, PartialEq)] +pub struct AlterModifySetting { + pub table_name: ObjectName, + pub options: Vec, +} + +#[derive(Debug, PartialEq)] +pub struct AlterAddColumn { + pub table_name: ObjectName, + pub columns: Vec, +} + +#[derive(Debug, PartialEq)] +pub struct ShowCreate { + pub obj_type: ShowCreateObject, + pub obj_name: ObjectName, +} + +#[derive(Debug, PartialEq)] +pub struct ExistsTable { + pub table_name: ObjectName, +} diff --git a/sql/src/container.rs b/sql/src/container.rs new file mode 100644 index 0000000000..eac30eb737 --- /dev/null +++ b/sql/src/container.rs @@ -0,0 +1,175 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Table container + +use std::{collections::HashMap, sync::Arc}; + +pub use arrow_deps::datafusion::catalog::{ResolvedTableReference, TableReference}; +use table_engine::provider::TableProviderAdapter; + +// Rust has poor support of using tuple as map key, so we use a 3 level +// map to store catalog -> schema -> table mapping +type CatalogMap = HashMap; +type SchemaMap = HashMap; +type TableMap = HashMap>; + +/// Container to hold table adapters +/// +/// Optimized for default catalog and schema +#[derive(Default)] +pub struct TableContainer { + default_catalog: String, + default_schema: String, + default_tables: HashMap>, + other_tables: CatalogMap, +} + +impl TableContainer { + pub fn new(default_catalog: String, default_schema: String) -> Self { + Self { + default_catalog, + default_schema, + default_tables: HashMap::new(), + other_tables: CatalogMap::new(), + } + } + + /// Catalog num + pub fn num_catalogs(&self) -> usize { + if self.other_tables.is_empty() { + 1 + } else { + self.other_tables.len() + 1 + } + } + + pub fn get(&self, name: TableReference) -> Option> { + match name { + TableReference::Bare { table } => self.get_default(table), + TableReference::Partial { schema, table } => { + if schema == self.default_schema { + self.get_default(table) + } else { + self.get_other(&self.default_catalog, schema, table) + } + } + TableReference::Full { + catalog, + schema, + table, + } => { + if catalog == self.default_catalog && schema == self.default_schema { + self.get_default(table) + } else { + self.get_other(catalog, schema, table) + } + } + } + } + + fn get_default(&self, table: &str) -> Option> { + self.default_tables.get(table).cloned() + } + + fn get_other( + &self, + catalog: &str, + schema: &str, + table: &str, + ) -> Option> { + self.other_tables + .get(catalog) + .and_then(|schemas| schemas.get(schema)) + .and_then(|tables| tables.get(table)) + .cloned() + } + + pub fn insert(&mut self, name: TableReference, table_adapter: Arc) { + match name { + TableReference::Bare { table } => self.insert_default(table, table_adapter), + TableReference::Partial { schema, table } => { + if schema == self.default_schema { + self.insert_default(table, table_adapter) + } else { + self.insert_other( + self.default_catalog.clone(), + schema.to_string(), + table.to_string(), + table_adapter, + ) + } + } + TableReference::Full { + catalog, + schema, + table, + } => { + if catalog == self.default_catalog && schema == self.default_schema { + self.insert_default(table, table_adapter) + } else { + self.insert_other( + catalog.to_string(), + schema.to_string(), + table.to_string(), + table_adapter, + ) + } + } + } + } + + fn insert_default(&mut self, table: &str, table_adapter: Arc) { + self.default_tables.insert(table.to_string(), table_adapter); + } + + fn insert_other( + &mut self, + catalog: String, + schema: String, + table: String, + table_adapter: Arc, + ) { + self.other_tables + .entry(catalog) + .or_insert_with(HashMap::new) + .entry(schema) + .or_insert_with(HashMap::new) + .insert(table, table_adapter); + } + + /// Visit all tables + /// + /// If f returns error, stop iteration and return the error + pub fn visit(&self, mut f: F) -> Result<(), E> + where + F: FnMut(ResolvedTableReference, &Arc) -> Result<(), E>, + { + // Visit default tables first + for (table, adapter) in &self.default_tables { + // default_catalog/default_schema can be empty string, but that's + // ok since we have table under them + let table_ref = ResolvedTableReference { + catalog: &self.default_catalog, + schema: &self.default_schema, + table, + }; + f(table_ref, adapter)?; + } + + // Visit other tables + for (catalog, schemas) in &self.other_tables { + for (schema, tables) in schemas { + for (table, adapter) in tables { + let table_ref = ResolvedTableReference { + catalog, + schema, + table, + }; + f(table_ref, adapter)?; + } + } + } + + Ok(()) + } +} diff --git a/sql/src/frontend.rs b/sql/src/frontend.rs new file mode 100644 index 0000000000..f45e6def4d --- /dev/null +++ b/sql/src/frontend.rs @@ -0,0 +1,108 @@ +// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Frontend + +use std::{convert::TryInto, sync::Arc}; + +use ceresdbproto::prometheus::PrometheusQueryRequest; +use common_types::request_id::RequestId; +use snafu::{ResultExt, Snafu}; +use table_engine::table; + +use crate::{ + ast::Statement, + parser::Parser, + plan::Plan, + planner::Planner, + promql::{ColumnNames, Expr}, + provider::MetaProvider, +}; + +#[derive(Debug, Snafu)] +pub enum Error { + // Invalid sql is quite common, so we don't provide a backtrace now. + #[snafu(display("Invalid sql, sql:{}, err:{}", sql, source))] + InvalidSql { + sql: String, + source: sqlparser::parser::ParserError, + }, + + // TODO(yingwen): Should we store stmt here? + #[snafu(display("Failed to create plan, err:{}", source))] + CreatePlan { source: crate::planner::Error }, + + #[snafu(display("Invalid prom request, err:{}", source))] + InvalidPromRequest { source: crate::promql::Error }, +} + +define_result!(Error); + +pub type StatementVec = Vec; + +/// Context used by Frontend +/// +/// We can collect metrics and trace info in it instead of using global +/// metrics or trace collector. +pub struct Context { + /// Id of the query request. + pub request_id: RequestId, + /// Parallelism to read table. + pub read_parallelism: usize, +} + +impl Context { + pub fn new(request_id: RequestId) -> Self { + Self { + request_id, + read_parallelism: table::DEFAULT_READ_PARALLELISM, + } + } +} + +/// SQL frontend implementation +/// +/// Thought the parser supports using multiple statements in a sql, but +/// this frontend only support planning one statement at a time now +#[derive(Debug)] +pub struct Frontend